unprint 0.16.3 → 0.16.4-beta
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -0
- package/package.json +5 -4
- package/playwright.config.js +81 -0
- package/src/app.js +141 -33
- package/tests/browser.js +42 -0
package/README.md
CHANGED
|
@@ -202,11 +202,19 @@ Extracts the CSS `url()` background from a style attribute. Alias for `query.sty
|
|
|
202
202
|
### HTTP request
|
|
203
203
|
* `unprint.get(url, [options])`
|
|
204
204
|
* `unprint.post(url, body, [options])`
|
|
205
|
+
* `unprint.request(url, body, [options], [method])`
|
|
205
206
|
|
|
206
207
|
Options
|
|
207
208
|
* `select`: Pre-query and initialize a specific element on the page
|
|
208
209
|
* `selectAll`: Pre-query and initialize multiple specific element on the page
|
|
209
210
|
|
|
211
|
+
Use Playwright with Chromium (experimental)
|
|
212
|
+
* `unprint.browserRequest(url, [options])`
|
|
213
|
+
* `unprint.closeAllBrowsers()`
|
|
214
|
+
|
|
215
|
+
Additional options
|
|
216
|
+
* `browser`: Options object passed to Playwright
|
|
217
|
+
|
|
210
218
|
Returns
|
|
211
219
|
```javascript
|
|
212
220
|
{
|
package/package.json
CHANGED
|
@@ -1,11 +1,9 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "unprint",
|
|
3
|
-
"version": "0.16.
|
|
3
|
+
"version": "0.16.4-beta",
|
|
4
4
|
"description": "Simplify common web scraping tasks while staying in control of the data.",
|
|
5
5
|
"main": "src/app.js",
|
|
6
|
-
"scripts": {
|
|
7
|
-
"test": "echo \"Error: no test specified\" && exit 1"
|
|
8
|
-
},
|
|
6
|
+
"scripts": {},
|
|
9
7
|
"repository": {
|
|
10
8
|
"type": "git",
|
|
11
9
|
"url": "git+https://github.com/ThePendulum/unprint.git"
|
|
@@ -30,10 +28,13 @@
|
|
|
30
28
|
"eslint-config-airbnb-base": "^15.0.0",
|
|
31
29
|
"jsdom": "^17.0.0",
|
|
32
30
|
"moment-timezone": "^0.5.34",
|
|
31
|
+
"patchright": "^1.56.1",
|
|
33
32
|
"srcset": "^4.0.0",
|
|
34
33
|
"tunnel": "^0.0.6"
|
|
35
34
|
},
|
|
36
35
|
"devDependencies": {
|
|
36
|
+
"@playwright/test": "^1.56.1",
|
|
37
|
+
"@types/node": "^24.10.0",
|
|
37
38
|
"express": "^4.18.1"
|
|
38
39
|
}
|
|
39
40
|
}
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
// @ts-check
|
|
2
|
+
import { defineConfig, devices } from '@playwright/test';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Read environment variables from file.
|
|
6
|
+
* https://github.com/motdotla/dotenv
|
|
7
|
+
*/
|
|
8
|
+
// import dotenv from 'dotenv';
|
|
9
|
+
// import path from 'path';
|
|
10
|
+
// dotenv.config({ path: path.resolve(__dirname, '.env') });
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* @see https://playwright.dev/docs/test-configuration
|
|
14
|
+
*/
|
|
15
|
+
export default defineConfig({
|
|
16
|
+
testDir: './e2e',
|
|
17
|
+
/* Run tests in files in parallel */
|
|
18
|
+
fullyParallel: true,
|
|
19
|
+
/* Fail the build on CI if you accidentally left test.only in the source code. */
|
|
20
|
+
forbidOnly: !!process.env.CI,
|
|
21
|
+
/* Retry on CI only */
|
|
22
|
+
retries: process.env.CI ? 2 : 0,
|
|
23
|
+
/* Opt out of parallel tests on CI. */
|
|
24
|
+
workers: process.env.CI ? 1 : undefined,
|
|
25
|
+
/* Reporter to use. See https://playwright.dev/docs/test-reporters */
|
|
26
|
+
reporter: 'html',
|
|
27
|
+
/* Shared settings for all the projects below. See https://playwright.dev/docs/api/class-testoptions. */
|
|
28
|
+
use: {
|
|
29
|
+
/* Base URL to use in actions like `await page.goto('')`. */
|
|
30
|
+
// baseURL: 'http://localhost:3000',
|
|
31
|
+
|
|
32
|
+
/* Collect trace when retrying the failed test. See https://playwright.dev/docs/trace-viewer */
|
|
33
|
+
trace: 'on-first-retry',
|
|
34
|
+
},
|
|
35
|
+
|
|
36
|
+
/* Configure projects for major browsers */
|
|
37
|
+
projects: [
|
|
38
|
+
{
|
|
39
|
+
name: 'chromium',
|
|
40
|
+
use: { ...devices['Desktop Chrome'] },
|
|
41
|
+
},
|
|
42
|
+
|
|
43
|
+
{
|
|
44
|
+
name: 'firefox',
|
|
45
|
+
use: { ...devices['Desktop Firefox'] },
|
|
46
|
+
},
|
|
47
|
+
|
|
48
|
+
{
|
|
49
|
+
name: 'webkit',
|
|
50
|
+
use: { ...devices['Desktop Safari'] },
|
|
51
|
+
},
|
|
52
|
+
|
|
53
|
+
/* Test against mobile viewports. */
|
|
54
|
+
// {
|
|
55
|
+
// name: 'Mobile Chrome',
|
|
56
|
+
// use: { ...devices['Pixel 5'] },
|
|
57
|
+
// },
|
|
58
|
+
// {
|
|
59
|
+
// name: 'Mobile Safari',
|
|
60
|
+
// use: { ...devices['iPhone 12'] },
|
|
61
|
+
// },
|
|
62
|
+
|
|
63
|
+
/* Test against branded browsers. */
|
|
64
|
+
// {
|
|
65
|
+
// name: 'Microsoft Edge',
|
|
66
|
+
// use: { ...devices['Desktop Edge'], channel: 'msedge' },
|
|
67
|
+
// },
|
|
68
|
+
// {
|
|
69
|
+
// name: 'Google Chrome',
|
|
70
|
+
// use: { ...devices['Desktop Chrome'], channel: 'chrome' },
|
|
71
|
+
// },
|
|
72
|
+
],
|
|
73
|
+
|
|
74
|
+
/* Run your local dev server before starting the tests */
|
|
75
|
+
// webServer: {
|
|
76
|
+
// command: 'npm run start',
|
|
77
|
+
// url: 'http://localhost:3000',
|
|
78
|
+
// reuseExistingServer: !process.env.CI,
|
|
79
|
+
// },
|
|
80
|
+
});
|
|
81
|
+
|
package/src/app.js
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
3
|
const { JSDOM, VirtualConsole } = require('jsdom');
|
|
4
|
+
const { chromium } = require('patchright');
|
|
4
5
|
const EventEmitter = require('events');
|
|
5
6
|
const http = require('http');
|
|
6
7
|
const https = require('https');
|
|
@@ -1041,7 +1042,143 @@ function setProxy(instance, options, url) {
|
|
|
1041
1042
|
|
|
1042
1043
|
return false;
|
|
1043
1044
|
}
|
|
1045
|
+
|
|
1046
|
+
const clients = new Map();
|
|
1047
|
+
|
|
1044
1048
|
/* eslint-enable no-param-reassign */
|
|
1049
|
+
async function getBrowserInstance(scope) {
|
|
1050
|
+
if (clients.has(scope)) {
|
|
1051
|
+
return clients.get(scope);
|
|
1052
|
+
}
|
|
1053
|
+
|
|
1054
|
+
const browser = await chromium.launch({
|
|
1055
|
+
headless: false,
|
|
1056
|
+
});
|
|
1057
|
+
|
|
1058
|
+
const context = await browser.newContext({
|
|
1059
|
+
userAgent: 'unprint',
|
|
1060
|
+
});
|
|
1061
|
+
|
|
1062
|
+
const client = { context, browser };
|
|
1063
|
+
|
|
1064
|
+
clients.set(scope, client);
|
|
1065
|
+
|
|
1066
|
+
return client;
|
|
1067
|
+
}
|
|
1068
|
+
|
|
1069
|
+
async function closeAllBrowsers() {
|
|
1070
|
+
await Promise.all(Array.from(clients.values()).map(async (client) => client.browser.close()));
|
|
1071
|
+
}
|
|
1072
|
+
|
|
1073
|
+
function curateResponse(res, options, { url, customOptions }) {
|
|
1074
|
+
const base = {
|
|
1075
|
+
ok: true,
|
|
1076
|
+
status: res.status,
|
|
1077
|
+
statusText: res.statusText,
|
|
1078
|
+
headers: res.headers,
|
|
1079
|
+
response: res,
|
|
1080
|
+
res,
|
|
1081
|
+
};
|
|
1082
|
+
|
|
1083
|
+
if (['application/json', 'application/javascript'].some((type) => res.headers['content-type']?.includes(type)) && typeof res.data === 'object') {
|
|
1084
|
+
return {
|
|
1085
|
+
...base,
|
|
1086
|
+
data: res.data,
|
|
1087
|
+
};
|
|
1088
|
+
}
|
|
1089
|
+
|
|
1090
|
+
if (!options.extract) {
|
|
1091
|
+
return base;
|
|
1092
|
+
}
|
|
1093
|
+
|
|
1094
|
+
const contextOptions = {
|
|
1095
|
+
...customOptions,
|
|
1096
|
+
origin: url,
|
|
1097
|
+
};
|
|
1098
|
+
|
|
1099
|
+
const context = options.selectAll
|
|
1100
|
+
? initAll(res.data, options.selectAll, contextOptions)
|
|
1101
|
+
: init(res.data, options.select, contextOptions);
|
|
1102
|
+
|
|
1103
|
+
return {
|
|
1104
|
+
...base,
|
|
1105
|
+
context,
|
|
1106
|
+
};
|
|
1107
|
+
}
|
|
1108
|
+
|
|
1109
|
+
async function browserRequest(url, customOptions = {}) {
|
|
1110
|
+
const options = merge.all([{
|
|
1111
|
+
timeout: 1000,
|
|
1112
|
+
extract: true,
|
|
1113
|
+
scope: 'main',
|
|
1114
|
+
url,
|
|
1115
|
+
}, globalOptions, customOptions]);
|
|
1116
|
+
|
|
1117
|
+
const { limiter, interval, concurrency } = getLimiter(url, options);
|
|
1118
|
+
|
|
1119
|
+
const feedbackBase = {
|
|
1120
|
+
url,
|
|
1121
|
+
method: 'get',
|
|
1122
|
+
interval,
|
|
1123
|
+
concurrency,
|
|
1124
|
+
isProxied: false,
|
|
1125
|
+
options,
|
|
1126
|
+
};
|
|
1127
|
+
|
|
1128
|
+
return limiter.schedule(async () => {
|
|
1129
|
+
const { context, browser } = await getBrowserInstance(options.scope);
|
|
1130
|
+
const page = await context.newPage();
|
|
1131
|
+
|
|
1132
|
+
const res = await page.goto(url, {
|
|
1133
|
+
...options.browser,
|
|
1134
|
+
});
|
|
1135
|
+
|
|
1136
|
+
const status = res.status();
|
|
1137
|
+
const statusText = res.statusText();
|
|
1138
|
+
const headers = await res.allHeaders();
|
|
1139
|
+
|
|
1140
|
+
if (!(status >= 200 && status < 300)) {
|
|
1141
|
+
handleError(new Error(`HTTP response from ${url} not OK (${status} ${statusText}): ${res.data}`), 'HTTP_NOT_OK');
|
|
1142
|
+
|
|
1143
|
+
events.emit('requestError', {
|
|
1144
|
+
...feedbackBase,
|
|
1145
|
+
status,
|
|
1146
|
+
statusText,
|
|
1147
|
+
});
|
|
1148
|
+
|
|
1149
|
+
return {
|
|
1150
|
+
ok: false,
|
|
1151
|
+
status,
|
|
1152
|
+
statusText,
|
|
1153
|
+
headers,
|
|
1154
|
+
response: res,
|
|
1155
|
+
res,
|
|
1156
|
+
};
|
|
1157
|
+
}
|
|
1158
|
+
|
|
1159
|
+
events.emit('requestSuccess', feedbackBase);
|
|
1160
|
+
|
|
1161
|
+
await page.waitForLoadState();
|
|
1162
|
+
|
|
1163
|
+
if (customOptions.control) {
|
|
1164
|
+
await customOptions.control(page, { context, browser });
|
|
1165
|
+
}
|
|
1166
|
+
|
|
1167
|
+
events.emit('controlSuccess', feedbackBase);
|
|
1168
|
+
|
|
1169
|
+
const data = await page.content();
|
|
1170
|
+
|
|
1171
|
+
await page.close();
|
|
1172
|
+
// await browser.close();
|
|
1173
|
+
|
|
1174
|
+
return curateResponse({
|
|
1175
|
+
data,
|
|
1176
|
+
status,
|
|
1177
|
+
statusText,
|
|
1178
|
+
headers,
|
|
1179
|
+
}, options, { url, customOptions });
|
|
1180
|
+
});
|
|
1181
|
+
}
|
|
1045
1182
|
|
|
1046
1183
|
async function request(url, body, customOptions = {}, method = 'GET') {
|
|
1047
1184
|
const options = merge.all([{
|
|
@@ -1099,45 +1236,13 @@ async function request(url, body, customOptions = {}, method = 'GET') {
|
|
|
1099
1236
|
};
|
|
1100
1237
|
}
|
|
1101
1238
|
|
|
1102
|
-
const base = {
|
|
1103
|
-
ok: true,
|
|
1104
|
-
status: res.status,
|
|
1105
|
-
statusText: res.statusText,
|
|
1106
|
-
headers: res.headers,
|
|
1107
|
-
response: res,
|
|
1108
|
-
res,
|
|
1109
|
-
};
|
|
1110
|
-
|
|
1111
1239
|
events.emit('requestSuccess', {
|
|
1112
1240
|
...feedbackBase,
|
|
1113
1241
|
status: res.status,
|
|
1114
1242
|
statusText: res.statusText,
|
|
1115
1243
|
});
|
|
1116
1244
|
|
|
1117
|
-
|
|
1118
|
-
return {
|
|
1119
|
-
...base,
|
|
1120
|
-
data: res.data,
|
|
1121
|
-
};
|
|
1122
|
-
}
|
|
1123
|
-
|
|
1124
|
-
if (!options.extract) {
|
|
1125
|
-
return base;
|
|
1126
|
-
}
|
|
1127
|
-
|
|
1128
|
-
const contextOptions = {
|
|
1129
|
-
...customOptions,
|
|
1130
|
-
origin: url,
|
|
1131
|
-
};
|
|
1132
|
-
|
|
1133
|
-
const context = options.selectAll
|
|
1134
|
-
? initAll(res.data, options.selectAll, contextOptions)
|
|
1135
|
-
: init(res.data, options.select, contextOptions);
|
|
1136
|
-
|
|
1137
|
-
return {
|
|
1138
|
-
...base,
|
|
1139
|
-
context,
|
|
1140
|
-
};
|
|
1245
|
+
return curateResponse(res, options, { url, customOptions });
|
|
1141
1246
|
}
|
|
1142
1247
|
|
|
1143
1248
|
async function get(url, options) {
|
|
@@ -1164,6 +1269,9 @@ module.exports = {
|
|
|
1164
1269
|
get,
|
|
1165
1270
|
post,
|
|
1166
1271
|
request,
|
|
1272
|
+
browserRequest,
|
|
1273
|
+
browser: browserRequest,
|
|
1274
|
+
closeAllBrowsers,
|
|
1167
1275
|
initialize: init,
|
|
1168
1276
|
initializeAll: initAll,
|
|
1169
1277
|
init,
|
package/tests/browser.js
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const unprint = require('../src/app');
|
|
4
|
+
|
|
5
|
+
async function initTest() {
|
|
6
|
+
await Promise.all([
|
|
7
|
+
unprint.browser('https://tools-httpstatus.pickup-services.com/200?sleep=5000', {
|
|
8
|
+
headless: false,
|
|
9
|
+
async control(_page) {
|
|
10
|
+
//
|
|
11
|
+
},
|
|
12
|
+
}),
|
|
13
|
+
new Promise((resolve) => {
|
|
14
|
+
setTimeout(() => {
|
|
15
|
+
resolve();
|
|
16
|
+
}, 1000);
|
|
17
|
+
}).then(async () => {
|
|
18
|
+
await unprint.browser('https://tools-httpstatus.pickup-services.com/200?sleep=2000', {
|
|
19
|
+
headless: false,
|
|
20
|
+
async control(_page) {
|
|
21
|
+
//
|
|
22
|
+
},
|
|
23
|
+
});
|
|
24
|
+
}),
|
|
25
|
+
]);
|
|
26
|
+
|
|
27
|
+
const res = await unprint.browser('https://www.scrapingcourse.com/', {
|
|
28
|
+
// await unprint.browser('https://www.scrapingcourse.com/', {
|
|
29
|
+
headless: false,
|
|
30
|
+
async control(_page) {
|
|
31
|
+
//
|
|
32
|
+
},
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
const cards = res.context.query.contents('h2');
|
|
36
|
+
|
|
37
|
+
console.log('CARD TITLES', cards);
|
|
38
|
+
|
|
39
|
+
await unprint.closeAllBrowsers();
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
initTest();
|