unprint 0.16.3 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +18 -2
- package/package.json +5 -4
- package/playwright.config.js +81 -0
- package/src/app.js +143 -33
- package/tests/browser.js +42 -0
package/README.md
CHANGED
|
@@ -202,10 +202,26 @@ Extracts the CSS `url()` background from a style attribute. Alias for `query.sty
|
|
|
202
202
|
### HTTP request
|
|
203
203
|
* `unprint.get(url, [options])`
|
|
204
204
|
* `unprint.post(url, body, [options])`
|
|
205
|
+
* `unprint.request(url, body, [options], [method])`
|
|
205
206
|
|
|
206
207
|
Options
|
|
207
|
-
* `select`: Pre-query and initialize a specific element on the page
|
|
208
|
-
* `selectAll`: Pre-query and initialize multiple specific element on the page
|
|
208
|
+
* `select`: Pre-query and initialize a specific element on the page.
|
|
209
|
+
* `selectAll`: Pre-query and initialize multiple specific element on the page.
|
|
210
|
+
|
|
211
|
+
Use Playwright with Chromium (experimental)
|
|
212
|
+
* `unprint.browserRequest(url, [options])`
|
|
213
|
+
* `unprint.closeAllBrowsers()`
|
|
214
|
+
|
|
215
|
+
Additional options
|
|
216
|
+
* `scope`: Browser instance to (re)use, default `main`.
|
|
217
|
+
* `browser`: Options object passed to Playwright's `launch`, requires new scope.
|
|
218
|
+
* `browser.headless`: Headless mode, set to `false` to launch visible browser, default `true`.
|
|
219
|
+
* `context`: Options object passed to Playwright's `newContext`, requires new scope.
|
|
220
|
+
* `page`: Options object passed to Playwright's `goto`.
|
|
221
|
+
|
|
222
|
+
This requires you to install the Chromium executable:
|
|
223
|
+
* `sudo npx patchright install-deps`
|
|
224
|
+
* `npx patchright install`
|
|
209
225
|
|
|
210
226
|
Returns
|
|
211
227
|
```javascript
|
package/package.json
CHANGED
|
@@ -1,11 +1,9 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "unprint",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.17.0",
|
|
4
4
|
"description": "Simplify common web scraping tasks while staying in control of the data.",
|
|
5
5
|
"main": "src/app.js",
|
|
6
|
-
"scripts": {
|
|
7
|
-
"test": "echo \"Error: no test specified\" && exit 1"
|
|
8
|
-
},
|
|
6
|
+
"scripts": {},
|
|
9
7
|
"repository": {
|
|
10
8
|
"type": "git",
|
|
11
9
|
"url": "git+https://github.com/ThePendulum/unprint.git"
|
|
@@ -30,10 +28,13 @@
|
|
|
30
28
|
"eslint-config-airbnb-base": "^15.0.0",
|
|
31
29
|
"jsdom": "^17.0.0",
|
|
32
30
|
"moment-timezone": "^0.5.34",
|
|
31
|
+
"patchright": "^1.56.1",
|
|
33
32
|
"srcset": "^4.0.0",
|
|
34
33
|
"tunnel": "^0.0.6"
|
|
35
34
|
},
|
|
36
35
|
"devDependencies": {
|
|
36
|
+
"@playwright/test": "^1.56.1",
|
|
37
|
+
"@types/node": "^24.10.0",
|
|
37
38
|
"express": "^4.18.1"
|
|
38
39
|
}
|
|
39
40
|
}
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
// @ts-check
|
|
2
|
+
import { defineConfig, devices } from '@playwright/test';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Read environment variables from file.
|
|
6
|
+
* https://github.com/motdotla/dotenv
|
|
7
|
+
*/
|
|
8
|
+
// import dotenv from 'dotenv';
|
|
9
|
+
// import path from 'path';
|
|
10
|
+
// dotenv.config({ path: path.resolve(__dirname, '.env') });
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* @see https://playwright.dev/docs/test-configuration
|
|
14
|
+
*/
|
|
15
|
+
export default defineConfig({
|
|
16
|
+
testDir: './e2e',
|
|
17
|
+
/* Run tests in files in parallel */
|
|
18
|
+
fullyParallel: true,
|
|
19
|
+
/* Fail the build on CI if you accidentally left test.only in the source code. */
|
|
20
|
+
forbidOnly: !!process.env.CI,
|
|
21
|
+
/* Retry on CI only */
|
|
22
|
+
retries: process.env.CI ? 2 : 0,
|
|
23
|
+
/* Opt out of parallel tests on CI. */
|
|
24
|
+
workers: process.env.CI ? 1 : undefined,
|
|
25
|
+
/* Reporter to use. See https://playwright.dev/docs/test-reporters */
|
|
26
|
+
reporter: 'html',
|
|
27
|
+
/* Shared settings for all the projects below. See https://playwright.dev/docs/api/class-testoptions. */
|
|
28
|
+
use: {
|
|
29
|
+
/* Base URL to use in actions like `await page.goto('')`. */
|
|
30
|
+
// baseURL: 'http://localhost:3000',
|
|
31
|
+
|
|
32
|
+
/* Collect trace when retrying the failed test. See https://playwright.dev/docs/trace-viewer */
|
|
33
|
+
trace: 'on-first-retry',
|
|
34
|
+
},
|
|
35
|
+
|
|
36
|
+
/* Configure projects for major browsers */
|
|
37
|
+
projects: [
|
|
38
|
+
{
|
|
39
|
+
name: 'chromium',
|
|
40
|
+
use: { ...devices['Desktop Chrome'] },
|
|
41
|
+
},
|
|
42
|
+
|
|
43
|
+
{
|
|
44
|
+
name: 'firefox',
|
|
45
|
+
use: { ...devices['Desktop Firefox'] },
|
|
46
|
+
},
|
|
47
|
+
|
|
48
|
+
{
|
|
49
|
+
name: 'webkit',
|
|
50
|
+
use: { ...devices['Desktop Safari'] },
|
|
51
|
+
},
|
|
52
|
+
|
|
53
|
+
/* Test against mobile viewports. */
|
|
54
|
+
// {
|
|
55
|
+
// name: 'Mobile Chrome',
|
|
56
|
+
// use: { ...devices['Pixel 5'] },
|
|
57
|
+
// },
|
|
58
|
+
// {
|
|
59
|
+
// name: 'Mobile Safari',
|
|
60
|
+
// use: { ...devices['iPhone 12'] },
|
|
61
|
+
// },
|
|
62
|
+
|
|
63
|
+
/* Test against branded browsers. */
|
|
64
|
+
// {
|
|
65
|
+
// name: 'Microsoft Edge',
|
|
66
|
+
// use: { ...devices['Desktop Edge'], channel: 'msedge' },
|
|
67
|
+
// },
|
|
68
|
+
// {
|
|
69
|
+
// name: 'Google Chrome',
|
|
70
|
+
// use: { ...devices['Desktop Chrome'], channel: 'chrome' },
|
|
71
|
+
// },
|
|
72
|
+
],
|
|
73
|
+
|
|
74
|
+
/* Run your local dev server before starting the tests */
|
|
75
|
+
// webServer: {
|
|
76
|
+
// command: 'npm run start',
|
|
77
|
+
// url: 'http://localhost:3000',
|
|
78
|
+
// reuseExistingServer: !process.env.CI,
|
|
79
|
+
// },
|
|
80
|
+
});
|
|
81
|
+
|
package/src/app.js
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
3
|
const { JSDOM, VirtualConsole } = require('jsdom');
|
|
4
|
+
const { chromium } = require('patchright');
|
|
4
5
|
const EventEmitter = require('events');
|
|
5
6
|
const http = require('http');
|
|
6
7
|
const https = require('https');
|
|
@@ -1041,7 +1042,145 @@ function setProxy(instance, options, url) {
|
|
|
1041
1042
|
|
|
1042
1043
|
return false;
|
|
1043
1044
|
}
|
|
1045
|
+
|
|
1046
|
+
const clients = new Map();
|
|
1047
|
+
|
|
1044
1048
|
/* eslint-enable no-param-reassign */
|
|
1049
|
+
async function getBrowserInstance(scope, options) {
|
|
1050
|
+
if (clients.has(scope)) {
|
|
1051
|
+
return clients.get(scope);
|
|
1052
|
+
}
|
|
1053
|
+
|
|
1054
|
+
const browser = await chromium.launch({
|
|
1055
|
+
headless: true,
|
|
1056
|
+
...options.browser,
|
|
1057
|
+
});
|
|
1058
|
+
|
|
1059
|
+
const context = await browser.newContext({
|
|
1060
|
+
userAgent: 'unprint',
|
|
1061
|
+
...options.context,
|
|
1062
|
+
});
|
|
1063
|
+
|
|
1064
|
+
const client = { context, browser };
|
|
1065
|
+
|
|
1066
|
+
clients.set(scope, client);
|
|
1067
|
+
|
|
1068
|
+
return client;
|
|
1069
|
+
}
|
|
1070
|
+
|
|
1071
|
+
async function closeAllBrowsers() {
|
|
1072
|
+
await Promise.all(Array.from(clients.values()).map(async (client) => client.browser.close()));
|
|
1073
|
+
}
|
|
1074
|
+
|
|
1075
|
+
function curateResponse(res, options, { url, customOptions }) {
|
|
1076
|
+
const base = {
|
|
1077
|
+
ok: true,
|
|
1078
|
+
status: res.status,
|
|
1079
|
+
statusText: res.statusText,
|
|
1080
|
+
headers: res.headers,
|
|
1081
|
+
response: res,
|
|
1082
|
+
res,
|
|
1083
|
+
};
|
|
1084
|
+
|
|
1085
|
+
if (['application/json', 'application/javascript'].some((type) => res.headers['content-type']?.includes(type)) && typeof res.data === 'object') {
|
|
1086
|
+
return {
|
|
1087
|
+
...base,
|
|
1088
|
+
data: res.data,
|
|
1089
|
+
};
|
|
1090
|
+
}
|
|
1091
|
+
|
|
1092
|
+
if (!options.extract) {
|
|
1093
|
+
return base;
|
|
1094
|
+
}
|
|
1095
|
+
|
|
1096
|
+
const contextOptions = {
|
|
1097
|
+
...customOptions,
|
|
1098
|
+
origin: url,
|
|
1099
|
+
};
|
|
1100
|
+
|
|
1101
|
+
const context = options.selectAll
|
|
1102
|
+
? initAll(res.data, options.selectAll, contextOptions)
|
|
1103
|
+
: init(res.data, options.select, contextOptions);
|
|
1104
|
+
|
|
1105
|
+
return {
|
|
1106
|
+
...base,
|
|
1107
|
+
context,
|
|
1108
|
+
};
|
|
1109
|
+
}
|
|
1110
|
+
|
|
1111
|
+
async function browserRequest(url, customOptions = {}) {
|
|
1112
|
+
const options = merge.all([{
|
|
1113
|
+
timeout: 1000,
|
|
1114
|
+
extract: true,
|
|
1115
|
+
scope: 'main',
|
|
1116
|
+
url,
|
|
1117
|
+
}, globalOptions, customOptions]);
|
|
1118
|
+
|
|
1119
|
+
const { limiter, interval, concurrency } = getLimiter(url, options);
|
|
1120
|
+
|
|
1121
|
+
const feedbackBase = {
|
|
1122
|
+
url,
|
|
1123
|
+
method: 'get',
|
|
1124
|
+
interval,
|
|
1125
|
+
concurrency,
|
|
1126
|
+
isProxied: false,
|
|
1127
|
+
options,
|
|
1128
|
+
};
|
|
1129
|
+
|
|
1130
|
+
return limiter.schedule(async () => {
|
|
1131
|
+
const { context, browser } = await getBrowserInstance(options.scope, options);
|
|
1132
|
+
const page = await context.newPage();
|
|
1133
|
+
|
|
1134
|
+
const res = await page.goto(url, {
|
|
1135
|
+
...options.page,
|
|
1136
|
+
});
|
|
1137
|
+
|
|
1138
|
+
const status = res.status();
|
|
1139
|
+
const statusText = res.statusText();
|
|
1140
|
+
const headers = await res.allHeaders();
|
|
1141
|
+
|
|
1142
|
+
if (!(status >= 200 && status < 300)) {
|
|
1143
|
+
handleError(new Error(`HTTP response from ${url} not OK (${status} ${statusText}): ${res.data}`), 'HTTP_NOT_OK');
|
|
1144
|
+
|
|
1145
|
+
events.emit('requestError', {
|
|
1146
|
+
...feedbackBase,
|
|
1147
|
+
status,
|
|
1148
|
+
statusText,
|
|
1149
|
+
});
|
|
1150
|
+
|
|
1151
|
+
return {
|
|
1152
|
+
ok: false,
|
|
1153
|
+
status,
|
|
1154
|
+
statusText,
|
|
1155
|
+
headers,
|
|
1156
|
+
response: res,
|
|
1157
|
+
res,
|
|
1158
|
+
};
|
|
1159
|
+
}
|
|
1160
|
+
|
|
1161
|
+
events.emit('requestSuccess', feedbackBase);
|
|
1162
|
+
|
|
1163
|
+
await page.waitForLoadState();
|
|
1164
|
+
|
|
1165
|
+
if (customOptions.control) {
|
|
1166
|
+
await customOptions.control(page, { context, browser });
|
|
1167
|
+
}
|
|
1168
|
+
|
|
1169
|
+
events.emit('controlSuccess', feedbackBase);
|
|
1170
|
+
|
|
1171
|
+
const data = await page.content();
|
|
1172
|
+
|
|
1173
|
+
await page.close();
|
|
1174
|
+
// await browser.close();
|
|
1175
|
+
|
|
1176
|
+
return curateResponse({
|
|
1177
|
+
data,
|
|
1178
|
+
status,
|
|
1179
|
+
statusText,
|
|
1180
|
+
headers,
|
|
1181
|
+
}, options, { url, customOptions });
|
|
1182
|
+
});
|
|
1183
|
+
}
|
|
1045
1184
|
|
|
1046
1185
|
async function request(url, body, customOptions = {}, method = 'GET') {
|
|
1047
1186
|
const options = merge.all([{
|
|
@@ -1099,45 +1238,13 @@ async function request(url, body, customOptions = {}, method = 'GET') {
|
|
|
1099
1238
|
};
|
|
1100
1239
|
}
|
|
1101
1240
|
|
|
1102
|
-
const base = {
|
|
1103
|
-
ok: true,
|
|
1104
|
-
status: res.status,
|
|
1105
|
-
statusText: res.statusText,
|
|
1106
|
-
headers: res.headers,
|
|
1107
|
-
response: res,
|
|
1108
|
-
res,
|
|
1109
|
-
};
|
|
1110
|
-
|
|
1111
1241
|
events.emit('requestSuccess', {
|
|
1112
1242
|
...feedbackBase,
|
|
1113
1243
|
status: res.status,
|
|
1114
1244
|
statusText: res.statusText,
|
|
1115
1245
|
});
|
|
1116
1246
|
|
|
1117
|
-
|
|
1118
|
-
return {
|
|
1119
|
-
...base,
|
|
1120
|
-
data: res.data,
|
|
1121
|
-
};
|
|
1122
|
-
}
|
|
1123
|
-
|
|
1124
|
-
if (!options.extract) {
|
|
1125
|
-
return base;
|
|
1126
|
-
}
|
|
1127
|
-
|
|
1128
|
-
const contextOptions = {
|
|
1129
|
-
...customOptions,
|
|
1130
|
-
origin: url,
|
|
1131
|
-
};
|
|
1132
|
-
|
|
1133
|
-
const context = options.selectAll
|
|
1134
|
-
? initAll(res.data, options.selectAll, contextOptions)
|
|
1135
|
-
: init(res.data, options.select, contextOptions);
|
|
1136
|
-
|
|
1137
|
-
return {
|
|
1138
|
-
...base,
|
|
1139
|
-
context,
|
|
1140
|
-
};
|
|
1247
|
+
return curateResponse(res, options, { url, customOptions });
|
|
1141
1248
|
}
|
|
1142
1249
|
|
|
1143
1250
|
async function get(url, options) {
|
|
@@ -1164,6 +1271,9 @@ module.exports = {
|
|
|
1164
1271
|
get,
|
|
1165
1272
|
post,
|
|
1166
1273
|
request,
|
|
1274
|
+
browserRequest,
|
|
1275
|
+
browser: browserRequest,
|
|
1276
|
+
closeAllBrowsers,
|
|
1167
1277
|
initialize: init,
|
|
1168
1278
|
initializeAll: initAll,
|
|
1169
1279
|
init,
|
package/tests/browser.js
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const unprint = require('../src/app');
|
|
4
|
+
|
|
5
|
+
async function initTest() {
|
|
6
|
+
await Promise.all([
|
|
7
|
+
unprint.browser('https://tools-httpstatus.pickup-services.com/200?sleep=5000', {
|
|
8
|
+
headless: false,
|
|
9
|
+
async control(_page) {
|
|
10
|
+
//
|
|
11
|
+
},
|
|
12
|
+
}),
|
|
13
|
+
new Promise((resolve) => {
|
|
14
|
+
setTimeout(() => {
|
|
15
|
+
resolve();
|
|
16
|
+
}, 1000);
|
|
17
|
+
}).then(async () => {
|
|
18
|
+
await unprint.browser('https://tools-httpstatus.pickup-services.com/200?sleep=2000', {
|
|
19
|
+
headless: false,
|
|
20
|
+
async control(_page) {
|
|
21
|
+
//
|
|
22
|
+
},
|
|
23
|
+
});
|
|
24
|
+
}),
|
|
25
|
+
]);
|
|
26
|
+
|
|
27
|
+
const res = await unprint.browser('https://www.scrapingcourse.com/', {
|
|
28
|
+
// await unprint.browser('https://www.scrapingcourse.com/', {
|
|
29
|
+
headless: false,
|
|
30
|
+
async control(_page) {
|
|
31
|
+
//
|
|
32
|
+
},
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
const cards = res.context.query.contents('h2');
|
|
36
|
+
|
|
37
|
+
console.log('CARD TITLES', cards);
|
|
38
|
+
|
|
39
|
+
await unprint.closeAllBrowsers();
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
initTest();
|