unprint 0.16.2 → 0.16.4-beta
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -0
- package/package.json +5 -4
- package/playwright.config.js +81 -0
- package/src/app.js +150 -36
- package/tests/browser.js +42 -0
- package/tests/init.js +1 -0
package/README.md
CHANGED
|
@@ -59,6 +59,9 @@ The selector can be a CSS selector, an XPath selector starting with `/` or `(`,
|
|
|
59
59
|
#### Querying multiple elements
|
|
60
60
|
Most methods can be used in plural, returning an array of results, i.e. `query.elements()`, `query.dates()`.
|
|
61
61
|
|
|
62
|
+
Options
|
|
63
|
+
* `filterDuplicates`: When an array of selectors results in the same element being selected multiple times, ensure each element is only returned once, default `true`.
|
|
64
|
+
|
|
62
65
|
#### Query an element
|
|
63
66
|
* `query.element([selector], [options])`
|
|
64
67
|
|
|
@@ -199,11 +202,19 @@ Extracts the CSS `url()` background from a style attribute. Alias for `query.sty
|
|
|
199
202
|
### HTTP request
|
|
200
203
|
* `unprint.get(url, [options])`
|
|
201
204
|
* `unprint.post(url, body, [options])`
|
|
205
|
+
* `unprint.request(url, body, [options], [method])`
|
|
202
206
|
|
|
203
207
|
Options
|
|
204
208
|
* `select`: Pre-query and initialize a specific element on the page
|
|
205
209
|
* `selectAll`: Pre-query and initialize multiple specific element on the page
|
|
206
210
|
|
|
211
|
+
Use Playwright with Chromium (experimental)
|
|
212
|
+
* `unprint.browserRequest(url, [options])`
|
|
213
|
+
* `unprint.closeAllBrowsers()`
|
|
214
|
+
|
|
215
|
+
Additional options
|
|
216
|
+
* `browser`: Options object passed to Playwright
|
|
217
|
+
|
|
207
218
|
Returns
|
|
208
219
|
```javascript
|
|
209
220
|
{
|
package/package.json
CHANGED
|
@@ -1,11 +1,9 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "unprint",
|
|
3
|
-
"version": "0.16.
|
|
3
|
+
"version": "0.16.4-beta",
|
|
4
4
|
"description": "Simplify common web scraping tasks while staying in control of the data.",
|
|
5
5
|
"main": "src/app.js",
|
|
6
|
-
"scripts": {
|
|
7
|
-
"test": "echo \"Error: no test specified\" && exit 1"
|
|
8
|
-
},
|
|
6
|
+
"scripts": {},
|
|
9
7
|
"repository": {
|
|
10
8
|
"type": "git",
|
|
11
9
|
"url": "git+https://github.com/ThePendulum/unprint.git"
|
|
@@ -30,10 +28,13 @@
|
|
|
30
28
|
"eslint-config-airbnb-base": "^15.0.0",
|
|
31
29
|
"jsdom": "^17.0.0",
|
|
32
30
|
"moment-timezone": "^0.5.34",
|
|
31
|
+
"patchright": "^1.56.1",
|
|
33
32
|
"srcset": "^4.0.0",
|
|
34
33
|
"tunnel": "^0.0.6"
|
|
35
34
|
},
|
|
36
35
|
"devDependencies": {
|
|
36
|
+
"@playwright/test": "^1.56.1",
|
|
37
|
+
"@types/node": "^24.10.0",
|
|
37
38
|
"express": "^4.18.1"
|
|
38
39
|
}
|
|
39
40
|
}
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
// @ts-check
|
|
2
|
+
import { defineConfig, devices } from '@playwright/test';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Read environment variables from file.
|
|
6
|
+
* https://github.com/motdotla/dotenv
|
|
7
|
+
*/
|
|
8
|
+
// import dotenv from 'dotenv';
|
|
9
|
+
// import path from 'path';
|
|
10
|
+
// dotenv.config({ path: path.resolve(__dirname, '.env') });
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* @see https://playwright.dev/docs/test-configuration
|
|
14
|
+
*/
|
|
15
|
+
export default defineConfig({
|
|
16
|
+
testDir: './e2e',
|
|
17
|
+
/* Run tests in files in parallel */
|
|
18
|
+
fullyParallel: true,
|
|
19
|
+
/* Fail the build on CI if you accidentally left test.only in the source code. */
|
|
20
|
+
forbidOnly: !!process.env.CI,
|
|
21
|
+
/* Retry on CI only */
|
|
22
|
+
retries: process.env.CI ? 2 : 0,
|
|
23
|
+
/* Opt out of parallel tests on CI. */
|
|
24
|
+
workers: process.env.CI ? 1 : undefined,
|
|
25
|
+
/* Reporter to use. See https://playwright.dev/docs/test-reporters */
|
|
26
|
+
reporter: 'html',
|
|
27
|
+
/* Shared settings for all the projects below. See https://playwright.dev/docs/api/class-testoptions. */
|
|
28
|
+
use: {
|
|
29
|
+
/* Base URL to use in actions like `await page.goto('')`. */
|
|
30
|
+
// baseURL: 'http://localhost:3000',
|
|
31
|
+
|
|
32
|
+
/* Collect trace when retrying the failed test. See https://playwright.dev/docs/trace-viewer */
|
|
33
|
+
trace: 'on-first-retry',
|
|
34
|
+
},
|
|
35
|
+
|
|
36
|
+
/* Configure projects for major browsers */
|
|
37
|
+
projects: [
|
|
38
|
+
{
|
|
39
|
+
name: 'chromium',
|
|
40
|
+
use: { ...devices['Desktop Chrome'] },
|
|
41
|
+
},
|
|
42
|
+
|
|
43
|
+
{
|
|
44
|
+
name: 'firefox',
|
|
45
|
+
use: { ...devices['Desktop Firefox'] },
|
|
46
|
+
},
|
|
47
|
+
|
|
48
|
+
{
|
|
49
|
+
name: 'webkit',
|
|
50
|
+
use: { ...devices['Desktop Safari'] },
|
|
51
|
+
},
|
|
52
|
+
|
|
53
|
+
/* Test against mobile viewports. */
|
|
54
|
+
// {
|
|
55
|
+
// name: 'Mobile Chrome',
|
|
56
|
+
// use: { ...devices['Pixel 5'] },
|
|
57
|
+
// },
|
|
58
|
+
// {
|
|
59
|
+
// name: 'Mobile Safari',
|
|
60
|
+
// use: { ...devices['iPhone 12'] },
|
|
61
|
+
// },
|
|
62
|
+
|
|
63
|
+
/* Test against branded browsers. */
|
|
64
|
+
// {
|
|
65
|
+
// name: 'Microsoft Edge',
|
|
66
|
+
// use: { ...devices['Desktop Edge'], channel: 'msedge' },
|
|
67
|
+
// },
|
|
68
|
+
// {
|
|
69
|
+
// name: 'Google Chrome',
|
|
70
|
+
// use: { ...devices['Desktop Chrome'], channel: 'chrome' },
|
|
71
|
+
// },
|
|
72
|
+
],
|
|
73
|
+
|
|
74
|
+
/* Run your local dev server before starting the tests */
|
|
75
|
+
// webServer: {
|
|
76
|
+
// command: 'npm run start',
|
|
77
|
+
// url: 'http://localhost:3000',
|
|
78
|
+
// reuseExistingServer: !process.env.CI,
|
|
79
|
+
// },
|
|
80
|
+
});
|
|
81
|
+
|
package/src/app.js
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
3
|
const { JSDOM, VirtualConsole } = require('jsdom');
|
|
4
|
+
const { chromium } = require('patchright');
|
|
4
5
|
const EventEmitter = require('events');
|
|
5
6
|
const http = require('http');
|
|
6
7
|
const https = require('https');
|
|
@@ -102,14 +103,20 @@ function queryElement(context, selectors, _customOptions) {
|
|
|
102
103
|
return target || null;
|
|
103
104
|
}
|
|
104
105
|
|
|
105
|
-
function queryElements(context, selectors,
|
|
106
|
+
function queryElements(context, selectors, customOptions = {}) {
|
|
106
107
|
if (!selectors) {
|
|
107
108
|
return context.element;
|
|
108
109
|
}
|
|
109
110
|
|
|
110
|
-
const
|
|
111
|
+
const options = customOptions;
|
|
112
|
+
const targets = [].concat(selectors).reduce((acc, selector) => acc.concat(getElements(context, selector, false)), []).filter(Boolean);
|
|
111
113
|
|
|
112
|
-
|
|
114
|
+
if (options.filterDuplicates === false) {
|
|
115
|
+
return targets || [];
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
// findIndex always finds first index, if current index is not the first index, it's a dupe
|
|
119
|
+
return targets.filter((target, index, array) => index === array.findIndex((dupe) => target === dupe));
|
|
113
120
|
}
|
|
114
121
|
|
|
115
122
|
function queryExistence(context, selector, customOptions) {
|
|
@@ -1035,7 +1042,143 @@ function setProxy(instance, options, url) {
|
|
|
1035
1042
|
|
|
1036
1043
|
return false;
|
|
1037
1044
|
}
|
|
1045
|
+
|
|
1046
|
+
const clients = new Map();
|
|
1047
|
+
|
|
1038
1048
|
/* eslint-enable no-param-reassign */
|
|
1049
|
+
async function getBrowserInstance(scope) {
|
|
1050
|
+
if (clients.has(scope)) {
|
|
1051
|
+
return clients.get(scope);
|
|
1052
|
+
}
|
|
1053
|
+
|
|
1054
|
+
const browser = await chromium.launch({
|
|
1055
|
+
headless: false,
|
|
1056
|
+
});
|
|
1057
|
+
|
|
1058
|
+
const context = await browser.newContext({
|
|
1059
|
+
userAgent: 'unprint',
|
|
1060
|
+
});
|
|
1061
|
+
|
|
1062
|
+
const client = { context, browser };
|
|
1063
|
+
|
|
1064
|
+
clients.set(scope, client);
|
|
1065
|
+
|
|
1066
|
+
return client;
|
|
1067
|
+
}
|
|
1068
|
+
|
|
1069
|
+
async function closeAllBrowsers() {
|
|
1070
|
+
await Promise.all(Array.from(clients.values()).map(async (client) => client.browser.close()));
|
|
1071
|
+
}
|
|
1072
|
+
|
|
1073
|
+
function curateResponse(res, options, { url, customOptions }) {
|
|
1074
|
+
const base = {
|
|
1075
|
+
ok: true,
|
|
1076
|
+
status: res.status,
|
|
1077
|
+
statusText: res.statusText,
|
|
1078
|
+
headers: res.headers,
|
|
1079
|
+
response: res,
|
|
1080
|
+
res,
|
|
1081
|
+
};
|
|
1082
|
+
|
|
1083
|
+
if (['application/json', 'application/javascript'].some((type) => res.headers['content-type']?.includes(type)) && typeof res.data === 'object') {
|
|
1084
|
+
return {
|
|
1085
|
+
...base,
|
|
1086
|
+
data: res.data,
|
|
1087
|
+
};
|
|
1088
|
+
}
|
|
1089
|
+
|
|
1090
|
+
if (!options.extract) {
|
|
1091
|
+
return base;
|
|
1092
|
+
}
|
|
1093
|
+
|
|
1094
|
+
const contextOptions = {
|
|
1095
|
+
...customOptions,
|
|
1096
|
+
origin: url,
|
|
1097
|
+
};
|
|
1098
|
+
|
|
1099
|
+
const context = options.selectAll
|
|
1100
|
+
? initAll(res.data, options.selectAll, contextOptions)
|
|
1101
|
+
: init(res.data, options.select, contextOptions);
|
|
1102
|
+
|
|
1103
|
+
return {
|
|
1104
|
+
...base,
|
|
1105
|
+
context,
|
|
1106
|
+
};
|
|
1107
|
+
}
|
|
1108
|
+
|
|
1109
|
+
async function browserRequest(url, customOptions = {}) {
|
|
1110
|
+
const options = merge.all([{
|
|
1111
|
+
timeout: 1000,
|
|
1112
|
+
extract: true,
|
|
1113
|
+
scope: 'main',
|
|
1114
|
+
url,
|
|
1115
|
+
}, globalOptions, customOptions]);
|
|
1116
|
+
|
|
1117
|
+
const { limiter, interval, concurrency } = getLimiter(url, options);
|
|
1118
|
+
|
|
1119
|
+
const feedbackBase = {
|
|
1120
|
+
url,
|
|
1121
|
+
method: 'get',
|
|
1122
|
+
interval,
|
|
1123
|
+
concurrency,
|
|
1124
|
+
isProxied: false,
|
|
1125
|
+
options,
|
|
1126
|
+
};
|
|
1127
|
+
|
|
1128
|
+
return limiter.schedule(async () => {
|
|
1129
|
+
const { context, browser } = await getBrowserInstance(options.scope);
|
|
1130
|
+
const page = await context.newPage();
|
|
1131
|
+
|
|
1132
|
+
const res = await page.goto(url, {
|
|
1133
|
+
...options.browser,
|
|
1134
|
+
});
|
|
1135
|
+
|
|
1136
|
+
const status = res.status();
|
|
1137
|
+
const statusText = res.statusText();
|
|
1138
|
+
const headers = await res.allHeaders();
|
|
1139
|
+
|
|
1140
|
+
if (!(status >= 200 && status < 300)) {
|
|
1141
|
+
handleError(new Error(`HTTP response from ${url} not OK (${status} ${statusText}): ${res.data}`), 'HTTP_NOT_OK');
|
|
1142
|
+
|
|
1143
|
+
events.emit('requestError', {
|
|
1144
|
+
...feedbackBase,
|
|
1145
|
+
status,
|
|
1146
|
+
statusText,
|
|
1147
|
+
});
|
|
1148
|
+
|
|
1149
|
+
return {
|
|
1150
|
+
ok: false,
|
|
1151
|
+
status,
|
|
1152
|
+
statusText,
|
|
1153
|
+
headers,
|
|
1154
|
+
response: res,
|
|
1155
|
+
res,
|
|
1156
|
+
};
|
|
1157
|
+
}
|
|
1158
|
+
|
|
1159
|
+
events.emit('requestSuccess', feedbackBase);
|
|
1160
|
+
|
|
1161
|
+
await page.waitForLoadState();
|
|
1162
|
+
|
|
1163
|
+
if (customOptions.control) {
|
|
1164
|
+
await customOptions.control(page, { context, browser });
|
|
1165
|
+
}
|
|
1166
|
+
|
|
1167
|
+
events.emit('controlSuccess', feedbackBase);
|
|
1168
|
+
|
|
1169
|
+
const data = await page.content();
|
|
1170
|
+
|
|
1171
|
+
await page.close();
|
|
1172
|
+
// await browser.close();
|
|
1173
|
+
|
|
1174
|
+
return curateResponse({
|
|
1175
|
+
data,
|
|
1176
|
+
status,
|
|
1177
|
+
statusText,
|
|
1178
|
+
headers,
|
|
1179
|
+
}, options, { url, customOptions });
|
|
1180
|
+
});
|
|
1181
|
+
}
|
|
1039
1182
|
|
|
1040
1183
|
async function request(url, body, customOptions = {}, method = 'GET') {
|
|
1041
1184
|
const options = merge.all([{
|
|
@@ -1093,45 +1236,13 @@ async function request(url, body, customOptions = {}, method = 'GET') {
|
|
|
1093
1236
|
};
|
|
1094
1237
|
}
|
|
1095
1238
|
|
|
1096
|
-
const base = {
|
|
1097
|
-
ok: true,
|
|
1098
|
-
status: res.status,
|
|
1099
|
-
statusText: res.statusText,
|
|
1100
|
-
headers: res.headers,
|
|
1101
|
-
response: res,
|
|
1102
|
-
res,
|
|
1103
|
-
};
|
|
1104
|
-
|
|
1105
1239
|
events.emit('requestSuccess', {
|
|
1106
1240
|
...feedbackBase,
|
|
1107
1241
|
status: res.status,
|
|
1108
1242
|
statusText: res.statusText,
|
|
1109
1243
|
});
|
|
1110
1244
|
|
|
1111
|
-
|
|
1112
|
-
return {
|
|
1113
|
-
...base,
|
|
1114
|
-
data: res.data,
|
|
1115
|
-
};
|
|
1116
|
-
}
|
|
1117
|
-
|
|
1118
|
-
if (!options.extract) {
|
|
1119
|
-
return base;
|
|
1120
|
-
}
|
|
1121
|
-
|
|
1122
|
-
const contextOptions = {
|
|
1123
|
-
...customOptions,
|
|
1124
|
-
origin: url,
|
|
1125
|
-
};
|
|
1126
|
-
|
|
1127
|
-
const context = options.selectAll
|
|
1128
|
-
? initAll(res.data, options.selectAll, contextOptions)
|
|
1129
|
-
: init(res.data, options.select, contextOptions);
|
|
1130
|
-
|
|
1131
|
-
return {
|
|
1132
|
-
...base,
|
|
1133
|
-
context,
|
|
1134
|
-
};
|
|
1245
|
+
return curateResponse(res, options, { url, customOptions });
|
|
1135
1246
|
}
|
|
1136
1247
|
|
|
1137
1248
|
async function get(url, options) {
|
|
@@ -1158,6 +1269,9 @@ module.exports = {
|
|
|
1158
1269
|
get,
|
|
1159
1270
|
post,
|
|
1160
1271
|
request,
|
|
1272
|
+
browserRequest,
|
|
1273
|
+
browser: browserRequest,
|
|
1274
|
+
closeAllBrowsers,
|
|
1161
1275
|
initialize: init,
|
|
1162
1276
|
initializeAll: initAll,
|
|
1163
1277
|
init,
|
package/tests/browser.js
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const unprint = require('../src/app');
|
|
4
|
+
|
|
5
|
+
async function initTest() {
|
|
6
|
+
await Promise.all([
|
|
7
|
+
unprint.browser('https://tools-httpstatus.pickup-services.com/200?sleep=5000', {
|
|
8
|
+
headless: false,
|
|
9
|
+
async control(_page) {
|
|
10
|
+
//
|
|
11
|
+
},
|
|
12
|
+
}),
|
|
13
|
+
new Promise((resolve) => {
|
|
14
|
+
setTimeout(() => {
|
|
15
|
+
resolve();
|
|
16
|
+
}, 1000);
|
|
17
|
+
}).then(async () => {
|
|
18
|
+
await unprint.browser('https://tools-httpstatus.pickup-services.com/200?sleep=2000', {
|
|
19
|
+
headless: false,
|
|
20
|
+
async control(_page) {
|
|
21
|
+
//
|
|
22
|
+
},
|
|
23
|
+
});
|
|
24
|
+
}),
|
|
25
|
+
]);
|
|
26
|
+
|
|
27
|
+
const res = await unprint.browser('https://www.scrapingcourse.com/', {
|
|
28
|
+
// await unprint.browser('https://www.scrapingcourse.com/', {
|
|
29
|
+
headless: false,
|
|
30
|
+
async control(_page) {
|
|
31
|
+
//
|
|
32
|
+
},
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
const cards = res.context.query.contents('h2');
|
|
36
|
+
|
|
37
|
+
console.log('CARD TITLES', cards);
|
|
38
|
+
|
|
39
|
+
await unprint.closeAllBrowsers();
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
initTest();
|
package/tests/init.js
CHANGED
|
@@ -47,6 +47,7 @@ async function initTest() {
|
|
|
47
47
|
console.log('number indexed', res.context.query.number('.number', { match: /(\d+)/, matchIndex: 1 }));
|
|
48
48
|
console.log('data', res.context.query.json('#json'));
|
|
49
49
|
console.log('items', res.context.query.contents('.item'));
|
|
50
|
+
console.log('items css xpath array', res.context.query.contents(['.item', '//li[contains(@class, "number")]']));
|
|
50
51
|
console.log('link', res.context.query.url('#link'));
|
|
51
52
|
console.log('links', res.context.query.urls('.link'));
|
|
52
53
|
console.log('text', res.context.query.text('.text'));
|