unprint 0.16.2 → 0.16.4-beta

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -59,6 +59,9 @@ The selector can be a CSS selector, an XPath selector starting with `/` or `(`,
59
59
  #### Querying multiple elements
60
60
  Most methods can be used in plural, returning an array of results, i.e. `query.elements()`, `query.dates()`.
61
61
 
62
+ Options
63
+ * `filterDuplicates`: When an array of selectors results in the same element being selected multiple times, ensure each element is only returned once, default `true`.
64
+
62
65
  #### Query an element
63
66
  * `query.element([selector], [options])`
64
67
 
@@ -199,11 +202,19 @@ Extracts the CSS `url()` background from a style attribute. Alias for `query.sty
199
202
  ### HTTP request
200
203
  * `unprint.get(url, [options])`
201
204
  * `unprint.post(url, body, [options])`
205
+ * `unprint.request(url, body, [options], [method])`
202
206
 
203
207
  Options
204
208
  * `select`: Pre-query and initialize a specific element on the page
205
209
  * `selectAll`: Pre-query and initialize multiple specific element on the page
206
210
 
211
+ Use Playwright with Chromium (experimental)
212
+ * `unprint.browserRequest(url, [options])`
213
+ * `unprint.closeAllBrowsers()`
214
+
215
+ Additional options
216
+ * `browser`: Options object passed to Playwright
217
+
207
218
  Returns
208
219
  ```javascript
209
220
  {
package/package.json CHANGED
@@ -1,11 +1,9 @@
1
1
  {
2
2
  "name": "unprint",
3
- "version": "0.16.2",
3
+ "version": "0.16.4-beta",
4
4
  "description": "Simplify common web scraping tasks while staying in control of the data.",
5
5
  "main": "src/app.js",
6
- "scripts": {
7
- "test": "echo \"Error: no test specified\" && exit 1"
8
- },
6
+ "scripts": {},
9
7
  "repository": {
10
8
  "type": "git",
11
9
  "url": "git+https://github.com/ThePendulum/unprint.git"
@@ -30,10 +28,13 @@
30
28
  "eslint-config-airbnb-base": "^15.0.0",
31
29
  "jsdom": "^17.0.0",
32
30
  "moment-timezone": "^0.5.34",
31
+ "patchright": "^1.56.1",
33
32
  "srcset": "^4.0.0",
34
33
  "tunnel": "^0.0.6"
35
34
  },
36
35
  "devDependencies": {
36
+ "@playwright/test": "^1.56.1",
37
+ "@types/node": "^24.10.0",
37
38
  "express": "^4.18.1"
38
39
  }
39
40
  }
@@ -0,0 +1,81 @@
1
+ // @ts-check
2
+ import { defineConfig, devices } from '@playwright/test';
3
+
4
+ /**
5
+ * Read environment variables from file.
6
+ * https://github.com/motdotla/dotenv
7
+ */
8
+ // import dotenv from 'dotenv';
9
+ // import path from 'path';
10
+ // dotenv.config({ path: path.resolve(__dirname, '.env') });
11
+
12
+ /**
13
+ * @see https://playwright.dev/docs/test-configuration
14
+ */
15
+ export default defineConfig({
16
+ testDir: './e2e',
17
+ /* Run tests in files in parallel */
18
+ fullyParallel: true,
19
+ /* Fail the build on CI if you accidentally left test.only in the source code. */
20
+ forbidOnly: !!process.env.CI,
21
+ /* Retry on CI only */
22
+ retries: process.env.CI ? 2 : 0,
23
+ /* Opt out of parallel tests on CI. */
24
+ workers: process.env.CI ? 1 : undefined,
25
+ /* Reporter to use. See https://playwright.dev/docs/test-reporters */
26
+ reporter: 'html',
27
+ /* Shared settings for all the projects below. See https://playwright.dev/docs/api/class-testoptions. */
28
+ use: {
29
+ /* Base URL to use in actions like `await page.goto('')`. */
30
+ // baseURL: 'http://localhost:3000',
31
+
32
+ /* Collect trace when retrying the failed test. See https://playwright.dev/docs/trace-viewer */
33
+ trace: 'on-first-retry',
34
+ },
35
+
36
+ /* Configure projects for major browsers */
37
+ projects: [
38
+ {
39
+ name: 'chromium',
40
+ use: { ...devices['Desktop Chrome'] },
41
+ },
42
+
43
+ {
44
+ name: 'firefox',
45
+ use: { ...devices['Desktop Firefox'] },
46
+ },
47
+
48
+ {
49
+ name: 'webkit',
50
+ use: { ...devices['Desktop Safari'] },
51
+ },
52
+
53
+ /* Test against mobile viewports. */
54
+ // {
55
+ // name: 'Mobile Chrome',
56
+ // use: { ...devices['Pixel 5'] },
57
+ // },
58
+ // {
59
+ // name: 'Mobile Safari',
60
+ // use: { ...devices['iPhone 12'] },
61
+ // },
62
+
63
+ /* Test against branded browsers. */
64
+ // {
65
+ // name: 'Microsoft Edge',
66
+ // use: { ...devices['Desktop Edge'], channel: 'msedge' },
67
+ // },
68
+ // {
69
+ // name: 'Google Chrome',
70
+ // use: { ...devices['Desktop Chrome'], channel: 'chrome' },
71
+ // },
72
+ ],
73
+
74
+ /* Run your local dev server before starting the tests */
75
+ // webServer: {
76
+ // command: 'npm run start',
77
+ // url: 'http://localhost:3000',
78
+ // reuseExistingServer: !process.env.CI,
79
+ // },
80
+ });
81
+
package/src/app.js CHANGED
@@ -1,6 +1,7 @@
1
1
  'use strict';
2
2
 
3
3
  const { JSDOM, VirtualConsole } = require('jsdom');
4
+ const { chromium } = require('patchright');
4
5
  const EventEmitter = require('events');
5
6
  const http = require('http');
6
7
  const https = require('https');
@@ -102,14 +103,20 @@ function queryElement(context, selectors, _customOptions) {
102
103
  return target || null;
103
104
  }
104
105
 
105
- function queryElements(context, selectors, _customOptions) {
106
+ function queryElements(context, selectors, customOptions = {}) {
106
107
  if (!selectors) {
107
108
  return context.element;
108
109
  }
109
110
 
110
- const targets = [].concat(selectors).reduce((acc, selector) => acc || getElements(context, selector, false), null);
111
+ const options = customOptions;
112
+ const targets = [].concat(selectors).reduce((acc, selector) => acc.concat(getElements(context, selector, false)), []).filter(Boolean);
111
113
 
112
- return targets || [];
114
+ if (options.filterDuplicates === false) {
115
+ return targets || [];
116
+ }
117
+
118
+ // findIndex always finds first index, if current index is not the first index, it's a dupe
119
+ return targets.filter((target, index, array) => index === array.findIndex((dupe) => target === dupe));
113
120
  }
114
121
 
115
122
  function queryExistence(context, selector, customOptions) {
@@ -1035,7 +1042,143 @@ function setProxy(instance, options, url) {
1035
1042
 
1036
1043
  return false;
1037
1044
  }
1045
+
1046
+ const clients = new Map();
1047
+
1038
1048
  /* eslint-enable no-param-reassign */
1049
+ async function getBrowserInstance(scope) {
1050
+ if (clients.has(scope)) {
1051
+ return clients.get(scope);
1052
+ }
1053
+
1054
+ const browser = await chromium.launch({
1055
+ headless: false,
1056
+ });
1057
+
1058
+ const context = await browser.newContext({
1059
+ userAgent: 'unprint',
1060
+ });
1061
+
1062
+ const client = { context, browser };
1063
+
1064
+ clients.set(scope, client);
1065
+
1066
+ return client;
1067
+ }
1068
+
1069
+ async function closeAllBrowsers() {
1070
+ await Promise.all(Array.from(clients.values()).map(async (client) => client.browser.close()));
1071
+ }
1072
+
1073
+ function curateResponse(res, options, { url, customOptions }) {
1074
+ const base = {
1075
+ ok: true,
1076
+ status: res.status,
1077
+ statusText: res.statusText,
1078
+ headers: res.headers,
1079
+ response: res,
1080
+ res,
1081
+ };
1082
+
1083
+ if (['application/json', 'application/javascript'].some((type) => res.headers['content-type']?.includes(type)) && typeof res.data === 'object') {
1084
+ return {
1085
+ ...base,
1086
+ data: res.data,
1087
+ };
1088
+ }
1089
+
1090
+ if (!options.extract) {
1091
+ return base;
1092
+ }
1093
+
1094
+ const contextOptions = {
1095
+ ...customOptions,
1096
+ origin: url,
1097
+ };
1098
+
1099
+ const context = options.selectAll
1100
+ ? initAll(res.data, options.selectAll, contextOptions)
1101
+ : init(res.data, options.select, contextOptions);
1102
+
1103
+ return {
1104
+ ...base,
1105
+ context,
1106
+ };
1107
+ }
1108
+
1109
+ async function browserRequest(url, customOptions = {}) {
1110
+ const options = merge.all([{
1111
+ timeout: 1000,
1112
+ extract: true,
1113
+ scope: 'main',
1114
+ url,
1115
+ }, globalOptions, customOptions]);
1116
+
1117
+ const { limiter, interval, concurrency } = getLimiter(url, options);
1118
+
1119
+ const feedbackBase = {
1120
+ url,
1121
+ method: 'get',
1122
+ interval,
1123
+ concurrency,
1124
+ isProxied: false,
1125
+ options,
1126
+ };
1127
+
1128
+ return limiter.schedule(async () => {
1129
+ const { context, browser } = await getBrowserInstance(options.scope);
1130
+ const page = await context.newPage();
1131
+
1132
+ const res = await page.goto(url, {
1133
+ ...options.browser,
1134
+ });
1135
+
1136
+ const status = res.status();
1137
+ const statusText = res.statusText();
1138
+ const headers = await res.allHeaders();
1139
+
1140
+ if (!(status >= 200 && status < 300)) {
1141
+ handleError(new Error(`HTTP response from ${url} not OK (${status} ${statusText}): ${res.data}`), 'HTTP_NOT_OK');
1142
+
1143
+ events.emit('requestError', {
1144
+ ...feedbackBase,
1145
+ status,
1146
+ statusText,
1147
+ });
1148
+
1149
+ return {
1150
+ ok: false,
1151
+ status,
1152
+ statusText,
1153
+ headers,
1154
+ response: res,
1155
+ res,
1156
+ };
1157
+ }
1158
+
1159
+ events.emit('requestSuccess', feedbackBase);
1160
+
1161
+ await page.waitForLoadState();
1162
+
1163
+ if (customOptions.control) {
1164
+ await customOptions.control(page, { context, browser });
1165
+ }
1166
+
1167
+ events.emit('controlSuccess', feedbackBase);
1168
+
1169
+ const data = await page.content();
1170
+
1171
+ await page.close();
1172
+ // await browser.close();
1173
+
1174
+ return curateResponse({
1175
+ data,
1176
+ status,
1177
+ statusText,
1178
+ headers,
1179
+ }, options, { url, customOptions });
1180
+ });
1181
+ }
1039
1182
 
1040
1183
  async function request(url, body, customOptions = {}, method = 'GET') {
1041
1184
  const options = merge.all([{
@@ -1093,45 +1236,13 @@ async function request(url, body, customOptions = {}, method = 'GET') {
1093
1236
  };
1094
1237
  }
1095
1238
 
1096
- const base = {
1097
- ok: true,
1098
- status: res.status,
1099
- statusText: res.statusText,
1100
- headers: res.headers,
1101
- response: res,
1102
- res,
1103
- };
1104
-
1105
1239
  events.emit('requestSuccess', {
1106
1240
  ...feedbackBase,
1107
1241
  status: res.status,
1108
1242
  statusText: res.statusText,
1109
1243
  });
1110
1244
 
1111
- if (['application/json', 'application/javascript'].some((type) => res.headers['content-type'].includes(type)) && typeof res.data === 'object') {
1112
- return {
1113
- ...base,
1114
- data: res.data,
1115
- };
1116
- }
1117
-
1118
- if (!options.extract) {
1119
- return base;
1120
- }
1121
-
1122
- const contextOptions = {
1123
- ...customOptions,
1124
- origin: url,
1125
- };
1126
-
1127
- const context = options.selectAll
1128
- ? initAll(res.data, options.selectAll, contextOptions)
1129
- : init(res.data, options.select, contextOptions);
1130
-
1131
- return {
1132
- ...base,
1133
- context,
1134
- };
1245
+ return curateResponse(res, options, { url, customOptions });
1135
1246
  }
1136
1247
 
1137
1248
  async function get(url, options) {
@@ -1158,6 +1269,9 @@ module.exports = {
1158
1269
  get,
1159
1270
  post,
1160
1271
  request,
1272
+ browserRequest,
1273
+ browser: browserRequest,
1274
+ closeAllBrowsers,
1161
1275
  initialize: init,
1162
1276
  initializeAll: initAll,
1163
1277
  init,
@@ -0,0 +1,42 @@
1
+ 'use strict';
2
+
3
+ const unprint = require('../src/app');
4
+
5
+ async function initTest() {
6
+ await Promise.all([
7
+ unprint.browser('https://tools-httpstatus.pickup-services.com/200?sleep=5000', {
8
+ headless: false,
9
+ async control(_page) {
10
+ //
11
+ },
12
+ }),
13
+ new Promise((resolve) => {
14
+ setTimeout(() => {
15
+ resolve();
16
+ }, 1000);
17
+ }).then(async () => {
18
+ await unprint.browser('https://tools-httpstatus.pickup-services.com/200?sleep=2000', {
19
+ headless: false,
20
+ async control(_page) {
21
+ //
22
+ },
23
+ });
24
+ }),
25
+ ]);
26
+
27
+ const res = await unprint.browser('https://www.scrapingcourse.com/', {
28
+ // await unprint.browser('https://www.scrapingcourse.com/', {
29
+ headless: false,
30
+ async control(_page) {
31
+ //
32
+ },
33
+ });
34
+
35
+ const cards = res.context.query.contents('h2');
36
+
37
+ console.log('CARD TITLES', cards);
38
+
39
+ await unprint.closeAllBrowsers();
40
+ }
41
+
42
+ initTest();
package/tests/init.js CHANGED
@@ -47,6 +47,7 @@ async function initTest() {
47
47
  console.log('number indexed', res.context.query.number('.number', { match: /(\d+)/, matchIndex: 1 }));
48
48
  console.log('data', res.context.query.json('#json'));
49
49
  console.log('items', res.context.query.contents('.item'));
50
+ console.log('items css xpath array', res.context.query.contents(['.item', '//li[contains(@class, "number")]']));
50
51
  console.log('link', res.context.query.url('#link'));
51
52
  console.log('links', res.context.query.urls('.link'));
52
53
  console.log('text', res.context.query.text('.text'));