unprint 0.16.3 → 0.16.4-beta

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -202,11 +202,19 @@ Extracts the CSS `url()` background from a style attribute. Alias for `query.sty
202
202
  ### HTTP request
203
203
  * `unprint.get(url, [options])`
204
204
  * `unprint.post(url, body, [options])`
205
+ * `unprint.request(url, body, [options], [method])`
205
206
 
206
207
  Options
207
208
  * `select`: Pre-query and initialize a specific element on the page
208
209
  * `selectAll`: Pre-query and initialize multiple specific element on the page
209
210
 
211
+ Use Playwright with Chromium (experimental)
212
+ * `unprint.browserRequest(url, [options])`
213
+ * `unprint.closeAllBrowsers()`
214
+
215
+ Additional options
216
+ * `browser`: Options object passed to Playwright
217
+
210
218
  Returns
211
219
  ```javascript
212
220
  {
package/package.json CHANGED
@@ -1,11 +1,9 @@
1
1
  {
2
2
  "name": "unprint",
3
- "version": "0.16.3",
3
+ "version": "0.16.4-beta",
4
4
  "description": "Simplify common web scraping tasks while staying in control of the data.",
5
5
  "main": "src/app.js",
6
- "scripts": {
7
- "test": "echo \"Error: no test specified\" && exit 1"
8
- },
6
+ "scripts": {},
9
7
  "repository": {
10
8
  "type": "git",
11
9
  "url": "git+https://github.com/ThePendulum/unprint.git"
@@ -30,10 +28,13 @@
30
28
  "eslint-config-airbnb-base": "^15.0.0",
31
29
  "jsdom": "^17.0.0",
32
30
  "moment-timezone": "^0.5.34",
31
+ "patchright": "^1.56.1",
33
32
  "srcset": "^4.0.0",
34
33
  "tunnel": "^0.0.6"
35
34
  },
36
35
  "devDependencies": {
36
+ "@playwright/test": "^1.56.1",
37
+ "@types/node": "^24.10.0",
37
38
  "express": "^4.18.1"
38
39
  }
39
40
  }
@@ -0,0 +1,81 @@
1
+ // @ts-check
2
+ import { defineConfig, devices } from '@playwright/test';
3
+
4
+ /**
5
+ * Read environment variables from file.
6
+ * https://github.com/motdotla/dotenv
7
+ */
8
+ // import dotenv from 'dotenv';
9
+ // import path from 'path';
10
+ // dotenv.config({ path: path.resolve(__dirname, '.env') });
11
+
12
+ /**
13
+ * @see https://playwright.dev/docs/test-configuration
14
+ */
15
+ export default defineConfig({
16
+ testDir: './e2e',
17
+ /* Run tests in files in parallel */
18
+ fullyParallel: true,
19
+ /* Fail the build on CI if you accidentally left test.only in the source code. */
20
+ forbidOnly: !!process.env.CI,
21
+ /* Retry on CI only */
22
+ retries: process.env.CI ? 2 : 0,
23
+ /* Opt out of parallel tests on CI. */
24
+ workers: process.env.CI ? 1 : undefined,
25
+ /* Reporter to use. See https://playwright.dev/docs/test-reporters */
26
+ reporter: 'html',
27
+ /* Shared settings for all the projects below. See https://playwright.dev/docs/api/class-testoptions. */
28
+ use: {
29
+ /* Base URL to use in actions like `await page.goto('')`. */
30
+ // baseURL: 'http://localhost:3000',
31
+
32
+ /* Collect trace when retrying the failed test. See https://playwright.dev/docs/trace-viewer */
33
+ trace: 'on-first-retry',
34
+ },
35
+
36
+ /* Configure projects for major browsers */
37
+ projects: [
38
+ {
39
+ name: 'chromium',
40
+ use: { ...devices['Desktop Chrome'] },
41
+ },
42
+
43
+ {
44
+ name: 'firefox',
45
+ use: { ...devices['Desktop Firefox'] },
46
+ },
47
+
48
+ {
49
+ name: 'webkit',
50
+ use: { ...devices['Desktop Safari'] },
51
+ },
52
+
53
+ /* Test against mobile viewports. */
54
+ // {
55
+ // name: 'Mobile Chrome',
56
+ // use: { ...devices['Pixel 5'] },
57
+ // },
58
+ // {
59
+ // name: 'Mobile Safari',
60
+ // use: { ...devices['iPhone 12'] },
61
+ // },
62
+
63
+ /* Test against branded browsers. */
64
+ // {
65
+ // name: 'Microsoft Edge',
66
+ // use: { ...devices['Desktop Edge'], channel: 'msedge' },
67
+ // },
68
+ // {
69
+ // name: 'Google Chrome',
70
+ // use: { ...devices['Desktop Chrome'], channel: 'chrome' },
71
+ // },
72
+ ],
73
+
74
+ /* Run your local dev server before starting the tests */
75
+ // webServer: {
76
+ // command: 'npm run start',
77
+ // url: 'http://localhost:3000',
78
+ // reuseExistingServer: !process.env.CI,
79
+ // },
80
+ });
81
+
package/src/app.js CHANGED
@@ -1,6 +1,7 @@
1
1
  'use strict';
2
2
 
3
3
  const { JSDOM, VirtualConsole } = require('jsdom');
4
+ const { chromium } = require('patchright');
4
5
  const EventEmitter = require('events');
5
6
  const http = require('http');
6
7
  const https = require('https');
@@ -1041,7 +1042,143 @@ function setProxy(instance, options, url) {
1041
1042
 
1042
1043
  return false;
1043
1044
  }
1045
+
1046
+ const clients = new Map();
1047
+
1044
1048
  /* eslint-enable no-param-reassign */
1049
+ async function getBrowserInstance(scope) {
1050
+ if (clients.has(scope)) {
1051
+ return clients.get(scope);
1052
+ }
1053
+
1054
+ const browser = await chromium.launch({
1055
+ headless: false,
1056
+ });
1057
+
1058
+ const context = await browser.newContext({
1059
+ userAgent: 'unprint',
1060
+ });
1061
+
1062
+ const client = { context, browser };
1063
+
1064
+ clients.set(scope, client);
1065
+
1066
+ return client;
1067
+ }
1068
+
1069
+ async function closeAllBrowsers() {
1070
+ await Promise.all(Array.from(clients.values()).map(async (client) => client.browser.close()));
1071
+ }
1072
+
1073
+ function curateResponse(res, options, { url, customOptions }) {
1074
+ const base = {
1075
+ ok: true,
1076
+ status: res.status,
1077
+ statusText: res.statusText,
1078
+ headers: res.headers,
1079
+ response: res,
1080
+ res,
1081
+ };
1082
+
1083
+ if (['application/json', 'application/javascript'].some((type) => res.headers['content-type']?.includes(type)) && typeof res.data === 'object') {
1084
+ return {
1085
+ ...base,
1086
+ data: res.data,
1087
+ };
1088
+ }
1089
+
1090
+ if (!options.extract) {
1091
+ return base;
1092
+ }
1093
+
1094
+ const contextOptions = {
1095
+ ...customOptions,
1096
+ origin: url,
1097
+ };
1098
+
1099
+ const context = options.selectAll
1100
+ ? initAll(res.data, options.selectAll, contextOptions)
1101
+ : init(res.data, options.select, contextOptions);
1102
+
1103
+ return {
1104
+ ...base,
1105
+ context,
1106
+ };
1107
+ }
1108
+
1109
+ async function browserRequest(url, customOptions = {}) {
1110
+ const options = merge.all([{
1111
+ timeout: 1000,
1112
+ extract: true,
1113
+ scope: 'main',
1114
+ url,
1115
+ }, globalOptions, customOptions]);
1116
+
1117
+ const { limiter, interval, concurrency } = getLimiter(url, options);
1118
+
1119
+ const feedbackBase = {
1120
+ url,
1121
+ method: 'get',
1122
+ interval,
1123
+ concurrency,
1124
+ isProxied: false,
1125
+ options,
1126
+ };
1127
+
1128
+ return limiter.schedule(async () => {
1129
+ const { context, browser } = await getBrowserInstance(options.scope);
1130
+ const page = await context.newPage();
1131
+
1132
+ const res = await page.goto(url, {
1133
+ ...options.browser,
1134
+ });
1135
+
1136
+ const status = res.status();
1137
+ const statusText = res.statusText();
1138
+ const headers = await res.allHeaders();
1139
+
1140
+ if (!(status >= 200 && status < 300)) {
1141
+ handleError(new Error(`HTTP response from ${url} not OK (${status} ${statusText}): ${res.data}`), 'HTTP_NOT_OK');
1142
+
1143
+ events.emit('requestError', {
1144
+ ...feedbackBase,
1145
+ status,
1146
+ statusText,
1147
+ });
1148
+
1149
+ return {
1150
+ ok: false,
1151
+ status,
1152
+ statusText,
1153
+ headers,
1154
+ response: res,
1155
+ res,
1156
+ };
1157
+ }
1158
+
1159
+ events.emit('requestSuccess', feedbackBase);
1160
+
1161
+ await page.waitForLoadState();
1162
+
1163
+ if (customOptions.control) {
1164
+ await customOptions.control(page, { context, browser });
1165
+ }
1166
+
1167
+ events.emit('controlSuccess', feedbackBase);
1168
+
1169
+ const data = await page.content();
1170
+
1171
+ await page.close();
1172
+ // await browser.close();
1173
+
1174
+ return curateResponse({
1175
+ data,
1176
+ status,
1177
+ statusText,
1178
+ headers,
1179
+ }, options, { url, customOptions });
1180
+ });
1181
+ }
1045
1182
 
1046
1183
  async function request(url, body, customOptions = {}, method = 'GET') {
1047
1184
  const options = merge.all([{
@@ -1099,45 +1236,13 @@ async function request(url, body, customOptions = {}, method = 'GET') {
1099
1236
  };
1100
1237
  }
1101
1238
 
1102
- const base = {
1103
- ok: true,
1104
- status: res.status,
1105
- statusText: res.statusText,
1106
- headers: res.headers,
1107
- response: res,
1108
- res,
1109
- };
1110
-
1111
1239
  events.emit('requestSuccess', {
1112
1240
  ...feedbackBase,
1113
1241
  status: res.status,
1114
1242
  statusText: res.statusText,
1115
1243
  });
1116
1244
 
1117
- if (['application/json', 'application/javascript'].some((type) => res.headers['content-type'].includes(type)) && typeof res.data === 'object') {
1118
- return {
1119
- ...base,
1120
- data: res.data,
1121
- };
1122
- }
1123
-
1124
- if (!options.extract) {
1125
- return base;
1126
- }
1127
-
1128
- const contextOptions = {
1129
- ...customOptions,
1130
- origin: url,
1131
- };
1132
-
1133
- const context = options.selectAll
1134
- ? initAll(res.data, options.selectAll, contextOptions)
1135
- : init(res.data, options.select, contextOptions);
1136
-
1137
- return {
1138
- ...base,
1139
- context,
1140
- };
1245
+ return curateResponse(res, options, { url, customOptions });
1141
1246
  }
1142
1247
 
1143
1248
  async function get(url, options) {
@@ -1164,6 +1269,9 @@ module.exports = {
1164
1269
  get,
1165
1270
  post,
1166
1271
  request,
1272
+ browserRequest,
1273
+ browser: browserRequest,
1274
+ closeAllBrowsers,
1167
1275
  initialize: init,
1168
1276
  initializeAll: initAll,
1169
1277
  init,
@@ -0,0 +1,42 @@
1
+ 'use strict';
2
+
3
+ const unprint = require('../src/app');
4
+
5
+ async function initTest() {
6
+ await Promise.all([
7
+ unprint.browser('https://tools-httpstatus.pickup-services.com/200?sleep=5000', {
8
+ headless: false,
9
+ async control(_page) {
10
+ //
11
+ },
12
+ }),
13
+ new Promise((resolve) => {
14
+ setTimeout(() => {
15
+ resolve();
16
+ }, 1000);
17
+ }).then(async () => {
18
+ await unprint.browser('https://tools-httpstatus.pickup-services.com/200?sleep=2000', {
19
+ headless: false,
20
+ async control(_page) {
21
+ //
22
+ },
23
+ });
24
+ }),
25
+ ]);
26
+
27
+ const res = await unprint.browser('https://www.scrapingcourse.com/', {
28
+ // await unprint.browser('https://www.scrapingcourse.com/', {
29
+ headless: false,
30
+ async control(_page) {
31
+ //
32
+ },
33
+ });
34
+
35
+ const cards = res.context.query.contents('h2');
36
+
37
+ console.log('CARD TITLES', cards);
38
+
39
+ await unprint.closeAllBrowsers();
40
+ }
41
+
42
+ initTest();