unprint 0.0.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.editorconfig +14 -0
- package/.eslintrc +20 -0
- package/README.md +75 -0
- package/package.json +11 -4
- package/src/app.js +561 -1
- package/tests/data.json +5 -0
- package/tests/index.html +32 -0
- package/tests/init.js +66 -0
package/.editorconfig
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# top-most EditorConfig file
|
|
2
|
+
root = true
|
|
3
|
+
|
|
4
|
+
# Unix-style newlines with a newline ending every file
|
|
5
|
+
[*]
|
|
6
|
+
end_of_line = lf
|
|
7
|
+
insert_final_newline = true
|
|
8
|
+
indent_style = tab
|
|
9
|
+
indent_size = 4
|
|
10
|
+
|
|
11
|
+
# Matches multiple files with brace expansion notation
|
|
12
|
+
# Set default charset
|
|
13
|
+
[*.js]
|
|
14
|
+
charset = utf-8
|
package/.eslintrc
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
{
|
|
2
|
+
"extends": "airbnb-base",
|
|
3
|
+
"parserOptions": {
|
|
4
|
+
"sourceType": "script",
|
|
5
|
+
"ecmaVersion": 2020
|
|
6
|
+
},
|
|
7
|
+
"rules": {
|
|
8
|
+
"strict": 0,
|
|
9
|
+
"indent": "off",
|
|
10
|
+
"no-tabs": "off",
|
|
11
|
+
"no-unused-vars": ["error", {"argsIgnorePattern": "^_"}],
|
|
12
|
+
"no-console": 0,
|
|
13
|
+
"no-underscore-dangle": 0,
|
|
14
|
+
"prefer-destructuring": "off",
|
|
15
|
+
"template-curly-spacing": "off",
|
|
16
|
+
"object-curly-newline": "off",
|
|
17
|
+
"default-param-last": "off",
|
|
18
|
+
"max-len": [2, {"code": 300, "tabWidth": 4, "ignoreUrls": true}]
|
|
19
|
+
}
|
|
20
|
+
}
|
package/README.md
CHANGED
|
@@ -1 +1,76 @@
|
|
|
1
1
|
# unprint
|
|
2
|
+
unprint is a web scraping utility built around JSDOM, providing convenience methods for quickly extracting common data types.
|
|
3
|
+
|
|
4
|
+
## Install
|
|
5
|
+
`npm install unprint`
|
|
6
|
+
|
|
7
|
+
## Usage
|
|
8
|
+
`const unprint = require('unprint');`
|
|
9
|
+
|
|
10
|
+
### Querying
|
|
11
|
+
For optimal flexibility, unprint query methods can be used with or without initialization. If you already have access to DOM elements using another library or unprint instance, you can query it by using the uninitialized `query` methods provided directly from the library, and passing the element as the first argument, as such:
|
|
12
|
+
|
|
13
|
+
`unprint.query.element(element, 'h1#title')` // HTMLHeadingElement
|
|
14
|
+
|
|
15
|
+
Both `unprint.get()` and `unprint.init()` return its `query` methods pre-initialized, removing the element argument in favor of the element retrieved or received. Initialized query methods therefore will *not* accept a custom element, usually expecting the selector as the first argument instead.
|
|
16
|
+
|
|
17
|
+
```javascript
|
|
18
|
+
const result = await unprint.get('http://localhot:3101/html');
|
|
19
|
+
const { query } = result.context;
|
|
20
|
+
|
|
21
|
+
query.element('h1#title'); // HTMLHeadingElement
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
```javascript
|
|
25
|
+
const result = await fetch('http://localhot:3101/html');
|
|
26
|
+
const body = await res.text();
|
|
27
|
+
const { query } = await unprint.init(body);
|
|
28
|
+
|
|
29
|
+
query.element('h1#title'); // HTMLHeadingElement
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
**From here on, the query methods will be described in their initialized form.** The API for the *uninitialized* methods is identical, except for the element passed as the first argument
|
|
33
|
+
|
|
34
|
+
#### Selector
|
|
35
|
+
The selector can be a CSS selector, an XPath selector starting with `//`, or an array of either or both acting as fallbacks. If the selector is falsy, the input element will be used.
|
|
36
|
+
|
|
37
|
+
#### Query an element
|
|
38
|
+
* `query.element([selector], [options])`
|
|
39
|
+
|
|
40
|
+
Returns the element node directly.
|
|
41
|
+
|
|
42
|
+
#### Query a date
|
|
43
|
+
`query.date(selector, format, [options])`
|
|
44
|
+
|
|
45
|
+
Arguments
|
|
46
|
+
* `format` (string, array): The input format as a string or array of strings described by the [Moment.js docs](https://momentjs.com/docs/#/displaying/format/).
|
|
47
|
+
|
|
48
|
+
Options
|
|
49
|
+
* `match (RegExp): The text to extract before attempting to parse it as a date. The default expression will attempt to extract any of 01-01-1970, 1970-01-01, 01/01/1970 or January 1, 1970 with optional 00:00[:00] time.
|
|
50
|
+
* `timezone` (string): The name of the input timezone, defaults to 'UTC'.
|
|
51
|
+
|
|
52
|
+
Returns a Date object.
|
|
53
|
+
|
|
54
|
+
#### Querying multiple elements
|
|
55
|
+
Most methods can be used in plural, returning an array of results, i.e. `query.elements()`, `query.dates()`.
|
|
56
|
+
|
|
57
|
+
### HTTP request
|
|
58
|
+
* `unprint.get(url, [options])`
|
|
59
|
+
* `unprint.post(url, body, [options])`
|
|
60
|
+
|
|
61
|
+
Options
|
|
62
|
+
* `select`: Pre-query and initialize a specific element on the page
|
|
63
|
+
* `selectAll`: Pre-query and initialize multiple specific element on the page
|
|
64
|
+
|
|
65
|
+
Returns
|
|
66
|
+
```javascript
|
|
67
|
+
{
|
|
68
|
+
query, // (object) unprint querying methods
|
|
69
|
+
html, // (string) HTML body
|
|
70
|
+
data, // (object) parsed JSON response
|
|
71
|
+
status, // (number) HTTP status code
|
|
72
|
+
ok, // (boolean) status code >= 200 and < 300
|
|
73
|
+
response, // (object) the original axios response object, alias 'res'
|
|
74
|
+
res, // (object) alias for 'response'
|
|
75
|
+
}
|
|
76
|
+
```
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "unprint",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.3.0",
|
|
4
4
|
"description": "Simplify common web scraping tasks while staying in control of the data.",
|
|
5
5
|
"main": "src/app.js",
|
|
6
6
|
"scripts": {
|
|
@@ -22,8 +22,15 @@
|
|
|
22
22
|
},
|
|
23
23
|
"homepage": "https://github.com/ThePendulum/unprint#readme",
|
|
24
24
|
"dependencies": {
|
|
25
|
-
"
|
|
26
|
-
"
|
|
27
|
-
"
|
|
25
|
+
"axios": "^0.27.2",
|
|
26
|
+
"bottleneck": "^2.19.5",
|
|
27
|
+
"eslint": "^8.17.0",
|
|
28
|
+
"eslint-config-airbnb": "^19.0.4",
|
|
29
|
+
"eslint-config-airbnb-base": "^15.0.0",
|
|
30
|
+
"jsdom": "^17.0.0",
|
|
31
|
+
"moment-timezone": "^0.5.34"
|
|
32
|
+
},
|
|
33
|
+
"devDependencies": {
|
|
34
|
+
"express": "^4.18.1"
|
|
28
35
|
}
|
|
29
36
|
}
|
package/src/app.js
CHANGED
|
@@ -1,3 +1,563 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
const { JSDOM, VirtualConsole } = require('jsdom');
|
|
4
|
+
const axios = require('axios').default;
|
|
5
|
+
const moment = require('moment-timezone');
|
|
6
|
+
|
|
7
|
+
const settings = {
|
|
8
|
+
throwErrors: false,
|
|
9
|
+
logErrors: true,
|
|
10
|
+
requestTimeout: 30000,
|
|
11
|
+
};
|
|
12
|
+
|
|
13
|
+
const virtualConsole = new VirtualConsole();
|
|
14
|
+
const { window: globalWindow } = new JSDOM('', { virtualConsole });
|
|
15
|
+
|
|
16
|
+
function handleError(error, code) {
|
|
17
|
+
if (settings.logErrors) {
|
|
18
|
+
console.error(`unprint encountered an error (${code}): ${error.message}`);
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
if (settings.throwErrors) {
|
|
22
|
+
throw Object.assign(error, { code });
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
return null;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
virtualConsole.on('error', (message) => handleError(message, 'JSDOM'));
|
|
29
|
+
virtualConsole.on('jsdomError', (message) => handleError(message, 'JSDOM'));
|
|
30
|
+
|
|
31
|
+
const defaultOptions = {
|
|
32
|
+
trim: true,
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
function trim(string) {
|
|
36
|
+
if (typeof string === 'string') {
|
|
37
|
+
return string.trim().replace(/\s+/g, ' ');
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
return string;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
function iterateXpathResult(iterator, results = []) {
|
|
44
|
+
const element = iterator.iterateNext();
|
|
45
|
+
|
|
46
|
+
if (element) {
|
|
47
|
+
return iterateXpathResult(iterator, results.concat(element));
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
return results;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
function getElements(context, selector, firstOnly = false) {
|
|
54
|
+
if (!selector) {
|
|
55
|
+
return context.element;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
if (/^\/\//.test(selector)) {
|
|
59
|
+
// XPath selector
|
|
60
|
+
const iterator = globalWindow.document.evaluate(selector, context.element, null, globalWindow.XPathResult.ORDERED_NODE_ITERATOR_TYPE, null);
|
|
61
|
+
|
|
62
|
+
if (firstOnly) {
|
|
63
|
+
return iterator.iterateNext();
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
return iterateXpathResult(iterator);
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
if (firstOnly) {
|
|
70
|
+
return context.element.querySelector(selector);
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
return Array.from(context.element.querySelectorAll(selector));
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
function queryElement(context, selectors, _customOptions) {
|
|
77
|
+
if (!selectors && context.element.nodeName === '#document') {
|
|
78
|
+
return null;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
const target = [].concat(selectors).reduce((acc, selector) => acc || getElements(context, selector, true), null);
|
|
82
|
+
|
|
83
|
+
return target || null;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
function queryElements(context, selectors, _customOptions) {
|
|
87
|
+
if (!selectors) {
|
|
88
|
+
return context.element;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
const targets = [].concat(selectors).reduce((acc, selector) => acc || getElements(context, selector, false), null);
|
|
92
|
+
|
|
93
|
+
return targets || [];
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
function queryExistence(context, selector, customOptions) {
|
|
97
|
+
return !!queryElement(context, selector, customOptions);
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
function queryCount(context, selector, customOptions) {
|
|
101
|
+
return queryElements(context, selector, customOptions)?.length || 0;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
function getAttributeKey(options) {
|
|
105
|
+
if (!options) {
|
|
106
|
+
return null;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
if (Object.hasOwn(options, 'attr')) {
|
|
110
|
+
return options.attr;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
if (Object.hasOwn(options, 'attribute')) {
|
|
114
|
+
return options.attribute;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
return null;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
function extractContent(element, options) {
|
|
121
|
+
if (!element) {
|
|
122
|
+
return null;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
const attributeKey = getAttributeKey(options);
|
|
126
|
+
|
|
127
|
+
if (attributeKey) {
|
|
128
|
+
// handle attribute extraction in content method so all methods can easily optionally query a specific attribute
|
|
129
|
+
const attribute = element[attributeKey] || element.getAttribute(attributeKey);
|
|
130
|
+
|
|
131
|
+
if (attribute && options.trim) {
|
|
132
|
+
return trim(attribute);
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
return attribute;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
if (options.trim) {
|
|
139
|
+
return trim(element.textContent);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
return element.textContent;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
function queryContent(context, selector, customOptions) {
|
|
146
|
+
const options = { ...context.options, ...customOptions };
|
|
147
|
+
const target = queryElement(context, selector, options);
|
|
148
|
+
|
|
149
|
+
return extractContent(target, options);
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
function queryContents(context, selector, customOptions) {
|
|
153
|
+
const options = { ...context.options, ...customOptions };
|
|
154
|
+
const targets = queryElements(context, selector, options);
|
|
155
|
+
|
|
156
|
+
return targets.map((target) => extractContent(target, options)).filter(Boolean);
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
function queryAttribute(context, selector, attribute, customOptions) {
|
|
160
|
+
return queryContent(context, selector, {
|
|
161
|
+
...customOptions,
|
|
162
|
+
attribute,
|
|
163
|
+
});
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
function queryAttributes(context, selector, attribute, customOptions) {
|
|
167
|
+
return queryContents(context, selector, {
|
|
168
|
+
...customOptions,
|
|
169
|
+
attribute,
|
|
170
|
+
});
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
function queryHtml(context, selector, customOptions) {
|
|
174
|
+
const target = queryElement(context, selector, customOptions);
|
|
175
|
+
|
|
176
|
+
if (target) {
|
|
177
|
+
return trim(target.innerHTML);
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
return null;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
function queryHtmls(context, selector, customOptions) {
|
|
184
|
+
const targets = queryElements(context, selector, customOptions);
|
|
185
|
+
|
|
186
|
+
return targets.map((target) => trim(target.innerHTML));
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
function prefixUrl(urlPath, originUrl, customOptions) {
|
|
190
|
+
if (!urlPath) {
|
|
191
|
+
return null;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
if (!originUrl) {
|
|
195
|
+
return urlPath;
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
const options = {
|
|
199
|
+
protocol: 'https',
|
|
200
|
+
...customOptions,
|
|
201
|
+
};
|
|
202
|
+
|
|
203
|
+
const { origin, protocol } = new URL(originUrl);
|
|
204
|
+
|
|
205
|
+
if (/^http/.test(urlPath)) {
|
|
206
|
+
// this is already a complete URL
|
|
207
|
+
return urlPath;
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
if (options.protocol && /^\/\//.test(urlPath)) {
|
|
211
|
+
return `${options.protocol.replace(/:$/, '')}:${urlPath}`; // allow protocol to be defined either as 'https' or 'https:'
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
if (protocol && /^\/\//.test(urlPath)) {
|
|
215
|
+
return `${protocol}${urlPath}`;
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
if (/^\//.test(urlPath)) {
|
|
219
|
+
return `${origin}${urlPath}`;
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
if (/^\.\//.test(urlPath)) {
|
|
223
|
+
return `${originUrl.replace(/\/+$/, '')}${urlPath.slice(1)}`;
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
return `${origin}/${urlPath}`;
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
function queryUrl(context, selector = 'a', customOptions) {
|
|
230
|
+
const options = {
|
|
231
|
+
...context.options,
|
|
232
|
+
attribute: 'href',
|
|
233
|
+
...customOptions,
|
|
234
|
+
};
|
|
235
|
+
|
|
236
|
+
const url = queryContent(context, selector, options);
|
|
237
|
+
const curatedUrl = prefixUrl(url, options.origin, customOptions);
|
|
238
|
+
|
|
239
|
+
return curatedUrl;
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
function getImageUrl(context, selector, options) {
|
|
243
|
+
const attributeKey = getAttributeKey(options);
|
|
244
|
+
|
|
245
|
+
if (attributeKey) {
|
|
246
|
+
return queryAttribute(context, selector, attributeKey, options);
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
return queryAttribute(context, selector, 'data-src', options)
|
|
250
|
+
|| queryAttribute(context, selector, 'src', options);
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
function getImageUrls(context, selector, options) {
|
|
254
|
+
const attributeKey = getAttributeKey(options);
|
|
255
|
+
|
|
256
|
+
if (attributeKey) {
|
|
257
|
+
return queryAttributes(context, selector, attributeKey, options);
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
const dataLinks = queryAttributes(context, selector, 'data-src', options);
|
|
261
|
+
|
|
262
|
+
if (dataLinks.lenght > 0) {
|
|
263
|
+
return dataLinks;
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
return queryAttributes(context, selector, 'src', options);
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
function queryImage(context, selector = 'img', customOptions) {
|
|
270
|
+
const options = {
|
|
271
|
+
...context.options,
|
|
272
|
+
...customOptions,
|
|
273
|
+
};
|
|
274
|
+
|
|
275
|
+
const imageUrl = getImageUrl(context, selector, options);
|
|
276
|
+
|
|
277
|
+
return prefixUrl(imageUrl, options.origin, options);
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
function queryImages(context, selector = 'img', customOptions) {
|
|
281
|
+
const options = {
|
|
282
|
+
...context.options,
|
|
283
|
+
...customOptions,
|
|
284
|
+
};
|
|
285
|
+
|
|
286
|
+
const imageUrls = getImageUrls(context, selector, options);
|
|
287
|
+
|
|
288
|
+
return imageUrls.map((imageUrl) => prefixUrl(imageUrl, options.origin, options));
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
function extractJson(element) {
|
|
292
|
+
if (!element) {
|
|
293
|
+
return null;
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
try {
|
|
297
|
+
return JSON.parse(element.innerHTML);
|
|
298
|
+
} catch (error) {
|
|
299
|
+
return null;
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
function queryJson(context, selector, customOptions) {
|
|
304
|
+
const target = queryElement(context, selector, customOptions);
|
|
305
|
+
|
|
306
|
+
return extractJson(target);
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
function queryJsons(context, selector, customOptions) {
|
|
310
|
+
const targets = queryElements(context, selector, customOptions);
|
|
311
|
+
|
|
312
|
+
return targets.map((target) => extractJson(target)).filter(Boolean);
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
function extractDate(dateString, format, customOptions) {
|
|
316
|
+
if (!dateString) {
|
|
317
|
+
return null;
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
if (!format) {
|
|
321
|
+
return handleError(new Error('Missing required date format parameter'), 'NO_DATE_FORMAT');
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
const options = {
|
|
325
|
+
match: /((\d{1,4}[/-]\d{1,2}[/-]\d{1,4})|(\w+\s+\d{1,2},?\s+\d{4}))(\s+\d{1,2}:\d{2}(:\d{2})?)?/g, // matches any of 01-01-1970, 1970-01-01 and January 1, 1970 with optional 00:00[:00] time
|
|
326
|
+
timezone: 'UTC',
|
|
327
|
+
...customOptions,
|
|
328
|
+
};
|
|
329
|
+
|
|
330
|
+
const dateStamp = options.match
|
|
331
|
+
? trim(dateString).match(options.match)
|
|
332
|
+
: trim(dateString);
|
|
333
|
+
|
|
334
|
+
if (dateStamp) {
|
|
335
|
+
const dateValue = moment.tz(options.match ? dateStamp[0] : dateStamp, format, options.timezone);
|
|
336
|
+
|
|
337
|
+
if (dateValue.isValid()) {
|
|
338
|
+
return dateValue.toDate();
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
return null;
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
function queryDate(context, selector, format, customOptions) {
|
|
346
|
+
const dateString = queryContent(context, selector, customOptions);
|
|
347
|
+
|
|
348
|
+
return extractDate(dateString, format, {
|
|
349
|
+
...context.options,
|
|
350
|
+
...customOptions,
|
|
351
|
+
});
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
function queryDates(context, selector, format, customOptions) {
|
|
355
|
+
const dateStrings = queryContents(context, selector, customOptions);
|
|
356
|
+
|
|
357
|
+
return dateStrings.map((dateString) => extractDate(dateString, format, {
|
|
358
|
+
...context.options,
|
|
359
|
+
customOptions,
|
|
360
|
+
}));
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
const queryFns = {
|
|
364
|
+
element: queryElement,
|
|
365
|
+
elements: queryElements,
|
|
366
|
+
el: queryElement,
|
|
367
|
+
els: queryElements,
|
|
368
|
+
all: queryElements,
|
|
369
|
+
content: queryContent,
|
|
370
|
+
contents: queryContents,
|
|
371
|
+
attribute: queryAttribute,
|
|
372
|
+
attributes: queryAttributes,
|
|
373
|
+
attr: queryAttribute,
|
|
374
|
+
attrs: queryAttributes,
|
|
375
|
+
exists: queryExistence,
|
|
376
|
+
count: queryCount,
|
|
377
|
+
html: queryHtml,
|
|
378
|
+
htmls: queryHtmls,
|
|
379
|
+
image: queryImage,
|
|
380
|
+
images: queryImages,
|
|
381
|
+
img: queryImage,
|
|
382
|
+
imgs: queryImages,
|
|
383
|
+
json: queryJson,
|
|
384
|
+
jsons: queryJsons,
|
|
385
|
+
date: queryDate,
|
|
386
|
+
dates: queryDates,
|
|
387
|
+
url: queryUrl,
|
|
388
|
+
};
|
|
389
|
+
|
|
390
|
+
function isDomObject(element) {
|
|
391
|
+
if (!element) {
|
|
392
|
+
return false;
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
return typeof element.nodeType !== 'undefined';
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
function initQueryFns(fns, context) {
|
|
399
|
+
if (context) {
|
|
400
|
+
return Object.fromEntries(Object.entries(fns).map(([key, fn]) => [key, (...args) => fn(context, ...args)]));
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
// context is passed directly to query method
|
|
404
|
+
return Object.fromEntries(Object.entries(fns).map(([key, fn]) => [key, (...args) => {
|
|
405
|
+
// first argument is already an unprint context. this seems like a convoluted approach, but there is little reason not to allow it
|
|
406
|
+
if (args[0]?.isUnprint) {
|
|
407
|
+
return fn(...args);
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
// most common usage is to pass an element directly, convert to context
|
|
411
|
+
if (isDomObject(args[0])) {
|
|
412
|
+
const element = args[0];
|
|
413
|
+
|
|
414
|
+
return fn({
|
|
415
|
+
element,
|
|
416
|
+
html: element.outerHTML || element.body?.outerHTML,
|
|
417
|
+
isUnprint: true,
|
|
418
|
+
}, ...args.slice(1));
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
return handleError(new Error('Context is not provided or initialized'), 'INVALID_CONTEXT');
|
|
422
|
+
}]));
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
function init(elementOrHtml, selector, options) {
|
|
426
|
+
if (!elementOrHtml) {
|
|
427
|
+
return null;
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
if (typeof elementOrHtml === 'string') {
|
|
431
|
+
// the context should be raw HTML
|
|
432
|
+
const { window } = new JSDOM(elementOrHtml, { virtualConsole, ...options.parser });
|
|
433
|
+
|
|
434
|
+
return init(window.document, selector, { ...options, window });
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
if (!isDomObject(elementOrHtml)) {
|
|
438
|
+
// the context is not a valid
|
|
439
|
+
return handleError(new Error('Init context is not a DOM element, HTML or an array'), 'INVALID_CONTEXT');
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
const element = selector
|
|
443
|
+
? elementOrHtml.querySelector(selector)
|
|
444
|
+
: elementOrHtml;
|
|
445
|
+
|
|
446
|
+
if (!element) {
|
|
447
|
+
return null;
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
const context = {
|
|
451
|
+
element,
|
|
452
|
+
html: element.outerHTML || element.body?.outerHTML,
|
|
453
|
+
...(options.window && {
|
|
454
|
+
window: options.window,
|
|
455
|
+
document: options.window.document,
|
|
456
|
+
}),
|
|
457
|
+
options,
|
|
458
|
+
isUnprint: true,
|
|
459
|
+
};
|
|
460
|
+
|
|
461
|
+
context.query = initQueryFns(queryFns, context);
|
|
462
|
+
|
|
463
|
+
return context;
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
function initAll(context, selector, options) {
|
|
467
|
+
if (Array.isArray(context)) {
|
|
468
|
+
return context.map((element) => init(element, selector, options));
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
if (typeof context === 'string') {
|
|
472
|
+
// the context should be raw HTML
|
|
473
|
+
const { window } = new JSDOM(context, { virtualConsole, ...options.parser });
|
|
474
|
+
|
|
475
|
+
return initAll(window.document, selector, { ...options, window });
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
if (!(context instanceof globalWindow.HTMLElement)) {
|
|
479
|
+
// the context is not a valid
|
|
480
|
+
return handleError(new Error('Init context is not a DOM element, HTML or an array'), 'INVALID_CONTEXT');
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
return Array.from(context.querySelectorAll(options.select))
|
|
484
|
+
.map((element) => init(element, selector, options));
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
async function request(url, body, customOptions = {}, method = 'GET') {
|
|
488
|
+
const options = {
|
|
489
|
+
timeout: 1000,
|
|
490
|
+
extract: true,
|
|
491
|
+
url,
|
|
492
|
+
...customOptions,
|
|
493
|
+
};
|
|
494
|
+
|
|
495
|
+
const res = await axios({
|
|
496
|
+
url,
|
|
497
|
+
method,
|
|
498
|
+
data: body,
|
|
499
|
+
validateStatus: null,
|
|
500
|
+
timeout: options.timeout,
|
|
501
|
+
signal: options.abortSignal,
|
|
502
|
+
...options,
|
|
503
|
+
});
|
|
504
|
+
|
|
505
|
+
if (!(res.status >= 200 && res.status < 300)) {
|
|
506
|
+
handleError(new Error(`HTTP response from ${url} not OK (${res.status} ${res.statusText}): ${res.data}`), 'HTTP_NOT_OK');
|
|
507
|
+
|
|
508
|
+
return res.status;
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
const base = {
|
|
512
|
+
ok: true,
|
|
513
|
+
status: res.status,
|
|
514
|
+
statusText: res.statusText,
|
|
515
|
+
response: res,
|
|
516
|
+
res,
|
|
517
|
+
};
|
|
518
|
+
|
|
519
|
+
if (res.headers['content-type'].includes('application/json') && typeof res.data === 'object') {
|
|
520
|
+
return {
|
|
521
|
+
...base,
|
|
522
|
+
data: res.data,
|
|
523
|
+
};
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
if (!options.extract) {
|
|
527
|
+
return base;
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
const contextOptions = {
|
|
531
|
+
...defaultOptions,
|
|
532
|
+
origin: url,
|
|
533
|
+
};
|
|
534
|
+
|
|
535
|
+
const context = options.selectAll
|
|
536
|
+
? initAll(res.data, options.selectAll, contextOptions)
|
|
537
|
+
: init(res.data, options.select, contextOptions);
|
|
538
|
+
|
|
539
|
+
return {
|
|
540
|
+
...base,
|
|
541
|
+
context,
|
|
542
|
+
};
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
async function get(url, options) {
|
|
546
|
+
return request(url, null, options, 'GET');
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
async function post(url, body, options) {
|
|
550
|
+
return request(url, body, options, 'POST');
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
module.exports = {
|
|
554
|
+
get,
|
|
555
|
+
post,
|
|
556
|
+
request,
|
|
557
|
+
initialize: init,
|
|
558
|
+
initializeAll: initAll,
|
|
559
|
+
init,
|
|
560
|
+
initAll,
|
|
561
|
+
extractDate,
|
|
562
|
+
query: initQueryFns(queryFns),
|
|
563
|
+
};
|
package/tests/data.json
ADDED
package/tests/index.html
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
<!DOCTYPE html>
|
|
2
|
+
<html lang="en">
|
|
3
|
+
<head>
|
|
4
|
+
<meta charset="UTF-8">
|
|
5
|
+
<title>Test page</title>
|
|
6
|
+
</head>
|
|
7
|
+
|
|
8
|
+
<body>
|
|
9
|
+
<h1 id="title">Test page</h1>
|
|
10
|
+
|
|
11
|
+
<ul id="items">
|
|
12
|
+
<li class="item">Item 1</li>
|
|
13
|
+
<li class="item">Item 2</li>
|
|
14
|
+
<li class="item">Item 3</li>
|
|
15
|
+
</ul>
|
|
16
|
+
|
|
17
|
+
<a id="link" href="http://localhost:3101/html">Get HTML</a>
|
|
18
|
+
<a id="path" href="/json">Get data</a>
|
|
19
|
+
<a id="relativePath" href="./json">Get data</a>
|
|
20
|
+
|
|
21
|
+
<div id="date">Date: 22-07-2022 02:00</div>
|
|
22
|
+
<div id="date2">Date: 13-05-2022 18:00</div>
|
|
23
|
+
|
|
24
|
+
<img class="image" src="https://i.redd.it/vn9h981hlx281.png">
|
|
25
|
+
<img class="image" src="https://i.redd.it/1s22dsrqy0181.jpg">
|
|
26
|
+
<img class="image" src="https://i.redd.it/e91oo4ueyeb71.jpg">
|
|
27
|
+
|
|
28
|
+
<video id="video"><source src="https://i.imgur.com/eDQmLys.mp4"></video>
|
|
29
|
+
|
|
30
|
+
<script id="json" type="application/js">{"foo": "bar", "lorem": "ipsum", "hello": "world"}</script>
|
|
31
|
+
</body>
|
|
32
|
+
</html>
|
package/tests/init.js
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const path = require('path');
|
|
4
|
+
const express = require('express');
|
|
5
|
+
// const unprint = require('unprint');
|
|
6
|
+
|
|
7
|
+
const unprint = require('../src/app');
|
|
8
|
+
const data = require('./data.json');
|
|
9
|
+
|
|
10
|
+
const port = process.env.PORT || 3101;
|
|
11
|
+
|
|
12
|
+
async function initTest() {
|
|
13
|
+
const res = await unprint.get(`http://127.0.0.1:${port}/html`, { select: 'body' });
|
|
14
|
+
// const jsonRes = await unprint.get(`http://127.0.0.1:${port}/json`);
|
|
15
|
+
// const errorRes = await unprint.get(`http://127.0.0.1:${port}/error/404`);
|
|
16
|
+
|
|
17
|
+
console.log('title', res.context.query.content('//*[contains(text(), "Test")]'));
|
|
18
|
+
console.log('date', res.context.query.date('#date', 'DD-MM-YYYY HH:mm'));
|
|
19
|
+
console.log('data', res.context.query.json('#json'));
|
|
20
|
+
console.log('items', res.context.query.contents('.item'));
|
|
21
|
+
console.log('link', res.context.query.url('#link'));
|
|
22
|
+
console.log('image', res.context.query.img('.image'));
|
|
23
|
+
console.log('images', res.context.query.imgs('.image'));
|
|
24
|
+
console.log('path', res.context.query.url('#path'));
|
|
25
|
+
console.log('relative path', res.context.query.url('#relativePath'));
|
|
26
|
+
console.log('exists', res.context.query.exists('#title'));
|
|
27
|
+
console.log('count', res.context.query.count('.item'), res.context.query.count('.foo'));
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
async function initServer() {
|
|
31
|
+
const app = express();
|
|
32
|
+
|
|
33
|
+
app.use((req, res, next) => {
|
|
34
|
+
if (req.query.delay) {
|
|
35
|
+
setTimeout(() => {
|
|
36
|
+
next();
|
|
37
|
+
}, req.query.delay);
|
|
38
|
+
|
|
39
|
+
return;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
next();
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
app.get('/html', (req, res) => {
|
|
46
|
+
res.sendFile(path.resolve(__dirname, 'index.html'));
|
|
47
|
+
});
|
|
48
|
+
|
|
49
|
+
app.get('/json', (req, res) => {
|
|
50
|
+
res.send(data);
|
|
51
|
+
});
|
|
52
|
+
|
|
53
|
+
app.get('/error/:code', (req, res) => {
|
|
54
|
+
res.status(Number(req.params.code)).send();
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
const server = app.listen(port, async () => {
|
|
58
|
+
const { address } = server.address();
|
|
59
|
+
|
|
60
|
+
console.log(`Test server listening on ${address}:${port}`);
|
|
61
|
+
|
|
62
|
+
await initTest();
|
|
63
|
+
});
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
initServer();
|