unprint 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.editorconfig +0 -0
- package/.eslintrc +0 -0
- package/README.md +9 -0
- package/package.json +2 -1
- package/src/app.js +39 -3
- package/tests/data.json +0 -0
- package/tests/index.html +0 -0
- package/tests/init.js +2 -0
package/.editorconfig
CHANGED
|
File without changes
|
package/.eslintrc
CHANGED
|
File without changes
|
package/README.md
CHANGED
|
@@ -7,6 +7,15 @@ unprint is a web scraping utility built around JSDOM, providing convenience meth
|
|
|
7
7
|
## Usage
|
|
8
8
|
`const unprint = require('unprint');`
|
|
9
9
|
|
|
10
|
+
### Global options
|
|
11
|
+
```
|
|
12
|
+
unprint.options({
|
|
13
|
+
headers: {
|
|
14
|
+
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
|
|
15
|
+
},
|
|
16
|
+
})
|
|
17
|
+
```
|
|
18
|
+
|
|
10
19
|
### Querying
|
|
11
20
|
For optimal flexibility, unprint query methods can be used with or without initialization. If you already have access to DOM elements using another library or unprint instance, you can query it by using the uninitialized `query` methods provided directly from the library, and passing the element as the first argument, as such:
|
|
12
21
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "unprint",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.4.0",
|
|
4
4
|
"description": "Simplify common web scraping tasks while staying in control of the data.",
|
|
5
5
|
"main": "src/app.js",
|
|
6
6
|
"scripts": {
|
|
@@ -24,6 +24,7 @@
|
|
|
24
24
|
"dependencies": {
|
|
25
25
|
"axios": "^0.27.2",
|
|
26
26
|
"bottleneck": "^2.19.5",
|
|
27
|
+
"deepmerge": "^4.2.2",
|
|
27
28
|
"eslint": "^8.17.0",
|
|
28
29
|
"eslint-config-airbnb": "^19.0.4",
|
|
29
30
|
"eslint-config-airbnb-base": "^15.0.0",
|
package/src/app.js
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
const { JSDOM, VirtualConsole } = require('jsdom');
|
|
4
4
|
const axios = require('axios').default;
|
|
5
5
|
const moment = require('moment-timezone');
|
|
6
|
+
const merge = require('deepmerge');
|
|
6
7
|
|
|
7
8
|
const settings = {
|
|
8
9
|
throwErrors: false,
|
|
@@ -32,6 +33,12 @@ const defaultOptions = {
|
|
|
32
33
|
trim: true,
|
|
33
34
|
};
|
|
34
35
|
|
|
36
|
+
let globalOptions = {};
|
|
37
|
+
|
|
38
|
+
function configure(newOptions) {
|
|
39
|
+
globalOptions = newOptions;
|
|
40
|
+
}
|
|
41
|
+
|
|
35
42
|
function trim(string) {
|
|
36
43
|
if (typeof string === 'string') {
|
|
37
44
|
return string.trim().replace(/\s+/g, ' ');
|
|
@@ -288,6 +295,30 @@ function queryImages(context, selector = 'img', customOptions) {
|
|
|
288
295
|
return imageUrls.map((imageUrl) => prefixUrl(imageUrl, options.origin, options));
|
|
289
296
|
}
|
|
290
297
|
|
|
298
|
+
function queryVideo(context, selector = 'source', customOptions) {
|
|
299
|
+
const options = {
|
|
300
|
+
...context.options,
|
|
301
|
+
attribute: 'src',
|
|
302
|
+
...customOptions,
|
|
303
|
+
};
|
|
304
|
+
|
|
305
|
+
const videoUrl = queryContent(context, selector, options);
|
|
306
|
+
|
|
307
|
+
return prefixUrl(videoUrl, options.origin, options);
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
function queryVideos(context, selector = 'source', customOptions) {
|
|
311
|
+
const options = {
|
|
312
|
+
...context.options,
|
|
313
|
+
attribute: 'src',
|
|
314
|
+
...customOptions,
|
|
315
|
+
};
|
|
316
|
+
|
|
317
|
+
const videoUrls = queryContents(context, selector, options);
|
|
318
|
+
|
|
319
|
+
return videoUrls.map((videoUrl) => prefixUrl(videoUrl, options.origin, options));
|
|
320
|
+
}
|
|
321
|
+
|
|
291
322
|
function extractJson(element) {
|
|
292
323
|
if (!element) {
|
|
293
324
|
return null;
|
|
@@ -385,6 +416,8 @@ const queryFns = {
|
|
|
385
416
|
date: queryDate,
|
|
386
417
|
dates: queryDates,
|
|
387
418
|
url: queryUrl,
|
|
419
|
+
video: queryVideo,
|
|
420
|
+
videos: queryVideos,
|
|
388
421
|
};
|
|
389
422
|
|
|
390
423
|
function isDomObject(element) {
|
|
@@ -485,12 +518,13 @@ function initAll(context, selector, options) {
|
|
|
485
518
|
}
|
|
486
519
|
|
|
487
520
|
async function request(url, body, customOptions = {}, method = 'GET') {
|
|
488
|
-
const options = {
|
|
521
|
+
const options = merge.all([{
|
|
489
522
|
timeout: 1000,
|
|
490
523
|
extract: true,
|
|
491
524
|
url,
|
|
492
|
-
|
|
493
|
-
|
|
525
|
+
}, globalOptions, customOptions]);
|
|
526
|
+
|
|
527
|
+
console.log('options', options, globalOptions);
|
|
494
528
|
|
|
495
529
|
const res = await axios({
|
|
496
530
|
url,
|
|
@@ -551,6 +585,7 @@ async function post(url, body, options) {
|
|
|
551
585
|
}
|
|
552
586
|
|
|
553
587
|
module.exports = {
|
|
588
|
+
configure,
|
|
554
589
|
get,
|
|
555
590
|
post,
|
|
556
591
|
request,
|
|
@@ -559,5 +594,6 @@ module.exports = {
|
|
|
559
594
|
init,
|
|
560
595
|
initAll,
|
|
561
596
|
extractDate,
|
|
597
|
+
options: configure,
|
|
562
598
|
query: initQueryFns(queryFns),
|
|
563
599
|
};
|
package/tests/data.json
CHANGED
|
File without changes
|
package/tests/index.html
CHANGED
|
File without changes
|
package/tests/init.js
CHANGED
|
@@ -10,6 +10,8 @@ const data = require('./data.json');
|
|
|
10
10
|
const port = process.env.PORT || 3101;
|
|
11
11
|
|
|
12
12
|
async function initTest() {
|
|
13
|
+
unprint.options({ headers: { 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36' } });
|
|
14
|
+
|
|
13
15
|
const res = await unprint.get(`http://127.0.0.1:${port}/html`, { select: 'body' });
|
|
14
16
|
// const jsonRes = await unprint.get(`http://127.0.0.1:${port}/json`);
|
|
15
17
|
// const errorRes = await unprint.get(`http://127.0.0.1:${port}/error/404`);
|