unprint 0.4.3 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -43,11 +43,73 @@ query.element('h1#title'); // HTMLHeadingElement
43
43
  #### Selector
44
44
  The selector can be a CSS selector, an XPath selector starting with `//`, or an array of either or both acting as fallbacks. If the selector is falsy, the input element will be used.
45
45
 
46
+ #### Querying multiple elements
47
+ Most methods can be used in plural, returning an array of results, i.e. `query.elements()`, `query.dates()`.
48
+
46
49
  #### Query an element
47
50
  * `query.element([selector], [options])`
48
51
 
49
52
  Returns the element node directly.
50
53
 
54
+ #### Query an attribute
55
+ `query.attribute(selector, attribute, [options])` or `query.attr()`
56
+
57
+ Return the contents of an attribute. Alias for `query.element([selector], { attribute: [attribute] })`.
58
+
59
+ #### Query existence
60
+ `query.exists(selector, [options])`
61
+
62
+ Return the presence of an element as a boolean.
63
+
64
+ #### Query count
65
+ `query.count(selector, [options])`
66
+
67
+ Return the number of elements that match the selector.
68
+
69
+ #### Query the content
70
+ `query.content([selector], [options])`
71
+
72
+ Return the text contents of an element (`.textContent`).
73
+
74
+ #### Query the HTML
75
+ `query.content([selector], [options])`
76
+
77
+ Return the HTML contents of an element (`.innerHTML`).
78
+
79
+ #### Query a URL
80
+ `query.url([selector], [options])`
81
+
82
+ Options
83
+ * `origin`: The hostname to prefix when it is not included in the URL (`/path`).
84
+ * `protocol`: The protocol to use when it is not included in the URL (`:www.example.com`, default `http`).
85
+
86
+ Returns the `href` from an anchor element (or any other specified target) as a string.
87
+
88
+ #### Query an image
89
+ `query.image([selector], [options])` or `query.img()`
90
+
91
+ Options:
92
+ * All options supported by `query.url()`.
93
+
94
+ Returns the `src` from an image element (or any other specified target) as a string.
95
+
96
+ #### Query a source set
97
+ `query.sourceSet([selector], [options])` or `query.srcSet()`
98
+
99
+ Options:
100
+ * `includeDescriptor`: Produce an array of `{ descriptor, url }` instead of URL strings.
101
+ * All options supported by `query.url()`.
102
+
103
+ Returns an array of media URLs from the `srcset` of an media element as strings sorted by their descriptor from large to small.
104
+
105
+ #### Query a video
106
+ `query.video([selector], [options])`
107
+
108
+ Options:
109
+ * All options supported by `query.url()`.
110
+
111
+ Returns the `src` from an video source element (or any other specified target) as a string.
112
+
51
113
  #### Query a date
52
114
  `query.date(selector, format, [options])`
53
115
 
@@ -55,13 +117,23 @@ Arguments
55
117
  * `format` (string, array): The input format as a string or array of strings described by the [Moment.js docs](https://momentjs.com/docs/#/displaying/format/).
56
118
 
57
119
  Options
58
- * `match (RegExp): The text to extract before attempting to parse it as a date. The default expression will attempt to extract any of 01-01-1970, 1970-01-01, 01/01/1970 or January 1, 1970 with optional 00:00[:00] time.
120
+ * `match` (RegExp): The text to extract before attempting to parse it as a date. The default expression will attempt to extract any of 01-01-1970, 1970-01-01, 01/01/1970 or January 1, 1970 with optional 00:00[:00] time.
59
121
  * `timezone` (string): The name of the input timezone, defaults to 'UTC'.
60
122
 
61
123
  Returns a Date object.
62
124
 
63
- #### Querying multiple elements
64
- Most methods can be used in plural, returning an array of results, i.e. `query.elements()`, `query.dates()`.
125
+ #### Query a duration
126
+ `query.duration(selector, format, [options])` or `query.dur`
127
+
128
+ Options
129
+ * `match` (RegExp): The text to extract before attempting to parse it as a duration. The default expression will attempt to extract `(hh:)mm:ss` and `PT##H##M##S`.
130
+
131
+ Returns the duration in seconds as a number.
132
+
133
+ #### Query JSON
134
+ `query.json([selector], [options])`
135
+
136
+ Returns the parsed JSON content of an element as an object.
65
137
 
66
138
  ### HTTP request
67
139
  * `unprint.get(url, [options])`
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "unprint",
3
- "version": "0.4.3",
3
+ "version": "0.6.0",
4
4
  "description": "Simplify common web scraping tasks while staying in control of the data.",
5
5
  "main": "src/app.js",
6
6
  "scripts": {
package/src/app.js CHANGED
@@ -295,6 +295,50 @@ function queryImages(context, selector = 'img', customOptions) {
295
295
  return imageUrls.map((imageUrl) => prefixUrl(imageUrl, options.origin, options));
296
296
  }
297
297
 
298
+ function querySourceSet(context, selector, attr = 'srcset', customOptions = {}) {
299
+ const srcset = queryAttribute(context, selector, attr, customOptions);
300
+
301
+ if (!srcset) {
302
+ return null;
303
+ }
304
+
305
+ const sources = srcset
306
+ .split(/\s*,\s*/)
307
+ .map((source) => {
308
+ const [link, descriptor] = source.split(' ');
309
+
310
+ if (link) {
311
+ return {
312
+ descriptor: descriptor || 'fallback',
313
+ url: prefixUrl(link, customOptions.origin, customOptions.protocol),
314
+ };
315
+ }
316
+
317
+ return null;
318
+ })
319
+ .filter(Boolean)
320
+ .sort((sourceA, sourceB) => {
321
+ if (sourceB.descriptor === 'fallback' || parseInt(sourceA.descriptor, 10) > parseInt(sourceB.descriptor, 10)) {
322
+ return -1;
323
+ }
324
+
325
+ if (parseInt(sourceA.descriptor, 10) < parseInt(sourceB.descriptor, 10)) {
326
+ return 1;
327
+ }
328
+
329
+ return 0;
330
+ });
331
+
332
+ if (customOptions.includeDescriptor) {
333
+ return sources.map((source) => ({
334
+ descriptor: source.descriptor,
335
+ url: prefixUrl(source.url),
336
+ }));
337
+ }
338
+
339
+ return sources.map((source) => prefixUrl(source.url));
340
+ }
341
+
298
342
  function queryVideo(context, selector = 'source', customOptions) {
299
343
  const options = {
300
344
  ...context.options,
@@ -391,6 +435,48 @@ function queryDates(context, selector, format, customOptions) {
391
435
  }));
392
436
  }
393
437
 
438
+ function extractDuration(durationString, match) {
439
+ const durationMatch = durationString.match(match || /(\d+:)?\d+:\d+/);
440
+
441
+ if (durationMatch) {
442
+ const segments = ['00'].concat(durationMatch[0].split(/[:hm]/)).slice(-3);
443
+
444
+ return moment.duration(segments.join(':')).asSeconds();
445
+ }
446
+
447
+ return null;
448
+ }
449
+ function extractTimestamp(durationString) {
450
+ const timestampMatch = durationString.match(/(\d+H)?\s*(\d+M)?\s*\d+S?/i);
451
+
452
+ if (timestampMatch) {
453
+ const hours = timestampMatch[0].match(/(\d+)H/i)?.[1] || 0;
454
+ const minutes = timestampMatch[0].match(/(\d+)M/i)?.[1] || 0;
455
+ const seconds = timestampMatch[0].match(/(\d+)(S|$)/i)?.[1] || 0;
456
+
457
+ return (Number(hours) * 3600) + (Number(minutes) * 60) + Number(seconds);
458
+ }
459
+
460
+ return null;
461
+ }
462
+
463
+ function queryDuration(context, selector, customOptions) {
464
+ const options = { ...customOptions };
465
+ const durationString = queryContent(context, selector, customOptions);
466
+
467
+ if (!durationString) {
468
+ return null;
469
+ }
470
+
471
+ if (options.match) {
472
+ return extractDuration(durationString, options.match);
473
+ }
474
+
475
+ return extractDuration(durationString)
476
+ || extractTimestamp(durationString)
477
+ || null;
478
+ }
479
+
394
480
  const queryFns = {
395
481
  element: queryElement,
396
482
  elements: queryElements,
@@ -415,6 +501,10 @@ const queryFns = {
415
501
  jsons: queryJsons,
416
502
  date: queryDate,
417
503
  dates: queryDates,
504
+ duration: queryDuration,
505
+ dur: queryDuration,
506
+ sourceSet: querySourceSet,
507
+ srcSet: querySourceSet,
418
508
  url: queryUrl,
419
509
  video: queryVideo,
420
510
  videos: queryVideos,
@@ -592,6 +682,7 @@ module.exports = {
592
682
  init,
593
683
  initAll,
594
684
  extractDate,
685
+ extractDuration,
595
686
  options: configure,
596
687
  query: initQueryFns(queryFns),
597
688
  };
package/tests/index.html CHANGED
@@ -21,10 +21,15 @@
21
21
  <div id="date">Date: 22-07-2022 02:00</div>
22
22
  <div id="date2">Date: 13-05-2022 18:00</div>
23
23
 
24
+ <div id="duration">01:15:33</div>
25
+ <div id="timestamp">PT1H34M18S</div>
26
+
24
27
  <img class="image" src="https://i.redd.it/vn9h981hlx281.png">
25
28
  <img class="image" src="https://i.redd.it/1s22dsrqy0181.jpg">
26
29
  <img class="image" src="https://i.redd.it/e91oo4ueyeb71.jpg">
27
30
 
31
+ <img class="srcset" srcset="https://i.redd.it/e91oo4ueyeb71.jpg 240w, https://i.redd.it/vn9h981hlx281.png 480w, https://i.redd.it/e91oo4ueyeb71.jpg 640w">
32
+
28
33
  <video id="video"><source src="https://i.imgur.com/eDQmLys.mp4"></video>
29
34
 
30
35
  <script id="json" type="application/js">{"foo": "bar", "lorem": "ipsum", "hello": "world"}</script>
package/tests/init.js CHANGED
@@ -18,11 +18,14 @@ async function initTest() {
18
18
 
19
19
  console.log('title', res.context.query.content('//*[contains(text(), "Test")]'));
20
20
  console.log('date', res.context.query.date('#date', 'DD-MM-YYYY HH:mm'));
21
+ console.log('duration', res.context.query.duration('#duration'));
22
+ console.log('timestamp', res.context.query.duration('#timestamp'));
21
23
  console.log('data', res.context.query.json('#json'));
22
24
  console.log('items', res.context.query.contents('.item'));
23
25
  console.log('link', res.context.query.url('#link'));
24
26
  console.log('image', res.context.query.img('.image'));
25
27
  console.log('images', res.context.query.imgs('.image'));
28
+ console.log('srcset', res.context.query.sourceSet('.srcset'));
26
29
  console.log('path', res.context.query.url('#path'));
27
30
  console.log('relative path', res.context.query.url('#relativePath'));
28
31
  console.log('exists', res.context.query.exists('#title'));