unprint 0.4.3 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -43,11 +43,73 @@ query.element('h1#title'); // HTMLHeadingElement
43
43
  #### Selector
44
44
  The selector can be a CSS selector, an XPath selector starting with `//`, or an array of either or both acting as fallbacks. If the selector is falsy, the input element will be used.
45
45
 
46
+ #### Querying multiple elements
47
+ Most methods can be used in plural, returning an array of results, i.e. `query.elements()`, `query.dates()`.
48
+
46
49
  #### Query an element
47
50
  * `query.element([selector], [options])`
48
51
 
49
52
  Returns the element node directly.
50
53
 
54
+ #### Query an attribute
55
+ `query.attribute(selector, attribute, [options])` or `query.attr()`
56
+
57
+ Return the contents of an attribute. Alias for `query.element([selector], { attribute: [attribute] })`.
58
+
59
+ #### Query existence
60
+ `query.exists(selector, [options])`
61
+
62
+ Return the presence of an element as a boolean.
63
+
64
+ #### Query count
65
+ `query.count(selector, [options])`
66
+
67
+ Return the number of elements that match the selector.
68
+
69
+ #### Query the content
70
+ `query.content([selector], [options])`
71
+
72
+ Return the text contents of an element (`.textContent`).
73
+
74
+ #### Query the HTML
75
+ `query.content([selector], [options])`
76
+
77
+ Return the HTML contents of an element (`.innerHTML`).
78
+
79
+ #### Query a URL
80
+ `query.url([selector], [options])`
81
+
82
+ Options
83
+ * `origin`: The hostname to prefix when it is not included in the URL (`/path`).
84
+ * `protocol`: The protocol to use when it is not included in the URL (`:www.example.com`, default `http`).
85
+
86
+ Returns the `href` from an anchor element (or any other specified target) as a string.
87
+
88
+ #### Query an image
89
+ `query.image([selector], [options])` or `query.img()`
90
+
91
+ Options:
92
+ * All options supported by `query.url()`.
93
+
94
+ Returns the `src` from an image element (or any other specified target) as a string.
95
+
96
+ #### Query a source set
97
+ `query.sourceSet([selector], [options])` or `query.srcSet()`
98
+
99
+ Options:
100
+ * `includeDescriptor`: Produce an array of `{ descriptor, url }` instead of URL strings.
101
+ * All options supported by `query.url()`.
102
+
103
+ Returns an array of media URLs from the `srcset` of an media element as strings sorted by their descriptor from large to small.
104
+
105
+ #### Query a video
106
+ `query.video([selector], [options])`
107
+
108
+ Options:
109
+ * All options supported by `query.url()`.
110
+
111
+ Returns the `src` from an video source element (or any other specified target) as a string.
112
+
51
113
  #### Query a date
52
114
  `query.date(selector, format, [options])`
53
115
 
@@ -55,13 +117,15 @@ Arguments
55
117
  * `format` (string, array): The input format as a string or array of strings described by the [Moment.js docs](https://momentjs.com/docs/#/displaying/format/).
56
118
 
57
119
  Options
58
- * `match (RegExp): The text to extract before attempting to parse it as a date. The default expression will attempt to extract any of 01-01-1970, 1970-01-01, 01/01/1970 or January 1, 1970 with optional 00:00[:00] time.
120
+ * `match` (RegExp): The text to extract before attempting to parse it as a date. The default expression will attempt to extract any of 01-01-1970, 1970-01-01, 01/01/1970 or January 1, 1970 with optional 00:00[:00] time.
59
121
  * `timezone` (string): The name of the input timezone, defaults to 'UTC'.
60
122
 
61
123
  Returns a Date object.
62
124
 
63
- #### Querying multiple elements
64
- Most methods can be used in plural, returning an array of results, i.e. `query.elements()`, `query.dates()`.
125
+ #### Query JSON
126
+ `query.json([selector], [options])`
127
+
128
+ Returns the parsed JSON content of an element as an object.
65
129
 
66
130
  ### HTTP request
67
131
  * `unprint.get(url, [options])`
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "unprint",
3
- "version": "0.4.3",
3
+ "version": "0.5.0",
4
4
  "description": "Simplify common web scraping tasks while staying in control of the data.",
5
5
  "main": "src/app.js",
6
6
  "scripts": {
package/src/app.js CHANGED
@@ -295,6 +295,50 @@ function queryImages(context, selector = 'img', customOptions) {
295
295
  return imageUrls.map((imageUrl) => prefixUrl(imageUrl, options.origin, options));
296
296
  }
297
297
 
298
+ function querySourceSet(context, selector, attr = 'srcset', customOptions = {}) {
299
+ const srcset = queryAttribute(context, selector, attr, customOptions);
300
+
301
+ if (!srcset) {
302
+ return null;
303
+ }
304
+
305
+ const sources = srcset
306
+ .split(/\s*,\s*/)
307
+ .map((source) => {
308
+ const [link, descriptor] = source.split(' ');
309
+
310
+ if (link) {
311
+ return {
312
+ descriptor: descriptor || 'fallback',
313
+ url: prefixUrl(link, customOptions.origin, customOptions.protocol),
314
+ };
315
+ }
316
+
317
+ return null;
318
+ })
319
+ .filter(Boolean)
320
+ .sort((sourceA, sourceB) => {
321
+ if (sourceB.descriptor === 'fallback' || parseInt(sourceA.descriptor, 10) > parseInt(sourceB.descriptor, 10)) {
322
+ return -1;
323
+ }
324
+
325
+ if (parseInt(sourceA.descriptor, 10) < parseInt(sourceB.descriptor, 10)) {
326
+ return 1;
327
+ }
328
+
329
+ return 0;
330
+ });
331
+
332
+ if (customOptions.includeDescriptor) {
333
+ return sources.map((source) => ({
334
+ descriptor: source.descriptor,
335
+ url: prefixUrl(source.url),
336
+ }));
337
+ }
338
+
339
+ return sources.map((source) => prefixUrl(source.url));
340
+ }
341
+
298
342
  function queryVideo(context, selector = 'source', customOptions) {
299
343
  const options = {
300
344
  ...context.options,
@@ -415,6 +459,8 @@ const queryFns = {
415
459
  jsons: queryJsons,
416
460
  date: queryDate,
417
461
  dates: queryDates,
462
+ sourceSet: querySourceSet,
463
+ srcSet: querySourceSet,
418
464
  url: queryUrl,
419
465
  video: queryVideo,
420
466
  videos: queryVideos,
package/tests/index.html CHANGED
@@ -25,6 +25,8 @@
25
25
  <img class="image" src="https://i.redd.it/1s22dsrqy0181.jpg">
26
26
  <img class="image" src="https://i.redd.it/e91oo4ueyeb71.jpg">
27
27
 
28
+ <img class="srcset" srcset="https://i.redd.it/e91oo4ueyeb71.jpg 240w, https://i.redd.it/vn9h981hlx281.png 480w, https://i.redd.it/e91oo4ueyeb71.jpg 640w">
29
+
28
30
  <video id="video"><source src="https://i.imgur.com/eDQmLys.mp4"></video>
29
31
 
30
32
  <script id="json" type="application/js">{"foo": "bar", "lorem": "ipsum", "hello": "world"}</script>
package/tests/init.js CHANGED
@@ -23,6 +23,7 @@ async function initTest() {
23
23
  console.log('link', res.context.query.url('#link'));
24
24
  console.log('image', res.context.query.img('.image'));
25
25
  console.log('images', res.context.query.imgs('.image'));
26
+ console.log('srcset', res.context.query.sourceSet('.srcset'));
26
27
  console.log('path', res.context.query.url('#path'));
27
28
  console.log('relative path', res.context.query.url('#relativePath'));
28
29
  console.log('exists', res.context.query.exists('#title'));