unprint 0.9.6 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -85,6 +85,16 @@ Return the contents of the element or attribute as a Number primitive.
85
85
 
86
86
  Return the HTML contents of an element (`.innerHTML`).
87
87
 
88
+ #### Query the text
89
+ `query.text([selector], [options])`
90
+
91
+ Return the text contents of an element, skipping non-text children, as opposed to querying content.
92
+
93
+ Options
94
+ * `join`: Join text nodes into one string
95
+ * `trim`: Remove excess whitespace
96
+ * `filter`: Remove empty text nodes
97
+
88
98
  #### Query a URL
89
99
  `query.url([selector], [options])`
90
100
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "unprint",
3
- "version": "0.9.6",
3
+ "version": "0.10.1",
4
4
  "description": "Simplify common web scraping tasks while staying in control of the data.",
5
5
  "main": "src/app.js",
6
6
  "scripts": {
package/src/app.js CHANGED
@@ -249,6 +249,51 @@ function queryHtmls(context, selector, customOptions) {
249
249
  return targets.map((target) => trim(target.innerHTML));
250
250
  }
251
251
 
252
+ function extractText(target, customOptions) {
253
+ const options = {
254
+ filter: true,
255
+ trim: true,
256
+ join: true,
257
+ ...customOptions,
258
+ };
259
+
260
+ const nodes = Array.from(target.childNodes)
261
+ .filter((node) => node.nodeName === '#text')
262
+ .map((node) => (options.trim ? trim(node.textContent) : node.textContent));
263
+
264
+ const filteredNodes = options.filter
265
+ ? nodes.filter(Boolean)
266
+ : nodes;
267
+
268
+ if (options.join) {
269
+ const text = filteredNodes.join(typeof options.join === 'string' ? options.join : ' ');
270
+
271
+ if (options.trim) {
272
+ return text.trim();
273
+ }
274
+
275
+ return text;
276
+ }
277
+
278
+ return filteredNodes;
279
+ }
280
+
281
+ function queryText(context, selector, customOptions) {
282
+ const target = queryElement(context, selector, customOptions);
283
+
284
+ if (!target) {
285
+ return null;
286
+ }
287
+
288
+ return extractText(target, customOptions);
289
+ }
290
+
291
+ function queryTexts(context, selector, customOptions) {
292
+ const targets = queryElements(context, selector, customOptions);
293
+
294
+ return targets.map((target) => extractText(target, customOptions));
295
+ }
296
+
252
297
  function prefixUrl(urlPath, originUrl, customOptions) {
253
298
  if (!urlPath) {
254
299
  return null;
@@ -626,6 +671,8 @@ const queryFns = {
626
671
  sourceSets: querySourceSets,
627
672
  srcSet: querySourceSet,
628
673
  srcSets: querySourceSets,
674
+ text: queryText,
675
+ texts: queryTexts,
629
676
  url: queryUrl,
630
677
  urls: queryUrls,
631
678
  video: queryVideo,
@@ -685,7 +732,7 @@ function init(elementOrHtml, selector, options = {}) {
685
732
  }
686
733
 
687
734
  const element = selector
688
- ? elementOrHtml.querySelector(selector)
735
+ ? queryElement({ element: elementOrHtml }, selector)
689
736
  : elementOrHtml;
690
737
 
691
738
  if (!element) {
@@ -725,7 +772,7 @@ function initAll(context, selector, options = {}) {
725
772
  return handleError(new Error('Init context is not a DOM element, HTML or an array'), 'INVALID_CONTEXT');
726
773
  }
727
774
 
728
- return Array.from(context.querySelectorAll(selector))
775
+ return queryElements({ element: context }, selector)
729
776
  .map((element) => init(element, null, options));
730
777
  }
731
778
 
package/tests/index.html CHANGED
@@ -46,5 +46,18 @@
46
46
 
47
47
  <div class="dataset" data-hello="world" data-foo="bar">
48
48
  <div class="dataset" data-hello="world" data-foo="bar">
49
+
50
+ <div class="text">
51
+ <span>this should not be extracted</span>
52
+ this is our juice
53
+ <span>this should not be extracted</span>
54
+ more juice
55
+ </div>
56
+
57
+ <div class="text">
58
+ <span>this should not be extracted</span>
59
+ this some more text
60
+ this is the final text
61
+ </div>
49
62
  </body>
50
63
  </html>
package/tests/init.js CHANGED
@@ -28,6 +28,9 @@ async function initTest() {
28
28
  console.log('items', res.context.query.contents('.item'));
29
29
  console.log('link', res.context.query.url('#link'));
30
30
  console.log('links', res.context.query.urls('.link'));
31
+ console.log('text', res.context.query.text('.text'));
32
+ console.log('texts', res.context.query.text('.text', { join: false }));
33
+ console.log('all texts', res.context.query.texts('.text'));
31
34
  console.log('image', res.context.query.img('.image'));
32
35
  console.log('images', res.context.query.imgs('.image'));
33
36
  console.log('srcset', res.context.query.sourceSet('.srcset'));