unprint 0.9.5 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -85,6 +85,16 @@ Return the contents of the element or attribute as a Number primitive.
85
85
 
86
86
  Return the HTML contents of an element (`.innerHTML`).
87
87
 
88
+ #### Query the text
89
+ `query.text([selector], [options])`
90
+
91
+ Return the text contents of an element, skipping non-text children, as opposed to querying content.
92
+
93
+ Options
94
+ * `join`: Join text nodes into one string
95
+ * `trim`: Remove excess whitespace
96
+ * `filter`: Remove empty text nodes
97
+
88
98
  #### Query a URL
89
99
  `query.url([selector], [options])`
90
100
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "unprint",
3
- "version": "0.9.5",
3
+ "version": "0.10.0",
4
4
  "description": "Simplify common web scraping tasks while staying in control of the data.",
5
5
  "main": "src/app.js",
6
6
  "scripts": {
package/src/app.js CHANGED
@@ -193,11 +193,11 @@ const defaultNumberRegexp = /\d+(\.\d+)?/;
193
193
 
194
194
  function matchNumberString(numberString, options) {
195
195
  if (numberString && options.match) {
196
- return Number(numberString.match(options.match)?.[options.matchIndex]);
196
+ return Number(numberString.match(options.match)?.[options.matchIndex]) || null;
197
197
  }
198
198
 
199
199
  if (numberString) {
200
- return Number(numberString);
200
+ return Number(numberString) || null;
201
201
  }
202
202
 
203
203
  return null;
@@ -249,6 +249,51 @@ function queryHtmls(context, selector, customOptions) {
249
249
  return targets.map((target) => trim(target.innerHTML));
250
250
  }
251
251
 
252
+ function extractText(target, customOptions) {
253
+ const options = {
254
+ filter: true,
255
+ trim: true,
256
+ join: true,
257
+ ...customOptions,
258
+ };
259
+
260
+ const nodes = Array.from(target.childNodes)
261
+ .filter((node) => node.nodeName === '#text')
262
+ .map((node) => (options.trim ? trim(node.textContent) : node.textContent));
263
+
264
+ const filteredNodes = options.filter
265
+ ? nodes.filter(Boolean)
266
+ : nodes;
267
+
268
+ if (options.join) {
269
+ const text = filteredNodes.join(typeof options.join === 'string' ? options.join : ' ');
270
+
271
+ if (options.trim) {
272
+ return text.trim();
273
+ }
274
+
275
+ return text;
276
+ }
277
+
278
+ return filteredNodes;
279
+ }
280
+
281
+ function queryText(context, selector, customOptions) {
282
+ const target = queryElement(context, selector, customOptions);
283
+
284
+ if (!target) {
285
+ return null;
286
+ }
287
+
288
+ return extractText(target, customOptions);
289
+ }
290
+
291
+ function queryTexts(context, selector, customOptions) {
292
+ const targets = queryElements(context, selector, customOptions);
293
+
294
+ return targets.map((target) => extractText(target, customOptions));
295
+ }
296
+
252
297
  function prefixUrl(urlPath, originUrl, customOptions) {
253
298
  if (!urlPath) {
254
299
  return null;
@@ -626,6 +671,8 @@ const queryFns = {
626
671
  sourceSets: querySourceSets,
627
672
  srcSet: querySourceSet,
628
673
  srcSets: querySourceSets,
674
+ text: queryText,
675
+ texts: queryTexts,
629
676
  url: queryUrl,
630
677
  urls: queryUrls,
631
678
  video: queryVideo,
package/tests/index.html CHANGED
@@ -46,5 +46,18 @@
46
46
 
47
47
  <div class="dataset" data-hello="world" data-foo="bar">
48
48
  <div class="dataset" data-hello="world" data-foo="bar">
49
+
50
+ <div class="text">
51
+ <span>this should not be extracted</span>
52
+ this is our juice
53
+ <span>this should not be extracted</span>
54
+ more juice
55
+ </div>
56
+
57
+ <div class="text">
58
+ <span>this should not be extracted</span>
59
+ this some more text
60
+ this is the final text
61
+ </div>
49
62
  </body>
50
63
  </html>
package/tests/init.js CHANGED
@@ -28,6 +28,9 @@ async function initTest() {
28
28
  console.log('items', res.context.query.contents('.item'));
29
29
  console.log('link', res.context.query.url('#link'));
30
30
  console.log('links', res.context.query.urls('.link'));
31
+ console.log('text', res.context.query.text('.text'));
32
+ console.log('texts', res.context.query.text('.text', { join: false }));
33
+ console.log('all texts', res.context.query.texts('.text'));
31
34
  console.log('image', res.context.query.img('.image'));
32
35
  console.log('images', res.context.query.imgs('.image'));
33
36
  console.log('srcset', res.context.query.sourceSet('.srcset'));