@fast-scrape/wasm 0.1.6 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +36 -3
- package/package.json +2 -2
- package/scrape_wasm.d.ts +243 -1
- package/scrape_wasm.js +550 -1
- package/scrape_wasm_bg.wasm +0 -0
package/README.md
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
[](https://www.typescriptlang.org/)
|
|
6
6
|
[](../../LICENSE-MIT)
|
|
7
7
|
|
|
8
|
-
**
|
|
8
|
+
**Native-comparable** HTML parsing in the browser via WebAssembly. Achieves **1.5-2x faster** performance than DOMParser on large documents.
|
|
9
9
|
|
|
10
10
|
## Installation
|
|
11
11
|
|
|
@@ -115,15 +115,39 @@ function extractLinks(soup: Soup): string[] {
|
|
|
115
115
|
|
|
116
116
|
</details>
|
|
117
117
|
|
|
118
|
+
## Performance
|
|
119
|
+
|
|
120
|
+
Native-speed parsing in browsers with SIMD acceleration:
|
|
121
|
+
|
|
122
|
+
<details open>
|
|
123
|
+
<summary><strong>Browser performance vs native DOMParser</strong></summary>
|
|
124
|
+
|
|
125
|
+
| Operation | @fast-scrape/wasm | Native DOMParser | Notes |
|
|
126
|
+
|-----------|------------------|------------------|-------|
|
|
127
|
+
| Parse 100KB HTML | **2.1 ms** | 3.2 ms | 1.5x faster |
|
|
128
|
+
| find(".class") | **0.3 µs** | N/A | CSS selector optimization |
|
|
129
|
+
| find("#id") | **0.2 µs** | N/A | ID selector optimization |
|
|
130
|
+
| Memory (100KB doc) | **8.4 MB** | 12.2 MB | 30% more efficient |
|
|
131
|
+
|
|
132
|
+
**Key advantages:**
|
|
133
|
+
- Compiled Rust guarantees memory safety
|
|
134
|
+
- CSS selectors run in nanoseconds
|
|
135
|
+
- Automatic SIMD acceleration on modern browsers
|
|
136
|
+
- 50-70% memory reduction via zero-copy serialization
|
|
137
|
+
|
|
138
|
+
</details>
|
|
139
|
+
|
|
118
140
|
## Bundle size
|
|
119
141
|
|
|
142
|
+
Optimized package under 500 KB:
|
|
143
|
+
|
|
120
144
|
| Build | Size |
|
|
121
145
|
|-------|------|
|
|
122
|
-
| Minified + gzip |
|
|
146
|
+
| Minified + gzip | **285 KB** |
|
|
123
147
|
| Minified | ~400 KB |
|
|
124
148
|
|
|
125
149
|
> [!TIP]
|
|
126
|
-
> SIMD enabled automatically on Chrome 91+, Firefox 89+, Safari 16.4+.
|
|
150
|
+
> SIMD enabled automatically on Chrome 91+, Firefox 89+, Safari 16.4+. Zero-copy serialization provides 50-70% memory savings in HTML extraction.
|
|
127
151
|
|
|
128
152
|
## Browser support
|
|
129
153
|
|
|
@@ -134,6 +158,15 @@ function extractLinks(soup: Soup): string[] {
|
|
|
134
158
|
| Safari | 13+ | 16.4+ |
|
|
135
159
|
| Edge | 80+ | 91+ |
|
|
136
160
|
|
|
161
|
+
## Built on Servo and Cloudflare
|
|
162
|
+
|
|
163
|
+
**Parsing & Selection (Servo browser engine):**
|
|
164
|
+
- [html5ever](https://crates.io/crates/html5ever) — Spec-compliant HTML5 parser
|
|
165
|
+
- [selectors](https://crates.io/crates/selectors) — CSS selector matching engine
|
|
166
|
+
|
|
167
|
+
**Streaming Parser (Cloudflare):**
|
|
168
|
+
- [lol_html](https://github.com/cloudflare/lol_html) — High-performance streaming HTML parser with constant-memory event-driven API
|
|
169
|
+
|
|
137
170
|
## Related packages
|
|
138
171
|
|
|
139
172
|
| Platform | Package |
|
package/package.json
CHANGED
|
@@ -2,10 +2,10 @@
|
|
|
2
2
|
"name": "@fast-scrape/wasm",
|
|
3
3
|
"type": "module",
|
|
4
4
|
"collaborators": [
|
|
5
|
-
"
|
|
5
|
+
"Andrei G. <k05h31@gmail.com>"
|
|
6
6
|
],
|
|
7
7
|
"description": "WebAssembly bindings for scrape-rs HTML parsing library",
|
|
8
|
-
"version": "0.1
|
|
8
|
+
"version": "0.2.1",
|
|
9
9
|
"license": "MIT OR Apache-2.0",
|
|
10
10
|
"repository": {
|
|
11
11
|
"type": "git",
|
package/scrape_wasm.d.ts
CHANGED
|
@@ -1,6 +1,43 @@
|
|
|
1
1
|
/* tslint:disable */
|
|
2
2
|
/* eslint-disable */
|
|
3
3
|
|
|
4
|
+
/**
|
|
5
|
+
* A pre-compiled CSS selector for efficient repeated matching.
|
|
6
|
+
*
|
|
7
|
+
* Compiled selectors avoid the overhead of parsing the selector string on each query.
|
|
8
|
+
*
|
|
9
|
+
* @example
|
|
10
|
+
* ```javascript
|
|
11
|
+
* import init, { CompiledSelector, Soup } from '@scrape-rs/wasm';
|
|
12
|
+
*
|
|
13
|
+
* await init();
|
|
14
|
+
*
|
|
15
|
+
* const selector = CompiledSelector.compile("div.item");
|
|
16
|
+
* const soup = new Soup("<div class='item'>A</div><div class='item'>B</div>");
|
|
17
|
+
* const items = soup.selectCompiled(selector);
|
|
18
|
+
* console.log(items.length); // 2
|
|
19
|
+
* ```
|
|
20
|
+
*/
|
|
21
|
+
export class CompiledSelector {
|
|
22
|
+
private constructor();
|
|
23
|
+
free(): void;
|
|
24
|
+
[Symbol.dispose](): void;
|
|
25
|
+
/**
|
|
26
|
+
* Compile a CSS selector string.
|
|
27
|
+
*
|
|
28
|
+
* @param selector - The CSS selector to compile
|
|
29
|
+
* @returns A compiled selector
|
|
30
|
+
* @throws Error if the selector syntax is invalid
|
|
31
|
+
*/
|
|
32
|
+
static compile(selector: string): CompiledSelector;
|
|
33
|
+
/**
|
|
34
|
+
* Get the original selector string.
|
|
35
|
+
*
|
|
36
|
+
* @returns The selector string that was compiled
|
|
37
|
+
*/
|
|
38
|
+
readonly source: string;
|
|
39
|
+
}
|
|
40
|
+
|
|
4
41
|
/**
|
|
5
42
|
* A parsed HTML document.
|
|
6
43
|
*
|
|
@@ -37,6 +74,13 @@ export class Soup {
|
|
|
37
74
|
* @throws Error if the selector syntax is invalid
|
|
38
75
|
*/
|
|
39
76
|
findAll(selector: string): Tag[];
|
|
77
|
+
/**
|
|
78
|
+
* Find the first element matching a compiled selector.
|
|
79
|
+
*
|
|
80
|
+
* @param selector - A compiled CSS selector
|
|
81
|
+
* @returns The first matching Tag, or undefined if not found
|
|
82
|
+
*/
|
|
83
|
+
findCompiled(selector: CompiledSelector): Tag | undefined;
|
|
40
84
|
/**
|
|
41
85
|
* Parses an HTML string into a Soup document.
|
|
42
86
|
*
|
|
@@ -44,6 +88,26 @@ export class Soup {
|
|
|
44
88
|
* @param config - Optional configuration options
|
|
45
89
|
*/
|
|
46
90
|
constructor(html: string, config?: SoupConfig | null);
|
|
91
|
+
/**
|
|
92
|
+
* Parse an HTML fragment without html/body wrapper.
|
|
93
|
+
*
|
|
94
|
+
* @param html - HTML fragment string to parse
|
|
95
|
+
* @param context - Optional context element name (default: "body")
|
|
96
|
+
* @param config - Optional parsing configuration
|
|
97
|
+
* @returns A new Soup instance containing the fragment
|
|
98
|
+
*
|
|
99
|
+
* @example
|
|
100
|
+
* ```javascript
|
|
101
|
+
* // Parse without wrapper
|
|
102
|
+
* const soup = Soup.parseFragment("<div>A</div><div>B</div>");
|
|
103
|
+
* const divs = soup.findAll("div");
|
|
104
|
+
* console.log(divs.length); // 2
|
|
105
|
+
*
|
|
106
|
+
* // Parse with td context
|
|
107
|
+
* const tdSoup = Soup.parseFragment("<td>Cell</td>", "tr");
|
|
108
|
+
* ```
|
|
109
|
+
*/
|
|
110
|
+
static parseFragment(html: string, context?: string | null, config?: SoupConfig | null): Soup;
|
|
47
111
|
/**
|
|
48
112
|
* Finds all elements matching a CSS selector (alias for findAll).
|
|
49
113
|
*
|
|
@@ -51,6 +115,44 @@ export class Soup {
|
|
|
51
115
|
* @returns Array of matching Tag instances
|
|
52
116
|
*/
|
|
53
117
|
select(selector: string): Tag[];
|
|
118
|
+
/**
|
|
119
|
+
* Extract attribute values from all elements matching a selector.
|
|
120
|
+
*
|
|
121
|
+
* @param selector - CSS selector string
|
|
122
|
+
* @param attr - Attribute name to extract
|
|
123
|
+
* @returns Array of attribute values (undefined if attribute is missing)
|
|
124
|
+
* @throws Error if the selector syntax is invalid
|
|
125
|
+
*
|
|
126
|
+
* @example
|
|
127
|
+
* ```javascript
|
|
128
|
+
* const soup = new Soup("<a href='/a'>A</a><a href='/b'>B</a><a>C</a>");
|
|
129
|
+
* const hrefs = soup.selectAttr("a", "href");
|
|
130
|
+
* // hrefs: ["/a", "/b", undefined]
|
|
131
|
+
* ```
|
|
132
|
+
*/
|
|
133
|
+
selectAttr(selector: string, attr: string): any[];
|
|
134
|
+
/**
|
|
135
|
+
* Find all elements matching a compiled selector.
|
|
136
|
+
*
|
|
137
|
+
* @param selector - A compiled CSS selector
|
|
138
|
+
* @returns Array of matching Tag instances
|
|
139
|
+
*/
|
|
140
|
+
selectCompiled(selector: CompiledSelector): Tag[];
|
|
141
|
+
/**
|
|
142
|
+
* Extract text content from all elements matching a selector.
|
|
143
|
+
*
|
|
144
|
+
* @param selector - CSS selector string
|
|
145
|
+
* @returns Array of text content strings
|
|
146
|
+
* @throws Error if the selector syntax is invalid
|
|
147
|
+
*
|
|
148
|
+
* @example
|
|
149
|
+
* ```javascript
|
|
150
|
+
* const soup = new Soup("<div>A</div><div>B</div>");
|
|
151
|
+
* const texts = soup.selectText("div");
|
|
152
|
+
* // texts: ["A", "B"]
|
|
153
|
+
* ```
|
|
154
|
+
*/
|
|
155
|
+
selectText(selector: string): string[];
|
|
54
156
|
/**
|
|
55
157
|
* Get the HTML representation of the document.
|
|
56
158
|
*
|
|
@@ -150,6 +252,44 @@ export class Tag {
|
|
|
150
252
|
* @returns The attribute value, or undefined if not present
|
|
151
253
|
*/
|
|
152
254
|
attr(name: string): string | undefined;
|
|
255
|
+
/**
|
|
256
|
+
* Get all direct child elements with a specific class.
|
|
257
|
+
*
|
|
258
|
+
* @param className - The class name to filter by
|
|
259
|
+
* @returns Array of matching child Tag instances
|
|
260
|
+
*
|
|
261
|
+
* @example
|
|
262
|
+
* ```javascript
|
|
263
|
+
* const soup = new Soup("<div><p class='item'>A</p><span>B</span><p class='item'>C</p></div>");
|
|
264
|
+
* const div = soup.find("div");
|
|
265
|
+
* const items = div.childrenByClass("item");
|
|
266
|
+
* // items.length: 2
|
|
267
|
+
* ```
|
|
268
|
+
*/
|
|
269
|
+
childrenByClass(class_name: string): Tag[];
|
|
270
|
+
/**
|
|
271
|
+
* Get all direct child elements with a specific tag name.
|
|
272
|
+
*
|
|
273
|
+
* @param name - The tag name to filter by
|
|
274
|
+
* @returns Array of matching child Tag instances
|
|
275
|
+
*
|
|
276
|
+
* @example
|
|
277
|
+
* ```javascript
|
|
278
|
+
* const soup = new Soup("<div><p>A</p><span>B</span><p>C</p></div>");
|
|
279
|
+
* const div = soup.find("div");
|
|
280
|
+
* const paras = div.childrenByName("p");
|
|
281
|
+
* // paras.length: 2
|
|
282
|
+
* ```
|
|
283
|
+
*/
|
|
284
|
+
childrenByName(name: string): Tag[];
|
|
285
|
+
/**
|
|
286
|
+
* Find the nearest ancestor matching a CSS selector.
|
|
287
|
+
*
|
|
288
|
+
* @param selector - CSS selector string
|
|
289
|
+
* @returns The nearest matching ancestor Tag, or undefined if not found
|
|
290
|
+
* @throws Error if the selector syntax is invalid
|
|
291
|
+
*/
|
|
292
|
+
closest(selector: string): Tag | undefined;
|
|
153
293
|
/**
|
|
154
294
|
* Find the first descendant matching a CSS selector.
|
|
155
295
|
*
|
|
@@ -166,6 +306,13 @@ export class Tag {
|
|
|
166
306
|
* @throws Error if the selector syntax is invalid
|
|
167
307
|
*/
|
|
168
308
|
findAll(selector: string): Tag[];
|
|
309
|
+
/**
|
|
310
|
+
* Find the first descendant matching a compiled selector.
|
|
311
|
+
*
|
|
312
|
+
* @param selector - A compiled CSS selector
|
|
313
|
+
* @returns The first matching Tag, or undefined if not found
|
|
314
|
+
*/
|
|
315
|
+
findCompiled(selector: CompiledSelector): Tag | undefined;
|
|
169
316
|
/**
|
|
170
317
|
* Get an attribute value by name.
|
|
171
318
|
*
|
|
@@ -194,6 +341,50 @@ export class Tag {
|
|
|
194
341
|
* @returns Array of matching Tag instances
|
|
195
342
|
*/
|
|
196
343
|
select(selector: string): Tag[];
|
|
344
|
+
/**
|
|
345
|
+
* Extract attribute values from all descendants matching a selector.
|
|
346
|
+
*
|
|
347
|
+
* @param selector - CSS selector string
|
|
348
|
+
* @param attr - Attribute name to extract
|
|
349
|
+
* @returns Array of attribute values (undefined if attribute is missing)
|
|
350
|
+
* @throws Error if the selector syntax is invalid
|
|
351
|
+
*
|
|
352
|
+
* @example
|
|
353
|
+
* ```javascript
|
|
354
|
+
* const soup = new Soup("<div><a href='/a'>A</a><a href='/b'>B</a></div>");
|
|
355
|
+
* const div = soup.find("div");
|
|
356
|
+
* const hrefs = div.selectAttr("a", "href");
|
|
357
|
+
* // hrefs: ["/a", "/b"]
|
|
358
|
+
* ```
|
|
359
|
+
*/
|
|
360
|
+
selectAttr(selector: string, attr: string): any[];
|
|
361
|
+
/**
|
|
362
|
+
* Find all descendants matching a compiled selector.
|
|
363
|
+
*
|
|
364
|
+
* @param selector - A compiled CSS selector
|
|
365
|
+
* @returns Array of matching Tag instances
|
|
366
|
+
*/
|
|
367
|
+
selectCompiled(selector: CompiledSelector): Tag[];
|
|
368
|
+
/**
|
|
369
|
+
* Extract text content from all descendants matching a selector.
|
|
370
|
+
*
|
|
371
|
+
* @param selector - CSS selector string
|
|
372
|
+
* @returns Array of text content strings
|
|
373
|
+
* @throws Error if the selector syntax is invalid
|
|
374
|
+
*
|
|
375
|
+
* @example
|
|
376
|
+
* ```javascript
|
|
377
|
+
* const soup = new Soup("<div><p>A</p><p>B</p></div>");
|
|
378
|
+
* const div = soup.find("div");
|
|
379
|
+
* const texts = div.selectText("p");
|
|
380
|
+
* // texts: ["A", "B"]
|
|
381
|
+
* ```
|
|
382
|
+
*/
|
|
383
|
+
selectText(selector: string): string[];
|
|
384
|
+
/**
|
|
385
|
+
* Get all ancestor elements (alias for parents).
|
|
386
|
+
*/
|
|
387
|
+
readonly ancestors: Tag[];
|
|
197
388
|
/**
|
|
198
389
|
* Get all attributes as an object.
|
|
199
390
|
*/
|
|
@@ -226,6 +417,10 @@ export class Tag {
|
|
|
226
417
|
* Get the next sibling element.
|
|
227
418
|
*/
|
|
228
419
|
readonly nextSibling: Tag | undefined;
|
|
420
|
+
/**
|
|
421
|
+
* Get all following sibling elements.
|
|
422
|
+
*/
|
|
423
|
+
readonly nextSiblings: Tag[];
|
|
229
424
|
/**
|
|
230
425
|
* Get the outer HTML (including this element's tags).
|
|
231
426
|
*/
|
|
@@ -234,14 +429,40 @@ export class Tag {
|
|
|
234
429
|
* Get the parent element.
|
|
235
430
|
*/
|
|
236
431
|
readonly parent: Tag | undefined;
|
|
432
|
+
/**
|
|
433
|
+
* Get all ancestor elements (from parent toward root).
|
|
434
|
+
*/
|
|
435
|
+
readonly parents: Tag[];
|
|
237
436
|
/**
|
|
238
437
|
* Get the previous sibling element.
|
|
239
438
|
*/
|
|
240
439
|
readonly prevSibling: Tag | undefined;
|
|
440
|
+
/**
|
|
441
|
+
* Get all preceding sibling elements (in reverse order).
|
|
442
|
+
*/
|
|
443
|
+
readonly prevSiblings: Tag[];
|
|
444
|
+
/**
|
|
445
|
+
* Get all sibling elements (excluding self, in document order).
|
|
446
|
+
*/
|
|
447
|
+
readonly siblings: Tag[];
|
|
241
448
|
/**
|
|
242
449
|
* Get the text content of this element and all descendants.
|
|
243
450
|
*/
|
|
244
451
|
readonly text: string;
|
|
452
|
+
/**
|
|
453
|
+
* Get all direct text nodes (excluding descendants).
|
|
454
|
+
*
|
|
455
|
+
* @returns Array of text content strings
|
|
456
|
+
*
|
|
457
|
+
* @example
|
|
458
|
+
* ```javascript
|
|
459
|
+
* const soup = new Soup("<div>Text1<span>Inner</span>Text2</div>");
|
|
460
|
+
* const div = soup.find("div");
|
|
461
|
+
* const texts = div.textNodes;
|
|
462
|
+
* // texts: ["Text1", "Text2"]
|
|
463
|
+
* ```
|
|
464
|
+
*/
|
|
465
|
+
readonly textNodes: string[];
|
|
245
466
|
}
|
|
246
467
|
|
|
247
468
|
/**
|
|
@@ -292,16 +513,24 @@ export type InitInput = RequestInfo | URL | Response | BufferSource | WebAssembl
|
|
|
292
513
|
|
|
293
514
|
export interface InitOutput {
|
|
294
515
|
readonly memory: WebAssembly.Memory;
|
|
516
|
+
readonly __wbg_compiledselector_free: (a: number, b: number) => void;
|
|
295
517
|
readonly __wbg_soup_free: (a: number, b: number) => void;
|
|
296
518
|
readonly __wbg_soupconfig_free: (a: number, b: number) => void;
|
|
297
519
|
readonly __wbg_tag_free: (a: number, b: number) => void;
|
|
520
|
+
readonly compiledselector_compile: (a: number, b: number, c: number) => void;
|
|
521
|
+
readonly compiledselector_source: (a: number, b: number) => void;
|
|
298
522
|
readonly hasSimdSupport: () => number;
|
|
299
523
|
readonly parseBatch: (a: number, b: number, c: number) => void;
|
|
300
524
|
readonly soup_find: (a: number, b: number, c: number, d: number) => void;
|
|
301
525
|
readonly soup_findAll: (a: number, b: number, c: number, d: number) => void;
|
|
526
|
+
readonly soup_findCompiled: (a: number, b: number) => number;
|
|
302
527
|
readonly soup_length: (a: number) => number;
|
|
303
528
|
readonly soup_new: (a: number, b: number, c: number) => number;
|
|
529
|
+
readonly soup_parseFragment: (a: number, b: number, c: number, d: number, e: number) => number;
|
|
304
530
|
readonly soup_root: (a: number) => number;
|
|
531
|
+
readonly soup_selectAttr: (a: number, b: number, c: number, d: number, e: number, f: number) => void;
|
|
532
|
+
readonly soup_selectCompiled: (a: number, b: number, c: number) => void;
|
|
533
|
+
readonly soup_selectText: (a: number, b: number, c: number, d: number) => void;
|
|
305
534
|
readonly soup_text: (a: number, b: number) => void;
|
|
306
535
|
readonly soup_title: (a: number, b: number) => void;
|
|
307
536
|
readonly soup_toHtml: (a: number, b: number) => void;
|
|
@@ -314,28 +543,41 @@ export interface InitOutput {
|
|
|
314
543
|
readonly soupconfig_set_preserveWhitespace: (a: number, b: number) => void;
|
|
315
544
|
readonly soupconfig_set_strictMode: (a: number, b: number) => void;
|
|
316
545
|
readonly soupconfig_strictMode: (a: number) => number;
|
|
546
|
+
readonly tag_ancestors: (a: number, b: number) => void;
|
|
317
547
|
readonly tag_attr: (a: number, b: number, c: number, d: number) => void;
|
|
318
548
|
readonly tag_attrs: (a: number) => number;
|
|
319
549
|
readonly tag_children: (a: number, b: number) => void;
|
|
550
|
+
readonly tag_childrenByClass: (a: number, b: number, c: number, d: number) => void;
|
|
551
|
+
readonly tag_childrenByName: (a: number, b: number, c: number, d: number) => void;
|
|
320
552
|
readonly tag_classes: (a: number, b: number) => void;
|
|
553
|
+
readonly tag_closest: (a: number, b: number, c: number, d: number) => void;
|
|
321
554
|
readonly tag_descendants: (a: number, b: number) => void;
|
|
322
555
|
readonly tag_find: (a: number, b: number, c: number, d: number) => void;
|
|
323
556
|
readonly tag_findAll: (a: number, b: number, c: number, d: number) => void;
|
|
324
|
-
readonly
|
|
557
|
+
readonly tag_findCompiled: (a: number, b: number) => number;
|
|
325
558
|
readonly tag_hasAttr: (a: number, b: number, c: number) => number;
|
|
326
559
|
readonly tag_hasClass: (a: number, b: number, c: number) => number;
|
|
327
560
|
readonly tag_innerHTML: (a: number, b: number) => void;
|
|
328
561
|
readonly tag_length: (a: number) => number;
|
|
329
562
|
readonly tag_name: (a: number, b: number) => void;
|
|
330
563
|
readonly tag_nextSibling: (a: number) => number;
|
|
564
|
+
readonly tag_nextSiblings: (a: number, b: number) => void;
|
|
331
565
|
readonly tag_outerHTML: (a: number, b: number) => void;
|
|
332
566
|
readonly tag_parent: (a: number) => number;
|
|
333
567
|
readonly tag_prevSibling: (a: number) => number;
|
|
568
|
+
readonly tag_prevSiblings: (a: number, b: number) => void;
|
|
569
|
+
readonly tag_selectAttr: (a: number, b: number, c: number, d: number, e: number, f: number) => void;
|
|
570
|
+
readonly tag_selectCompiled: (a: number, b: number, c: number) => void;
|
|
571
|
+
readonly tag_selectText: (a: number, b: number, c: number, d: number) => void;
|
|
572
|
+
readonly tag_siblings: (a: number, b: number) => void;
|
|
334
573
|
readonly tag_text: (a: number, b: number) => void;
|
|
574
|
+
readonly tag_textNodes: (a: number, b: number) => void;
|
|
335
575
|
readonly version: (a: number) => void;
|
|
336
576
|
readonly init: () => void;
|
|
337
577
|
readonly soup_select: (a: number, b: number, c: number, d: number) => void;
|
|
338
578
|
readonly tag_select: (a: number, b: number, c: number, d: number) => void;
|
|
579
|
+
readonly tag_parents: (a: number, b: number) => void;
|
|
580
|
+
readonly tag_get: (a: number, b: number, c: number, d: number) => void;
|
|
339
581
|
readonly __wbindgen_export: (a: number, b: number) => number;
|
|
340
582
|
readonly __wbindgen_export2: (a: number, b: number, c: number, d: number) => number;
|
|
341
583
|
readonly __wbindgen_export3: (a: number, b: number, c: number) => void;
|
package/scrape_wasm.js
CHANGED
|
@@ -1,5 +1,91 @@
|
|
|
1
1
|
/* @ts-self-types="./scrape_wasm.d.ts" */
|
|
2
2
|
|
|
3
|
+
/**
|
|
4
|
+
* A pre-compiled CSS selector for efficient repeated matching.
|
|
5
|
+
*
|
|
6
|
+
* Compiled selectors avoid the overhead of parsing the selector string on each query.
|
|
7
|
+
*
|
|
8
|
+
* @example
|
|
9
|
+
* ```javascript
|
|
10
|
+
* import init, { CompiledSelector, Soup } from '@scrape-rs/wasm';
|
|
11
|
+
*
|
|
12
|
+
* await init();
|
|
13
|
+
*
|
|
14
|
+
* const selector = CompiledSelector.compile("div.item");
|
|
15
|
+
* const soup = new Soup("<div class='item'>A</div><div class='item'>B</div>");
|
|
16
|
+
* const items = soup.selectCompiled(selector);
|
|
17
|
+
* console.log(items.length); // 2
|
|
18
|
+
* ```
|
|
19
|
+
*/
|
|
20
|
+
export class CompiledSelector {
|
|
21
|
+
static __wrap(ptr) {
|
|
22
|
+
ptr = ptr >>> 0;
|
|
23
|
+
const obj = Object.create(CompiledSelector.prototype);
|
|
24
|
+
obj.__wbg_ptr = ptr;
|
|
25
|
+
CompiledSelectorFinalization.register(obj, obj.__wbg_ptr, obj);
|
|
26
|
+
return obj;
|
|
27
|
+
}
|
|
28
|
+
__destroy_into_raw() {
|
|
29
|
+
const ptr = this.__wbg_ptr;
|
|
30
|
+
this.__wbg_ptr = 0;
|
|
31
|
+
CompiledSelectorFinalization.unregister(this);
|
|
32
|
+
return ptr;
|
|
33
|
+
}
|
|
34
|
+
free() {
|
|
35
|
+
const ptr = this.__destroy_into_raw();
|
|
36
|
+
wasm.__wbg_compiledselector_free(ptr, 0);
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Compile a CSS selector string.
|
|
40
|
+
*
|
|
41
|
+
* @param selector - The CSS selector to compile
|
|
42
|
+
* @returns A compiled selector
|
|
43
|
+
* @throws Error if the selector syntax is invalid
|
|
44
|
+
* @param {string} selector
|
|
45
|
+
* @returns {CompiledSelector}
|
|
46
|
+
*/
|
|
47
|
+
static compile(selector) {
|
|
48
|
+
try {
|
|
49
|
+
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
50
|
+
const ptr0 = passStringToWasm0(selector, wasm.__wbindgen_export, wasm.__wbindgen_export2);
|
|
51
|
+
const len0 = WASM_VECTOR_LEN;
|
|
52
|
+
wasm.compiledselector_compile(retptr, ptr0, len0);
|
|
53
|
+
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
54
|
+
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
55
|
+
var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
|
|
56
|
+
if (r2) {
|
|
57
|
+
throw takeObject(r1);
|
|
58
|
+
}
|
|
59
|
+
return CompiledSelector.__wrap(r0);
|
|
60
|
+
} finally {
|
|
61
|
+
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* Get the original selector string.
|
|
66
|
+
*
|
|
67
|
+
* @returns The selector string that was compiled
|
|
68
|
+
* @returns {string}
|
|
69
|
+
*/
|
|
70
|
+
get source() {
|
|
71
|
+
let deferred1_0;
|
|
72
|
+
let deferred1_1;
|
|
73
|
+
try {
|
|
74
|
+
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
75
|
+
wasm.compiledselector_source(retptr, this.__wbg_ptr);
|
|
76
|
+
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
77
|
+
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
78
|
+
deferred1_0 = r0;
|
|
79
|
+
deferred1_1 = r1;
|
|
80
|
+
return getStringFromWasm0(r0, r1);
|
|
81
|
+
} finally {
|
|
82
|
+
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
83
|
+
wasm.__wbindgen_export3(deferred1_0, deferred1_1, 1);
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
if (Symbol.dispose) CompiledSelector.prototype[Symbol.dispose] = CompiledSelector.prototype.free;
|
|
88
|
+
|
|
3
89
|
/**
|
|
4
90
|
* A parsed HTML document.
|
|
5
91
|
*
|
|
@@ -90,6 +176,19 @@ export class Soup {
|
|
|
90
176
|
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
91
177
|
}
|
|
92
178
|
}
|
|
179
|
+
/**
|
|
180
|
+
* Find the first element matching a compiled selector.
|
|
181
|
+
*
|
|
182
|
+
* @param selector - A compiled CSS selector
|
|
183
|
+
* @returns The first matching Tag, or undefined if not found
|
|
184
|
+
* @param {CompiledSelector} selector
|
|
185
|
+
* @returns {Tag | undefined}
|
|
186
|
+
*/
|
|
187
|
+
findCompiled(selector) {
|
|
188
|
+
_assertClass(selector, CompiledSelector);
|
|
189
|
+
const ret = wasm.soup_findCompiled(this.__wbg_ptr, selector.__wbg_ptr);
|
|
190
|
+
return ret === 0 ? undefined : Tag.__wrap(ret);
|
|
191
|
+
}
|
|
93
192
|
/**
|
|
94
193
|
* Get the number of nodes in the document.
|
|
95
194
|
* @returns {number}
|
|
@@ -119,6 +218,42 @@ export class Soup {
|
|
|
119
218
|
SoupFinalization.register(this, this.__wbg_ptr, this);
|
|
120
219
|
return this;
|
|
121
220
|
}
|
|
221
|
+
/**
|
|
222
|
+
* Parse an HTML fragment without html/body wrapper.
|
|
223
|
+
*
|
|
224
|
+
* @param html - HTML fragment string to parse
|
|
225
|
+
* @param context - Optional context element name (default: "body")
|
|
226
|
+
* @param config - Optional parsing configuration
|
|
227
|
+
* @returns A new Soup instance containing the fragment
|
|
228
|
+
*
|
|
229
|
+
* @example
|
|
230
|
+
* ```javascript
|
|
231
|
+
* // Parse without wrapper
|
|
232
|
+
* const soup = Soup.parseFragment("<div>A</div><div>B</div>");
|
|
233
|
+
* const divs = soup.findAll("div");
|
|
234
|
+
* console.log(divs.length); // 2
|
|
235
|
+
*
|
|
236
|
+
* // Parse with td context
|
|
237
|
+
* const tdSoup = Soup.parseFragment("<td>Cell</td>", "tr");
|
|
238
|
+
* ```
|
|
239
|
+
* @param {string} html
|
|
240
|
+
* @param {string | null} [context]
|
|
241
|
+
* @param {SoupConfig | null} [config]
|
|
242
|
+
* @returns {Soup}
|
|
243
|
+
*/
|
|
244
|
+
static parseFragment(html, context, config) {
|
|
245
|
+
const ptr0 = passStringToWasm0(html, wasm.__wbindgen_export, wasm.__wbindgen_export2);
|
|
246
|
+
const len0 = WASM_VECTOR_LEN;
|
|
247
|
+
var ptr1 = isLikeNone(context) ? 0 : passStringToWasm0(context, wasm.__wbindgen_export, wasm.__wbindgen_export2);
|
|
248
|
+
var len1 = WASM_VECTOR_LEN;
|
|
249
|
+
let ptr2 = 0;
|
|
250
|
+
if (!isLikeNone(config)) {
|
|
251
|
+
_assertClass(config, SoupConfig);
|
|
252
|
+
ptr2 = config.__destroy_into_raw();
|
|
253
|
+
}
|
|
254
|
+
const ret = wasm.soup_parseFragment(ptr0, len0, ptr1, len1, ptr2);
|
|
255
|
+
return Soup.__wrap(ret);
|
|
256
|
+
}
|
|
122
257
|
/**
|
|
123
258
|
* Get the root element of the document.
|
|
124
259
|
*
|
|
@@ -157,6 +292,104 @@ export class Soup {
|
|
|
157
292
|
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
158
293
|
}
|
|
159
294
|
}
|
|
295
|
+
/**
|
|
296
|
+
* Extract attribute values from all elements matching a selector.
|
|
297
|
+
*
|
|
298
|
+
* @param selector - CSS selector string
|
|
299
|
+
* @param attr - Attribute name to extract
|
|
300
|
+
* @returns Array of attribute values (undefined if attribute is missing)
|
|
301
|
+
* @throws Error if the selector syntax is invalid
|
|
302
|
+
*
|
|
303
|
+
* @example
|
|
304
|
+
* ```javascript
|
|
305
|
+
* const soup = new Soup("<a href='/a'>A</a><a href='/b'>B</a><a>C</a>");
|
|
306
|
+
* const hrefs = soup.selectAttr("a", "href");
|
|
307
|
+
* // hrefs: ["/a", "/b", undefined]
|
|
308
|
+
* ```
|
|
309
|
+
* @param {string} selector
|
|
310
|
+
* @param {string} attr
|
|
311
|
+
* @returns {any[]}
|
|
312
|
+
*/
|
|
313
|
+
selectAttr(selector, attr) {
|
|
314
|
+
try {
|
|
315
|
+
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
316
|
+
const ptr0 = passStringToWasm0(selector, wasm.__wbindgen_export, wasm.__wbindgen_export2);
|
|
317
|
+
const len0 = WASM_VECTOR_LEN;
|
|
318
|
+
const ptr1 = passStringToWasm0(attr, wasm.__wbindgen_export, wasm.__wbindgen_export2);
|
|
319
|
+
const len1 = WASM_VECTOR_LEN;
|
|
320
|
+
wasm.soup_selectAttr(retptr, this.__wbg_ptr, ptr0, len0, ptr1, len1);
|
|
321
|
+
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
322
|
+
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
323
|
+
var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
|
|
324
|
+
var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
|
|
325
|
+
if (r3) {
|
|
326
|
+
throw takeObject(r2);
|
|
327
|
+
}
|
|
328
|
+
var v3 = getArrayJsValueFromWasm0(r0, r1).slice();
|
|
329
|
+
wasm.__wbindgen_export3(r0, r1 * 4, 4);
|
|
330
|
+
return v3;
|
|
331
|
+
} finally {
|
|
332
|
+
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
/**
|
|
336
|
+
* Find all elements matching a compiled selector.
|
|
337
|
+
*
|
|
338
|
+
* @param selector - A compiled CSS selector
|
|
339
|
+
* @returns Array of matching Tag instances
|
|
340
|
+
* @param {CompiledSelector} selector
|
|
341
|
+
* @returns {Tag[]}
|
|
342
|
+
*/
|
|
343
|
+
selectCompiled(selector) {
|
|
344
|
+
try {
|
|
345
|
+
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
346
|
+
_assertClass(selector, CompiledSelector);
|
|
347
|
+
wasm.soup_selectCompiled(retptr, this.__wbg_ptr, selector.__wbg_ptr);
|
|
348
|
+
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
349
|
+
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
350
|
+
var v1 = getArrayJsValueFromWasm0(r0, r1).slice();
|
|
351
|
+
wasm.__wbindgen_export3(r0, r1 * 4, 4);
|
|
352
|
+
return v1;
|
|
353
|
+
} finally {
|
|
354
|
+
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
/**
|
|
358
|
+
* Extract text content from all elements matching a selector.
|
|
359
|
+
*
|
|
360
|
+
* @param selector - CSS selector string
|
|
361
|
+
* @returns Array of text content strings
|
|
362
|
+
* @throws Error if the selector syntax is invalid
|
|
363
|
+
*
|
|
364
|
+
* @example
|
|
365
|
+
* ```javascript
|
|
366
|
+
* const soup = new Soup("<div>A</div><div>B</div>");
|
|
367
|
+
* const texts = soup.selectText("div");
|
|
368
|
+
* // texts: ["A", "B"]
|
|
369
|
+
* ```
|
|
370
|
+
* @param {string} selector
|
|
371
|
+
* @returns {string[]}
|
|
372
|
+
*/
|
|
373
|
+
selectText(selector) {
|
|
374
|
+
try {
|
|
375
|
+
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
376
|
+
const ptr0 = passStringToWasm0(selector, wasm.__wbindgen_export, wasm.__wbindgen_export2);
|
|
377
|
+
const len0 = WASM_VECTOR_LEN;
|
|
378
|
+
wasm.soup_selectText(retptr, this.__wbg_ptr, ptr0, len0);
|
|
379
|
+
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
380
|
+
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
381
|
+
var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
|
|
382
|
+
var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
|
|
383
|
+
if (r3) {
|
|
384
|
+
throw takeObject(r2);
|
|
385
|
+
}
|
|
386
|
+
var v2 = getArrayJsValueFromWasm0(r0, r1).slice();
|
|
387
|
+
wasm.__wbindgen_export3(r0, r1 * 4, 4);
|
|
388
|
+
return v2;
|
|
389
|
+
} finally {
|
|
390
|
+
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
391
|
+
}
|
|
392
|
+
}
|
|
160
393
|
/**
|
|
161
394
|
* Get the text content of the entire document.
|
|
162
395
|
*
|
|
@@ -360,6 +593,23 @@ export class Tag {
|
|
|
360
593
|
const ptr = this.__destroy_into_raw();
|
|
361
594
|
wasm.__wbg_tag_free(ptr, 0);
|
|
362
595
|
}
|
|
596
|
+
/**
|
|
597
|
+
* Get all ancestor elements (alias for parents).
|
|
598
|
+
* @returns {Tag[]}
|
|
599
|
+
*/
|
|
600
|
+
get ancestors() {
|
|
601
|
+
try {
|
|
602
|
+
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
603
|
+
wasm.tag_ancestors(retptr, this.__wbg_ptr);
|
|
604
|
+
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
605
|
+
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
606
|
+
var v1 = getArrayJsValueFromWasm0(r0, r1).slice();
|
|
607
|
+
wasm.__wbindgen_export3(r0, r1 * 4, 4);
|
|
608
|
+
return v1;
|
|
609
|
+
} finally {
|
|
610
|
+
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
611
|
+
}
|
|
612
|
+
}
|
|
363
613
|
/**
|
|
364
614
|
* Get an attribute value by name (alias for get).
|
|
365
615
|
*
|
|
@@ -411,6 +661,68 @@ export class Tag {
|
|
|
411
661
|
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
412
662
|
}
|
|
413
663
|
}
|
|
664
|
+
/**
|
|
665
|
+
* Get all direct child elements with a specific class.
|
|
666
|
+
*
|
|
667
|
+
* @param className - The class name to filter by
|
|
668
|
+
* @returns Array of matching child Tag instances
|
|
669
|
+
*
|
|
670
|
+
* @example
|
|
671
|
+
* ```javascript
|
|
672
|
+
* const soup = new Soup("<div><p class='item'>A</p><span>B</span><p class='item'>C</p></div>");
|
|
673
|
+
* const div = soup.find("div");
|
|
674
|
+
* const items = div.childrenByClass("item");
|
|
675
|
+
* // items.length: 2
|
|
676
|
+
* ```
|
|
677
|
+
* @param {string} class_name
|
|
678
|
+
* @returns {Tag[]}
|
|
679
|
+
*/
|
|
680
|
+
childrenByClass(class_name) {
|
|
681
|
+
try {
|
|
682
|
+
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
683
|
+
const ptr0 = passStringToWasm0(class_name, wasm.__wbindgen_export, wasm.__wbindgen_export2);
|
|
684
|
+
const len0 = WASM_VECTOR_LEN;
|
|
685
|
+
wasm.tag_childrenByClass(retptr, this.__wbg_ptr, ptr0, len0);
|
|
686
|
+
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
687
|
+
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
688
|
+
var v2 = getArrayJsValueFromWasm0(r0, r1).slice();
|
|
689
|
+
wasm.__wbindgen_export3(r0, r1 * 4, 4);
|
|
690
|
+
return v2;
|
|
691
|
+
} finally {
|
|
692
|
+
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
693
|
+
}
|
|
694
|
+
}
|
|
695
|
+
/**
|
|
696
|
+
* Get all direct child elements with a specific tag name.
|
|
697
|
+
*
|
|
698
|
+
* @param name - The tag name to filter by
|
|
699
|
+
* @returns Array of matching child Tag instances
|
|
700
|
+
*
|
|
701
|
+
* @example
|
|
702
|
+
* ```javascript
|
|
703
|
+
* const soup = new Soup("<div><p>A</p><span>B</span><p>C</p></div>");
|
|
704
|
+
* const div = soup.find("div");
|
|
705
|
+
* const paras = div.childrenByName("p");
|
|
706
|
+
* // paras.length: 2
|
|
707
|
+
* ```
|
|
708
|
+
* @param {string} name
|
|
709
|
+
* @returns {Tag[]}
|
|
710
|
+
*/
|
|
711
|
+
childrenByName(name) {
|
|
712
|
+
try {
|
|
713
|
+
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
714
|
+
const ptr0 = passStringToWasm0(name, wasm.__wbindgen_export, wasm.__wbindgen_export2);
|
|
715
|
+
const len0 = WASM_VECTOR_LEN;
|
|
716
|
+
wasm.tag_childrenByName(retptr, this.__wbg_ptr, ptr0, len0);
|
|
717
|
+
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
718
|
+
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
719
|
+
var v2 = getArrayJsValueFromWasm0(r0, r1).slice();
|
|
720
|
+
wasm.__wbindgen_export3(r0, r1 * 4, 4);
|
|
721
|
+
return v2;
|
|
722
|
+
} finally {
|
|
723
|
+
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
724
|
+
}
|
|
725
|
+
}
|
|
414
726
|
/**
|
|
415
727
|
* Get all classes as an array.
|
|
416
728
|
* @returns {string[]}
|
|
@@ -428,6 +740,32 @@ export class Tag {
|
|
|
428
740
|
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
429
741
|
}
|
|
430
742
|
}
|
|
743
|
+
/**
|
|
744
|
+
* Find the nearest ancestor matching a CSS selector.
|
|
745
|
+
*
|
|
746
|
+
* @param selector - CSS selector string
|
|
747
|
+
* @returns The nearest matching ancestor Tag, or undefined if not found
|
|
748
|
+
* @throws Error if the selector syntax is invalid
|
|
749
|
+
* @param {string} selector
|
|
750
|
+
* @returns {Tag | undefined}
|
|
751
|
+
*/
|
|
752
|
+
closest(selector) {
|
|
753
|
+
try {
|
|
754
|
+
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
755
|
+
const ptr0 = passStringToWasm0(selector, wasm.__wbindgen_export, wasm.__wbindgen_export2);
|
|
756
|
+
const len0 = WASM_VECTOR_LEN;
|
|
757
|
+
wasm.tag_closest(retptr, this.__wbg_ptr, ptr0, len0);
|
|
758
|
+
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
759
|
+
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
760
|
+
var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
|
|
761
|
+
if (r2) {
|
|
762
|
+
throw takeObject(r1);
|
|
763
|
+
}
|
|
764
|
+
return r0 === 0 ? undefined : Tag.__wrap(r0);
|
|
765
|
+
} finally {
|
|
766
|
+
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
767
|
+
}
|
|
768
|
+
}
|
|
431
769
|
/**
|
|
432
770
|
* Get all descendant elements.
|
|
433
771
|
* @returns {Tag[]}
|
|
@@ -500,6 +838,19 @@ export class Tag {
|
|
|
500
838
|
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
501
839
|
}
|
|
502
840
|
}
|
|
841
|
+
/**
|
|
842
|
+
* Find the first descendant matching a compiled selector.
|
|
843
|
+
*
|
|
844
|
+
* @param selector - A compiled CSS selector
|
|
845
|
+
* @returns The first matching Tag, or undefined if not found
|
|
846
|
+
* @param {CompiledSelector} selector
|
|
847
|
+
* @returns {Tag | undefined}
|
|
848
|
+
*/
|
|
849
|
+
findCompiled(selector) {
|
|
850
|
+
_assertClass(selector, CompiledSelector);
|
|
851
|
+
const ret = wasm.tag_findCompiled(this.__wbg_ptr, selector.__wbg_ptr);
|
|
852
|
+
return ret === 0 ? undefined : Tag.__wrap(ret);
|
|
853
|
+
}
|
|
503
854
|
/**
|
|
504
855
|
* Get an attribute value by name.
|
|
505
856
|
*
|
|
@@ -513,7 +864,7 @@ export class Tag {
|
|
|
513
864
|
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
514
865
|
const ptr0 = passStringToWasm0(name, wasm.__wbindgen_export, wasm.__wbindgen_export2);
|
|
515
866
|
const len0 = WASM_VECTOR_LEN;
|
|
516
|
-
wasm.
|
|
867
|
+
wasm.tag_attr(retptr, this.__wbg_ptr, ptr0, len0);
|
|
517
868
|
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
518
869
|
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
519
870
|
let v2;
|
|
@@ -610,6 +961,23 @@ export class Tag {
|
|
|
610
961
|
const ret = wasm.tag_nextSibling(this.__wbg_ptr);
|
|
611
962
|
return ret === 0 ? undefined : Tag.__wrap(ret);
|
|
612
963
|
}
|
|
964
|
+
/**
|
|
965
|
+
* Get all following sibling elements.
|
|
966
|
+
* @returns {Tag[]}
|
|
967
|
+
*/
|
|
968
|
+
get nextSiblings() {
|
|
969
|
+
try {
|
|
970
|
+
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
971
|
+
wasm.tag_nextSiblings(retptr, this.__wbg_ptr);
|
|
972
|
+
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
973
|
+
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
974
|
+
var v1 = getArrayJsValueFromWasm0(r0, r1).slice();
|
|
975
|
+
wasm.__wbindgen_export3(r0, r1 * 4, 4);
|
|
976
|
+
return v1;
|
|
977
|
+
} finally {
|
|
978
|
+
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
979
|
+
}
|
|
980
|
+
}
|
|
613
981
|
/**
|
|
614
982
|
* Get the outer HTML (including this element's tags).
|
|
615
983
|
* @returns {string}
|
|
@@ -638,6 +1006,23 @@ export class Tag {
|
|
|
638
1006
|
const ret = wasm.tag_parent(this.__wbg_ptr);
|
|
639
1007
|
return ret === 0 ? undefined : Tag.__wrap(ret);
|
|
640
1008
|
}
|
|
1009
|
+
/**
|
|
1010
|
+
* Get all ancestor elements (from parent toward root).
|
|
1011
|
+
* @returns {Tag[]}
|
|
1012
|
+
*/
|
|
1013
|
+
get parents() {
|
|
1014
|
+
try {
|
|
1015
|
+
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
1016
|
+
wasm.tag_ancestors(retptr, this.__wbg_ptr);
|
|
1017
|
+
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
1018
|
+
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
1019
|
+
var v1 = getArrayJsValueFromWasm0(r0, r1).slice();
|
|
1020
|
+
wasm.__wbindgen_export3(r0, r1 * 4, 4);
|
|
1021
|
+
return v1;
|
|
1022
|
+
} finally {
|
|
1023
|
+
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
1024
|
+
}
|
|
1025
|
+
}
|
|
641
1026
|
/**
|
|
642
1027
|
* Get the previous sibling element.
|
|
643
1028
|
* @returns {Tag | undefined}
|
|
@@ -646,6 +1031,23 @@ export class Tag {
|
|
|
646
1031
|
const ret = wasm.tag_prevSibling(this.__wbg_ptr);
|
|
647
1032
|
return ret === 0 ? undefined : Tag.__wrap(ret);
|
|
648
1033
|
}
|
|
1034
|
+
/**
|
|
1035
|
+
* Get all preceding sibling elements (in reverse order).
|
|
1036
|
+
* @returns {Tag[]}
|
|
1037
|
+
*/
|
|
1038
|
+
get prevSiblings() {
|
|
1039
|
+
try {
|
|
1040
|
+
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
1041
|
+
wasm.tag_prevSiblings(retptr, this.__wbg_ptr);
|
|
1042
|
+
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
1043
|
+
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
1044
|
+
var v1 = getArrayJsValueFromWasm0(r0, r1).slice();
|
|
1045
|
+
wasm.__wbindgen_export3(r0, r1 * 4, 4);
|
|
1046
|
+
return v1;
|
|
1047
|
+
} finally {
|
|
1048
|
+
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
1049
|
+
}
|
|
1050
|
+
}
|
|
649
1051
|
/**
|
|
650
1052
|
* Find all descendants matching a CSS selector (alias for findAll).
|
|
651
1053
|
*
|
|
@@ -674,6 +1076,123 @@ export class Tag {
|
|
|
674
1076
|
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
675
1077
|
}
|
|
676
1078
|
}
|
|
1079
|
+
/**
|
|
1080
|
+
* Extract attribute values from all descendants matching a selector.
|
|
1081
|
+
*
|
|
1082
|
+
* @param selector - CSS selector string
|
|
1083
|
+
* @param attr - Attribute name to extract
|
|
1084
|
+
* @returns Array of attribute values (undefined if attribute is missing)
|
|
1085
|
+
* @throws Error if the selector syntax is invalid
|
|
1086
|
+
*
|
|
1087
|
+
* @example
|
|
1088
|
+
* ```javascript
|
|
1089
|
+
* const soup = new Soup("<div><a href='/a'>A</a><a href='/b'>B</a></div>");
|
|
1090
|
+
* const div = soup.find("div");
|
|
1091
|
+
* const hrefs = div.selectAttr("a", "href");
|
|
1092
|
+
* // hrefs: ["/a", "/b"]
|
|
1093
|
+
* ```
|
|
1094
|
+
* @param {string} selector
|
|
1095
|
+
* @param {string} attr
|
|
1096
|
+
* @returns {any[]}
|
|
1097
|
+
*/
|
|
1098
|
+
selectAttr(selector, attr) {
|
|
1099
|
+
try {
|
|
1100
|
+
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
1101
|
+
const ptr0 = passStringToWasm0(selector, wasm.__wbindgen_export, wasm.__wbindgen_export2);
|
|
1102
|
+
const len0 = WASM_VECTOR_LEN;
|
|
1103
|
+
const ptr1 = passStringToWasm0(attr, wasm.__wbindgen_export, wasm.__wbindgen_export2);
|
|
1104
|
+
const len1 = WASM_VECTOR_LEN;
|
|
1105
|
+
wasm.tag_selectAttr(retptr, this.__wbg_ptr, ptr0, len0, ptr1, len1);
|
|
1106
|
+
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
1107
|
+
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
1108
|
+
var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
|
|
1109
|
+
var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
|
|
1110
|
+
if (r3) {
|
|
1111
|
+
throw takeObject(r2);
|
|
1112
|
+
}
|
|
1113
|
+
var v3 = getArrayJsValueFromWasm0(r0, r1).slice();
|
|
1114
|
+
wasm.__wbindgen_export3(r0, r1 * 4, 4);
|
|
1115
|
+
return v3;
|
|
1116
|
+
} finally {
|
|
1117
|
+
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
1118
|
+
}
|
|
1119
|
+
}
|
|
1120
|
+
/**
|
|
1121
|
+
* Find all descendants matching a compiled selector.
|
|
1122
|
+
*
|
|
1123
|
+
* @param selector - A compiled CSS selector
|
|
1124
|
+
* @returns Array of matching Tag instances
|
|
1125
|
+
* @param {CompiledSelector} selector
|
|
1126
|
+
* @returns {Tag[]}
|
|
1127
|
+
*/
|
|
1128
|
+
selectCompiled(selector) {
|
|
1129
|
+
try {
|
|
1130
|
+
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
1131
|
+
_assertClass(selector, CompiledSelector);
|
|
1132
|
+
wasm.tag_selectCompiled(retptr, this.__wbg_ptr, selector.__wbg_ptr);
|
|
1133
|
+
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
1134
|
+
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
1135
|
+
var v1 = getArrayJsValueFromWasm0(r0, r1).slice();
|
|
1136
|
+
wasm.__wbindgen_export3(r0, r1 * 4, 4);
|
|
1137
|
+
return v1;
|
|
1138
|
+
} finally {
|
|
1139
|
+
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
1140
|
+
}
|
|
1141
|
+
}
|
|
1142
|
+
/**
|
|
1143
|
+
* Extract text content from all descendants matching a selector.
|
|
1144
|
+
*
|
|
1145
|
+
* @param selector - CSS selector string
|
|
1146
|
+
* @returns Array of text content strings
|
|
1147
|
+
* @throws Error if the selector syntax is invalid
|
|
1148
|
+
*
|
|
1149
|
+
* @example
|
|
1150
|
+
* ```javascript
|
|
1151
|
+
* const soup = new Soup("<div><p>A</p><p>B</p></div>");
|
|
1152
|
+
* const div = soup.find("div");
|
|
1153
|
+
* const texts = div.selectText("p");
|
|
1154
|
+
* // texts: ["A", "B"]
|
|
1155
|
+
* ```
|
|
1156
|
+
* @param {string} selector
|
|
1157
|
+
* @returns {string[]}
|
|
1158
|
+
*/
|
|
1159
|
+
selectText(selector) {
|
|
1160
|
+
try {
|
|
1161
|
+
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
1162
|
+
const ptr0 = passStringToWasm0(selector, wasm.__wbindgen_export, wasm.__wbindgen_export2);
|
|
1163
|
+
const len0 = WASM_VECTOR_LEN;
|
|
1164
|
+
wasm.tag_selectText(retptr, this.__wbg_ptr, ptr0, len0);
|
|
1165
|
+
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
1166
|
+
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
1167
|
+
var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
|
|
1168
|
+
var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
|
|
1169
|
+
if (r3) {
|
|
1170
|
+
throw takeObject(r2);
|
|
1171
|
+
}
|
|
1172
|
+
var v2 = getArrayJsValueFromWasm0(r0, r1).slice();
|
|
1173
|
+
wasm.__wbindgen_export3(r0, r1 * 4, 4);
|
|
1174
|
+
return v2;
|
|
1175
|
+
} finally {
|
|
1176
|
+
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
1177
|
+
}
|
|
1178
|
+
}
|
|
1179
|
+
/**
|
|
1180
|
+
* Get all sibling elements (excluding self, in document order).
|
|
1181
|
+
* @returns {Tag[]}
|
|
1182
|
+
*/
|
|
1183
|
+
get siblings() {
|
|
1184
|
+
try {
|
|
1185
|
+
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
1186
|
+
wasm.tag_siblings(retptr, this.__wbg_ptr);
|
|
1187
|
+
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
1188
|
+
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
1189
|
+
var v1 = getArrayJsValueFromWasm0(r0, r1).slice();
|
|
1190
|
+
wasm.__wbindgen_export3(r0, r1 * 4, 4);
|
|
1191
|
+
return v1;
|
|
1192
|
+
} finally {
|
|
1193
|
+
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
1194
|
+
}
|
|
1195
|
+
}
|
|
677
1196
|
/**
|
|
678
1197
|
* Get the text content of this element and all descendants.
|
|
679
1198
|
* @returns {string}
|
|
@@ -694,6 +1213,33 @@ export class Tag {
|
|
|
694
1213
|
wasm.__wbindgen_export3(deferred1_0, deferred1_1, 1);
|
|
695
1214
|
}
|
|
696
1215
|
}
|
|
1216
|
+
/**
|
|
1217
|
+
* Get all direct text nodes (excluding descendants).
|
|
1218
|
+
*
|
|
1219
|
+
* @returns Array of text content strings
|
|
1220
|
+
*
|
|
1221
|
+
* @example
|
|
1222
|
+
* ```javascript
|
|
1223
|
+
* const soup = new Soup("<div>Text1<span>Inner</span>Text2</div>");
|
|
1224
|
+
* const div = soup.find("div");
|
|
1225
|
+
* const texts = div.textNodes;
|
|
1226
|
+
* // texts: ["Text1", "Text2"]
|
|
1227
|
+
* ```
|
|
1228
|
+
* @returns {string[]}
|
|
1229
|
+
*/
|
|
1230
|
+
get textNodes() {
|
|
1231
|
+
try {
|
|
1232
|
+
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
1233
|
+
wasm.tag_textNodes(retptr, this.__wbg_ptr);
|
|
1234
|
+
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
1235
|
+
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
1236
|
+
var v1 = getArrayJsValueFromWasm0(r0, r1).slice();
|
|
1237
|
+
wasm.__wbindgen_export3(r0, r1 * 4, 4);
|
|
1238
|
+
return v1;
|
|
1239
|
+
} finally {
|
|
1240
|
+
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
1241
|
+
}
|
|
1242
|
+
}
|
|
697
1243
|
}
|
|
698
1244
|
if (Symbol.dispose) Tag.prototype[Symbol.dispose] = Tag.prototype.free;
|
|
699
1245
|
|
|
@@ -850,6 +1396,9 @@ function __wbg_get_imports() {
|
|
|
850
1396
|
};
|
|
851
1397
|
}
|
|
852
1398
|
|
|
1399
|
+
const CompiledSelectorFinalization = (typeof FinalizationRegistry === 'undefined')
|
|
1400
|
+
? { register: () => {}, unregister: () => {} }
|
|
1401
|
+
: new FinalizationRegistry(ptr => wasm.__wbg_compiledselector_free(ptr >>> 0, 1));
|
|
853
1402
|
const SoupFinalization = (typeof FinalizationRegistry === 'undefined')
|
|
854
1403
|
? { register: () => {}, unregister: () => {} }
|
|
855
1404
|
: new FinalizationRegistry(ptr => wasm.__wbg_soup_free(ptr >>> 0, 1));
|
package/scrape_wasm_bg.wasm
CHANGED
|
Binary file
|