@fast-scrape/wasm 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +57 -63
- package/package.json +1 -1
- package/scrape_wasm.d.ts +23 -23
- package/scrape_wasm.js +2 -2
- package/scrape_wasm_bg.wasm +0 -0
package/README.md
CHANGED
|
@@ -1,54 +1,50 @@
|
|
|
1
|
-
# @scrape
|
|
1
|
+
# @fast-scrape/wasm
|
|
2
2
|
|
|
3
|
-
[](https://www.npmjs.com/package/@fast-scrape/wasm)
|
|
4
|
+
[](https://bundlephobia.com/package/@fast-scrape/wasm)
|
|
5
5
|
[](https://www.typescriptlang.org/)
|
|
6
|
-
[](../../LICENSE-MIT)
|
|
6
|
+
[](../../LICENSE-MIT)
|
|
8
7
|
|
|
9
|
-
|
|
8
|
+
**10-50x faster** HTML parsing in the browser. Native-speed parsing via WebAssembly.
|
|
10
9
|
|
|
11
10
|
## Installation
|
|
12
11
|
|
|
13
12
|
```bash
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
# yarn
|
|
18
|
-
yarn add @scrape-rs/wasm
|
|
13
|
+
npm install @fast-scrape/wasm
|
|
14
|
+
```
|
|
19
15
|
|
|
20
|
-
|
|
21
|
-
|
|
16
|
+
<details>
|
|
17
|
+
<summary>Other package managers</summary>
|
|
22
18
|
|
|
23
|
-
|
|
24
|
-
|
|
19
|
+
```bash
|
|
20
|
+
yarn add @fast-scrape/wasm
|
|
21
|
+
pnpm add @fast-scrape/wasm
|
|
22
|
+
bun add @fast-scrape/wasm
|
|
25
23
|
```
|
|
26
24
|
|
|
25
|
+
</details>
|
|
26
|
+
|
|
27
27
|
## Quick start
|
|
28
28
|
|
|
29
29
|
```typescript
|
|
30
|
-
import init, { Soup } from '@scrape
|
|
30
|
+
import init, { Soup } from '@fast-scrape/wasm';
|
|
31
31
|
|
|
32
|
-
// Initialize WASM module (
|
|
33
|
-
await init();
|
|
32
|
+
await init(); // Initialize WASM module (once)
|
|
34
33
|
|
|
35
|
-
const
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
const div = soup.find("div");
|
|
39
|
-
console.log(div.text);
|
|
40
|
-
// Hello, World!
|
|
34
|
+
const soup = new Soup("<html><body><div class='content'>Hello, World!</div></body></html>");
|
|
35
|
+
console.log(soup.find("div").text); // Hello, World!
|
|
41
36
|
```
|
|
42
37
|
|
|
43
38
|
> [!IMPORTANT]
|
|
44
|
-
> Call `init()` once before using any other functions.
|
|
39
|
+
> Call `init()` once before using any other functions.
|
|
45
40
|
|
|
46
41
|
## Usage
|
|
47
42
|
|
|
48
|
-
|
|
43
|
+
<details open>
|
|
44
|
+
<summary><strong>Find elements</strong></summary>
|
|
49
45
|
|
|
50
46
|
```typescript
|
|
51
|
-
import init, { Soup } from '@scrape
|
|
47
|
+
import init, { Soup } from '@fast-scrape/wasm';
|
|
52
48
|
|
|
53
49
|
await init();
|
|
54
50
|
|
|
@@ -66,15 +62,16 @@ for (const el of soup.select("div.content > p")) {
|
|
|
66
62
|
}
|
|
67
63
|
```
|
|
68
64
|
|
|
69
|
-
|
|
65
|
+
</details>
|
|
66
|
+
|
|
67
|
+
<details>
|
|
68
|
+
<summary><strong>Bundlers</strong></summary>
|
|
70
69
|
|
|
71
70
|
**Vite:**
|
|
72
71
|
|
|
73
72
|
```typescript
|
|
74
|
-
import init, { Soup } from '@scrape
|
|
75
|
-
|
|
76
|
-
// Vite handles WASM automatically
|
|
77
|
-
await init();
|
|
73
|
+
import init, { Soup } from '@fast-scrape/wasm';
|
|
74
|
+
await init(); // Vite handles WASM automatically
|
|
78
75
|
```
|
|
79
76
|
|
|
80
77
|
**Webpack 5:**
|
|
@@ -82,41 +79,32 @@ await init();
|
|
|
82
79
|
```javascript
|
|
83
80
|
// webpack.config.js
|
|
84
81
|
module.exports = {
|
|
85
|
-
experiments: {
|
|
86
|
-
asyncWebAssembly: true,
|
|
87
|
-
},
|
|
82
|
+
experiments: { asyncWebAssembly: true },
|
|
88
83
|
};
|
|
89
84
|
```
|
|
90
85
|
|
|
91
|
-
|
|
86
|
+
</details>
|
|
87
|
+
|
|
88
|
+
<details>
|
|
89
|
+
<summary><strong>CDN usage</strong></summary>
|
|
92
90
|
|
|
93
91
|
```html
|
|
94
92
|
<script type="module">
|
|
95
|
-
import init, { Soup } from 'https://esm.sh/@scrape
|
|
93
|
+
import init, { Soup } from 'https://esm.sh/@fast-scrape/wasm';
|
|
96
94
|
|
|
97
95
|
await init();
|
|
98
|
-
|
|
99
96
|
const soup = new Soup('<div>Hello</div>');
|
|
100
97
|
console.log(soup.find('div').text);
|
|
101
98
|
</script>
|
|
102
99
|
```
|
|
103
100
|
|
|
104
|
-
|
|
101
|
+
</details>
|
|
105
102
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
| Minified + gzip | ~150 KB |
|
|
109
|
-
| Minified | ~400 KB |
|
|
110
|
-
|
|
111
|
-
> [!TIP]
|
|
112
|
-
> The WASM module includes SIMD optimizations. Modern browsers (Chrome 91+, Firefox 89+, Safari 16.4+) run SIMD automatically.
|
|
113
|
-
|
|
114
|
-
## TypeScript
|
|
115
|
-
|
|
116
|
-
Full TypeScript support with exported types:
|
|
103
|
+
<details>
|
|
104
|
+
<summary><strong>TypeScript</strong></summary>
|
|
117
105
|
|
|
118
106
|
```typescript
|
|
119
|
-
import init, { Soup, Tag } from '@scrape
|
|
107
|
+
import init, { Soup, Tag } from '@fast-scrape/wasm';
|
|
120
108
|
|
|
121
109
|
await init();
|
|
122
110
|
|
|
@@ -125,6 +113,18 @@ function extractLinks(soup: Soup): string[] {
|
|
|
125
113
|
}
|
|
126
114
|
```
|
|
127
115
|
|
|
116
|
+
</details>
|
|
117
|
+
|
|
118
|
+
## Bundle size
|
|
119
|
+
|
|
120
|
+
| Build | Size |
|
|
121
|
+
|-------|------|
|
|
122
|
+
| Minified + gzip | ~150 KB |
|
|
123
|
+
| Minified | ~400 KB |
|
|
124
|
+
|
|
125
|
+
> [!TIP]
|
|
126
|
+
> SIMD enabled automatically on Chrome 91+, Firefox 89+, Safari 16.4+.
|
|
127
|
+
|
|
128
128
|
## Browser support
|
|
129
129
|
|
|
130
130
|
| Browser | Version | SIMD |
|
|
@@ -134,20 +134,14 @@ function extractLinks(soup: Soup): string[] {
|
|
|
134
134
|
| Safari | 13+ | 16.4+ |
|
|
135
135
|
| Edge | 80+ | 91+ |
|
|
136
136
|
|
|
137
|
-
## Limitations
|
|
138
|
-
|
|
139
|
-
- No parallel processing (WASM threads have limited browser support)
|
|
140
|
-
- Must call `init()` before using the API
|
|
141
|
-
- Slightly higher memory usage than native bindings
|
|
142
|
-
|
|
143
137
|
## Related packages
|
|
144
138
|
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
-
|
|
139
|
+
| Platform | Package |
|
|
140
|
+
|----------|---------|
|
|
141
|
+
| Rust | [`scrape-core`](https://crates.io/crates/scrape-core) |
|
|
142
|
+
| Python | [`fast-scrape`](https://pypi.org/project/fast-scrape) |
|
|
143
|
+
| Node.js | [`@fast-scrape/node`](https://www.npmjs.com/package/@fast-scrape/node) |
|
|
150
144
|
|
|
151
145
|
## License
|
|
152
146
|
|
|
153
|
-
|
|
147
|
+
MIT OR Apache-2.0
|
package/package.json
CHANGED
package/scrape_wasm.d.ts
CHANGED
|
@@ -292,7 +292,28 @@ export type InitInput = RequestInfo | URL | Response | BufferSource | WebAssembl
|
|
|
292
292
|
|
|
293
293
|
export interface InitOutput {
|
|
294
294
|
readonly memory: WebAssembly.Memory;
|
|
295
|
+
readonly __wbg_soup_free: (a: number, b: number) => void;
|
|
296
|
+
readonly __wbg_soupconfig_free: (a: number, b: number) => void;
|
|
295
297
|
readonly __wbg_tag_free: (a: number, b: number) => void;
|
|
298
|
+
readonly hasSimdSupport: () => number;
|
|
299
|
+
readonly parseBatch: (a: number, b: number, c: number) => void;
|
|
300
|
+
readonly soup_find: (a: number, b: number, c: number, d: number) => void;
|
|
301
|
+
readonly soup_findAll: (a: number, b: number, c: number, d: number) => void;
|
|
302
|
+
readonly soup_length: (a: number) => number;
|
|
303
|
+
readonly soup_new: (a: number, b: number, c: number) => number;
|
|
304
|
+
readonly soup_root: (a: number) => number;
|
|
305
|
+
readonly soup_text: (a: number, b: number) => void;
|
|
306
|
+
readonly soup_title: (a: number, b: number) => void;
|
|
307
|
+
readonly soup_toHtml: (a: number, b: number) => void;
|
|
308
|
+
readonly soupconfig_includeComments: (a: number) => number;
|
|
309
|
+
readonly soupconfig_maxDepth: (a: number) => number;
|
|
310
|
+
readonly soupconfig_new: () => number;
|
|
311
|
+
readonly soupconfig_preserveWhitespace: (a: number) => number;
|
|
312
|
+
readonly soupconfig_set_includeComments: (a: number, b: number) => void;
|
|
313
|
+
readonly soupconfig_set_maxDepth: (a: number, b: number) => void;
|
|
314
|
+
readonly soupconfig_set_preserveWhitespace: (a: number, b: number) => void;
|
|
315
|
+
readonly soupconfig_set_strictMode: (a: number, b: number) => void;
|
|
316
|
+
readonly soupconfig_strictMode: (a: number) => number;
|
|
296
317
|
readonly tag_attr: (a: number, b: number, c: number, d: number) => void;
|
|
297
318
|
readonly tag_attrs: (a: number) => number;
|
|
298
319
|
readonly tag_children: (a: number, b: number) => void;
|
|
@@ -310,32 +331,11 @@ export interface InitOutput {
|
|
|
310
331
|
readonly tag_outerHTML: (a: number, b: number) => void;
|
|
311
332
|
readonly tag_parent: (a: number) => number;
|
|
312
333
|
readonly tag_prevSibling: (a: number) => number;
|
|
313
|
-
readonly tag_select: (a: number, b: number, c: number, d: number) => void;
|
|
314
334
|
readonly tag_text: (a: number, b: number) => void;
|
|
315
|
-
readonly __wbg_soupconfig_free: (a: number, b: number) => void;
|
|
316
|
-
readonly soupconfig_includeComments: (a: number) => number;
|
|
317
|
-
readonly soupconfig_maxDepth: (a: number) => number;
|
|
318
|
-
readonly soupconfig_new: () => number;
|
|
319
|
-
readonly soupconfig_preserveWhitespace: (a: number) => number;
|
|
320
|
-
readonly soupconfig_set_includeComments: (a: number, b: number) => void;
|
|
321
|
-
readonly soupconfig_set_maxDepth: (a: number, b: number) => void;
|
|
322
|
-
readonly soupconfig_set_preserveWhitespace: (a: number, b: number) => void;
|
|
323
|
-
readonly soupconfig_set_strictMode: (a: number, b: number) => void;
|
|
324
|
-
readonly soupconfig_strictMode: (a: number) => number;
|
|
325
|
-
readonly __wbg_soup_free: (a: number, b: number) => void;
|
|
326
|
-
readonly soup_find: (a: number, b: number, c: number, d: number) => void;
|
|
327
|
-
readonly soup_findAll: (a: number, b: number, c: number, d: number) => void;
|
|
328
|
-
readonly soup_length: (a: number) => number;
|
|
329
|
-
readonly soup_new: (a: number, b: number, c: number) => number;
|
|
330
|
-
readonly soup_root: (a: number) => number;
|
|
331
|
-
readonly soup_select: (a: number, b: number, c: number, d: number) => void;
|
|
332
|
-
readonly soup_text: (a: number, b: number) => void;
|
|
333
|
-
readonly soup_title: (a: number, b: number) => void;
|
|
334
|
-
readonly soup_toHtml: (a: number, b: number) => void;
|
|
335
|
-
readonly hasSimdSupport: () => number;
|
|
336
|
-
readonly parseBatch: (a: number, b: number, c: number) => void;
|
|
337
335
|
readonly version: (a: number) => void;
|
|
338
336
|
readonly init: () => void;
|
|
337
|
+
readonly soup_select: (a: number, b: number, c: number, d: number) => void;
|
|
338
|
+
readonly tag_select: (a: number, b: number, c: number, d: number) => void;
|
|
339
339
|
readonly __wbindgen_export: (a: number, b: number) => number;
|
|
340
340
|
readonly __wbindgen_export2: (a: number, b: number, c: number, d: number) => number;
|
|
341
341
|
readonly __wbindgen_export3: (a: number, b: number, c: number) => void;
|
package/scrape_wasm.js
CHANGED
|
@@ -142,7 +142,7 @@ export class Soup {
|
|
|
142
142
|
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
143
143
|
const ptr0 = passStringToWasm0(selector, wasm.__wbindgen_export, wasm.__wbindgen_export2);
|
|
144
144
|
const len0 = WASM_VECTOR_LEN;
|
|
145
|
-
wasm.
|
|
145
|
+
wasm.soup_findAll(retptr, this.__wbg_ptr, ptr0, len0);
|
|
146
146
|
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
147
147
|
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
148
148
|
var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
|
|
@@ -659,7 +659,7 @@ export class Tag {
|
|
|
659
659
|
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
660
660
|
const ptr0 = passStringToWasm0(selector, wasm.__wbindgen_export, wasm.__wbindgen_export2);
|
|
661
661
|
const len0 = WASM_VECTOR_LEN;
|
|
662
|
-
wasm.
|
|
662
|
+
wasm.tag_findAll(retptr, this.__wbg_ptr, ptr0, len0);
|
|
663
663
|
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
664
664
|
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
665
665
|
var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
|
package/scrape_wasm_bg.wasm
CHANGED
|
Binary file
|