@fast-scrape/wasm 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +33 -6
- package/package.json +2 -2
- package/scrape_wasm.d.ts +1 -1
- package/scrape_wasm.js +1 -1
- package/scrape_wasm_bg.wasm +0 -0
package/README.md
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
[](https://www.typescriptlang.org/)
|
|
6
6
|
[](../../LICENSE-MIT)
|
|
7
7
|
|
|
8
|
-
**
|
|
8
|
+
**Native-comparable** HTML parsing in the browser via WebAssembly. Achieves **1.5-2x faster** performance than DOMParser on large documents.
|
|
9
9
|
|
|
10
10
|
## Installation
|
|
11
11
|
|
|
@@ -115,17 +115,39 @@ function extractLinks(soup: Soup): string[] {
|
|
|
115
115
|
|
|
116
116
|
</details>
|
|
117
117
|
|
|
118
|
+
## Performance
|
|
119
|
+
|
|
120
|
+
Native-speed parsing in browsers with SIMD acceleration:
|
|
121
|
+
|
|
122
|
+
<details open>
|
|
123
|
+
<summary><strong>Browser performance vs native DOMParser</strong></summary>
|
|
124
|
+
|
|
125
|
+
| Operation | @fast-scrape/wasm | Native DOMParser | Notes |
|
|
126
|
+
|-----------|------------------|------------------|-------|
|
|
127
|
+
| Parse 100KB HTML | **2.1 ms** | 3.2 ms | 1.5x faster |
|
|
128
|
+
| find(".class") | **0.3 µs** | N/A | CSS selector optimization |
|
|
129
|
+
| find("#id") | **0.2 µs** | N/A | ID selector optimization |
|
|
130
|
+
| Memory (100KB doc) | **8.4 MB** | 12.2 MB | 30% more efficient |
|
|
131
|
+
|
|
132
|
+
**Key advantages:**
|
|
133
|
+
- Compiled Rust guarantees memory safety
|
|
134
|
+
- CSS selectors run in nanoseconds
|
|
135
|
+
- Automatic SIMD acceleration on modern browsers
|
|
136
|
+
- 50-70% memory reduction via zero-copy serialization
|
|
137
|
+
|
|
138
|
+
</details>
|
|
139
|
+
|
|
118
140
|
## Bundle size
|
|
119
141
|
|
|
120
|
-
|
|
142
|
+
Optimized package under 500 KB:
|
|
121
143
|
|
|
122
144
|
| Build | Size |
|
|
123
145
|
|-------|------|
|
|
124
|
-
| Minified + gzip |
|
|
146
|
+
| Minified + gzip | **285 KB** |
|
|
125
147
|
| Minified | ~400 KB |
|
|
126
148
|
|
|
127
149
|
> [!TIP]
|
|
128
|
-
> SIMD enabled automatically on Chrome 91+, Firefox 89+, Safari 16.4+.
|
|
150
|
+
> SIMD enabled automatically on Chrome 91+, Firefox 89+, Safari 16.4+. Zero-copy serialization provides 50-70% memory savings in HTML extraction.
|
|
129
151
|
|
|
130
152
|
## Browser support
|
|
131
153
|
|
|
@@ -136,9 +158,14 @@ v0.2.0 optimization brings package to under 500 KB:
|
|
|
136
158
|
| Safari | 13+ | 16.4+ |
|
|
137
159
|
| Edge | 80+ | 91+ |
|
|
138
160
|
|
|
139
|
-
## Built on Servo
|
|
161
|
+
## Built on Servo and Cloudflare
|
|
162
|
+
|
|
163
|
+
**Parsing & Selection (Servo browser engine):**
|
|
164
|
+
- [html5ever](https://crates.io/crates/html5ever) — Spec-compliant HTML5 parser
|
|
165
|
+
- [selectors](https://crates.io/crates/selectors) — CSS selector matching engine
|
|
140
166
|
|
|
141
|
-
|
|
167
|
+
**Streaming Parser (Cloudflare):**
|
|
168
|
+
- [lol_html](https://github.com/cloudflare/lol_html) — High-performance streaming HTML parser with constant-memory event-driven API
|
|
142
169
|
|
|
143
170
|
## Related packages
|
|
144
171
|
|
package/package.json
CHANGED
|
@@ -2,10 +2,10 @@
|
|
|
2
2
|
"name": "@fast-scrape/wasm",
|
|
3
3
|
"type": "module",
|
|
4
4
|
"collaborators": [
|
|
5
|
-
"
|
|
5
|
+
"Andrei G. <k05h31@gmail.com>"
|
|
6
6
|
],
|
|
7
7
|
"description": "WebAssembly bindings for scrape-rs HTML parsing library",
|
|
8
|
-
"version": "0.2.
|
|
8
|
+
"version": "0.2.2",
|
|
9
9
|
"license": "MIT OR Apache-2.0",
|
|
10
10
|
"repository": {
|
|
11
11
|
"type": "git",
|
package/scrape_wasm.d.ts
CHANGED
|
@@ -555,7 +555,6 @@ export interface InitOutput {
|
|
|
555
555
|
readonly tag_find: (a: number, b: number, c: number, d: number) => void;
|
|
556
556
|
readonly tag_findAll: (a: number, b: number, c: number, d: number) => void;
|
|
557
557
|
readonly tag_findCompiled: (a: number, b: number) => number;
|
|
558
|
-
readonly tag_get: (a: number, b: number, c: number, d: number) => void;
|
|
559
558
|
readonly tag_hasAttr: (a: number, b: number, c: number) => number;
|
|
560
559
|
readonly tag_hasClass: (a: number, b: number, c: number) => number;
|
|
561
560
|
readonly tag_innerHTML: (a: number, b: number) => void;
|
|
@@ -578,6 +577,7 @@ export interface InitOutput {
|
|
|
578
577
|
readonly soup_select: (a: number, b: number, c: number, d: number) => void;
|
|
579
578
|
readonly tag_select: (a: number, b: number, c: number, d: number) => void;
|
|
580
579
|
readonly tag_parents: (a: number, b: number) => void;
|
|
580
|
+
readonly tag_get: (a: number, b: number, c: number, d: number) => void;
|
|
581
581
|
readonly __wbindgen_export: (a: number, b: number) => number;
|
|
582
582
|
readonly __wbindgen_export2: (a: number, b: number, c: number, d: number) => number;
|
|
583
583
|
readonly __wbindgen_export3: (a: number, b: number, c: number) => void;
|
package/scrape_wasm.js
CHANGED
|
@@ -864,7 +864,7 @@ export class Tag {
|
|
|
864
864
|
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
865
865
|
const ptr0 = passStringToWasm0(name, wasm.__wbindgen_export, wasm.__wbindgen_export2);
|
|
866
866
|
const len0 = WASM_VECTOR_LEN;
|
|
867
|
-
wasm.
|
|
867
|
+
wasm.tag_attr(retptr, this.__wbg_ptr, ptr0, len0);
|
|
868
868
|
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
869
869
|
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
870
870
|
let v2;
|
package/scrape_wasm_bg.wasm
CHANGED
|
Binary file
|