@fast-scrape/wasm 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,153 @@
1
+ # @scrape-rs/wasm
2
+
3
+ [![npm](https://img.shields.io/npm/v/@scrape-rs/wasm)](https://www.npmjs.com/package/@scrape-rs/wasm)
4
+ [![Bundle Size](https://img.shields.io/bundlephobia/minzip/@scrape-rs/wasm)](https://bundlephobia.com/package/@scrape-rs/wasm)
5
+ [![TypeScript](https://img.shields.io/badge/TypeScript-Ready-blue)](https://www.typescriptlang.org/)
6
+ [![codecov](https://codecov.io/gh/bug-ops/scrape-rs/graph/badge.svg?token=6MQTONGT95&flag=wasm)](https://codecov.io/gh/bug-ops/scrape-rs)
7
+ [![License](https://img.shields.io/npm/l/@scrape-rs/wasm)](../../LICENSE-MIT)
8
+
9
+ WebAssembly bindings for scrape-rs, a high-performance HTML parsing library. Run native-speed parsing in the browser.
10
+
11
+ ## Installation
12
+
13
+ ```bash
14
+ # npm
15
+ npm install @scrape-rs/wasm
16
+
17
+ # yarn
18
+ yarn add @scrape-rs/wasm
19
+
20
+ # pnpm
21
+ pnpm add @scrape-rs/wasm
22
+
23
+ # bun
24
+ bun add @scrape-rs/wasm
25
+ ```
26
+
27
+ ## Quick start
28
+
29
+ ```typescript
30
+ import init, { Soup } from '@scrape-rs/wasm';
31
+
32
+ // Initialize WASM module (required once)
33
+ await init();
34
+
35
+ const html = "<html><body><div class='content'>Hello, World!</div></body></html>";
36
+ const soup = new Soup(html);
37
+
38
+ const div = soup.find("div");
39
+ console.log(div.text);
40
+ // Hello, World!
41
+ ```
42
+
43
+ > [!IMPORTANT]
44
+ > Call `init()` once before using any other functions. It loads and compiles the WASM module.
45
+
46
+ ## Usage
47
+
48
+ ### Find elements
49
+
50
+ ```typescript
51
+ import init, { Soup } from '@scrape-rs/wasm';
52
+
53
+ await init();
54
+
55
+ const soup = new Soup(html);
56
+
57
+ // Find first element by tag
58
+ const div = soup.find("div");
59
+
60
+ // Find all elements
61
+ const divs = soup.findAll("div");
62
+
63
+ // CSS selectors
64
+ for (const el of soup.select("div.content > p")) {
65
+ console.log(el.text);
66
+ }
67
+ ```
68
+
69
+ ### With bundlers
70
+
71
+ **Vite:**
72
+
73
+ ```typescript
74
+ import init, { Soup } from '@scrape-rs/wasm';
75
+
76
+ // Vite handles WASM automatically
77
+ await init();
78
+ ```
79
+
80
+ **Webpack 5:**
81
+
82
+ ```javascript
83
+ // webpack.config.js
84
+ module.exports = {
85
+ experiments: {
86
+ asyncWebAssembly: true,
87
+ },
88
+ };
89
+ ```
90
+
91
+ ### CDN usage
92
+
93
+ ```html
94
+ <script type="module">
95
+ import init, { Soup } from 'https://esm.sh/@scrape-rs/wasm';
96
+
97
+ await init();
98
+
99
+ const soup = new Soup('<div>Hello</div>');
100
+ console.log(soup.find('div').text);
101
+ </script>
102
+ ```
103
+
104
+ ## Bundle size
105
+
106
+ | Build | Size |
107
+ |-------|------|
108
+ | Minified + gzip | ~150 KB |
109
+ | Minified | ~400 KB |
110
+
111
+ > [!TIP]
112
+ > The WASM module includes SIMD optimizations. Modern browsers (Chrome 91+, Firefox 89+, Safari 16.4+) run SIMD automatically.
113
+
114
+ ## TypeScript
115
+
116
+ Full TypeScript support with exported types:
117
+
118
+ ```typescript
119
+ import init, { Soup, Tag } from '@scrape-rs/wasm';
120
+
121
+ await init();
122
+
123
+ function extractLinks(soup: Soup): string[] {
124
+ return soup.select("a[href]").map(a => a.getAttribute("href") ?? "");
125
+ }
126
+ ```
127
+
128
+ ## Browser support
129
+
130
+ | Browser | Version | SIMD |
131
+ |---------|---------|------|
132
+ | Chrome | 80+ | 91+ |
133
+ | Firefox | 75+ | 89+ |
134
+ | Safari | 13+ | 16.4+ |
135
+ | Edge | 80+ | 91+ |
136
+
137
+ ## Limitations
138
+
139
+ - No parallel processing (WASM threads have limited browser support)
140
+ - Must call `init()` before using the API
141
+ - Slightly higher memory usage than native bindings
142
+
143
+ ## Related packages
144
+
145
+ Part of the [scrape-rs](https://github.com/bug-ops/scrape-rs) project:
146
+
147
+ - `scrape-core` — Rust core library
148
+ - `scrape-rs` (PyPI) — Python bindings
149
+ - `scrape-rs` (npm) — Node.js bindings
150
+
151
+ ## License
152
+
153
+ Licensed under either of [Apache License, Version 2.0](../../LICENSE-APACHE) or [MIT License](../../LICENSE-MIT) at your option.
package/package.json ADDED
@@ -0,0 +1,24 @@
1
+ {
2
+ "name": "@fast-scrape/wasm",
3
+ "type": "module",
4
+ "collaborators": [
5
+ "scrape-rs contributors"
6
+ ],
7
+ "description": "WebAssembly bindings for scrape-rs HTML parsing library",
8
+ "version": "0.1.0",
9
+ "license": "MIT OR Apache-2.0",
10
+ "repository": {
11
+ "type": "git",
12
+ "url": "https://github.com/bug-ops/scrape-rs"
13
+ },
14
+ "files": [
15
+ "scrape_wasm_bg.wasm",
16
+ "scrape_wasm.js",
17
+ "scrape_wasm.d.ts"
18
+ ],
19
+ "main": "scrape_wasm.js",
20
+ "types": "scrape_wasm.d.ts",
21
+ "sideEffects": [
22
+ "./snippets/*"
23
+ ]
24
+ }
@@ -0,0 +1,367 @@
1
+ /* tslint:disable */
2
+ /* eslint-disable */
3
+
4
+ /**
5
+ * A parsed HTML document.
6
+ *
7
+ * `Soup` is the main entry point for parsing and querying HTML documents.
8
+ * It provides methods for finding elements by CSS selector.
9
+ *
10
+ * @example
11
+ * ```javascript
12
+ * import init, { Soup } from '@scrape-rs/wasm';
13
+ *
14
+ * await init();
15
+ *
16
+ * const soup = new Soup("<div class='item'>Hello</div>");
17
+ * const div = soup.find("div.item");
18
+ * console.log(div.text); // "Hello"
19
+ * ```
20
+ */
21
+ export class Soup {
22
+ free(): void;
23
+ [Symbol.dispose](): void;
24
+ /**
25
+ * Finds the first element matching a CSS selector.
26
+ *
27
+ * @param selector - CSS selector string
28
+ * @returns The first matching Tag, or undefined if not found
29
+ * @throws Error if the selector syntax is invalid
30
+ */
31
+ find(selector: string): Tag | undefined;
32
+ /**
33
+ * Finds all elements matching a CSS selector.
34
+ *
35
+ * @param selector - CSS selector string
36
+ * @returns Array of matching Tag instances
37
+ * @throws Error if the selector syntax is invalid
38
+ */
39
+ findAll(selector: string): Tag[];
40
+ /**
41
+ * Parses an HTML string into a Soup document.
42
+ *
43
+ * @param html - The HTML string to parse
44
+ * @param config - Optional configuration options
45
+ */
46
+ constructor(html: string, config?: SoupConfig | null);
47
+ /**
48
+ * Finds all elements matching a CSS selector (alias for findAll).
49
+ *
50
+ * @param selector - CSS selector string
51
+ * @returns Array of matching Tag instances
52
+ */
53
+ select(selector: string): Tag[];
54
+ /**
55
+ * Get the HTML representation of the document.
56
+ *
57
+ * @returns The document as an HTML string
58
+ */
59
+ toHtml(): string;
60
+ /**
61
+ * Get the number of nodes in the document.
62
+ */
63
+ readonly length: number;
64
+ /**
65
+ * Get the root element of the document.
66
+ *
67
+ * @returns The root Tag (usually <html>), or undefined for empty documents
68
+ */
69
+ readonly root: Tag | undefined;
70
+ /**
71
+ * Get the text content of the entire document.
72
+ *
73
+ * @returns All text content with HTML tags stripped
74
+ */
75
+ readonly text: string;
76
+ /**
77
+ * Get the document title.
78
+ *
79
+ * @returns The title text, or undefined if no <title> element exists
80
+ */
81
+ readonly title: string | undefined;
82
+ }
83
+
84
+ /**
85
+ * Configuration options for HTML parsing.
86
+ *
87
+ * All options have sensible defaults. Use setters to customize behavior.
88
+ *
89
+ * @example
90
+ * ```javascript
91
+ * const config = new SoupConfig();
92
+ * config.maxDepth = 256;
93
+ * config.strictMode = true;
94
+ * const soup = new Soup("<div>Hello</div>", config);
95
+ * ```
96
+ */
97
+ export class SoupConfig {
98
+ free(): void;
99
+ [Symbol.dispose](): void;
100
+ /**
101
+ * Creates a new configuration with default values.
102
+ *
103
+ * Default values:
104
+ * - maxDepth: 512
105
+ * - strictMode: false
106
+ * - preserveWhitespace: false
107
+ * - includeComments: false
108
+ */
109
+ constructor();
110
+ /**
111
+ * Include comment nodes in DOM.
112
+ */
113
+ includeComments: boolean;
114
+ /**
115
+ * Maximum nesting depth for DOM tree.
116
+ */
117
+ maxDepth: number;
118
+ /**
119
+ * Preserve whitespace-only text nodes.
120
+ */
121
+ preserveWhitespace: boolean;
122
+ /**
123
+ * Enable strict parsing mode (fail on malformed HTML).
124
+ */
125
+ strictMode: boolean;
126
+ }
127
+
128
+ /**
129
+ * An HTML element in the DOM tree.
130
+ *
131
+ * Provides access to element content, attributes, and tree navigation.
132
+ *
133
+ * @example
134
+ * ```javascript
135
+ * const soup = new Soup('<div class="test">Hello</div>');
136
+ * const div = soup.find("div");
137
+ * console.log(div.name); // "div"
138
+ * console.log(div.text); // "Hello"
139
+ * console.log(div.attr("class")); // "test"
140
+ * ```
141
+ */
142
+ export class Tag {
143
+ private constructor();
144
+ free(): void;
145
+ [Symbol.dispose](): void;
146
+ /**
147
+ * Get an attribute value by name (alias for get).
148
+ *
149
+ * @param name - The attribute name
150
+ * @returns The attribute value, or undefined if not present
151
+ */
152
+ attr(name: string): string | undefined;
153
+ /**
154
+ * Find the first descendant matching a CSS selector.
155
+ *
156
+ * @param selector - CSS selector string
157
+ * @returns The first matching Tag, or undefined if not found
158
+ * @throws Error if the selector syntax is invalid
159
+ */
160
+ find(selector: string): Tag | undefined;
161
+ /**
162
+ * Find all descendants matching a CSS selector.
163
+ *
164
+ * @param selector - CSS selector string
165
+ * @returns Array of matching Tag instances
166
+ * @throws Error if the selector syntax is invalid
167
+ */
168
+ findAll(selector: string): Tag[];
169
+ /**
170
+ * Get an attribute value by name.
171
+ *
172
+ * @param name - The attribute name
173
+ * @returns The attribute value, or undefined if not present
174
+ */
175
+ get(name: string): string | undefined;
176
+ /**
177
+ * Check if the element has an attribute.
178
+ *
179
+ * @param name - The attribute name
180
+ * @returns True if the attribute exists
181
+ */
182
+ hasAttr(name: string): boolean;
183
+ /**
184
+ * Check if the element has a specific class.
185
+ *
186
+ * @param className - The class name to check
187
+ * @returns True if the element has the class
188
+ */
189
+ hasClass(class_name: string): boolean;
190
+ /**
191
+ * Find all descendants matching a CSS selector (alias for findAll).
192
+ *
193
+ * @param selector - CSS selector string
194
+ * @returns Array of matching Tag instances
195
+ */
196
+ select(selector: string): Tag[];
197
+ /**
198
+ * Get all attributes as an object.
199
+ */
200
+ readonly attrs: object;
201
+ /**
202
+ * Get all direct child elements.
203
+ */
204
+ readonly children: Tag[];
205
+ /**
206
+ * Get all classes as an array.
207
+ */
208
+ readonly classes: string[];
209
+ /**
210
+ * Get all descendant elements.
211
+ */
212
+ readonly descendants: Tag[];
213
+ /**
214
+ * Get the inner HTML content (excluding this element's tags).
215
+ */
216
+ readonly innerHTML: string;
217
+ /**
218
+ * Get the number of direct child elements.
219
+ */
220
+ readonly length: number;
221
+ /**
222
+ * Get the tag name (e.g., "div", "span").
223
+ */
224
+ readonly name: string | undefined;
225
+ /**
226
+ * Get the next sibling element.
227
+ */
228
+ readonly nextSibling: Tag | undefined;
229
+ /**
230
+ * Get the outer HTML (including this element's tags).
231
+ */
232
+ readonly outerHTML: string;
233
+ /**
234
+ * Get the parent element.
235
+ */
236
+ readonly parent: Tag | undefined;
237
+ /**
238
+ * Get the previous sibling element.
239
+ */
240
+ readonly prevSibling: Tag | undefined;
241
+ /**
242
+ * Get the text content of this element and all descendants.
243
+ */
244
+ readonly text: string;
245
+ }
246
+
247
+ /**
248
+ * Check if WASM SIMD is supported in the current environment.
249
+ *
250
+ * Returns true if the module was compiled with SIMD support and
251
+ * is running on a platform that supports SIMD128 instructions.
252
+ *
253
+ * SIMD support requires:
254
+ * - Chrome 91+ / Firefox 89+ / Safari 16.4+
255
+ * - Module built with RUSTFLAGS='-C target-feature=+simd128'
256
+ */
257
+ export function hasSimdSupport(): boolean;
258
+
259
+ /**
260
+ * Initialize the WASM module.
261
+ *
262
+ * Sets up panic hook for better error messages in browser console.
263
+ * This is called automatically when the module is loaded.
264
+ */
265
+ export function init(): void;
266
+
267
+ /**
268
+ * Parse multiple HTML documents.
269
+ *
270
+ * Note: WASM does not support threads, so this processes documents sequentially.
271
+ * For parallel processing in browsers, use Web Workers with separate WASM instances.
272
+ *
273
+ * @param documents - Array of HTML strings to parse
274
+ * @returns Array of Soup documents
275
+ *
276
+ * @example
277
+ * ```javascript
278
+ * const soups = parseBatch(['<div>A</div>', '<div>B</div>']);
279
+ * console.log(soups.length); // 2
280
+ * ```
281
+ */
282
+ export function parseBatch(documents: string[]): Soup[];
283
+
284
+ /**
285
+ * Get the library version.
286
+ *
287
+ * @returns Version string (e.g., "0.1.0")
288
+ */
289
+ export function version(): string;
290
+
291
+ export type InitInput = RequestInfo | URL | Response | BufferSource | WebAssembly.Module;
292
+
293
+ export interface InitOutput {
294
+ readonly memory: WebAssembly.Memory;
295
+ readonly __wbg_tag_free: (a: number, b: number) => void;
296
+ readonly tag_attr: (a: number, b: number, c: number, d: number) => void;
297
+ readonly tag_attrs: (a: number) => number;
298
+ readonly tag_children: (a: number, b: number) => void;
299
+ readonly tag_classes: (a: number, b: number) => void;
300
+ readonly tag_descendants: (a: number, b: number) => void;
301
+ readonly tag_find: (a: number, b: number, c: number, d: number) => void;
302
+ readonly tag_findAll: (a: number, b: number, c: number, d: number) => void;
303
+ readonly tag_get: (a: number, b: number, c: number, d: number) => void;
304
+ readonly tag_hasAttr: (a: number, b: number, c: number) => number;
305
+ readonly tag_hasClass: (a: number, b: number, c: number) => number;
306
+ readonly tag_innerHTML: (a: number, b: number) => void;
307
+ readonly tag_length: (a: number) => number;
308
+ readonly tag_name: (a: number, b: number) => void;
309
+ readonly tag_nextSibling: (a: number) => number;
310
+ readonly tag_outerHTML: (a: number, b: number) => void;
311
+ readonly tag_parent: (a: number) => number;
312
+ readonly tag_prevSibling: (a: number) => number;
313
+ readonly tag_select: (a: number, b: number, c: number, d: number) => void;
314
+ readonly tag_text: (a: number, b: number) => void;
315
+ readonly __wbg_soupconfig_free: (a: number, b: number) => void;
316
+ readonly soupconfig_includeComments: (a: number) => number;
317
+ readonly soupconfig_maxDepth: (a: number) => number;
318
+ readonly soupconfig_new: () => number;
319
+ readonly soupconfig_preserveWhitespace: (a: number) => number;
320
+ readonly soupconfig_set_includeComments: (a: number, b: number) => void;
321
+ readonly soupconfig_set_maxDepth: (a: number, b: number) => void;
322
+ readonly soupconfig_set_preserveWhitespace: (a: number, b: number) => void;
323
+ readonly soupconfig_set_strictMode: (a: number, b: number) => void;
324
+ readonly soupconfig_strictMode: (a: number) => number;
325
+ readonly __wbg_soup_free: (a: number, b: number) => void;
326
+ readonly soup_find: (a: number, b: number, c: number, d: number) => void;
327
+ readonly soup_findAll: (a: number, b: number, c: number, d: number) => void;
328
+ readonly soup_length: (a: number) => number;
329
+ readonly soup_new: (a: number, b: number, c: number) => number;
330
+ readonly soup_root: (a: number) => number;
331
+ readonly soup_select: (a: number, b: number, c: number, d: number) => void;
332
+ readonly soup_text: (a: number, b: number) => void;
333
+ readonly soup_title: (a: number, b: number) => void;
334
+ readonly soup_toHtml: (a: number, b: number) => void;
335
+ readonly hasSimdSupport: () => number;
336
+ readonly parseBatch: (a: number, b: number, c: number) => void;
337
+ readonly version: (a: number) => void;
338
+ readonly init: () => void;
339
+ readonly __wbindgen_export: (a: number, b: number) => number;
340
+ readonly __wbindgen_export2: (a: number, b: number, c: number, d: number) => number;
341
+ readonly __wbindgen_export3: (a: number, b: number, c: number) => void;
342
+ readonly __wbindgen_export4: (a: number) => void;
343
+ readonly __wbindgen_add_to_stack_pointer: (a: number) => number;
344
+ readonly __wbindgen_start: () => void;
345
+ }
346
+
347
+ export type SyncInitInput = BufferSource | WebAssembly.Module;
348
+
349
+ /**
350
+ * Instantiates the given `module`, which can either be bytes or
351
+ * a precompiled `WebAssembly.Module`.
352
+ *
353
+ * @param {{ module: SyncInitInput }} module - Passing `SyncInitInput` directly is deprecated.
354
+ *
355
+ * @returns {InitOutput}
356
+ */
357
+ export function initSync(module: { module: SyncInitInput } | SyncInitInput): InitOutput;
358
+
359
+ /**
360
+ * If `module_or_path` is {RequestInfo} or {URL}, makes a request and
361
+ * for everything else, calls `WebAssembly.instantiate` directly.
362
+ *
363
+ * @param {{ module_or_path: InitInput | Promise<InitInput> }} module_or_path - Passing `InitInput` directly is deprecated.
364
+ *
365
+ * @returns {Promise<InitOutput>}
366
+ */
367
+ export default function __wbg_init (module_or_path?: { module_or_path: InitInput | Promise<InitInput> } | InitInput | Promise<InitInput>): Promise<InitOutput>;