@qretaio/html2json 0.5.4 → 0.5.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -44,7 +44,7 @@ just install
44
44
  ### JavaScript / TypeScript
45
45
 
46
46
  ```javascript
47
- import { extract } from '@qretaio/html2json';
47
+ import { extract } from "@qretaio/html2json";
48
48
 
49
49
  const html = `
50
50
  <article class="post">
@@ -57,16 +57,18 @@ const html = `
57
57
  </article>
58
58
  `;
59
59
 
60
- const spec = JSON.stringify({
60
+ const spec = {
61
61
  title: "h2",
62
62
  author: ".author",
63
- tags: [{
64
- "$": ".tags span",
65
- "name": "$"
66
- }]
67
- });
68
-
69
- const result = extract(html, spec);
63
+ tags: [
64
+ {
65
+ $: ".tags span",
66
+ name: "$",
67
+ },
68
+ ],
69
+ };
70
+
71
+ const result = await extract(html, spec);
70
72
  console.log(result);
71
73
  // {
72
74
  // "title": "My Article",
@@ -129,6 +131,7 @@ The spec is a JSON object where each key defines an output field and each value
129
131
  ```
130
132
 
131
133
  Available pipes:
134
+
132
135
  - `trim` - Trim whitespace
133
136
  - `lower` - Convert to lowercase
134
137
  - `upper` - Convert to uppercase
@@ -143,11 +146,13 @@ Available pipes:
143
146
 
144
147
  ```json
145
148
  {
146
- "items": [{
147
- "$": ".item",
148
- "title": "h2",
149
- "description": "p"
150
- }]
149
+ "items": [
150
+ {
151
+ "$": ".item",
152
+ "title": "h2",
153
+ "description": "p"
154
+ }
155
+ ]
151
156
  }
152
157
  ```
153
158
 
package/html2json.d.ts CHANGED
@@ -1,11 +1,45 @@
1
1
  /* tslint:disable */
2
2
  /* eslint-disable */
3
3
 
4
+ /**
5
+ * Extract JSON from HTML using a spec
6
+ *
7
+ * # Arguments
8
+ *
9
+ * * `html` - The HTML source to parse
10
+ * * `spec_json` - The extraction specification as JSON string
11
+ *
12
+ * # Returns
13
+ *
14
+ * A JSON string with the extracted data
15
+ *
16
+ * # Errors
17
+ *
18
+ * Returns a JsValue error if the HTML parsing or extraction fails
19
+ *
20
+ * # Example
21
+ *
22
+ * ```javascript
23
+ * import { extract } from 'html2json';
24
+ *
25
+ * const html = '<div class="item"><span>Price: $25.00</span></div>';
26
+ * const spec = '{"price": ".item span | regex:\\\\$(\\\\d+\\\\.\\\\d+)"}';
27
+ * const result = extract(html, spec);
28
+ * console.log(result); // {"price":"25.00"}
29
+ * ```
30
+ */
31
+ export function extract(html: string, spec_json: string): string;
32
+
4
33
  export type InitInput = RequestInfo | URL | Response | BufferSource | WebAssembly.Module;
5
34
 
6
35
  export interface InitOutput {
7
36
  readonly memory: WebAssembly.Memory;
37
+ readonly extract: (a: number, b: number, c: number, d: number) => [number, number, number, number];
8
38
  readonly __wbindgen_externrefs: WebAssembly.Table;
39
+ readonly __wbindgen_malloc: (a: number, b: number) => number;
40
+ readonly __wbindgen_realloc: (a: number, b: number, c: number, d: number) => number;
41
+ readonly __externref_table_dealloc: (a: number) => void;
42
+ readonly __wbindgen_free: (a: number, b: number, c: number) => void;
9
43
  readonly __wbindgen_start: () => void;
10
44
  }
11
45
 
package/html2json.js CHANGED
@@ -1,8 +1,66 @@
1
1
  /* @ts-self-types="./html2json.d.ts" */
2
2
 
3
+ /**
4
+ * Extract JSON from HTML using a spec
5
+ *
6
+ * # Arguments
7
+ *
8
+ * * `html` - The HTML source to parse
9
+ * * `spec_json` - The extraction specification as JSON string
10
+ *
11
+ * # Returns
12
+ *
13
+ * A JSON string with the extracted data
14
+ *
15
+ * # Errors
16
+ *
17
+ * Returns a JsValue error if the HTML parsing or extraction fails
18
+ *
19
+ * # Example
20
+ *
21
+ * ```javascript
22
+ * import { extract } from 'html2json';
23
+ *
24
+ * const html = '<div class="item"><span>Price: $25.00</span></div>';
25
+ * const spec = '{"price": ".item span | regex:\\\\$(\\\\d+\\\\.\\\\d+)"}';
26
+ * const result = extract(html, spec);
27
+ * console.log(result); // {"price":"25.00"}
28
+ * ```
29
+ * @param {string} html
30
+ * @param {string} spec_json
31
+ * @returns {string}
32
+ */
33
+ export function extract(html, spec_json) {
34
+ let deferred4_0;
35
+ let deferred4_1;
36
+ try {
37
+ const ptr0 = passStringToWasm0(html, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
38
+ const len0 = WASM_VECTOR_LEN;
39
+ const ptr1 = passStringToWasm0(spec_json, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
40
+ const len1 = WASM_VECTOR_LEN;
41
+ const ret = wasm.extract(ptr0, len0, ptr1, len1);
42
+ var ptr3 = ret[0];
43
+ var len3 = ret[1];
44
+ if (ret[3]) {
45
+ ptr3 = 0; len3 = 0;
46
+ throw takeFromExternrefTable0(ret[2]);
47
+ }
48
+ deferred4_0 = ptr3;
49
+ deferred4_1 = len3;
50
+ return getStringFromWasm0(ptr3, len3);
51
+ } finally {
52
+ wasm.__wbindgen_free(deferred4_0, deferred4_1, 1);
53
+ }
54
+ }
55
+
3
56
  function __wbg_get_imports() {
4
57
  const import0 = {
5
58
  __proto__: null,
59
+ __wbindgen_cast_0000000000000001: function(arg0, arg1) {
60
+ // Cast intrinsic for `Ref(String) -> Externref`.
61
+ const ret = getStringFromWasm0(arg0, arg1);
62
+ return ret;
63
+ },
6
64
  __wbindgen_init_externref_table: function() {
7
65
  const table = wasm.__wbindgen_externrefs;
8
66
  const offset = table.grow(4);
@@ -19,10 +77,96 @@ function __wbg_get_imports() {
19
77
  };
20
78
  }
21
79
 
80
+ function getStringFromWasm0(ptr, len) {
81
+ ptr = ptr >>> 0;
82
+ return decodeText(ptr, len);
83
+ }
84
+
85
+ let cachedUint8ArrayMemory0 = null;
86
+ function getUint8ArrayMemory0() {
87
+ if (cachedUint8ArrayMemory0 === null || cachedUint8ArrayMemory0.byteLength === 0) {
88
+ cachedUint8ArrayMemory0 = new Uint8Array(wasm.memory.buffer);
89
+ }
90
+ return cachedUint8ArrayMemory0;
91
+ }
92
+
93
+ function passStringToWasm0(arg, malloc, realloc) {
94
+ if (realloc === undefined) {
95
+ const buf = cachedTextEncoder.encode(arg);
96
+ const ptr = malloc(buf.length, 1) >>> 0;
97
+ getUint8ArrayMemory0().subarray(ptr, ptr + buf.length).set(buf);
98
+ WASM_VECTOR_LEN = buf.length;
99
+ return ptr;
100
+ }
101
+
102
+ let len = arg.length;
103
+ let ptr = malloc(len, 1) >>> 0;
104
+
105
+ const mem = getUint8ArrayMemory0();
106
+
107
+ let offset = 0;
108
+
109
+ for (; offset < len; offset++) {
110
+ const code = arg.charCodeAt(offset);
111
+ if (code > 0x7F) break;
112
+ mem[ptr + offset] = code;
113
+ }
114
+ if (offset !== len) {
115
+ if (offset !== 0) {
116
+ arg = arg.slice(offset);
117
+ }
118
+ ptr = realloc(ptr, len, len = offset + arg.length * 3, 1) >>> 0;
119
+ const view = getUint8ArrayMemory0().subarray(ptr + offset, ptr + len);
120
+ const ret = cachedTextEncoder.encodeInto(arg, view);
121
+
122
+ offset += ret.written;
123
+ ptr = realloc(ptr, len, offset, 1) >>> 0;
124
+ }
125
+
126
+ WASM_VECTOR_LEN = offset;
127
+ return ptr;
128
+ }
129
+
130
+ function takeFromExternrefTable0(idx) {
131
+ const value = wasm.__wbindgen_externrefs.get(idx);
132
+ wasm.__externref_table_dealloc(idx);
133
+ return value;
134
+ }
135
+
136
+ let cachedTextDecoder = new TextDecoder('utf-8', { ignoreBOM: true, fatal: true });
137
+ cachedTextDecoder.decode();
138
+ const MAX_SAFARI_DECODE_BYTES = 2146435072;
139
+ let numBytesDecoded = 0;
140
+ function decodeText(ptr, len) {
141
+ numBytesDecoded += len;
142
+ if (numBytesDecoded >= MAX_SAFARI_DECODE_BYTES) {
143
+ cachedTextDecoder = new TextDecoder('utf-8', { ignoreBOM: true, fatal: true });
144
+ cachedTextDecoder.decode();
145
+ numBytesDecoded = len;
146
+ }
147
+ return cachedTextDecoder.decode(getUint8ArrayMemory0().subarray(ptr, ptr + len));
148
+ }
149
+
150
+ const cachedTextEncoder = new TextEncoder();
151
+
152
+ if (!('encodeInto' in cachedTextEncoder)) {
153
+ cachedTextEncoder.encodeInto = function (arg, view) {
154
+ const buf = cachedTextEncoder.encode(arg);
155
+ view.set(buf);
156
+ return {
157
+ read: arg.length,
158
+ written: buf.length
159
+ };
160
+ };
161
+ }
162
+
163
+ let WASM_VECTOR_LEN = 0;
164
+
22
165
  let wasmModule, wasm;
23
166
  function __wbg_finalize_init(instance, module) {
24
167
  wasm = instance.exports;
25
168
  wasmModule = module;
169
+ cachedUint8ArrayMemory0 = null;
26
170
  wasm.__wbindgen_start();
27
171
  return wasm;
28
172
  }
package/html2json_bg.wasm CHANGED
Binary file
package/index.js ADDED
@@ -0,0 +1,27 @@
1
+ // Auto-initializing wrapper for convenience
2
+ import _init from "./html2json.js";
3
+ import { extract as _extract, initSync } from "./html2json.js";
4
+
5
+ let initPromise;
6
+
7
+ // Auto-init on first call
8
+ function init() {
9
+ if (!initPromise) {
10
+ initPromise = _init();
11
+ }
12
+ return initPromise;
13
+ }
14
+
15
+ // Export auto-initialized extract (async for first call)
16
+ export async function extract(html, spec_json) {
17
+ await init();
18
+ return _extract(html, spec_json);
19
+ }
20
+
21
+ // Also export init functions for those who want to control timing
22
+ export { init, initSync };
23
+
24
+ // Export raw extract for advanced use (sync, requires manual init)
25
+ export { _extract as extractSync };
26
+
27
+ export default init;
package/package.json CHANGED
@@ -5,7 +5,7 @@
5
5
  "Qreta Dev <qretadev@gmail.com>"
6
6
  ],
7
7
  "description": "HTML to JSON extractor",
8
- "version": "0.5.4",
8
+ "version": "0.5.6",
9
9
  "license": "MIT",
10
10
  "repository": {
11
11
  "type": "git",
@@ -14,11 +14,12 @@
14
14
  "files": [
15
15
  "html2json_bg.wasm",
16
16
  "html2json.js",
17
- "html2json.d.ts"
17
+ "html2json.d.ts",
18
+ "index.js"
18
19
  ],
19
- "main": "html2json.js",
20
+ "main": "index.js",
20
21
  "types": "html2json.d.ts",
21
22
  "sideEffects": [
22
23
  "./snippets/*"
23
24
  ]
24
- }
25
+ }