html-json-extractor 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +45 -0
- package/dist/index.cjs +319 -0
- package/dist/index.d.cts +15 -0
- package/dist/index.d.ts +15 -0
- package/dist/index.js +317 -0
- package/package.json +54 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 VastBlast
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# html-json-extractor
|
|
2
|
+
|
|
3
|
+
Fast, forgiving extraction of `<script type="application/ld+json">` blocks from an HTML string.
|
|
4
|
+
|
|
5
|
+
- No DOM parser or runtime dependencies
|
|
6
|
+
- Returns one result per matching script block
|
|
7
|
+
- Malformed JSON-LD blocks do not break the rest
|
|
8
|
+
|
|
9
|
+
## Install
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
npm install html-json-extractor
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## Usage
|
|
16
|
+
|
|
17
|
+
```ts
|
|
18
|
+
import { extractJsonLd, extractJsonLdStrings } from 'html-json-extractor';
|
|
19
|
+
|
|
20
|
+
const html = `
|
|
21
|
+
<script type="application/ld+json">{"@type":"WebSite","name":"Example"}</script>
|
|
22
|
+
<script type="application/ld+json">{"broken":</script>
|
|
23
|
+
<script type="application/ld+json">[{"@type":"Person","name":"Ada"}]</script>
|
|
24
|
+
`;
|
|
25
|
+
|
|
26
|
+
const raw = extractJsonLdStrings(html);
|
|
27
|
+
// ['{"@type":"WebSite","name":"Example"}', '{"broken":', '[{"@type":"Person","name":"Ada"}]']
|
|
28
|
+
|
|
29
|
+
const parsed = extractJsonLd(html);
|
|
30
|
+
// [{ '@type': 'WebSite', name: 'Example' }, [{ '@type': 'Person', name: 'Ada' }]]
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## API
|
|
34
|
+
|
|
35
|
+
### `extractJsonLdStrings(html: string): string[]`
|
|
36
|
+
|
|
37
|
+
Returns normalized JSON-LD script contents as strings.
|
|
38
|
+
|
|
39
|
+
### `extractJsonLd<T = JsonValue>(html: string): T[]`
|
|
40
|
+
|
|
41
|
+
Parses the extracted strings with `JSON.parse`. Entries that fail to parse are skipped.
|
|
42
|
+
|
|
43
|
+
## License
|
|
44
|
+
|
|
45
|
+
MIT
|
package/dist/index.cjs
ADDED
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
Object.defineProperty(exports, Symbol.toStringTag, { value: "Module" });
|
|
2
|
+
//#region src/scan.ts
|
|
3
|
+
const SCRIPT_TAG_NAME = "script";
|
|
4
|
+
const APPLICATION_LD_JSON_MIME = "application/ld+json";
|
|
5
|
+
const CHAR_TAB = 9;
|
|
6
|
+
const CHAR_LINE_FEED = 10;
|
|
7
|
+
const CHAR_FORM_FEED = 12;
|
|
8
|
+
const CHAR_CARRIAGE_RETURN = 13;
|
|
9
|
+
const CHAR_EXCLAMATION_MARK = 33;
|
|
10
|
+
const CHAR_DOUBLE_QUOTE = 34;
|
|
11
|
+
const CHAR_APOSTROPHE = 39;
|
|
12
|
+
const CHAR_SLASH = 47;
|
|
13
|
+
const CHAR_SEMICOLON = 59;
|
|
14
|
+
const CHAR_EQUALS = 61;
|
|
15
|
+
const CHAR_GREATER_THAN = 62;
|
|
16
|
+
const CHAR_SPACE = 32;
|
|
17
|
+
const CHAR_BYTE_ORDER_MARK = 65279;
|
|
18
|
+
const TEXT_LITERAL_CONTAINERS = [
|
|
19
|
+
{
|
|
20
|
+
tagName: "style",
|
|
21
|
+
isTerminal: false
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
tagName: "textarea",
|
|
25
|
+
isTerminal: false
|
|
26
|
+
},
|
|
27
|
+
{
|
|
28
|
+
tagName: "title",
|
|
29
|
+
isTerminal: false
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
tagName: "xmp",
|
|
33
|
+
isTerminal: false
|
|
34
|
+
},
|
|
35
|
+
{
|
|
36
|
+
tagName: "noembed",
|
|
37
|
+
isTerminal: false
|
|
38
|
+
},
|
|
39
|
+
{
|
|
40
|
+
tagName: "noframes",
|
|
41
|
+
isTerminal: false
|
|
42
|
+
},
|
|
43
|
+
{
|
|
44
|
+
tagName: "plaintext",
|
|
45
|
+
isTerminal: true
|
|
46
|
+
}
|
|
47
|
+
];
|
|
48
|
+
function extractJsonLdStrings(html) {
|
|
49
|
+
if (html.length === 0) return [];
|
|
50
|
+
const results = [];
|
|
51
|
+
let cursor = 0;
|
|
52
|
+
while (cursor < html.length) {
|
|
53
|
+
const openTagStart = findNextScriptOpenTag(html, cursor);
|
|
54
|
+
if (openTagStart === -1) break;
|
|
55
|
+
const openTag = parseScriptOpenTag(html, openTagStart);
|
|
56
|
+
if (openTag === null) break;
|
|
57
|
+
const contentStart = openTag.tagEnd + 1;
|
|
58
|
+
const closeTagStart = findNextCloseTag(html, contentStart, SCRIPT_TAG_NAME);
|
|
59
|
+
const contentEnd = closeTagStart === -1 ? html.length : closeTagStart;
|
|
60
|
+
if (openTag.isJsonLd) {
|
|
61
|
+
const content = normalizeJsonLdContent(html.slice(contentStart, contentEnd));
|
|
62
|
+
if (content.length > 0) results.push(content);
|
|
63
|
+
}
|
|
64
|
+
if (closeTagStart === -1) break;
|
|
65
|
+
const closeTagEnd = findTagEnd(html, closeTagStart + 2 + 6);
|
|
66
|
+
cursor = closeTagEnd === -1 ? html.length : closeTagEnd + 1;
|
|
67
|
+
}
|
|
68
|
+
return results;
|
|
69
|
+
}
|
|
70
|
+
function findNextScriptOpenTag(html, start) {
|
|
71
|
+
let cursor = html.indexOf("<", start);
|
|
72
|
+
while (cursor !== -1) {
|
|
73
|
+
const next = cursor + 1;
|
|
74
|
+
if (next >= html.length) return -1;
|
|
75
|
+
if (isHtmlCommentStart(html, next)) {
|
|
76
|
+
const commentEnd = html.indexOf("-->", next + 3);
|
|
77
|
+
if (commentEnd === -1) return -1;
|
|
78
|
+
cursor = html.indexOf("<", commentEnd + 3);
|
|
79
|
+
continue;
|
|
80
|
+
}
|
|
81
|
+
const firstTagChar = toLowerAsciiCode(html.charCodeAt(next));
|
|
82
|
+
if (firstTagChar !== 110 && firstTagChar !== 112 && firstTagChar !== 115 && firstTagChar !== 116 && firstTagChar !== 120) {
|
|
83
|
+
cursor = html.indexOf("<", cursor + 1);
|
|
84
|
+
continue;
|
|
85
|
+
}
|
|
86
|
+
if (isInsideOpenTag(html, cursor)) {
|
|
87
|
+
cursor = html.indexOf("<", cursor + 1);
|
|
88
|
+
continue;
|
|
89
|
+
}
|
|
90
|
+
if (firstTagChar === 115 && matchesTagName(html, next, SCRIPT_TAG_NAME)) return cursor;
|
|
91
|
+
const skippedContainer = getSkippedTextContainer(html, next, firstTagChar);
|
|
92
|
+
if (skippedContainer !== null) {
|
|
93
|
+
const openTagEnd = findTagEndRespectingQuotes(html, next + skippedContainer.tagName.length);
|
|
94
|
+
if (openTagEnd === -1) return -1;
|
|
95
|
+
if (skippedContainer.isTerminal) return -1;
|
|
96
|
+
const closeTagStart = findNextCloseTag(html, openTagEnd + 1, skippedContainer.tagName);
|
|
97
|
+
if (closeTagStart === -1) return -1;
|
|
98
|
+
const closeTagEnd = findTagEnd(html, closeTagStart + 2 + skippedContainer.tagName.length);
|
|
99
|
+
cursor = closeTagEnd === -1 ? -1 : html.indexOf("<", closeTagEnd + 1);
|
|
100
|
+
continue;
|
|
101
|
+
}
|
|
102
|
+
cursor = html.indexOf("<", cursor + 1);
|
|
103
|
+
}
|
|
104
|
+
return -1;
|
|
105
|
+
}
|
|
106
|
+
function parseScriptOpenTag(html, openTagStart) {
|
|
107
|
+
let cursor = openTagStart + 1 + 6;
|
|
108
|
+
let isJsonLd = false;
|
|
109
|
+
while (cursor < html.length) {
|
|
110
|
+
cursor = skipLeadingHtmlWhitespace(html, cursor, html.length);
|
|
111
|
+
if (cursor >= html.length) break;
|
|
112
|
+
const code = html.charCodeAt(cursor);
|
|
113
|
+
if (code === CHAR_GREATER_THAN) return {
|
|
114
|
+
isJsonLd,
|
|
115
|
+
tagEnd: cursor
|
|
116
|
+
};
|
|
117
|
+
if (isHtmlWhitespace(code) || code === CHAR_SLASH) {
|
|
118
|
+
cursor += 1;
|
|
119
|
+
continue;
|
|
120
|
+
}
|
|
121
|
+
const attributeNameStart = cursor;
|
|
122
|
+
while (cursor < html.length) {
|
|
123
|
+
const attributeCode = html.charCodeAt(cursor);
|
|
124
|
+
if (isHtmlWhitespace(attributeCode) || attributeCode === CHAR_EQUALS || attributeCode === CHAR_GREATER_THAN || attributeCode === CHAR_SLASH) break;
|
|
125
|
+
cursor += 1;
|
|
126
|
+
}
|
|
127
|
+
const isTypeAttribute = cursor - attributeNameStart === 4 && matchesAsciiLiteral(html, attributeNameStart, "type");
|
|
128
|
+
cursor = skipLeadingHtmlWhitespace(html, cursor, html.length);
|
|
129
|
+
if (cursor >= html.length || html.charCodeAt(cursor) !== CHAR_EQUALS) continue;
|
|
130
|
+
cursor += 1;
|
|
131
|
+
cursor = skipLeadingHtmlWhitespace(html, cursor, html.length);
|
|
132
|
+
if (cursor >= html.length) break;
|
|
133
|
+
const quote = html.charCodeAt(cursor);
|
|
134
|
+
let valueStart = cursor;
|
|
135
|
+
let valueEnd = cursor;
|
|
136
|
+
if (quote === CHAR_DOUBLE_QUOTE || quote === CHAR_APOSTROPHE) {
|
|
137
|
+
valueStart = cursor + 1;
|
|
138
|
+
cursor = valueStart;
|
|
139
|
+
while (cursor < html.length && html.charCodeAt(cursor) !== quote) cursor += 1;
|
|
140
|
+
valueEnd = cursor;
|
|
141
|
+
if (cursor < html.length) cursor += 1;
|
|
142
|
+
} else {
|
|
143
|
+
while (cursor < html.length) {
|
|
144
|
+
const valueCode = html.charCodeAt(cursor);
|
|
145
|
+
if (isHtmlWhitespace(valueCode) || valueCode === CHAR_GREATER_THAN) break;
|
|
146
|
+
cursor += 1;
|
|
147
|
+
}
|
|
148
|
+
valueEnd = cursor;
|
|
149
|
+
}
|
|
150
|
+
if (isTypeAttribute && isApplicationLdJsonMime(html, valueStart, valueEnd)) isJsonLd = true;
|
|
151
|
+
}
|
|
152
|
+
return null;
|
|
153
|
+
}
|
|
154
|
+
function normalizeJsonLdContent(content) {
|
|
155
|
+
let start = 0;
|
|
156
|
+
let end = content.length;
|
|
157
|
+
let changed = false;
|
|
158
|
+
while (true) {
|
|
159
|
+
const trimmedStart = skipLeadingHtmlWhitespace(content, start, end);
|
|
160
|
+
const trimmedEnd = skipTrailingHtmlWhitespace(content, trimmedStart, end);
|
|
161
|
+
if (trimmedStart !== start || trimmedEnd !== end) changed = true;
|
|
162
|
+
start = trimmedStart;
|
|
163
|
+
end = trimmedEnd;
|
|
164
|
+
if (start >= end) return "";
|
|
165
|
+
if (content.charCodeAt(start) === CHAR_BYTE_ORDER_MARK) {
|
|
166
|
+
start += 1;
|
|
167
|
+
changed = true;
|
|
168
|
+
continue;
|
|
169
|
+
}
|
|
170
|
+
if (content.startsWith("<!--", start)) {
|
|
171
|
+
start += 4;
|
|
172
|
+
changed = true;
|
|
173
|
+
continue;
|
|
174
|
+
}
|
|
175
|
+
if (content.startsWith("<![CDATA[", start)) {
|
|
176
|
+
start += 9;
|
|
177
|
+
changed = true;
|
|
178
|
+
continue;
|
|
179
|
+
}
|
|
180
|
+
if (end - start >= 3 && content.endsWith("-->", end)) {
|
|
181
|
+
end -= 3;
|
|
182
|
+
changed = true;
|
|
183
|
+
continue;
|
|
184
|
+
}
|
|
185
|
+
if (end - start >= 3 && content.endsWith("]]>", end)) {
|
|
186
|
+
end -= 3;
|
|
187
|
+
changed = true;
|
|
188
|
+
continue;
|
|
189
|
+
}
|
|
190
|
+
if (content.charCodeAt(end - 1) === CHAR_SEMICOLON) {
|
|
191
|
+
end -= 1;
|
|
192
|
+
changed = true;
|
|
193
|
+
continue;
|
|
194
|
+
}
|
|
195
|
+
break;
|
|
196
|
+
}
|
|
197
|
+
if (!changed && start === 0 && end === content.length) return content;
|
|
198
|
+
return content.slice(start, end);
|
|
199
|
+
}
|
|
200
|
+
function findTagEnd(html, start) {
|
|
201
|
+
return html.indexOf(">", start);
|
|
202
|
+
}
|
|
203
|
+
function findTagEndRespectingQuotes(html, start) {
|
|
204
|
+
let quote = 0;
|
|
205
|
+
for (let cursor = start; cursor < html.length; cursor += 1) {
|
|
206
|
+
const code = html.charCodeAt(cursor);
|
|
207
|
+
if (quote !== 0) {
|
|
208
|
+
if (code === quote) quote = 0;
|
|
209
|
+
continue;
|
|
210
|
+
}
|
|
211
|
+
if (code === CHAR_DOUBLE_QUOTE || code === CHAR_APOSTROPHE) {
|
|
212
|
+
quote = code;
|
|
213
|
+
continue;
|
|
214
|
+
}
|
|
215
|
+
if (code === CHAR_GREATER_THAN) return cursor;
|
|
216
|
+
}
|
|
217
|
+
return -1;
|
|
218
|
+
}
|
|
219
|
+
function findNextCloseTag(html, start, tagName) {
|
|
220
|
+
let cursor = html.indexOf("<", start);
|
|
221
|
+
while (cursor !== -1) {
|
|
222
|
+
const slashIndex = cursor + 1;
|
|
223
|
+
if (slashIndex < html.length && html.charCodeAt(slashIndex) === CHAR_SLASH && matchesTagName(html, slashIndex + 1, tagName)) return cursor;
|
|
224
|
+
cursor = html.indexOf("<", cursor + 1);
|
|
225
|
+
}
|
|
226
|
+
return -1;
|
|
227
|
+
}
|
|
228
|
+
function isApplicationLdJsonMime(value, start, end) {
|
|
229
|
+
start = skipLeadingHtmlWhitespace(value, start, end);
|
|
230
|
+
end = skipTrailingHtmlWhitespace(value, start, end);
|
|
231
|
+
if (end - start < 19) return false;
|
|
232
|
+
if (!matchesAsciiLiteral(value, start, APPLICATION_LD_JSON_MIME)) return false;
|
|
233
|
+
if (end - start === 19) return true;
|
|
234
|
+
const suffixStart = start + 19;
|
|
235
|
+
const nextCode = value.charCodeAt(suffixStart);
|
|
236
|
+
if (nextCode === CHAR_SEMICOLON) return true;
|
|
237
|
+
if (!isHtmlWhitespace(nextCode)) return false;
|
|
238
|
+
for (let index = suffixStart; index < end; index += 1) {
|
|
239
|
+
const code = value.charCodeAt(index);
|
|
240
|
+
if (!isHtmlWhitespace(code)) return code === CHAR_SEMICOLON;
|
|
241
|
+
}
|
|
242
|
+
return true;
|
|
243
|
+
}
|
|
244
|
+
function matchesTagName(source, start, expectedTagName) {
|
|
245
|
+
if (!matchesAsciiLiteral(source, start, expectedTagName)) return false;
|
|
246
|
+
const boundaryIndex = start + expectedTagName.length;
|
|
247
|
+
if (boundaryIndex >= source.length) return true;
|
|
248
|
+
const boundaryCode = source.charCodeAt(boundaryIndex);
|
|
249
|
+
return boundaryCode === CHAR_GREATER_THAN || boundaryCode === CHAR_SLASH || isHtmlWhitespace(boundaryCode);
|
|
250
|
+
}
|
|
251
|
+
function matchesAsciiLiteral(source, start, expected) {
|
|
252
|
+
if (start + expected.length > source.length) return false;
|
|
253
|
+
for (let index = 0; index < expected.length; index += 1) if (toLowerAsciiCode(source.charCodeAt(start + index)) !== expected.charCodeAt(index)) return false;
|
|
254
|
+
return true;
|
|
255
|
+
}
|
|
256
|
+
function isHtmlCommentStart(source, start) {
|
|
257
|
+
return start + 2 < source.length && source.charCodeAt(start) === CHAR_EXCLAMATION_MARK && source.charCodeAt(start + 1) === 45 && source.charCodeAt(start + 2) === 45;
|
|
258
|
+
}
|
|
259
|
+
function isHtmlWhitespace(code) {
|
|
260
|
+
return code === CHAR_SPACE || code === CHAR_TAB || code === CHAR_LINE_FEED || code === CHAR_CARRIAGE_RETURN || code === CHAR_FORM_FEED;
|
|
261
|
+
}
|
|
262
|
+
function toLowerAsciiCode(code) {
|
|
263
|
+
return code >= 65 && code <= 90 ? code + 32 : code;
|
|
264
|
+
}
|
|
265
|
+
function isInsideOpenTag(html, candidateStart) {
|
|
266
|
+
let tagStart = candidateStart - 1;
|
|
267
|
+
while (tagStart >= 0) {
|
|
268
|
+
const code = html.charCodeAt(tagStart);
|
|
269
|
+
if (code === CHAR_GREATER_THAN) return false;
|
|
270
|
+
if (code === 60) break;
|
|
271
|
+
tagStart -= 1;
|
|
272
|
+
}
|
|
273
|
+
if (tagStart < 0 || tagStart + 1 >= html.length || !isTagStartChar(html.charCodeAt(tagStart + 1))) return false;
|
|
274
|
+
let quote = 0;
|
|
275
|
+
for (let cursor = tagStart + 1; cursor < candidateStart; cursor += 1) {
|
|
276
|
+
const code = html.charCodeAt(cursor);
|
|
277
|
+
if (quote !== 0) {
|
|
278
|
+
if (code === quote) quote = 0;
|
|
279
|
+
continue;
|
|
280
|
+
}
|
|
281
|
+
if (code === CHAR_DOUBLE_QUOTE || code === CHAR_APOSTROPHE) {
|
|
282
|
+
quote = code;
|
|
283
|
+
continue;
|
|
284
|
+
}
|
|
285
|
+
if (code === CHAR_GREATER_THAN) return false;
|
|
286
|
+
}
|
|
287
|
+
return true;
|
|
288
|
+
}
|
|
289
|
+
function getSkippedTextContainer(html, start, firstTagChar) {
|
|
290
|
+
for (const container of TEXT_LITERAL_CONTAINERS) {
|
|
291
|
+
if (container.tagName.charCodeAt(0) !== firstTagChar) continue;
|
|
292
|
+
if (matchesTagName(html, start, container.tagName)) return container;
|
|
293
|
+
}
|
|
294
|
+
return null;
|
|
295
|
+
}
|
|
296
|
+
function skipLeadingHtmlWhitespace(source, start, end) {
|
|
297
|
+
while (start < end && isHtmlWhitespace(source.charCodeAt(start))) start += 1;
|
|
298
|
+
return start;
|
|
299
|
+
}
|
|
300
|
+
function skipTrailingHtmlWhitespace(source, start, end) {
|
|
301
|
+
while (end > start && isHtmlWhitespace(source.charCodeAt(end - 1))) end -= 1;
|
|
302
|
+
return end;
|
|
303
|
+
}
|
|
304
|
+
function isTagStartChar(code) {
|
|
305
|
+
return code === CHAR_EXCLAMATION_MARK || code === CHAR_SLASH || code === 63 || code >= 65 && code <= 90 || code >= 97 && code <= 122;
|
|
306
|
+
}
|
|
307
|
+
//#endregion
|
|
308
|
+
//#region src/parse.ts
|
|
309
|
+
function extractJsonLd(html) {
|
|
310
|
+
const rawEntries = extractJsonLdStrings(html);
|
|
311
|
+
const results = [];
|
|
312
|
+
for (const rawEntry of rawEntries) try {
|
|
313
|
+
results.push(JSON.parse(rawEntry));
|
|
314
|
+
} catch {}
|
|
315
|
+
return results;
|
|
316
|
+
}
|
|
317
|
+
//#endregion
|
|
318
|
+
exports.extractJsonLd = extractJsonLd;
|
|
319
|
+
exports.extractJsonLdStrings = extractJsonLdStrings;
|
package/dist/index.d.cts
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
//#region src/types.d.ts
|
|
2
|
+
type JsonPrimitive = boolean | number | null | string;
|
|
3
|
+
interface JsonObject {
|
|
4
|
+
[key: string]: JsonValue;
|
|
5
|
+
}
|
|
6
|
+
type JsonArray = JsonValue[];
|
|
7
|
+
type JsonValue = JsonArray | JsonObject | JsonPrimitive;
|
|
8
|
+
//#endregion
|
|
9
|
+
//#region src/parse.d.ts
|
|
10
|
+
declare function extractJsonLd<T extends JsonValue = JsonValue>(html: string): T[];
|
|
11
|
+
//#endregion
|
|
12
|
+
//#region src/scan.d.ts
|
|
13
|
+
declare function extractJsonLdStrings(html: string): string[];
|
|
14
|
+
//#endregion
|
|
15
|
+
export { type JsonArray, type JsonObject, type JsonPrimitive, type JsonValue, extractJsonLd, extractJsonLdStrings };
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
//#region src/types.d.ts
|
|
2
|
+
type JsonPrimitive = boolean | number | null | string;
|
|
3
|
+
interface JsonObject {
|
|
4
|
+
[key: string]: JsonValue;
|
|
5
|
+
}
|
|
6
|
+
type JsonArray = JsonValue[];
|
|
7
|
+
type JsonValue = JsonArray | JsonObject | JsonPrimitive;
|
|
8
|
+
//#endregion
|
|
9
|
+
//#region src/parse.d.ts
|
|
10
|
+
declare function extractJsonLd<T extends JsonValue = JsonValue>(html: string): T[];
|
|
11
|
+
//#endregion
|
|
12
|
+
//#region src/scan.d.ts
|
|
13
|
+
declare function extractJsonLdStrings(html: string): string[];
|
|
14
|
+
//#endregion
|
|
15
|
+
export { type JsonArray, type JsonObject, type JsonPrimitive, type JsonValue, extractJsonLd, extractJsonLdStrings };
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,317 @@
|
|
|
1
|
+
//#region src/scan.ts
|
|
2
|
+
const SCRIPT_TAG_NAME = "script";
|
|
3
|
+
const APPLICATION_LD_JSON_MIME = "application/ld+json";
|
|
4
|
+
const CHAR_TAB = 9;
|
|
5
|
+
const CHAR_LINE_FEED = 10;
|
|
6
|
+
const CHAR_FORM_FEED = 12;
|
|
7
|
+
const CHAR_CARRIAGE_RETURN = 13;
|
|
8
|
+
const CHAR_EXCLAMATION_MARK = 33;
|
|
9
|
+
const CHAR_DOUBLE_QUOTE = 34;
|
|
10
|
+
const CHAR_APOSTROPHE = 39;
|
|
11
|
+
const CHAR_SLASH = 47;
|
|
12
|
+
const CHAR_SEMICOLON = 59;
|
|
13
|
+
const CHAR_EQUALS = 61;
|
|
14
|
+
const CHAR_GREATER_THAN = 62;
|
|
15
|
+
const CHAR_SPACE = 32;
|
|
16
|
+
const CHAR_BYTE_ORDER_MARK = 65279;
|
|
17
|
+
const TEXT_LITERAL_CONTAINERS = [
|
|
18
|
+
{
|
|
19
|
+
tagName: "style",
|
|
20
|
+
isTerminal: false
|
|
21
|
+
},
|
|
22
|
+
{
|
|
23
|
+
tagName: "textarea",
|
|
24
|
+
isTerminal: false
|
|
25
|
+
},
|
|
26
|
+
{
|
|
27
|
+
tagName: "title",
|
|
28
|
+
isTerminal: false
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
tagName: "xmp",
|
|
32
|
+
isTerminal: false
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
tagName: "noembed",
|
|
36
|
+
isTerminal: false
|
|
37
|
+
},
|
|
38
|
+
{
|
|
39
|
+
tagName: "noframes",
|
|
40
|
+
isTerminal: false
|
|
41
|
+
},
|
|
42
|
+
{
|
|
43
|
+
tagName: "plaintext",
|
|
44
|
+
isTerminal: true
|
|
45
|
+
}
|
|
46
|
+
];
|
|
47
|
+
function extractJsonLdStrings(html) {
|
|
48
|
+
if (html.length === 0) return [];
|
|
49
|
+
const results = [];
|
|
50
|
+
let cursor = 0;
|
|
51
|
+
while (cursor < html.length) {
|
|
52
|
+
const openTagStart = findNextScriptOpenTag(html, cursor);
|
|
53
|
+
if (openTagStart === -1) break;
|
|
54
|
+
const openTag = parseScriptOpenTag(html, openTagStart);
|
|
55
|
+
if (openTag === null) break;
|
|
56
|
+
const contentStart = openTag.tagEnd + 1;
|
|
57
|
+
const closeTagStart = findNextCloseTag(html, contentStart, SCRIPT_TAG_NAME);
|
|
58
|
+
const contentEnd = closeTagStart === -1 ? html.length : closeTagStart;
|
|
59
|
+
if (openTag.isJsonLd) {
|
|
60
|
+
const content = normalizeJsonLdContent(html.slice(contentStart, contentEnd));
|
|
61
|
+
if (content.length > 0) results.push(content);
|
|
62
|
+
}
|
|
63
|
+
if (closeTagStart === -1) break;
|
|
64
|
+
const closeTagEnd = findTagEnd(html, closeTagStart + 2 + 6);
|
|
65
|
+
cursor = closeTagEnd === -1 ? html.length : closeTagEnd + 1;
|
|
66
|
+
}
|
|
67
|
+
return results;
|
|
68
|
+
}
|
|
69
|
+
function findNextScriptOpenTag(html, start) {
|
|
70
|
+
let cursor = html.indexOf("<", start);
|
|
71
|
+
while (cursor !== -1) {
|
|
72
|
+
const next = cursor + 1;
|
|
73
|
+
if (next >= html.length) return -1;
|
|
74
|
+
if (isHtmlCommentStart(html, next)) {
|
|
75
|
+
const commentEnd = html.indexOf("-->", next + 3);
|
|
76
|
+
if (commentEnd === -1) return -1;
|
|
77
|
+
cursor = html.indexOf("<", commentEnd + 3);
|
|
78
|
+
continue;
|
|
79
|
+
}
|
|
80
|
+
const firstTagChar = toLowerAsciiCode(html.charCodeAt(next));
|
|
81
|
+
if (firstTagChar !== 110 && firstTagChar !== 112 && firstTagChar !== 115 && firstTagChar !== 116 && firstTagChar !== 120) {
|
|
82
|
+
cursor = html.indexOf("<", cursor + 1);
|
|
83
|
+
continue;
|
|
84
|
+
}
|
|
85
|
+
if (isInsideOpenTag(html, cursor)) {
|
|
86
|
+
cursor = html.indexOf("<", cursor + 1);
|
|
87
|
+
continue;
|
|
88
|
+
}
|
|
89
|
+
if (firstTagChar === 115 && matchesTagName(html, next, SCRIPT_TAG_NAME)) return cursor;
|
|
90
|
+
const skippedContainer = getSkippedTextContainer(html, next, firstTagChar);
|
|
91
|
+
if (skippedContainer !== null) {
|
|
92
|
+
const openTagEnd = findTagEndRespectingQuotes(html, next + skippedContainer.tagName.length);
|
|
93
|
+
if (openTagEnd === -1) return -1;
|
|
94
|
+
if (skippedContainer.isTerminal) return -1;
|
|
95
|
+
const closeTagStart = findNextCloseTag(html, openTagEnd + 1, skippedContainer.tagName);
|
|
96
|
+
if (closeTagStart === -1) return -1;
|
|
97
|
+
const closeTagEnd = findTagEnd(html, closeTagStart + 2 + skippedContainer.tagName.length);
|
|
98
|
+
cursor = closeTagEnd === -1 ? -1 : html.indexOf("<", closeTagEnd + 1);
|
|
99
|
+
continue;
|
|
100
|
+
}
|
|
101
|
+
cursor = html.indexOf("<", cursor + 1);
|
|
102
|
+
}
|
|
103
|
+
return -1;
|
|
104
|
+
}
|
|
105
|
+
function parseScriptOpenTag(html, openTagStart) {
|
|
106
|
+
let cursor = openTagStart + 1 + 6;
|
|
107
|
+
let isJsonLd = false;
|
|
108
|
+
while (cursor < html.length) {
|
|
109
|
+
cursor = skipLeadingHtmlWhitespace(html, cursor, html.length);
|
|
110
|
+
if (cursor >= html.length) break;
|
|
111
|
+
const code = html.charCodeAt(cursor);
|
|
112
|
+
if (code === CHAR_GREATER_THAN) return {
|
|
113
|
+
isJsonLd,
|
|
114
|
+
tagEnd: cursor
|
|
115
|
+
};
|
|
116
|
+
if (isHtmlWhitespace(code) || code === CHAR_SLASH) {
|
|
117
|
+
cursor += 1;
|
|
118
|
+
continue;
|
|
119
|
+
}
|
|
120
|
+
const attributeNameStart = cursor;
|
|
121
|
+
while (cursor < html.length) {
|
|
122
|
+
const attributeCode = html.charCodeAt(cursor);
|
|
123
|
+
if (isHtmlWhitespace(attributeCode) || attributeCode === CHAR_EQUALS || attributeCode === CHAR_GREATER_THAN || attributeCode === CHAR_SLASH) break;
|
|
124
|
+
cursor += 1;
|
|
125
|
+
}
|
|
126
|
+
const isTypeAttribute = cursor - attributeNameStart === 4 && matchesAsciiLiteral(html, attributeNameStart, "type");
|
|
127
|
+
cursor = skipLeadingHtmlWhitespace(html, cursor, html.length);
|
|
128
|
+
if (cursor >= html.length || html.charCodeAt(cursor) !== CHAR_EQUALS) continue;
|
|
129
|
+
cursor += 1;
|
|
130
|
+
cursor = skipLeadingHtmlWhitespace(html, cursor, html.length);
|
|
131
|
+
if (cursor >= html.length) break;
|
|
132
|
+
const quote = html.charCodeAt(cursor);
|
|
133
|
+
let valueStart = cursor;
|
|
134
|
+
let valueEnd = cursor;
|
|
135
|
+
if (quote === CHAR_DOUBLE_QUOTE || quote === CHAR_APOSTROPHE) {
|
|
136
|
+
valueStart = cursor + 1;
|
|
137
|
+
cursor = valueStart;
|
|
138
|
+
while (cursor < html.length && html.charCodeAt(cursor) !== quote) cursor += 1;
|
|
139
|
+
valueEnd = cursor;
|
|
140
|
+
if (cursor < html.length) cursor += 1;
|
|
141
|
+
} else {
|
|
142
|
+
while (cursor < html.length) {
|
|
143
|
+
const valueCode = html.charCodeAt(cursor);
|
|
144
|
+
if (isHtmlWhitespace(valueCode) || valueCode === CHAR_GREATER_THAN) break;
|
|
145
|
+
cursor += 1;
|
|
146
|
+
}
|
|
147
|
+
valueEnd = cursor;
|
|
148
|
+
}
|
|
149
|
+
if (isTypeAttribute && isApplicationLdJsonMime(html, valueStart, valueEnd)) isJsonLd = true;
|
|
150
|
+
}
|
|
151
|
+
return null;
|
|
152
|
+
}
|
|
153
|
+
function normalizeJsonLdContent(content) {
|
|
154
|
+
let start = 0;
|
|
155
|
+
let end = content.length;
|
|
156
|
+
let changed = false;
|
|
157
|
+
while (true) {
|
|
158
|
+
const trimmedStart = skipLeadingHtmlWhitespace(content, start, end);
|
|
159
|
+
const trimmedEnd = skipTrailingHtmlWhitespace(content, trimmedStart, end);
|
|
160
|
+
if (trimmedStart !== start || trimmedEnd !== end) changed = true;
|
|
161
|
+
start = trimmedStart;
|
|
162
|
+
end = trimmedEnd;
|
|
163
|
+
if (start >= end) return "";
|
|
164
|
+
if (content.charCodeAt(start) === CHAR_BYTE_ORDER_MARK) {
|
|
165
|
+
start += 1;
|
|
166
|
+
changed = true;
|
|
167
|
+
continue;
|
|
168
|
+
}
|
|
169
|
+
if (content.startsWith("<!--", start)) {
|
|
170
|
+
start += 4;
|
|
171
|
+
changed = true;
|
|
172
|
+
continue;
|
|
173
|
+
}
|
|
174
|
+
if (content.startsWith("<![CDATA[", start)) {
|
|
175
|
+
start += 9;
|
|
176
|
+
changed = true;
|
|
177
|
+
continue;
|
|
178
|
+
}
|
|
179
|
+
if (end - start >= 3 && content.endsWith("-->", end)) {
|
|
180
|
+
end -= 3;
|
|
181
|
+
changed = true;
|
|
182
|
+
continue;
|
|
183
|
+
}
|
|
184
|
+
if (end - start >= 3 && content.endsWith("]]>", end)) {
|
|
185
|
+
end -= 3;
|
|
186
|
+
changed = true;
|
|
187
|
+
continue;
|
|
188
|
+
}
|
|
189
|
+
if (content.charCodeAt(end - 1) === CHAR_SEMICOLON) {
|
|
190
|
+
end -= 1;
|
|
191
|
+
changed = true;
|
|
192
|
+
continue;
|
|
193
|
+
}
|
|
194
|
+
break;
|
|
195
|
+
}
|
|
196
|
+
if (!changed && start === 0 && end === content.length) return content;
|
|
197
|
+
return content.slice(start, end);
|
|
198
|
+
}
|
|
199
|
+
function findTagEnd(html, start) {
|
|
200
|
+
return html.indexOf(">", start);
|
|
201
|
+
}
|
|
202
|
+
function findTagEndRespectingQuotes(html, start) {
|
|
203
|
+
let quote = 0;
|
|
204
|
+
for (let cursor = start; cursor < html.length; cursor += 1) {
|
|
205
|
+
const code = html.charCodeAt(cursor);
|
|
206
|
+
if (quote !== 0) {
|
|
207
|
+
if (code === quote) quote = 0;
|
|
208
|
+
continue;
|
|
209
|
+
}
|
|
210
|
+
if (code === CHAR_DOUBLE_QUOTE || code === CHAR_APOSTROPHE) {
|
|
211
|
+
quote = code;
|
|
212
|
+
continue;
|
|
213
|
+
}
|
|
214
|
+
if (code === CHAR_GREATER_THAN) return cursor;
|
|
215
|
+
}
|
|
216
|
+
return -1;
|
|
217
|
+
}
|
|
218
|
+
function findNextCloseTag(html, start, tagName) {
|
|
219
|
+
let cursor = html.indexOf("<", start);
|
|
220
|
+
while (cursor !== -1) {
|
|
221
|
+
const slashIndex = cursor + 1;
|
|
222
|
+
if (slashIndex < html.length && html.charCodeAt(slashIndex) === CHAR_SLASH && matchesTagName(html, slashIndex + 1, tagName)) return cursor;
|
|
223
|
+
cursor = html.indexOf("<", cursor + 1);
|
|
224
|
+
}
|
|
225
|
+
return -1;
|
|
226
|
+
}
|
|
227
|
+
function isApplicationLdJsonMime(value, start, end) {
|
|
228
|
+
start = skipLeadingHtmlWhitespace(value, start, end);
|
|
229
|
+
end = skipTrailingHtmlWhitespace(value, start, end);
|
|
230
|
+
if (end - start < 19) return false;
|
|
231
|
+
if (!matchesAsciiLiteral(value, start, APPLICATION_LD_JSON_MIME)) return false;
|
|
232
|
+
if (end - start === 19) return true;
|
|
233
|
+
const suffixStart = start + 19;
|
|
234
|
+
const nextCode = value.charCodeAt(suffixStart);
|
|
235
|
+
if (nextCode === CHAR_SEMICOLON) return true;
|
|
236
|
+
if (!isHtmlWhitespace(nextCode)) return false;
|
|
237
|
+
for (let index = suffixStart; index < end; index += 1) {
|
|
238
|
+
const code = value.charCodeAt(index);
|
|
239
|
+
if (!isHtmlWhitespace(code)) return code === CHAR_SEMICOLON;
|
|
240
|
+
}
|
|
241
|
+
return true;
|
|
242
|
+
}
|
|
243
|
+
function matchesTagName(source, start, expectedTagName) {
|
|
244
|
+
if (!matchesAsciiLiteral(source, start, expectedTagName)) return false;
|
|
245
|
+
const boundaryIndex = start + expectedTagName.length;
|
|
246
|
+
if (boundaryIndex >= source.length) return true;
|
|
247
|
+
const boundaryCode = source.charCodeAt(boundaryIndex);
|
|
248
|
+
return boundaryCode === CHAR_GREATER_THAN || boundaryCode === CHAR_SLASH || isHtmlWhitespace(boundaryCode);
|
|
249
|
+
}
|
|
250
|
+
function matchesAsciiLiteral(source, start, expected) {
|
|
251
|
+
if (start + expected.length > source.length) return false;
|
|
252
|
+
for (let index = 0; index < expected.length; index += 1) if (toLowerAsciiCode(source.charCodeAt(start + index)) !== expected.charCodeAt(index)) return false;
|
|
253
|
+
return true;
|
|
254
|
+
}
|
|
255
|
+
function isHtmlCommentStart(source, start) {
|
|
256
|
+
return start + 2 < source.length && source.charCodeAt(start) === CHAR_EXCLAMATION_MARK && source.charCodeAt(start + 1) === 45 && source.charCodeAt(start + 2) === 45;
|
|
257
|
+
}
|
|
258
|
+
function isHtmlWhitespace(code) {
|
|
259
|
+
return code === CHAR_SPACE || code === CHAR_TAB || code === CHAR_LINE_FEED || code === CHAR_CARRIAGE_RETURN || code === CHAR_FORM_FEED;
|
|
260
|
+
}
|
|
261
|
+
function toLowerAsciiCode(code) {
|
|
262
|
+
return code >= 65 && code <= 90 ? code + 32 : code;
|
|
263
|
+
}
|
|
264
|
+
function isInsideOpenTag(html, candidateStart) {
|
|
265
|
+
let tagStart = candidateStart - 1;
|
|
266
|
+
while (tagStart >= 0) {
|
|
267
|
+
const code = html.charCodeAt(tagStart);
|
|
268
|
+
if (code === CHAR_GREATER_THAN) return false;
|
|
269
|
+
if (code === 60) break;
|
|
270
|
+
tagStart -= 1;
|
|
271
|
+
}
|
|
272
|
+
if (tagStart < 0 || tagStart + 1 >= html.length || !isTagStartChar(html.charCodeAt(tagStart + 1))) return false;
|
|
273
|
+
let quote = 0;
|
|
274
|
+
for (let cursor = tagStart + 1; cursor < candidateStart; cursor += 1) {
|
|
275
|
+
const code = html.charCodeAt(cursor);
|
|
276
|
+
if (quote !== 0) {
|
|
277
|
+
if (code === quote) quote = 0;
|
|
278
|
+
continue;
|
|
279
|
+
}
|
|
280
|
+
if (code === CHAR_DOUBLE_QUOTE || code === CHAR_APOSTROPHE) {
|
|
281
|
+
quote = code;
|
|
282
|
+
continue;
|
|
283
|
+
}
|
|
284
|
+
if (code === CHAR_GREATER_THAN) return false;
|
|
285
|
+
}
|
|
286
|
+
return true;
|
|
287
|
+
}
|
|
288
|
+
function getSkippedTextContainer(html, start, firstTagChar) {
|
|
289
|
+
for (const container of TEXT_LITERAL_CONTAINERS) {
|
|
290
|
+
if (container.tagName.charCodeAt(0) !== firstTagChar) continue;
|
|
291
|
+
if (matchesTagName(html, start, container.tagName)) return container;
|
|
292
|
+
}
|
|
293
|
+
return null;
|
|
294
|
+
}
|
|
295
|
+
function skipLeadingHtmlWhitespace(source, start, end) {
|
|
296
|
+
while (start < end && isHtmlWhitespace(source.charCodeAt(start))) start += 1;
|
|
297
|
+
return start;
|
|
298
|
+
}
|
|
299
|
+
function skipTrailingHtmlWhitespace(source, start, end) {
|
|
300
|
+
while (end > start && isHtmlWhitespace(source.charCodeAt(end - 1))) end -= 1;
|
|
301
|
+
return end;
|
|
302
|
+
}
|
|
303
|
+
function isTagStartChar(code) {
|
|
304
|
+
return code === CHAR_EXCLAMATION_MARK || code === CHAR_SLASH || code === 63 || code >= 65 && code <= 90 || code >= 97 && code <= 122;
|
|
305
|
+
}
|
|
306
|
+
//#endregion
|
|
307
|
+
//#region src/parse.ts
|
|
308
|
+
function extractJsonLd(html) {
|
|
309
|
+
const rawEntries = extractJsonLdStrings(html);
|
|
310
|
+
const results = [];
|
|
311
|
+
for (const rawEntry of rawEntries) try {
|
|
312
|
+
results.push(JSON.parse(rawEntry));
|
|
313
|
+
} catch {}
|
|
314
|
+
return results;
|
|
315
|
+
}
|
|
316
|
+
//#endregion
|
|
317
|
+
export { extractJsonLd, extractJsonLdStrings };
|
package/package.json
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "html-json-extractor",
|
|
3
|
+
"version": "0.0.1",
|
|
4
|
+
"description": "Fast, forgiving extraction of application/ld+json script blocks from HTML strings.",
|
|
5
|
+
"license": "MIT",
|
|
6
|
+
"author": "VastBlast",
|
|
7
|
+
"type": "module",
|
|
8
|
+
"sideEffects": false,
|
|
9
|
+
"files": [
|
|
10
|
+
"dist"
|
|
11
|
+
],
|
|
12
|
+
"keywords": [
|
|
13
|
+
"html",
|
|
14
|
+
"json-ld",
|
|
15
|
+
"ld+json",
|
|
16
|
+
"schema",
|
|
17
|
+
"schema.org",
|
|
18
|
+
"structured-data",
|
|
19
|
+
"extractor",
|
|
20
|
+
"typescript"
|
|
21
|
+
],
|
|
22
|
+
"scripts": {
|
|
23
|
+
"build": "tsdown",
|
|
24
|
+
"build:watch": "tsdown --watch",
|
|
25
|
+
"test": "vitest run",
|
|
26
|
+
"bench": "node --disable-warning=ExperimentalWarning --experimental-strip-types bench/extract-json-ld.ts",
|
|
27
|
+
"check:types": "tsgo --noEmit",
|
|
28
|
+
"lint": "oxlint --type-aware",
|
|
29
|
+
"lint:fix": "oxlint --fix --type-aware",
|
|
30
|
+
"check": "npm run lint && npm run check:types",
|
|
31
|
+
"prepack": "npm run build",
|
|
32
|
+
"prepublishOnly": "npm run check && npm test"
|
|
33
|
+
},
|
|
34
|
+
"devDependencies": {
|
|
35
|
+
"@types/node": "^25.3.5",
|
|
36
|
+
"@typescript/native-preview": "^7.0.0-dev.20260308.1",
|
|
37
|
+
"oxlint": "^1.51.0",
|
|
38
|
+
"oxlint-tsgolint": "^0.16.0",
|
|
39
|
+
"tsdown": "^0.21.0",
|
|
40
|
+
"typescript": "^5.9.3",
|
|
41
|
+
"vitest": "^4.0.18"
|
|
42
|
+
},
|
|
43
|
+
"main": "./dist/index.cjs",
|
|
44
|
+
"module": "./dist/index.js",
|
|
45
|
+
"types": "./dist/index.d.ts",
|
|
46
|
+
"exports": {
|
|
47
|
+
".": {
|
|
48
|
+
"types": "./dist/index.d.ts",
|
|
49
|
+
"import": "./dist/index.js",
|
|
50
|
+
"require": "./dist/index.cjs"
|
|
51
|
+
},
|
|
52
|
+
"./package.json": "./package.json"
|
|
53
|
+
}
|
|
54
|
+
}
|