html-json-extractor 0.0.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +33 -11
- package/dist/index.cjs +45 -17
- package/dist/index.d.cts +7 -3
- package/dist/index.d.ts +7 -3
- package/dist/index.js +42 -16
- package/package.json +13 -4
package/README.md
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
# html-json-extractor
|
|
2
2
|
|
|
3
|
-
Fast, forgiving extraction of `<script type="application/ld+json">` blocks from an HTML string.
|
|
3
|
+
Fast, forgiving extraction of `<script type="application/json">` and `<script type="application/ld+json">` blocks from an HTML string.
|
|
4
4
|
|
|
5
5
|
- No DOM parser or runtime dependencies
|
|
6
6
|
- Returns one result per matching script block
|
|
7
|
-
- Malformed JSON
|
|
7
|
+
- Malformed JSON blocks do not break the rest
|
|
8
8
|
|
|
9
9
|
## Install
|
|
10
10
|
|
|
@@ -15,31 +15,53 @@ npm install html-json-extractor
|
|
|
15
15
|
## Usage
|
|
16
16
|
|
|
17
17
|
```ts
|
|
18
|
-
import {
|
|
18
|
+
import {
|
|
19
|
+
extractJson,
|
|
20
|
+
extractJsonStrings,
|
|
21
|
+
getJsonLdItems,
|
|
22
|
+
getJsonLdRecords
|
|
23
|
+
} from 'html-json-extractor';
|
|
19
24
|
|
|
20
25
|
const html = `
|
|
26
|
+
<script type="application/json">{"featureFlags":{"search":true}}</script>
|
|
21
27
|
<script type="application/ld+json">{"@type":"WebSite","name":"Example"}</script>
|
|
22
|
-
<script type="application/
|
|
28
|
+
<script type="application/json">{"broken":</script>
|
|
23
29
|
<script type="application/ld+json">[{"@type":"Person","name":"Ada"}]</script>
|
|
24
30
|
`;
|
|
25
31
|
|
|
26
|
-
const raw =
|
|
27
|
-
// ['{"@type":"WebSite","name":"Example"}', '{"broken":', '[{"@type":"Person","name":"Ada"}]']
|
|
32
|
+
const raw = extractJsonStrings(html);
|
|
33
|
+
// ['{"featureFlags":{"search":true}}', '{"@type":"WebSite","name":"Example"}', '{"broken":', '[{"@type":"Person","name":"Ada"}]']
|
|
28
34
|
|
|
29
|
-
const parsed =
|
|
30
|
-
// [{ '@type': 'WebSite', name: 'Example' }, [{ '@type': 'Person', name: 'Ada' }]]
|
|
35
|
+
const parsed = extractJson(html);
|
|
36
|
+
// [{ featureFlags: { search: true } }, { '@type': 'WebSite', name: 'Example' }, [{ '@type': 'Person', name: 'Ada' }]]
|
|
37
|
+
|
|
38
|
+
const items = parsed.flatMap(getJsonLdItems);
|
|
39
|
+
// [{ featureFlags: { search: true } }, { '@type': 'WebSite', name: 'Example' }, { '@type': 'Person', name: 'Ada' }]
|
|
40
|
+
|
|
41
|
+
const records = parsed.flatMap(getJsonLdRecords);
|
|
42
|
+
// [{ featureFlags: { search: true } }, { '@type': 'WebSite', name: 'Example' }, { '@type': 'Person', name: 'Ada' }]
|
|
31
43
|
```
|
|
32
44
|
|
|
33
45
|
## API
|
|
34
46
|
|
|
35
|
-
### `
|
|
47
|
+
### `extractJsonStrings(html: string): string[]`
|
|
36
48
|
|
|
37
|
-
Returns normalized
|
|
49
|
+
Returns normalized `application/json` and `application/ld+json` script contents as strings.
|
|
38
50
|
|
|
39
|
-
### `
|
|
51
|
+
### `extractJson<T = JsonValue>(html: string): T[]`
|
|
40
52
|
|
|
41
53
|
Parses the extracted strings with `JSON.parse`. Entries that fail to parse are skipped.
|
|
42
54
|
|
|
55
|
+
### `getJsonLdItems(value: unknown): unknown[]`
|
|
56
|
+
|
|
57
|
+
Normalizes a single parsed entry or parsed array entry into a flat list of items.
|
|
58
|
+
|
|
59
|
+
### `getJsonLdRecords(value: unknown): Record<string, unknown>[]`
|
|
60
|
+
|
|
61
|
+
Returns record-shaped items and follows nested `@graph` content.
|
|
62
|
+
|
|
63
|
+
`getJsonLdItems` and `getJsonLdRecords` remain JSON-LD-oriented helpers for structured-data use cases on top of the generic extractor.
|
|
64
|
+
|
|
43
65
|
## License
|
|
44
66
|
|
|
45
67
|
MIT
|
package/dist/index.cjs
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Object.defineProperty(exports, Symbol.toStringTag, { value: "Module" });
|
|
2
2
|
//#region src/scan.ts
|
|
3
3
|
const SCRIPT_TAG_NAME = "script";
|
|
4
|
-
const
|
|
4
|
+
const SUPPORTED_JSON_SCRIPT_MIME_TYPES = ["application/ld+json", "application/json"];
|
|
5
5
|
const CHAR_TAB = 9;
|
|
6
6
|
const CHAR_LINE_FEED = 10;
|
|
7
7
|
const CHAR_FORM_FEED = 12;
|
|
@@ -45,7 +45,7 @@ const TEXT_LITERAL_CONTAINERS = [
|
|
|
45
45
|
isTerminal: true
|
|
46
46
|
}
|
|
47
47
|
];
|
|
48
|
-
function
|
|
48
|
+
function extractJsonStrings(html) {
|
|
49
49
|
if (html.length === 0) return [];
|
|
50
50
|
const results = [];
|
|
51
51
|
let cursor = 0;
|
|
@@ -57,8 +57,8 @@ function extractJsonLdStrings(html) {
|
|
|
57
57
|
const contentStart = openTag.tagEnd + 1;
|
|
58
58
|
const closeTagStart = findNextCloseTag(html, contentStart, SCRIPT_TAG_NAME);
|
|
59
59
|
const contentEnd = closeTagStart === -1 ? html.length : closeTagStart;
|
|
60
|
-
if (openTag.
|
|
61
|
-
const content =
|
|
60
|
+
if (openTag.hasSupportedMime) {
|
|
61
|
+
const content = normalizeJsonScriptContent(html.slice(contentStart, contentEnd));
|
|
62
62
|
if (content.length > 0) results.push(content);
|
|
63
63
|
}
|
|
64
64
|
if (closeTagStart === -1) break;
|
|
@@ -105,13 +105,13 @@ function findNextScriptOpenTag(html, start) {
|
|
|
105
105
|
}
|
|
106
106
|
function parseScriptOpenTag(html, openTagStart) {
|
|
107
107
|
let cursor = openTagStart + 1 + 6;
|
|
108
|
-
let
|
|
108
|
+
let hasSupportedMime = false;
|
|
109
109
|
while (cursor < html.length) {
|
|
110
110
|
cursor = skipLeadingHtmlWhitespace(html, cursor, html.length);
|
|
111
111
|
if (cursor >= html.length) break;
|
|
112
112
|
const code = html.charCodeAt(cursor);
|
|
113
113
|
if (code === CHAR_GREATER_THAN) return {
|
|
114
|
-
|
|
114
|
+
hasSupportedMime,
|
|
115
115
|
tagEnd: cursor
|
|
116
116
|
};
|
|
117
117
|
if (isHtmlWhitespace(code) || code === CHAR_SLASH) {
|
|
@@ -147,11 +147,11 @@ function parseScriptOpenTag(html, openTagStart) {
|
|
|
147
147
|
}
|
|
148
148
|
valueEnd = cursor;
|
|
149
149
|
}
|
|
150
|
-
if (isTypeAttribute &&
|
|
150
|
+
if (isTypeAttribute && isSupportedJsonScriptMime(html, valueStart, valueEnd)) hasSupportedMime = true;
|
|
151
151
|
}
|
|
152
152
|
return null;
|
|
153
153
|
}
|
|
154
|
-
function
|
|
154
|
+
function normalizeJsonScriptContent(content) {
|
|
155
155
|
let start = 0;
|
|
156
156
|
let end = content.length;
|
|
157
157
|
let changed = false;
|
|
@@ -225,13 +225,17 @@ function findNextCloseTag(html, start, tagName) {
|
|
|
225
225
|
}
|
|
226
226
|
return -1;
|
|
227
227
|
}
|
|
228
|
-
function
|
|
228
|
+
function isSupportedJsonScriptMime(value, start, end) {
|
|
229
229
|
start = skipLeadingHtmlWhitespace(value, start, end);
|
|
230
230
|
end = skipTrailingHtmlWhitespace(value, start, end);
|
|
231
|
-
if (
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
231
|
+
for (const mime of SUPPORTED_JSON_SCRIPT_MIME_TYPES) if (matchesMimeType(value, start, end, mime)) return true;
|
|
232
|
+
return false;
|
|
233
|
+
}
|
|
234
|
+
function matchesMimeType(value, start, end, expectedMime) {
|
|
235
|
+
if (end - start < expectedMime.length) return false;
|
|
236
|
+
if (!matchesAsciiLiteral(value, start, expectedMime)) return false;
|
|
237
|
+
if (end - start === expectedMime.length) return true;
|
|
238
|
+
const suffixStart = start + expectedMime.length;
|
|
235
239
|
const nextCode = value.charCodeAt(suffixStart);
|
|
236
240
|
if (nextCode === CHAR_SEMICOLON) return true;
|
|
237
241
|
if (!isHtmlWhitespace(nextCode)) return false;
|
|
@@ -306,8 +310,8 @@ function isTagStartChar(code) {
|
|
|
306
310
|
}
|
|
307
311
|
//#endregion
|
|
308
312
|
//#region src/parse.ts
|
|
309
|
-
function
|
|
310
|
-
const rawEntries =
|
|
313
|
+
function extractJson(html) {
|
|
314
|
+
const rawEntries = extractJsonStrings(html);
|
|
311
315
|
const results = [];
|
|
312
316
|
for (const rawEntry of rawEntries) try {
|
|
313
317
|
results.push(JSON.parse(rawEntry));
|
|
@@ -315,5 +319,29 @@ function extractJsonLd(html) {
|
|
|
315
319
|
return results;
|
|
316
320
|
}
|
|
317
321
|
//#endregion
|
|
318
|
-
|
|
319
|
-
|
|
322
|
+
//#region src/helpers.ts
|
|
323
|
+
const isRecord = (value) => {
|
|
324
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
325
|
+
};
|
|
326
|
+
const asRecord = (value) => {
|
|
327
|
+
return isRecord(value) ? value : void 0;
|
|
328
|
+
};
|
|
329
|
+
const asArray = (value) => {
|
|
330
|
+
return Array.isArray(value) ? value : void 0;
|
|
331
|
+
};
|
|
332
|
+
function getJsonLdRecords(value) {
|
|
333
|
+
const items = asArray(value);
|
|
334
|
+
if (items) return items.flatMap(getJsonLdRecords);
|
|
335
|
+
const record = asRecord(value);
|
|
336
|
+
if (!record) return [];
|
|
337
|
+
return [record, ...getJsonLdRecords(record["@graph"])];
|
|
338
|
+
}
|
|
339
|
+
function getJsonLdItems(value) {
|
|
340
|
+
if (value === void 0) return [];
|
|
341
|
+
return asArray(value) ?? [value];
|
|
342
|
+
}
|
|
343
|
+
//#endregion
|
|
344
|
+
exports.extractJson = extractJson;
|
|
345
|
+
exports.extractJsonStrings = extractJsonStrings;
|
|
346
|
+
exports.getJsonLdItems = getJsonLdItems;
|
|
347
|
+
exports.getJsonLdRecords = getJsonLdRecords;
|
package/dist/index.d.cts
CHANGED
|
@@ -7,9 +7,13 @@ type JsonArray = JsonValue[];
|
|
|
7
7
|
type JsonValue = JsonArray | JsonObject | JsonPrimitive;
|
|
8
8
|
//#endregion
|
|
9
9
|
//#region src/parse.d.ts
|
|
10
|
-
declare function
|
|
10
|
+
declare function extractJson<T extends JsonValue = JsonValue>(html: string): T[];
|
|
11
11
|
//#endregion
|
|
12
12
|
//#region src/scan.d.ts
|
|
13
|
-
declare function
|
|
13
|
+
declare function extractJsonStrings(html: string): string[];
|
|
14
14
|
//#endregion
|
|
15
|
-
|
|
15
|
+
//#region src/helpers.d.ts
|
|
16
|
+
declare function getJsonLdRecords(value: unknown): Record<string, unknown>[];
|
|
17
|
+
declare function getJsonLdItems(value: unknown): unknown[];
|
|
18
|
+
//#endregion
|
|
19
|
+
export { type JsonArray, type JsonObject, type JsonPrimitive, type JsonValue, extractJson, extractJsonStrings, getJsonLdItems, getJsonLdRecords };
|
package/dist/index.d.ts
CHANGED
|
@@ -7,9 +7,13 @@ type JsonArray = JsonValue[];
|
|
|
7
7
|
type JsonValue = JsonArray | JsonObject | JsonPrimitive;
|
|
8
8
|
//#endregion
|
|
9
9
|
//#region src/parse.d.ts
|
|
10
|
-
declare function
|
|
10
|
+
declare function extractJson<T extends JsonValue = JsonValue>(html: string): T[];
|
|
11
11
|
//#endregion
|
|
12
12
|
//#region src/scan.d.ts
|
|
13
|
-
declare function
|
|
13
|
+
declare function extractJsonStrings(html: string): string[];
|
|
14
14
|
//#endregion
|
|
15
|
-
|
|
15
|
+
//#region src/helpers.d.ts
|
|
16
|
+
declare function getJsonLdRecords(value: unknown): Record<string, unknown>[];
|
|
17
|
+
declare function getJsonLdItems(value: unknown): unknown[];
|
|
18
|
+
//#endregion
|
|
19
|
+
export { type JsonArray, type JsonObject, type JsonPrimitive, type JsonValue, extractJson, extractJsonStrings, getJsonLdItems, getJsonLdRecords };
|
package/dist/index.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
//#region src/scan.ts
|
|
2
2
|
const SCRIPT_TAG_NAME = "script";
|
|
3
|
-
const
|
|
3
|
+
const SUPPORTED_JSON_SCRIPT_MIME_TYPES = ["application/ld+json", "application/json"];
|
|
4
4
|
const CHAR_TAB = 9;
|
|
5
5
|
const CHAR_LINE_FEED = 10;
|
|
6
6
|
const CHAR_FORM_FEED = 12;
|
|
@@ -44,7 +44,7 @@ const TEXT_LITERAL_CONTAINERS = [
|
|
|
44
44
|
isTerminal: true
|
|
45
45
|
}
|
|
46
46
|
];
|
|
47
|
-
function
|
|
47
|
+
function extractJsonStrings(html) {
|
|
48
48
|
if (html.length === 0) return [];
|
|
49
49
|
const results = [];
|
|
50
50
|
let cursor = 0;
|
|
@@ -56,8 +56,8 @@ function extractJsonLdStrings(html) {
|
|
|
56
56
|
const contentStart = openTag.tagEnd + 1;
|
|
57
57
|
const closeTagStart = findNextCloseTag(html, contentStart, SCRIPT_TAG_NAME);
|
|
58
58
|
const contentEnd = closeTagStart === -1 ? html.length : closeTagStart;
|
|
59
|
-
if (openTag.
|
|
60
|
-
const content =
|
|
59
|
+
if (openTag.hasSupportedMime) {
|
|
60
|
+
const content = normalizeJsonScriptContent(html.slice(contentStart, contentEnd));
|
|
61
61
|
if (content.length > 0) results.push(content);
|
|
62
62
|
}
|
|
63
63
|
if (closeTagStart === -1) break;
|
|
@@ -104,13 +104,13 @@ function findNextScriptOpenTag(html, start) {
|
|
|
104
104
|
}
|
|
105
105
|
function parseScriptOpenTag(html, openTagStart) {
|
|
106
106
|
let cursor = openTagStart + 1 + 6;
|
|
107
|
-
let
|
|
107
|
+
let hasSupportedMime = false;
|
|
108
108
|
while (cursor < html.length) {
|
|
109
109
|
cursor = skipLeadingHtmlWhitespace(html, cursor, html.length);
|
|
110
110
|
if (cursor >= html.length) break;
|
|
111
111
|
const code = html.charCodeAt(cursor);
|
|
112
112
|
if (code === CHAR_GREATER_THAN) return {
|
|
113
|
-
|
|
113
|
+
hasSupportedMime,
|
|
114
114
|
tagEnd: cursor
|
|
115
115
|
};
|
|
116
116
|
if (isHtmlWhitespace(code) || code === CHAR_SLASH) {
|
|
@@ -146,11 +146,11 @@ function parseScriptOpenTag(html, openTagStart) {
|
|
|
146
146
|
}
|
|
147
147
|
valueEnd = cursor;
|
|
148
148
|
}
|
|
149
|
-
if (isTypeAttribute &&
|
|
149
|
+
if (isTypeAttribute && isSupportedJsonScriptMime(html, valueStart, valueEnd)) hasSupportedMime = true;
|
|
150
150
|
}
|
|
151
151
|
return null;
|
|
152
152
|
}
|
|
153
|
-
function
|
|
153
|
+
function normalizeJsonScriptContent(content) {
|
|
154
154
|
let start = 0;
|
|
155
155
|
let end = content.length;
|
|
156
156
|
let changed = false;
|
|
@@ -224,13 +224,17 @@ function findNextCloseTag(html, start, tagName) {
|
|
|
224
224
|
}
|
|
225
225
|
return -1;
|
|
226
226
|
}
|
|
227
|
-
function
|
|
227
|
+
function isSupportedJsonScriptMime(value, start, end) {
|
|
228
228
|
start = skipLeadingHtmlWhitespace(value, start, end);
|
|
229
229
|
end = skipTrailingHtmlWhitespace(value, start, end);
|
|
230
|
-
if (
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
230
|
+
for (const mime of SUPPORTED_JSON_SCRIPT_MIME_TYPES) if (matchesMimeType(value, start, end, mime)) return true;
|
|
231
|
+
return false;
|
|
232
|
+
}
|
|
233
|
+
function matchesMimeType(value, start, end, expectedMime) {
|
|
234
|
+
if (end - start < expectedMime.length) return false;
|
|
235
|
+
if (!matchesAsciiLiteral(value, start, expectedMime)) return false;
|
|
236
|
+
if (end - start === expectedMime.length) return true;
|
|
237
|
+
const suffixStart = start + expectedMime.length;
|
|
234
238
|
const nextCode = value.charCodeAt(suffixStart);
|
|
235
239
|
if (nextCode === CHAR_SEMICOLON) return true;
|
|
236
240
|
if (!isHtmlWhitespace(nextCode)) return false;
|
|
@@ -305,8 +309,8 @@ function isTagStartChar(code) {
|
|
|
305
309
|
}
|
|
306
310
|
//#endregion
|
|
307
311
|
//#region src/parse.ts
|
|
308
|
-
function
|
|
309
|
-
const rawEntries =
|
|
312
|
+
function extractJson(html) {
|
|
313
|
+
const rawEntries = extractJsonStrings(html);
|
|
310
314
|
const results = [];
|
|
311
315
|
for (const rawEntry of rawEntries) try {
|
|
312
316
|
results.push(JSON.parse(rawEntry));
|
|
@@ -314,4 +318,26 @@ function extractJsonLd(html) {
|
|
|
314
318
|
return results;
|
|
315
319
|
}
|
|
316
320
|
//#endregion
|
|
317
|
-
|
|
321
|
+
//#region src/helpers.ts
|
|
322
|
+
const isRecord = (value) => {
|
|
323
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
324
|
+
};
|
|
325
|
+
const asRecord = (value) => {
|
|
326
|
+
return isRecord(value) ? value : void 0;
|
|
327
|
+
};
|
|
328
|
+
const asArray = (value) => {
|
|
329
|
+
return Array.isArray(value) ? value : void 0;
|
|
330
|
+
};
|
|
331
|
+
function getJsonLdRecords(value) {
|
|
332
|
+
const items = asArray(value);
|
|
333
|
+
if (items) return items.flatMap(getJsonLdRecords);
|
|
334
|
+
const record = asRecord(value);
|
|
335
|
+
if (!record) return [];
|
|
336
|
+
return [record, ...getJsonLdRecords(record["@graph"])];
|
|
337
|
+
}
|
|
338
|
+
function getJsonLdItems(value) {
|
|
339
|
+
if (value === void 0) return [];
|
|
340
|
+
return asArray(value) ?? [value];
|
|
341
|
+
}
|
|
342
|
+
//#endregion
|
|
343
|
+
export { extractJson, extractJsonStrings, getJsonLdItems, getJsonLdRecords };
|
package/package.json
CHANGED
|
@@ -1,9 +1,17 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "html-json-extractor",
|
|
3
|
-
"version": "0.0
|
|
4
|
-
"description": "Fast, forgiving extraction of application/ld+json script blocks from HTML strings.",
|
|
3
|
+
"version": "0.2.0",
|
|
4
|
+
"description": "Fast, forgiving extraction of application/json and application/ld+json script blocks from HTML strings.",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"author": "VastBlast",
|
|
7
|
+
"repository": {
|
|
8
|
+
"type": "git",
|
|
9
|
+
"url": "git+https://github.com/VastBlast/html-json-extractor.git"
|
|
10
|
+
},
|
|
11
|
+
"bugs": {
|
|
12
|
+
"url": "https://github.com/VastBlast/html-json-extractor/issues"
|
|
13
|
+
},
|
|
14
|
+
"homepage": "https://github.com/VastBlast/html-json-extractor#readme",
|
|
7
15
|
"type": "module",
|
|
8
16
|
"sideEffects": false,
|
|
9
17
|
"files": [
|
|
@@ -11,6 +19,8 @@
|
|
|
11
19
|
],
|
|
12
20
|
"keywords": [
|
|
13
21
|
"html",
|
|
22
|
+
"json",
|
|
23
|
+
"application/json",
|
|
14
24
|
"json-ld",
|
|
15
25
|
"ld+json",
|
|
16
26
|
"schema",
|
|
@@ -42,10 +52,9 @@
|
|
|
42
52
|
},
|
|
43
53
|
"main": "./dist/index.cjs",
|
|
44
54
|
"module": "./dist/index.js",
|
|
45
|
-
"types": "./dist/index.d.
|
|
55
|
+
"types": "./dist/index.d.cts",
|
|
46
56
|
"exports": {
|
|
47
57
|
".": {
|
|
48
|
-
"types": "./dist/index.d.ts",
|
|
49
58
|
"import": "./dist/index.js",
|
|
50
59
|
"require": "./dist/index.cjs"
|
|
51
60
|
},
|