html-json-extractor 0.0.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,10 +1,10 @@
1
1
  # html-json-extractor
2
2
 
3
- Fast, forgiving extraction of `<script type="application/ld+json">` blocks from an HTML string.
3
+ Fast, forgiving extraction of `<script type="application/json">` and `<script type="application/ld+json">` blocks from an HTML string.
4
4
 
5
5
  - No DOM parser or runtime dependencies
6
6
  - Returns one result per matching script block
7
- - Malformed JSON-LD blocks do not break the rest
7
+ - Malformed JSON blocks do not break the rest
8
8
 
9
9
  ## Install
10
10
 
@@ -15,31 +15,53 @@ npm install html-json-extractor
15
15
  ## Usage
16
16
 
17
17
  ```ts
18
- import { extractJsonLd, extractJsonLdStrings } from 'html-json-extractor';
18
+ import {
19
+ extractJson,
20
+ extractJsonStrings,
21
+ getJsonLdItems,
22
+ getJsonLdRecords
23
+ } from 'html-json-extractor';
19
24
 
20
25
  const html = `
26
+ <script type="application/json">{"featureFlags":{"search":true}}</script>
21
27
  <script type="application/ld+json">{"@type":"WebSite","name":"Example"}</script>
22
- <script type="application/ld+json">{"broken":</script>
28
+ <script type="application/json">{"broken":</script>
23
29
  <script type="application/ld+json">[{"@type":"Person","name":"Ada"}]</script>
24
30
  `;
25
31
 
26
- const raw = extractJsonLdStrings(html);
27
- // ['{"@type":"WebSite","name":"Example"}', '{"broken":', '[{"@type":"Person","name":"Ada"}]']
32
+ const raw = extractJsonStrings(html);
33
+ // ['{"featureFlags":{"search":true}}', '{"@type":"WebSite","name":"Example"}', '{"broken":', '[{"@type":"Person","name":"Ada"}]']
28
34
 
29
- const parsed = extractJsonLd(html);
30
- // [{ '@type': 'WebSite', name: 'Example' }, [{ '@type': 'Person', name: 'Ada' }]]
35
+ const parsed = extractJson(html);
36
+ // [{ featureFlags: { search: true } }, { '@type': 'WebSite', name: 'Example' }, [{ '@type': 'Person', name: 'Ada' }]]
37
+
38
+ const items = parsed.flatMap(getJsonLdItems);
39
+ // [{ featureFlags: { search: true } }, { '@type': 'WebSite', name: 'Example' }, { '@type': 'Person', name: 'Ada' }]
40
+
41
+ const records = parsed.flatMap(getJsonLdRecords);
42
+ // [{ featureFlags: { search: true } }, { '@type': 'WebSite', name: 'Example' }, { '@type': 'Person', name: 'Ada' }]
31
43
  ```
32
44
 
33
45
  ## API
34
46
 
35
- ### `extractJsonLdStrings(html: string): string[]`
47
+ ### `extractJsonStrings(html: string): string[]`
36
48
 
37
- Returns normalized JSON-LD script contents as strings.
49
+ Returns normalized `application/json` and `application/ld+json` script contents as strings.
38
50
 
39
- ### `extractJsonLd<T = JsonValue>(html: string): T[]`
51
+ ### `extractJson<T = JsonValue>(html: string): T[]`
40
52
 
41
53
  Parses the extracted strings with `JSON.parse`. Entries that fail to parse are skipped.
42
54
 
55
+ ### `getJsonLdItems(value: unknown): unknown[]`
56
+
57
+ Normalizes a single parsed entry or parsed array entry into a flat list of items.
58
+
59
+ ### `getJsonLdRecords(value: unknown): Record<string, unknown>[]`
60
+
61
+ Returns record-shaped items and follows nested `@graph` content.
62
+
63
+ `getJsonLdItems` and `getJsonLdRecords` remain JSON-LD-oriented helpers for structured-data use cases on top of the generic extractor.
64
+
43
65
  ## License
44
66
 
45
67
  MIT
package/dist/index.cjs CHANGED
@@ -1,7 +1,7 @@
1
1
  Object.defineProperty(exports, Symbol.toStringTag, { value: "Module" });
2
2
  //#region src/scan.ts
3
3
  const SCRIPT_TAG_NAME = "script";
4
- const APPLICATION_LD_JSON_MIME = "application/ld+json";
4
+ const SUPPORTED_JSON_SCRIPT_MIME_TYPES = ["application/ld+json", "application/json"];
5
5
  const CHAR_TAB = 9;
6
6
  const CHAR_LINE_FEED = 10;
7
7
  const CHAR_FORM_FEED = 12;
@@ -45,7 +45,7 @@ const TEXT_LITERAL_CONTAINERS = [
45
45
  isTerminal: true
46
46
  }
47
47
  ];
48
- function extractJsonLdStrings(html) {
48
+ function extractJsonStrings(html) {
49
49
  if (html.length === 0) return [];
50
50
  const results = [];
51
51
  let cursor = 0;
@@ -57,8 +57,8 @@ function extractJsonLdStrings(html) {
57
57
  const contentStart = openTag.tagEnd + 1;
58
58
  const closeTagStart = findNextCloseTag(html, contentStart, SCRIPT_TAG_NAME);
59
59
  const contentEnd = closeTagStart === -1 ? html.length : closeTagStart;
60
- if (openTag.isJsonLd) {
61
- const content = normalizeJsonLdContent(html.slice(contentStart, contentEnd));
60
+ if (openTag.hasSupportedMime) {
61
+ const content = normalizeJsonScriptContent(html.slice(contentStart, contentEnd));
62
62
  if (content.length > 0) results.push(content);
63
63
  }
64
64
  if (closeTagStart === -1) break;
@@ -105,13 +105,13 @@ function findNextScriptOpenTag(html, start) {
105
105
  }
106
106
  function parseScriptOpenTag(html, openTagStart) {
107
107
  let cursor = openTagStart + 1 + 6;
108
- let isJsonLd = false;
108
+ let hasSupportedMime = false;
109
109
  while (cursor < html.length) {
110
110
  cursor = skipLeadingHtmlWhitespace(html, cursor, html.length);
111
111
  if (cursor >= html.length) break;
112
112
  const code = html.charCodeAt(cursor);
113
113
  if (code === CHAR_GREATER_THAN) return {
114
- isJsonLd,
114
+ hasSupportedMime,
115
115
  tagEnd: cursor
116
116
  };
117
117
  if (isHtmlWhitespace(code) || code === CHAR_SLASH) {
@@ -147,11 +147,11 @@ function parseScriptOpenTag(html, openTagStart) {
147
147
  }
148
148
  valueEnd = cursor;
149
149
  }
150
- if (isTypeAttribute && isApplicationLdJsonMime(html, valueStart, valueEnd)) isJsonLd = true;
150
+ if (isTypeAttribute && isSupportedJsonScriptMime(html, valueStart, valueEnd)) hasSupportedMime = true;
151
151
  }
152
152
  return null;
153
153
  }
154
- function normalizeJsonLdContent(content) {
154
+ function normalizeJsonScriptContent(content) {
155
155
  let start = 0;
156
156
  let end = content.length;
157
157
  let changed = false;
@@ -225,13 +225,17 @@ function findNextCloseTag(html, start, tagName) {
225
225
  }
226
226
  return -1;
227
227
  }
228
- function isApplicationLdJsonMime(value, start, end) {
228
+ function isSupportedJsonScriptMime(value, start, end) {
229
229
  start = skipLeadingHtmlWhitespace(value, start, end);
230
230
  end = skipTrailingHtmlWhitespace(value, start, end);
231
- if (end - start < 19) return false;
232
- if (!matchesAsciiLiteral(value, start, APPLICATION_LD_JSON_MIME)) return false;
233
- if (end - start === 19) return true;
234
- const suffixStart = start + 19;
231
+ for (const mime of SUPPORTED_JSON_SCRIPT_MIME_TYPES) if (matchesMimeType(value, start, end, mime)) return true;
232
+ return false;
233
+ }
234
+ function matchesMimeType(value, start, end, expectedMime) {
235
+ if (end - start < expectedMime.length) return false;
236
+ if (!matchesAsciiLiteral(value, start, expectedMime)) return false;
237
+ if (end - start === expectedMime.length) return true;
238
+ const suffixStart = start + expectedMime.length;
235
239
  const nextCode = value.charCodeAt(suffixStart);
236
240
  if (nextCode === CHAR_SEMICOLON) return true;
237
241
  if (!isHtmlWhitespace(nextCode)) return false;
@@ -306,8 +310,8 @@ function isTagStartChar(code) {
306
310
  }
307
311
  //#endregion
308
312
  //#region src/parse.ts
309
- function extractJsonLd(html) {
310
- const rawEntries = extractJsonLdStrings(html);
313
+ function extractJson(html) {
314
+ const rawEntries = extractJsonStrings(html);
311
315
  const results = [];
312
316
  for (const rawEntry of rawEntries) try {
313
317
  results.push(JSON.parse(rawEntry));
@@ -315,5 +319,29 @@ function extractJsonLd(html) {
315
319
  return results;
316
320
  }
317
321
  //#endregion
318
- exports.extractJsonLd = extractJsonLd;
319
- exports.extractJsonLdStrings = extractJsonLdStrings;
322
+ //#region src/helpers.ts
323
+ const isRecord = (value) => {
324
+ return typeof value === "object" && value !== null && !Array.isArray(value);
325
+ };
326
+ const asRecord = (value) => {
327
+ return isRecord(value) ? value : void 0;
328
+ };
329
+ const asArray = (value) => {
330
+ return Array.isArray(value) ? value : void 0;
331
+ };
332
+ function getJsonLdRecords(value) {
333
+ const items = asArray(value);
334
+ if (items) return items.flatMap(getJsonLdRecords);
335
+ const record = asRecord(value);
336
+ if (!record) return [];
337
+ return [record, ...getJsonLdRecords(record["@graph"])];
338
+ }
339
+ function getJsonLdItems(value) {
340
+ if (value === void 0) return [];
341
+ return asArray(value) ?? [value];
342
+ }
343
+ //#endregion
344
+ exports.extractJson = extractJson;
345
+ exports.extractJsonStrings = extractJsonStrings;
346
+ exports.getJsonLdItems = getJsonLdItems;
347
+ exports.getJsonLdRecords = getJsonLdRecords;
package/dist/index.d.cts CHANGED
@@ -7,9 +7,13 @@ type JsonArray = JsonValue[];
7
7
  type JsonValue = JsonArray | JsonObject | JsonPrimitive;
8
8
  //#endregion
9
9
  //#region src/parse.d.ts
10
- declare function extractJsonLd<T extends JsonValue = JsonValue>(html: string): T[];
10
+ declare function extractJson<T extends JsonValue = JsonValue>(html: string): T[];
11
11
  //#endregion
12
12
  //#region src/scan.d.ts
13
- declare function extractJsonLdStrings(html: string): string[];
13
+ declare function extractJsonStrings(html: string): string[];
14
14
  //#endregion
15
- export { type JsonArray, type JsonObject, type JsonPrimitive, type JsonValue, extractJsonLd, extractJsonLdStrings };
15
+ //#region src/helpers.d.ts
16
+ declare function getJsonLdRecords(value: unknown): Record<string, unknown>[];
17
+ declare function getJsonLdItems(value: unknown): unknown[];
18
+ //#endregion
19
+ export { type JsonArray, type JsonObject, type JsonPrimitive, type JsonValue, extractJson, extractJsonStrings, getJsonLdItems, getJsonLdRecords };
package/dist/index.d.ts CHANGED
@@ -7,9 +7,13 @@ type JsonArray = JsonValue[];
7
7
  type JsonValue = JsonArray | JsonObject | JsonPrimitive;
8
8
  //#endregion
9
9
  //#region src/parse.d.ts
10
- declare function extractJsonLd<T extends JsonValue = JsonValue>(html: string): T[];
10
+ declare function extractJson<T extends JsonValue = JsonValue>(html: string): T[];
11
11
  //#endregion
12
12
  //#region src/scan.d.ts
13
- declare function extractJsonLdStrings(html: string): string[];
13
+ declare function extractJsonStrings(html: string): string[];
14
14
  //#endregion
15
- export { type JsonArray, type JsonObject, type JsonPrimitive, type JsonValue, extractJsonLd, extractJsonLdStrings };
15
+ //#region src/helpers.d.ts
16
+ declare function getJsonLdRecords(value: unknown): Record<string, unknown>[];
17
+ declare function getJsonLdItems(value: unknown): unknown[];
18
+ //#endregion
19
+ export { type JsonArray, type JsonObject, type JsonPrimitive, type JsonValue, extractJson, extractJsonStrings, getJsonLdItems, getJsonLdRecords };
package/dist/index.js CHANGED
@@ -1,6 +1,6 @@
1
1
  //#region src/scan.ts
2
2
  const SCRIPT_TAG_NAME = "script";
3
- const APPLICATION_LD_JSON_MIME = "application/ld+json";
3
+ const SUPPORTED_JSON_SCRIPT_MIME_TYPES = ["application/ld+json", "application/json"];
4
4
  const CHAR_TAB = 9;
5
5
  const CHAR_LINE_FEED = 10;
6
6
  const CHAR_FORM_FEED = 12;
@@ -44,7 +44,7 @@ const TEXT_LITERAL_CONTAINERS = [
44
44
  isTerminal: true
45
45
  }
46
46
  ];
47
- function extractJsonLdStrings(html) {
47
+ function extractJsonStrings(html) {
48
48
  if (html.length === 0) return [];
49
49
  const results = [];
50
50
  let cursor = 0;
@@ -56,8 +56,8 @@ function extractJsonLdStrings(html) {
56
56
  const contentStart = openTag.tagEnd + 1;
57
57
  const closeTagStart = findNextCloseTag(html, contentStart, SCRIPT_TAG_NAME);
58
58
  const contentEnd = closeTagStart === -1 ? html.length : closeTagStart;
59
- if (openTag.isJsonLd) {
60
- const content = normalizeJsonLdContent(html.slice(contentStart, contentEnd));
59
+ if (openTag.hasSupportedMime) {
60
+ const content = normalizeJsonScriptContent(html.slice(contentStart, contentEnd));
61
61
  if (content.length > 0) results.push(content);
62
62
  }
63
63
  if (closeTagStart === -1) break;
@@ -104,13 +104,13 @@ function findNextScriptOpenTag(html, start) {
104
104
  }
105
105
  function parseScriptOpenTag(html, openTagStart) {
106
106
  let cursor = openTagStart + 1 + 6;
107
- let isJsonLd = false;
107
+ let hasSupportedMime = false;
108
108
  while (cursor < html.length) {
109
109
  cursor = skipLeadingHtmlWhitespace(html, cursor, html.length);
110
110
  if (cursor >= html.length) break;
111
111
  const code = html.charCodeAt(cursor);
112
112
  if (code === CHAR_GREATER_THAN) return {
113
- isJsonLd,
113
+ hasSupportedMime,
114
114
  tagEnd: cursor
115
115
  };
116
116
  if (isHtmlWhitespace(code) || code === CHAR_SLASH) {
@@ -146,11 +146,11 @@ function parseScriptOpenTag(html, openTagStart) {
146
146
  }
147
147
  valueEnd = cursor;
148
148
  }
149
- if (isTypeAttribute && isApplicationLdJsonMime(html, valueStart, valueEnd)) isJsonLd = true;
149
+ if (isTypeAttribute && isSupportedJsonScriptMime(html, valueStart, valueEnd)) hasSupportedMime = true;
150
150
  }
151
151
  return null;
152
152
  }
153
- function normalizeJsonLdContent(content) {
153
+ function normalizeJsonScriptContent(content) {
154
154
  let start = 0;
155
155
  let end = content.length;
156
156
  let changed = false;
@@ -224,13 +224,17 @@ function findNextCloseTag(html, start, tagName) {
224
224
  }
225
225
  return -1;
226
226
  }
227
- function isApplicationLdJsonMime(value, start, end) {
227
+ function isSupportedJsonScriptMime(value, start, end) {
228
228
  start = skipLeadingHtmlWhitespace(value, start, end);
229
229
  end = skipTrailingHtmlWhitespace(value, start, end);
230
- if (end - start < 19) return false;
231
- if (!matchesAsciiLiteral(value, start, APPLICATION_LD_JSON_MIME)) return false;
232
- if (end - start === 19) return true;
233
- const suffixStart = start + 19;
230
+ for (const mime of SUPPORTED_JSON_SCRIPT_MIME_TYPES) if (matchesMimeType(value, start, end, mime)) return true;
231
+ return false;
232
+ }
233
+ function matchesMimeType(value, start, end, expectedMime) {
234
+ if (end - start < expectedMime.length) return false;
235
+ if (!matchesAsciiLiteral(value, start, expectedMime)) return false;
236
+ if (end - start === expectedMime.length) return true;
237
+ const suffixStart = start + expectedMime.length;
234
238
  const nextCode = value.charCodeAt(suffixStart);
235
239
  if (nextCode === CHAR_SEMICOLON) return true;
236
240
  if (!isHtmlWhitespace(nextCode)) return false;
@@ -305,8 +309,8 @@ function isTagStartChar(code) {
305
309
  }
306
310
  //#endregion
307
311
  //#region src/parse.ts
308
- function extractJsonLd(html) {
309
- const rawEntries = extractJsonLdStrings(html);
312
+ function extractJson(html) {
313
+ const rawEntries = extractJsonStrings(html);
310
314
  const results = [];
311
315
  for (const rawEntry of rawEntries) try {
312
316
  results.push(JSON.parse(rawEntry));
@@ -314,4 +318,26 @@ function extractJsonLd(html) {
314
318
  return results;
315
319
  }
316
320
  //#endregion
317
- export { extractJsonLd, extractJsonLdStrings };
321
+ //#region src/helpers.ts
322
+ const isRecord = (value) => {
323
+ return typeof value === "object" && value !== null && !Array.isArray(value);
324
+ };
325
+ const asRecord = (value) => {
326
+ return isRecord(value) ? value : void 0;
327
+ };
328
+ const asArray = (value) => {
329
+ return Array.isArray(value) ? value : void 0;
330
+ };
331
+ function getJsonLdRecords(value) {
332
+ const items = asArray(value);
333
+ if (items) return items.flatMap(getJsonLdRecords);
334
+ const record = asRecord(value);
335
+ if (!record) return [];
336
+ return [record, ...getJsonLdRecords(record["@graph"])];
337
+ }
338
+ function getJsonLdItems(value) {
339
+ if (value === void 0) return [];
340
+ return asArray(value) ?? [value];
341
+ }
342
+ //#endregion
343
+ export { extractJson, extractJsonStrings, getJsonLdItems, getJsonLdRecords };
package/package.json CHANGED
@@ -1,9 +1,17 @@
1
1
  {
2
2
  "name": "html-json-extractor",
3
- "version": "0.0.1",
4
- "description": "Fast, forgiving extraction of application/ld+json script blocks from HTML strings.",
3
+ "version": "0.2.0",
4
+ "description": "Fast, forgiving extraction of application/json and application/ld+json script blocks from HTML strings.",
5
5
  "license": "MIT",
6
6
  "author": "VastBlast",
7
+ "repository": {
8
+ "type": "git",
9
+ "url": "git+https://github.com/VastBlast/html-json-extractor.git"
10
+ },
11
+ "bugs": {
12
+ "url": "https://github.com/VastBlast/html-json-extractor/issues"
13
+ },
14
+ "homepage": "https://github.com/VastBlast/html-json-extractor#readme",
7
15
  "type": "module",
8
16
  "sideEffects": false,
9
17
  "files": [
@@ -11,6 +19,8 @@
11
19
  ],
12
20
  "keywords": [
13
21
  "html",
22
+ "json",
23
+ "application/json",
14
24
  "json-ld",
15
25
  "ld+json",
16
26
  "schema",
@@ -42,10 +52,9 @@
42
52
  },
43
53
  "main": "./dist/index.cjs",
44
54
  "module": "./dist/index.js",
45
- "types": "./dist/index.d.ts",
55
+ "types": "./dist/index.d.cts",
46
56
  "exports": {
47
57
  ".": {
48
- "types": "./dist/index.d.ts",
49
58
  "import": "./dist/index.js",
50
59
  "require": "./dist/index.cjs"
51
60
  },