@j0hanz/superfetch 1.2.1 → 1.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/README.md +60 -45
  2. package/dist/config/formatting.d.ts +1 -1
  3. package/dist/config/types/content.d.ts +3 -3
  4. package/dist/config/types/runtime.d.ts +1 -1
  5. package/dist/config/types/tools.d.ts +12 -12
  6. package/dist/http/cors.js +23 -23
  7. package/dist/http/download-routes.js +11 -5
  8. package/dist/http/mcp-routes.js +2 -13
  9. package/dist/http/mcp-validation.js +1 -1
  10. package/dist/http/server-middleware.js +5 -3
  11. package/dist/http/server.js +2 -0
  12. package/dist/index.js +5 -0
  13. package/dist/middleware/error-handler.js +1 -1
  14. package/dist/resources/cached-content.js +8 -4
  15. package/dist/server.js +2 -0
  16. package/dist/services/cache.d.ts +2 -1
  17. package/dist/services/cache.js +23 -7
  18. package/dist/services/context.d.ts +4 -4
  19. package/dist/services/context.js +11 -1
  20. package/dist/services/extractor.js +26 -21
  21. package/dist/services/fetcher/agents.js +55 -1
  22. package/dist/services/fetcher/interceptors.d.ts +22 -0
  23. package/dist/services/fetcher/interceptors.js +57 -26
  24. package/dist/services/fetcher/response.d.ts +1 -1
  25. package/dist/services/fetcher/response.js +37 -16
  26. package/dist/services/fetcher.d.ts +1 -1
  27. package/dist/services/fetcher.js +9 -8
  28. package/dist/services/metadata-collector.d.ts +10 -0
  29. package/dist/services/metadata-collector.js +11 -0
  30. package/dist/services/parser.d.ts +5 -1
  31. package/dist/services/parser.js +82 -11
  32. package/dist/services/transform-worker-pool.d.ts +14 -0
  33. package/dist/services/transform-worker-pool.js +167 -0
  34. package/dist/tools/handlers/fetch-markdown.tool.d.ts +9 -1
  35. package/dist/tools/handlers/fetch-markdown.tool.js +58 -30
  36. package/dist/tools/handlers/fetch-single.shared.d.ts +8 -3
  37. package/dist/tools/handlers/fetch-single.shared.js +42 -17
  38. package/dist/tools/handlers/fetch-url.tool.js +46 -16
  39. package/dist/tools/index.js +13 -0
  40. package/dist/tools/schemas.d.ts +19 -16
  41. package/dist/tools/schemas.js +25 -4
  42. package/dist/tools/utils/common.js +20 -16
  43. package/dist/tools/utils/content-transform-async.d.ts +6 -0
  44. package/dist/tools/utils/content-transform-async.js +33 -0
  45. package/dist/tools/utils/content-transform.d.ts +4 -1
  46. package/dist/tools/utils/content-transform.js +37 -3
  47. package/dist/tools/utils/fetch-pipeline.js +26 -15
  48. package/dist/utils/content-cleaner.d.ts +1 -1
  49. package/dist/utils/download-url.d.ts +9 -1
  50. package/dist/utils/download-url.js +9 -6
  51. package/dist/utils/tool-error-handler.d.ts +2 -2
  52. package/dist/utils/tool-error-handler.js +7 -7
  53. package/dist/utils/url-validator.d.ts +5 -0
  54. package/dist/utils/url-validator.js +45 -3
  55. package/dist/workers/transform-worker.d.ts +1 -0
  56. package/dist/workers/transform-worker.js +50 -0
  57. package/package.json +4 -6
@@ -3,17 +3,7 @@ import { Readability } from '@mozilla/readability';
3
3
  import { getErrorMessage } from '../utils/error-utils.js';
4
4
  import { truncateHtml } from '../utils/html-truncator.js';
5
5
  import { logError, logInfo, logWarn } from './logger.js';
6
- function resolveMetaField(state, field) {
7
- const sources = state[field];
8
- return sources.og ?? sources.twitter ?? sources.standard;
9
- }
10
- function createMetaCollectorState() {
11
- return {
12
- title: {},
13
- description: {},
14
- author: {},
15
- };
16
- }
6
+ import { createMetaCollectorState, resolveMetaField, } from './metadata-collector.js';
17
7
  function collectMetaTag(state, tag) {
18
8
  const content = getMetaContent(tag);
19
9
  if (!content)
@@ -76,11 +66,17 @@ function extractMetadata(document) {
76
66
  const state = createMetaCollectorState();
77
67
  scanMetaTags(document, state);
78
68
  ensureTitleFallback(document, state);
79
- return {
80
- title: resolveMetaField(state, 'title'),
81
- description: resolveMetaField(state, 'description'),
82
- author: resolveMetaField(state, 'author'),
83
- };
69
+ const metadata = {};
70
+ const title = resolveMetaField(state, 'title');
71
+ const description = resolveMetaField(state, 'description');
72
+ const author = resolveMetaField(state, 'author');
73
+ if (title !== undefined)
74
+ metadata.title = title;
75
+ if (description !== undefined)
76
+ metadata.description = description;
77
+ if (author !== undefined)
78
+ metadata.author = author;
79
+ return metadata;
84
80
  }
85
81
  function isReadabilityCompatible(doc) {
86
82
  if (!doc || typeof doc !== 'object')
@@ -113,14 +109,23 @@ function parseReadabilityArticle(document) {
113
109
  }
114
110
  }
115
111
  function mapReadabilityResult(parsed) {
116
- return {
117
- title: toOptional(parsed.title),
118
- byline: toOptional(parsed.byline),
112
+ const article = {
119
113
  content: parsed.content ?? '',
120
114
  textContent: parsed.textContent ?? '',
121
- excerpt: toOptional(parsed.excerpt),
122
- siteName: toOptional(parsed.siteName),
123
115
  };
116
+ const title = toOptional(parsed.title);
117
+ if (title !== undefined)
118
+ article.title = title;
119
+ const byline = toOptional(parsed.byline);
120
+ if (byline !== undefined)
121
+ article.byline = byline;
122
+ const excerpt = toOptional(parsed.excerpt);
123
+ if (excerpt !== undefined)
124
+ article.excerpt = excerpt;
125
+ const siteName = toOptional(parsed.siteName);
126
+ if (siteName !== undefined)
127
+ article.siteName = siteName;
128
+ return article;
124
129
  }
125
130
  function toOptional(value) {
126
131
  return value ?? undefined;
@@ -3,9 +3,26 @@ import os from 'node:os';
3
3
  import { Agent } from 'undici';
4
4
  import { createErrorWithCode } from '../../utils/error-utils.js';
5
5
  import { isBlockedIp } from '../../utils/url-validator.js';
6
+ const DNS_LOOKUP_TIMEOUT_MS = 5000;
6
7
  function resolveDns(hostname, options, callback) {
7
8
  const { normalizedOptions, useAll, resolvedFamily } = buildLookupContext(options);
8
- dns.lookup(hostname, { ...normalizedOptions, all: true }, createLookupCallback(hostname, resolvedFamily, useAll, callback));
9
+ const lookupOptions = buildLookupOptions(normalizedOptions);
10
+ let done = false;
11
+ const timer = setTimeout(() => {
12
+ if (done)
13
+ return;
14
+ done = true;
15
+ callback(createErrorWithCode(`DNS lookup timed out for ${hostname}`, 'ETIMEOUT'), []);
16
+ }, DNS_LOOKUP_TIMEOUT_MS);
17
+ timer.unref();
18
+ const safeCallback = (err, address, family) => {
19
+ if (done)
20
+ return;
21
+ done = true;
22
+ clearTimeout(timer);
23
+ callback(err, address, family);
24
+ };
25
+ dns.lookup(hostname, lookupOptions, createLookupCallback(hostname, resolvedFamily, useAll, safeCallback));
9
26
  }
10
27
  function normalizeLookupOptions(options) {
11
28
  return typeof options === 'number' ? { family: options } : options;
@@ -18,6 +35,29 @@ function buildLookupContext(options) {
18
35
  resolvedFamily: resolveFamily(normalizedOptions.family),
19
36
  };
20
37
  }
38
+ const DEFAULT_DNS_ORDER = 'verbatim';
39
+ function resolveResultOrder(options) {
40
+ if (options.order)
41
+ return options.order;
42
+ const legacyVerbatim = getLegacyVerbatim(options);
43
+ if (legacyVerbatim !== undefined) {
44
+ return legacyVerbatim ? 'verbatim' : 'ipv4first';
45
+ }
46
+ return DEFAULT_DNS_ORDER;
47
+ }
48
+ function getLegacyVerbatim(options) {
49
+ const legacy = options.verbatim;
50
+ return typeof legacy === 'boolean' ? legacy : undefined;
51
+ }
52
+ function buildLookupOptions(normalizedOptions) {
53
+ const options = {
54
+ ...normalizedOptions,
55
+ order: resolveResultOrder(normalizedOptions),
56
+ all: true,
57
+ };
58
+ delete options.verbatim;
59
+ return options;
60
+ }
21
61
  function createLookupCallback(hostname, resolvedFamily, useAll, callback) {
22
62
  return (err, addresses) => {
23
63
  handleLookupResult(err, addresses, hostname, resolvedFamily, useAll, callback);
@@ -42,6 +82,11 @@ function handleLookupResult(error, addresses, hostname, resolvedFamily, useAll,
42
82
  return;
43
83
  }
44
84
  const list = normalizeLookupResults(addresses, resolvedFamily);
85
+ const invalidFamilyError = findInvalidFamilyError(list, hostname);
86
+ if (invalidFamilyError) {
87
+ callback(invalidFamilyError, list);
88
+ return;
89
+ }
45
90
  const blockedError = findBlockedIpError(list, hostname);
46
91
  if (blockedError) {
47
92
  callback(blockedError, list);
@@ -89,6 +134,15 @@ function findBlockedIpError(list, hostname) {
89
134
  }
90
135
  return null;
91
136
  }
137
+ function findInvalidFamilyError(list, hostname) {
138
+ for (const addr of list) {
139
+ const family = typeof addr === 'string' ? 0 : addr.family;
140
+ if (family === 4 || family === 6)
141
+ continue;
142
+ return createErrorWithCode(`Invalid address family returned for ${hostname}`, 'EINVAL');
143
+ }
144
+ return null;
145
+ }
92
146
  function createNoDnsResultsError(hostname) {
93
147
  return createErrorWithCode(`No DNS results returned for ${hostname}`, 'ENODATA');
94
148
  }
@@ -1,3 +1,25 @@
1
+ export type FetchChannelEvent = {
2
+ v: 1;
3
+ type: 'start';
4
+ requestId: string;
5
+ method: string;
6
+ url: string;
7
+ } | {
8
+ v: 1;
9
+ type: 'end';
10
+ requestId: string;
11
+ status: number;
12
+ duration: number;
13
+ } | {
14
+ v: 1;
15
+ type: 'error';
16
+ requestId: string;
17
+ url: string;
18
+ error: string;
19
+ code?: string;
20
+ status?: number;
21
+ duration: number;
22
+ };
1
23
  interface FetchTelemetryContext {
2
24
  requestId: string;
3
25
  startTime: number;
@@ -4,21 +4,44 @@ import { performance } from 'node:perf_hooks';
4
4
  import { isSystemError } from '../../utils/error-utils.js';
5
5
  import { logDebug, logError, logWarn } from '../logger.js';
6
6
  const fetchChannel = diagnosticsChannel.channel('superfetch.fetch');
7
+ function redactUrl(rawUrl) {
8
+ try {
9
+ const url = new URL(rawUrl);
10
+ url.username = '';
11
+ url.password = '';
12
+ url.hash = '';
13
+ url.search = '';
14
+ return url.toString();
15
+ }
16
+ catch {
17
+ return rawUrl;
18
+ }
19
+ }
20
+ function publishFetchEvent(event) {
21
+ if (!fetchChannel.hasSubscribers)
22
+ return;
23
+ try {
24
+ fetchChannel.publish(event);
25
+ }
26
+ catch {
27
+ // Avoid crashing the publisher if a subscriber throws.
28
+ }
29
+ }
7
30
  export function startFetchTelemetry(url, method) {
31
+ const safeUrl = redactUrl(url);
8
32
  const context = {
9
33
  requestId: randomUUID(),
10
34
  startTime: performance.now(),
11
- url,
35
+ url: safeUrl,
12
36
  method: method.toUpperCase(),
13
37
  };
14
- if (fetchChannel.hasSubscribers) {
15
- fetchChannel.publish({
16
- type: 'start',
17
- requestId: context.requestId,
18
- method: context.method,
19
- url: context.url,
20
- });
21
- }
38
+ publishFetchEvent({
39
+ v: 1,
40
+ type: 'start',
41
+ requestId: context.requestId,
42
+ method: context.method,
43
+ url: context.url,
44
+ });
22
45
  logDebug('HTTP Request', {
23
46
  requestId: context.requestId,
24
47
  method: context.method,
@@ -38,9 +61,8 @@ export function recordFetchResponse(context, response, contentSize) {
38
61
  logSlowRequestIfNeeded(context, duration);
39
62
  }
40
63
  function publishFetchEnd(context, status, duration) {
41
- if (!fetchChannel.hasSubscribers)
42
- return;
43
- fetchChannel.publish({
64
+ publishFetchEvent({
65
+ v: 1,
44
66
  type: 'end',
45
67
  requestId: context.requestId,
46
68
  status,
@@ -48,13 +70,18 @@ function publishFetchEnd(context, status, duration) {
48
70
  });
49
71
  }
50
72
  function buildResponseMeta(response, contentSize, duration) {
51
- const contentType = response.headers.get('content-type') ?? undefined;
52
73
  const contentLength = response.headers.get('content-length') ?? contentSize?.toString();
53
- return {
54
- contentType,
74
+ const meta = {
55
75
  duration: `${Math.round(duration)}ms`,
56
- size: contentLength,
57
76
  };
77
+ const contentType = response.headers.get('content-type');
78
+ if (contentType !== null) {
79
+ meta.contentType = contentType;
80
+ }
81
+ if (contentLength !== undefined) {
82
+ meta.size = contentLength;
83
+ }
84
+ return meta;
58
85
  }
59
86
  function logSlowRequestIfNeeded(context, duration) {
60
87
  if (duration <= 5000)
@@ -69,17 +96,21 @@ export function recordFetchError(context, error, status) {
69
96
  const duration = performance.now() - context.startTime;
70
97
  const err = error instanceof Error ? error : new Error(String(error));
71
98
  const code = isSystemError(err) ? err.code : undefined;
72
- if (fetchChannel.hasSubscribers) {
73
- fetchChannel.publish({
74
- type: 'error',
75
- requestId: context.requestId,
76
- url: context.url,
77
- error: err.message,
78
- code,
79
- status,
80
- duration,
81
- });
99
+ const event = {
100
+ v: 1,
101
+ type: 'error',
102
+ requestId: context.requestId,
103
+ url: context.url,
104
+ error: err.message,
105
+ duration,
106
+ };
107
+ if (code !== undefined) {
108
+ event.code = code;
109
+ }
110
+ if (status !== undefined) {
111
+ event.status = status;
82
112
  }
113
+ publishFetchEvent(event);
83
114
  const log = status === 429 ? logWarn : logError;
84
115
  log('HTTP Request Error', {
85
116
  requestId: context.requestId,
@@ -1,4 +1,4 @@
1
- export declare function readResponseText(response: Response, url: string, maxBytes: number): Promise<{
1
+ export declare function readResponseText(response: Response, url: string, maxBytes: number, signal?: AbortSignal): Promise<{
2
2
  text: string;
3
3
  size: number;
4
4
  }>;
@@ -1,3 +1,5 @@
1
+ import { Readable, Writable } from 'node:stream';
2
+ import { pipeline } from 'node:stream/promises';
1
3
  import { FetchError } from '../../errors/app-error.js';
2
4
  function assertContentLengthWithinLimit(response, url, maxBytes) {
3
5
  const contentLengthHeader = response.headers.get('content-length');
@@ -9,30 +11,49 @@ function assertContentLengthWithinLimit(response, url, maxBytes) {
9
11
  }
10
12
  throw new FetchError(`Response exceeds maximum size of ${maxBytes} bytes`, url);
11
13
  }
12
- async function readStreamWithLimit(stream, url, maxBytes) {
13
- const reader = stream.getReader();
14
+ async function readStreamWithLimit(stream, url, maxBytes, signal) {
14
15
  const decoder = new TextDecoder();
15
16
  let total = 0;
16
- const chunks = [];
17
- for (;;) {
18
- const { value, done } = await reader.read();
19
- if (done)
20
- break;
21
- total += value.byteLength;
22
- if (total > maxBytes) {
23
- await reader.cancel();
24
- throw new FetchError(`Response exceeds maximum size of ${maxBytes} bytes`, url);
17
+ let text = '';
18
+ const toBuffer = (chunk) => {
19
+ if (typeof chunk === 'string') {
20
+ return Buffer.from(chunk);
25
21
  }
26
- chunks.push(decoder.decode(value, { stream: true }));
22
+ return Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
23
+ };
24
+ const sink = new Writable({
25
+ write(chunk, _encoding, callback) {
26
+ const buffer = toBuffer(chunk);
27
+ total += buffer.length;
28
+ if (total > maxBytes) {
29
+ callback(new FetchError(`Response exceeds maximum size of ${maxBytes} bytes`, url));
30
+ return;
31
+ }
32
+ text += decoder.decode(buffer, { stream: true });
33
+ callback();
34
+ },
35
+ final(callback) {
36
+ text += decoder.decode();
37
+ callback();
38
+ },
39
+ });
40
+ try {
41
+ const readable = Readable.fromWeb(stream, { signal });
42
+ await pipeline(readable, sink, { signal });
27
43
  }
28
- chunks.push(decoder.decode());
29
- return { text: chunks.join(''), size: total };
44
+ catch (error) {
45
+ if (signal?.aborted) {
46
+ throw new FetchError('Request was aborted during response read', url, 499, { reason: 'aborted' });
47
+ }
48
+ throw error;
49
+ }
50
+ return { text, size: total };
30
51
  }
31
- export async function readResponseText(response, url, maxBytes) {
52
+ export async function readResponseText(response, url, maxBytes, signal) {
32
53
  assertContentLengthWithinLimit(response, url, maxBytes);
33
54
  if (!response.body) {
34
55
  const text = await response.text();
35
56
  return { text, size: Buffer.byteLength(text) };
36
57
  }
37
- return readStreamWithLimit(response.body, url, maxBytes);
58
+ return readStreamWithLimit(response.body, url, maxBytes, signal);
38
59
  }
@@ -1,4 +1,4 @@
1
1
  import type { FetchOptions } from '../config/types/runtime.js';
2
2
  import { destroyAgents } from './fetcher/agents.js';
3
3
  export { destroyAgents };
4
- export declare function fetchUrlWithRetry(url: string, options?: FetchOptions, maxRetries?: number): Promise<string>;
4
+ export declare function fetchNormalizedUrlWithRetry(normalizedUrl: string, options?: FetchOptions, maxRetries?: number): Promise<string>;
@@ -1,6 +1,5 @@
1
1
  import { config } from '../config/index.js';
2
2
  import { normalizeHeaderRecord } from '../utils/header-normalizer.js';
3
- import { validateAndNormalizeUrl } from '../utils/url-validator.js';
4
3
  import { destroyAgents, dispatcher } from './fetcher/agents.js';
5
4
  import { createHttpError, createRateLimitError, mapFetchError, } from './fetcher/errors.js';
6
5
  import { recordFetchError, recordFetchResponse, startFetchTelemetry, } from './fetcher/interceptors.js';
@@ -39,7 +38,7 @@ function buildRequestInit(headers, signal) {
39
38
  dispatcher,
40
39
  };
41
40
  }
42
- async function handleFetchResponse(response, finalUrl, telemetry) {
41
+ async function handleFetchResponse(response, finalUrl, telemetry, signal) {
43
42
  if (response.status === 429) {
44
43
  void response.body?.cancel();
45
44
  throw createRateLimitError(finalUrl, response.headers.get('retry-after'));
@@ -48,7 +47,7 @@ async function handleFetchResponse(response, finalUrl, telemetry) {
48
47
  void response.body?.cancel();
49
48
  throw createHttpError(finalUrl, response.status, response.statusText);
50
49
  }
51
- const { text, size } = await readResponseText(response, finalUrl, config.fetcher.maxContentLength);
50
+ const { text, size } = await readResponseText(response, finalUrl, config.fetcher.maxContentLength, signal);
52
51
  recordFetchResponse(telemetry, response, size);
53
52
  return text;
54
53
  }
@@ -57,7 +56,7 @@ async function fetchWithTelemetry(normalizedUrl, requestInit, timeoutMs) {
57
56
  try {
58
57
  const { response, url: finalUrl } = await fetchWithRedirects(normalizedUrl, requestInit, config.fetcher.maxRedirects);
59
58
  telemetry.url = finalUrl;
60
- return await handleFetchResponse(response, finalUrl, telemetry);
59
+ return await handleFetchResponse(response, finalUrl, telemetry, requestInit.signal ?? undefined);
61
60
  }
62
61
  catch (error) {
63
62
  const mapped = mapFetchError(error, normalizedUrl, timeoutMs);
@@ -66,17 +65,19 @@ async function fetchWithTelemetry(normalizedUrl, requestInit, timeoutMs) {
66
65
  throw mapped;
67
66
  }
68
67
  }
69
- export async function fetchUrlWithRetry(url, options, maxRetries = 3) {
70
- const normalizedUrl = await validateAndNormalizeUrl(url);
68
+ export async function fetchNormalizedUrlWithRetry(normalizedUrl, options, maxRetries = 3) {
71
69
  const context = buildRequestContext(options);
72
70
  return executeWithRetry(normalizedUrl, maxRetries, async () => runFetch(normalizedUrl, context), context.signal);
73
71
  }
74
72
  function buildRequestContext(options) {
75
- return {
73
+ const context = {
76
74
  timeoutMs: options?.timeout ?? config.fetcher.timeout,
77
75
  headers: buildHeaders(options?.customHeaders),
78
- signal: options?.signal,
79
76
  };
77
+ if (options?.signal) {
78
+ context.signal = options.signal;
79
+ }
80
+ return context;
80
81
  }
81
82
  async function runFetch(normalizedUrl, context) {
82
83
  const signal = buildRequestSignal(context.timeoutMs, context.signal);
@@ -0,0 +1,10 @@
1
+ import type { ExtractedMetadata } from '../config/types/content.js';
2
+ export type MetaSource = 'og' | 'twitter' | 'standard';
3
+ export type MetaField = keyof ExtractedMetadata;
4
+ export interface MetaCollectorState {
5
+ title: Partial<Record<MetaSource, string>>;
6
+ description: Partial<Record<MetaSource, string>>;
7
+ author: Partial<Record<MetaSource, string>>;
8
+ }
9
+ export declare function createMetaCollectorState(): MetaCollectorState;
10
+ export declare function resolveMetaField(state: MetaCollectorState, field: MetaField): string | undefined;
@@ -0,0 +1,11 @@
1
+ export function createMetaCollectorState() {
2
+ return {
3
+ title: {},
4
+ description: {},
5
+ author: {},
6
+ };
7
+ }
8
+ export function resolveMetaField(state, field) {
9
+ const sources = state[field];
10
+ return sources.og ?? sources.twitter ?? sources.standard;
11
+ }
@@ -1,2 +1,6 @@
1
- import type { ContentBlockUnion } from '../config/types/content.js';
1
+ import type { ContentBlockUnion, ExtractedMetadata } from '../config/types/content.js';
2
2
  export declare function parseHtml(html: string): ContentBlockUnion[];
3
+ export declare function parseHtmlWithMetadata(html: string): {
4
+ blocks: ContentBlockUnion[];
5
+ metadata: ExtractedMetadata;
6
+ };
@@ -6,7 +6,57 @@ import { getErrorMessage } from '../utils/error-utils.js';
6
6
  import { truncateHtml } from '../utils/html-truncator.js';
7
7
  import { sanitizeText } from '../utils/sanitizer.js';
8
8
  import { logWarn } from './logger.js';
9
+ import { createMetaCollectorState, resolveMetaField, } from './metadata-collector.js';
9
10
  const CONTENT_SELECTOR = 'h1, h2, h3, h4, h5, h6, p, ul, ol, pre, code:not(pre code), table, img, blockquote';
11
+ function extractMetadata($) {
12
+ const state = createMetaCollectorState();
13
+ $('meta').each((_, element) => {
14
+ const content = $(element).attr('content')?.trim();
15
+ if (!content)
16
+ return;
17
+ const property = $(element).attr('property');
18
+ if (property?.startsWith('og:')) {
19
+ const key = property.replace('og:', '');
20
+ if (key === 'title')
21
+ state.title.og = content;
22
+ if (key === 'description')
23
+ state.description.og = content;
24
+ return;
25
+ }
26
+ const name = $(element).attr('name');
27
+ if (name?.startsWith('twitter:')) {
28
+ const key = name.replace('twitter:', '');
29
+ if (key === 'title')
30
+ state.title.twitter = content;
31
+ if (key === 'description')
32
+ state.description.twitter = content;
33
+ return;
34
+ }
35
+ if (name === 'description') {
36
+ state.description.standard = content;
37
+ }
38
+ if (name === 'author') {
39
+ state.author.standard = content;
40
+ }
41
+ });
42
+ if (!state.title.standard) {
43
+ const titleText = $('title').first().text().trim();
44
+ if (titleText) {
45
+ state.title.standard = titleText;
46
+ }
47
+ }
48
+ const metadata = {};
49
+ const title = resolveMetaField(state, 'title');
50
+ const description = resolveMetaField(state, 'description');
51
+ const author = resolveMetaField(state, 'author');
52
+ if (title !== undefined)
53
+ metadata.title = title;
54
+ if (description !== undefined)
55
+ metadata.description = description;
56
+ if (author !== undefined)
57
+ metadata.author = author;
58
+ return metadata;
59
+ }
10
60
  function parseHeading($, element) {
11
61
  const rawText = sanitizeText($(element).text());
12
62
  const text = cleanHeading(rawText);
@@ -55,11 +105,14 @@ function parseCode($, element) {
55
105
  const dataLang = $(element).attr('data-language') ?? '';
56
106
  const language = resolveLanguageFromAttributes(className, dataLang) ??
57
107
  detectLanguageFromCode(text);
58
- return {
108
+ const block = {
59
109
  type: 'code',
60
- language,
61
110
  text,
62
111
  };
112
+ if (language !== undefined) {
113
+ block.language = language;
114
+ }
115
+ return block;
63
116
  }
64
117
  function parseTable($, element) {
65
118
  const headers = [];
@@ -90,21 +143,23 @@ function parseTable($, element) {
90
143
  });
91
144
  if (rows.length === 0)
92
145
  return null;
93
- return {
94
- type: 'table',
95
- headers: headers.length > 0 ? headers : undefined,
96
- rows,
97
- };
146
+ return headers.length > 0
147
+ ? { type: 'table', headers, rows }
148
+ : { type: 'table', rows };
98
149
  }
99
150
  function parseImage($, element) {
100
151
  const src = $(element).attr('src');
101
152
  if (!src)
102
153
  return null;
103
- return {
154
+ const alt = $(element).attr('alt');
155
+ const image = {
104
156
  type: 'image',
105
157
  src,
106
- alt: $(element).attr('alt') ?? undefined,
107
158
  };
159
+ if (alt !== undefined) {
160
+ image.alt = alt;
161
+ }
162
+ return image;
108
163
  }
109
164
  function parseBlockquote($, element) {
110
165
  const rawText = sanitizeText($(element).text());
@@ -173,6 +228,10 @@ function loadHtml(html) {
173
228
  return null;
174
229
  }
175
230
  }
231
+ function prepareCheerio(html) {
232
+ const processedHtml = truncateHtml(html);
233
+ return loadHtml(processedHtml);
234
+ }
176
235
  function removeNoiseElements($) {
177
236
  $('script, style, noscript, iframe, svg').remove();
178
237
  }
@@ -198,10 +257,22 @@ function safeParseElement($, element) {
198
257
  export function parseHtml(html) {
199
258
  if (!html || typeof html !== 'string')
200
259
  return [];
201
- const processedHtml = truncateHtml(html);
202
- const $ = loadHtml(processedHtml);
260
+ const $ = prepareCheerio(html);
203
261
  if (!$)
204
262
  return [];
205
263
  removeNoiseElements($);
206
264
  return filterBlocks(collectBlocks($));
207
265
  }
266
+ export function parseHtmlWithMetadata(html) {
267
+ if (!html || typeof html !== 'string') {
268
+ return { blocks: [], metadata: {} };
269
+ }
270
+ const $ = prepareCheerio(html);
271
+ if (!$) {
272
+ return { blocks: [], metadata: {} };
273
+ }
274
+ const metadata = extractMetadata($);
275
+ removeNoiseElements($);
276
+ const blocks = filterBlocks(collectBlocks($));
277
+ return { blocks, metadata };
278
+ }
@@ -0,0 +1,14 @@
1
+ import type { JsonlTransformResult, MarkdownTransformResult, TransformOptions } from '../config/types/content.js';
2
+ type TransformMode = 'jsonl' | 'markdown' | 'markdown-blocks';
3
+ export interface TransformJob {
4
+ mode: TransformMode;
5
+ html: string;
6
+ url: string;
7
+ options: TransformOptions & {
8
+ includeContentBlocks?: boolean;
9
+ };
10
+ }
11
+ type TransformResult = JsonlTransformResult | MarkdownTransformResult;
12
+ export declare function runTransformInWorker(job: TransformJob): Promise<TransformResult | null>;
13
+ export declare function destroyTransformWorkers(): void;
14
+ export {};