@j0hanz/superfetch 2.2.1 → 2.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +363 -614
- package/dist/cache.d.ts +2 -2
- package/dist/cache.d.ts.map +1 -1
- package/dist/cache.js +47 -225
- package/dist/cache.js.map +1 -1
- package/dist/config.d.ts +6 -0
- package/dist/config.d.ts.map +1 -1
- package/dist/config.js +20 -27
- package/dist/config.js.map +1 -1
- package/dist/dom-noise-removal.d.ts +6 -0
- package/dist/dom-noise-removal.d.ts.map +1 -0
- package/dist/dom-noise-removal.js +482 -0
- package/dist/dom-noise-removal.js.map +1 -0
- package/dist/errors.d.ts.map +1 -1
- package/dist/errors.js +8 -5
- package/dist/errors.js.map +1 -1
- package/dist/fetch.d.ts.map +1 -1
- package/dist/fetch.js +26 -32
- package/dist/fetch.js.map +1 -1
- package/dist/http-native.d.ts +6 -0
- package/dist/http-native.d.ts.map +1 -0
- package/dist/http-native.js +645 -0
- package/dist/http-native.js.map +1 -0
- package/dist/http-utils.d.ts +61 -0
- package/dist/http-utils.d.ts.map +1 -0
- package/dist/http-utils.js +252 -0
- package/dist/http-utils.js.map +1 -0
- package/dist/index.js +1 -1
- package/dist/index.js.map +1 -1
- package/dist/instructions.md +41 -39
- package/dist/json.d.ts +2 -0
- package/dist/json.d.ts.map +1 -0
- package/dist/json.js +30 -0
- package/dist/json.js.map +1 -0
- package/dist/language-detection.d.ts +13 -0
- package/dist/language-detection.d.ts.map +1 -0
- package/dist/language-detection.js +283 -0
- package/dist/language-detection.js.map +1 -0
- package/dist/markdown-cleanup.d.ts +19 -0
- package/dist/markdown-cleanup.d.ts.map +1 -0
- package/dist/markdown-cleanup.js +283 -0
- package/dist/markdown-cleanup.js.map +1 -0
- package/dist/observability.d.ts +1 -0
- package/dist/observability.d.ts.map +1 -1
- package/dist/observability.js +10 -0
- package/dist/observability.js.map +1 -1
- package/dist/tools.js +4 -4
- package/dist/transform-types.d.ts +81 -0
- package/dist/transform-types.d.ts.map +1 -0
- package/dist/transform-types.js +6 -0
- package/dist/transform-types.js.map +1 -0
- package/dist/transform.d.ts +7 -52
- package/dist/transform.d.ts.map +1 -1
- package/dist/transform.js +411 -839
- package/dist/transform.js.map +1 -1
- package/dist/type-guards.d.ts +1 -1
- package/dist/type-guards.d.ts.map +1 -1
- package/dist/type-guards.js +1 -1
- package/dist/type-guards.js.map +1 -1
- package/dist/workers/transform-worker.js +23 -24
- package/dist/workers/transform-worker.js.map +1 -1
- package/package.json +85 -86
- package/dist/http.d.ts +0 -90
- package/dist/http.d.ts.map +0 -1
- package/dist/http.js +0 -1576
- package/dist/http.js.map +0 -1
package/dist/tools.js
CHANGED
|
@@ -6,7 +6,7 @@ import { FetchError, getErrorMessage, isSystemError } from './errors.js';
|
|
|
6
6
|
import { fetchNormalizedUrl, normalizeUrl, transformToRawUrl, } from './fetch.js';
|
|
7
7
|
import { getRequestId, logDebug, logError, logWarn, runWithRequestContext, } from './observability.js';
|
|
8
8
|
import { transformHtmlToMarkdown, } from './transform.js';
|
|
9
|
-
import {
|
|
9
|
+
import { isObject } from './type-guards.js';
|
|
10
10
|
const TRUNCATION_MARKER = '...[truncated]';
|
|
11
11
|
const FETCH_PROGRESS_TOTAL = 4;
|
|
12
12
|
const fetchUrlInputSchema = z.strictObject({
|
|
@@ -246,7 +246,7 @@ function persistCache({ cacheKey, data, serialize, normalizedUrl, }) {
|
|
|
246
246
|
cache.set(cacheKey, serializer(data), metadata);
|
|
247
247
|
}
|
|
248
248
|
function extractTitle(value) {
|
|
249
|
-
if (!
|
|
249
|
+
if (!isObject(value))
|
|
250
250
|
return undefined;
|
|
251
251
|
const { title } = value;
|
|
252
252
|
return typeof title === 'string' ? title : undefined;
|
|
@@ -319,7 +319,7 @@ function resolveToolErrorMessage(error, fallbackMessage) {
|
|
|
319
319
|
function parseJsonRecord(input) {
|
|
320
320
|
try {
|
|
321
321
|
const parsed = JSON.parse(input);
|
|
322
|
-
return
|
|
322
|
+
return isObject(parsed) ? parsed : undefined;
|
|
323
323
|
}
|
|
324
324
|
catch {
|
|
325
325
|
return undefined;
|
|
@@ -467,7 +467,7 @@ export function withRequestContextIfMissing(handler) {
|
|
|
467
467
|
};
|
|
468
468
|
}
|
|
469
469
|
function resolveRequestIdFromExtra(extra) {
|
|
470
|
-
if (!
|
|
470
|
+
if (!isObject(extra))
|
|
471
471
|
return undefined;
|
|
472
472
|
const { requestId } = extra;
|
|
473
473
|
if (typeof requestId === 'string')
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared types for the transform pipeline.
|
|
3
|
+
* Extracted to avoid circular dependencies between transform modules.
|
|
4
|
+
*/
|
|
5
|
+
/**
|
|
6
|
+
* Metadata block for attaching source information to markdown output.
|
|
7
|
+
*/
|
|
8
|
+
export interface MetadataBlock {
|
|
9
|
+
type: 'metadata';
|
|
10
|
+
title?: string;
|
|
11
|
+
description?: string;
|
|
12
|
+
author?: string;
|
|
13
|
+
url: string;
|
|
14
|
+
fetchedAt: string;
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Article extracted by Readability.
|
|
18
|
+
*/
|
|
19
|
+
export interface ExtractedArticle {
|
|
20
|
+
title?: string;
|
|
21
|
+
byline?: string;
|
|
22
|
+
content: string;
|
|
23
|
+
textContent: string;
|
|
24
|
+
excerpt?: string;
|
|
25
|
+
siteName?: string;
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Metadata extracted from HTML meta tags.
|
|
29
|
+
*/
|
|
30
|
+
export interface ExtractedMetadata {
|
|
31
|
+
title?: string;
|
|
32
|
+
description?: string;
|
|
33
|
+
author?: string;
|
|
34
|
+
image?: string;
|
|
35
|
+
publishedAt?: string;
|
|
36
|
+
modifiedAt?: string;
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Result of content extraction (article + metadata).
|
|
40
|
+
*/
|
|
41
|
+
export interface ExtractionResult {
|
|
42
|
+
article: ExtractedArticle | null;
|
|
43
|
+
metadata: ExtractedMetadata;
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* Result of HTML to markdown transformation.
|
|
47
|
+
*/
|
|
48
|
+
export interface MarkdownTransformResult {
|
|
49
|
+
markdown: string;
|
|
50
|
+
title: string | undefined;
|
|
51
|
+
truncated: boolean;
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* Options for transform operations.
|
|
55
|
+
*/
|
|
56
|
+
export interface TransformOptions {
|
|
57
|
+
includeMetadata: boolean;
|
|
58
|
+
signal?: AbortSignal;
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* Telemetry event emitted during transform stages.
|
|
62
|
+
*/
|
|
63
|
+
export interface TransformStageEvent {
|
|
64
|
+
v: 1;
|
|
65
|
+
type: 'stage';
|
|
66
|
+
stage: string;
|
|
67
|
+
durationMs: number;
|
|
68
|
+
url: string;
|
|
69
|
+
requestId?: string;
|
|
70
|
+
operationId?: string;
|
|
71
|
+
truncated?: boolean;
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* Context for tracking transform stage timing.
|
|
75
|
+
*/
|
|
76
|
+
export interface TransformStageContext {
|
|
77
|
+
readonly stage: string;
|
|
78
|
+
readonly startTime: number;
|
|
79
|
+
readonly url: string;
|
|
80
|
+
}
|
|
81
|
+
//# sourceMappingURL=transform-types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"transform-types.d.ts","sourceRoot":"","sources":["../src/transform-types.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B,IAAI,EAAE,UAAU,CAAC;IACjB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,GAAG,EAAE,MAAM,CAAC;IACZ,SAAS,EAAE,MAAM,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,OAAO,EAAE,MAAM,CAAC;IAChB,WAAW,EAAE,MAAM,CAAC;IACpB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B,OAAO,EAAE,gBAAgB,GAAG,IAAI,CAAC;IACjC,QAAQ,EAAE,iBAAiB,CAAC;CAC7B;AAED;;GAEG;AACH,MAAM,WAAW,uBAAuB;IACtC,QAAQ,EAAE,MAAM,CAAC;IACjB,KAAK,EAAE,MAAM,GAAG,SAAS,CAAC;IAC1B,SAAS,EAAE,OAAO,CAAC;CACpB;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B,eAAe,EAAE,OAAO,CAAC;IACzB,MAAM,CAAC,EAAE,WAAW,CAAC;CACtB;AAED;;GAEG;AACH,MAAM,WAAW,mBAAmB;IAClC,CAAC,EAAE,CAAC,CAAC;IACL,IAAI,EAAE,OAAO,CAAC;IACd,KAAK,EAAE,MAAM,CAAC;IACd,UAAU,EAAE,MAAM,CAAC;IACnB,GAAG,EAAE,MAAM,CAAC;IACZ,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,SAAS,CAAC,EAAE,OAAO,CAAC;CACrB;AAED;;GAEG;AACH,MAAM,WAAW,qBAAqB;IACpC,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC;CACtB"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"transform-types.js","sourceRoot":"","sources":["../src/transform-types.ts"],"names":[],"mappings":"AAAA;;;GAGG"}
|
package/dist/transform.d.ts
CHANGED
|
@@ -1,52 +1,8 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
url: string;
|
|
7
|
-
fetchedAt: string;
|
|
8
|
-
}
|
|
9
|
-
export interface ExtractedArticle {
|
|
10
|
-
title?: string;
|
|
11
|
-
byline?: string;
|
|
12
|
-
content: string;
|
|
13
|
-
textContent: string;
|
|
14
|
-
excerpt?: string;
|
|
15
|
-
siteName?: string;
|
|
16
|
-
}
|
|
17
|
-
export interface ExtractedMetadata {
|
|
18
|
-
title?: string;
|
|
19
|
-
description?: string;
|
|
20
|
-
author?: string;
|
|
21
|
-
}
|
|
22
|
-
export interface ExtractionResult {
|
|
23
|
-
article: ExtractedArticle | null;
|
|
24
|
-
metadata: ExtractedMetadata;
|
|
25
|
-
}
|
|
26
|
-
export interface MarkdownTransformResult {
|
|
27
|
-
markdown: string;
|
|
28
|
-
title: string | undefined;
|
|
29
|
-
truncated: boolean;
|
|
30
|
-
}
|
|
31
|
-
export interface TransformOptions {
|
|
32
|
-
includeMetadata: boolean;
|
|
33
|
-
signal?: AbortSignal;
|
|
34
|
-
}
|
|
35
|
-
export interface TransformStageEvent {
|
|
36
|
-
v: 1;
|
|
37
|
-
type: 'stage';
|
|
38
|
-
stage: string;
|
|
39
|
-
durationMs: number;
|
|
40
|
-
url: string;
|
|
41
|
-
requestId?: string;
|
|
42
|
-
operationId?: string;
|
|
43
|
-
truncated?: boolean;
|
|
44
|
-
}
|
|
45
|
-
export interface TransformStageContext {
|
|
46
|
-
readonly stage: string;
|
|
47
|
-
readonly startTime: number;
|
|
48
|
-
readonly url: string;
|
|
49
|
-
}
|
|
1
|
+
import type { ExtractedArticle, ExtractedMetadata, ExtractionResult, MarkdownTransformResult, MetadataBlock, TransformOptions, TransformStageContext } from './transform-types.js';
|
|
2
|
+
export { detectLanguageFromCode, resolveLanguageFromAttributes, } from './language-detection.js';
|
|
3
|
+
export { cleanupMarkdownArtifacts, promoteOrphanHeadings, } from './markdown-cleanup.js';
|
|
4
|
+
export { removeNoiseFromHtml } from './dom-noise-removal.js';
|
|
5
|
+
export type { MetadataBlock, ExtractedArticle, ExtractedMetadata, ExtractionResult, MarkdownTransformResult, TransformOptions, TransformStageEvent, TransformStageContext, } from './transform-types.js';
|
|
50
6
|
export declare function startTransformStage(url: string, stage: string): TransformStageContext | null;
|
|
51
7
|
export declare function endTransformStage(context: TransformStageContext | null, options?: {
|
|
52
8
|
truncated?: boolean;
|
|
@@ -55,14 +11,13 @@ export declare function extractContent(html: string, url: string, options?: {
|
|
|
55
11
|
extractArticle?: boolean;
|
|
56
12
|
signal?: AbortSignal;
|
|
57
13
|
}): ExtractionResult;
|
|
58
|
-
export declare function detectLanguageFromCode(code: string): string | undefined;
|
|
59
|
-
export declare function resolveLanguageFromAttributes(className: string, dataLang: string): string | undefined;
|
|
60
14
|
export declare function htmlToMarkdown(html: string, metadata?: MetadataBlock, options?: {
|
|
61
15
|
url?: string;
|
|
62
16
|
signal?: AbortSignal;
|
|
63
17
|
document?: Document;
|
|
18
|
+
skipNoiseRemoval?: boolean;
|
|
64
19
|
}): string;
|
|
65
|
-
export declare function isExtractionSufficient(article: ExtractedArticle | null,
|
|
20
|
+
export declare function isExtractionSufficient(article: ExtractedArticle | null, originalHtmlOrDocument: string | Document): boolean;
|
|
66
21
|
export declare function determineContentExtractionSource(article: ExtractedArticle | null): article is ExtractedArticle;
|
|
67
22
|
export declare function createContentMetadataBlock(url: string, article: ExtractedArticle | null, extractedMeta: ExtractedMetadata, shouldExtractFromArticle: boolean, includeMetadata: boolean): MetadataBlock | undefined;
|
|
68
23
|
export declare function transformHtmlToMarkdownInProcess(html: string, url: string, options: TransformOptions): MarkdownTransformResult;
|
package/dist/transform.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"transform.d.ts","sourceRoot":"","sources":["../src/transform.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"transform.d.ts","sourceRoot":"","sources":["../src/transform.ts"],"names":[],"mappings":"AAqCA,OAAO,KAAK,EACV,gBAAgB,EAChB,iBAAiB,EACjB,gBAAgB,EAChB,uBAAuB,EACvB,aAAa,EACb,gBAAgB,EAChB,qBAAqB,EAEtB,MAAM,sBAAsB,CAAC;AAI9B,OAAO,EACL,sBAAsB,EACtB,6BAA6B,GAC9B,MAAM,yBAAyB,CAAC;AAGjC,OAAO,EACL,wBAAwB,EACxB,qBAAqB,GACtB,MAAM,uBAAuB,CAAC;AAG/B,OAAO,EAAE,mBAAmB,EAAE,MAAM,wBAAwB,CAAC;AAG7D,YAAY,EACV,aAAa,EACb,gBAAgB,EAChB,iBAAiB,EACjB,gBAAgB,EAChB,uBAAuB,EACvB,gBAAgB,EAChB,mBAAmB,EACnB,qBAAqB,GACtB,MAAM,sBAAsB,CAAC;AAgC9B,wBAAgB,mBAAmB,CACjC,GAAG,EAAE,MAAM,EACX,KAAK,EAAE,MAAM,GACZ,qBAAqB,GAAG,IAAI,CAQ9B;AAED,wBAAgB,iBAAiB,CAC/B,OAAO,EAAE,qBAAqB,GAAG,IAAI,EACrC,OAAO,CAAC,EAAE;IAAE,SAAS,CAAC,EAAE,OAAO,CAAA;CAAE,GAChC,IAAI,CAoBN;AAuOD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,GAAE;IAAE,cAAc,CAAC,EAAE,OAAO,CAAC;IAAC,MAAM,CAAC,EAAE,WAAW,CAAA;CAExD,GACA,gBAAgB,CAGlB;AA4XD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,aAAa,EACxB,OAAO,CAAC,EAAE;IACR,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,QAAQ,CAAC,EAAE,QAAQ,CAAC;IACpB,gBAAgB,CAAC,EAAE,OAAO,CAAC;CAC5B,GACA,MAAM,CAyBR;AAgaD,wBAAgB,sBAAsB,CACpC,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,sBAAsB,EAAE,MAAM,GAAG,QAAQ,GACxC,OAAO,CAUT;AA0BD,wBAAgB,gCAAgC,CAC9C,OAAO,EAAE,gBAAgB,GAAG,IAAI,GAC/B,OAAO,IAAI,gBAAgB,CAE7B;AAED,wBAAgB,0BAA0B,CACxC,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,aAAa,EAAE,iBAAiB,EAChC,wBAAwB,EAAE,OAAO,EACjC,eAAe,EAAE,OAAO,GACvB,aAAa,GAAG,SAAS,CAuB3B;AA0UD,wBAAgB,gCAAgC,CAC9C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,uBAAuB,CAkBzB;AA6ED,wBAAsB,2BAA2B,IAAI,OAAO,CAAC,IAAI,CAAC,CAIjE;AA8dD,wBAAsB,uBAAuB,CAC3C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,OAAO,CAAC,uBAAuB,CAAC,CAelC"}
|