@j0hanz/superfetch 1.0.6 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +228 -36
- package/dist/config/index.d.ts +10 -5
- package/dist/config/index.d.ts.map +1 -1
- package/dist/config/index.js +73 -19
- package/dist/config/index.js.map +1 -1
- package/dist/config/types.d.ts +98 -57
- package/dist/config/types.d.ts.map +1 -1
- package/dist/errors/app-error.d.ts +4 -28
- package/dist/errors/app-error.d.ts.map +1 -1
- package/dist/errors/app-error.js +10 -51
- package/dist/errors/app-error.js.map +1 -1
- package/dist/index.js +10 -55
- package/dist/index.js.map +1 -1
- package/dist/middleware/error-handler.d.ts +2 -2
- package/dist/middleware/error-handler.d.ts.map +1 -1
- package/dist/middleware/error-handler.js +12 -14
- package/dist/middleware/error-handler.js.map +1 -1
- package/dist/middleware/rate-limiter.d.ts.map +1 -1
- package/dist/middleware/rate-limiter.js +0 -8
- package/dist/middleware/rate-limiter.js.map +1 -1
- package/dist/parsers/base-html-element-parser.d.ts +43 -0
- package/dist/parsers/base-html-element-parser.d.ts.map +1 -0
- package/dist/parsers/base-html-element-parser.js +59 -0
- package/dist/parsers/base-html-element-parser.js.map +1 -0
- package/dist/parsers/heading-element-parser.d.ts +14 -0
- package/dist/parsers/heading-element-parser.d.ts.map +1 -0
- package/dist/parsers/heading-element-parser.js +26 -0
- package/dist/parsers/heading-element-parser.js.map +1 -0
- package/dist/parsers/image-element-parser.d.ts +16 -0
- package/dist/parsers/image-element-parser.d.ts.map +1 -0
- package/dist/parsers/image-element-parser.js +33 -0
- package/dist/parsers/image-element-parser.js.map +1 -0
- package/dist/parsers/link-element-parser.d.ts +15 -0
- package/dist/parsers/link-element-parser.d.ts.map +1 -0
- package/dist/parsers/link-element-parser.js +28 -0
- package/dist/parsers/link-element-parser.js.map +1 -0
- package/dist/parsers/open-graph-parser.d.ts +17 -0
- package/dist/parsers/open-graph-parser.d.ts.map +1 -0
- package/dist/parsers/open-graph-parser.js +41 -0
- package/dist/parsers/open-graph-parser.js.map +1 -0
- package/dist/parsers/schema-org-parser.d.ts +17 -0
- package/dist/parsers/schema-org-parser.d.ts.map +1 -0
- package/dist/parsers/schema-org-parser.js +32 -0
- package/dist/parsers/schema-org-parser.js.map +1 -0
- package/dist/parsers/standard-meta-parser.d.ts +18 -0
- package/dist/parsers/standard-meta-parser.d.ts.map +1 -0
- package/dist/parsers/standard-meta-parser.js +32 -0
- package/dist/parsers/standard-meta-parser.js.map +1 -0
- package/dist/parsers/twitter-card-parser.d.ts +17 -0
- package/dist/parsers/twitter-card-parser.d.ts.map +1 -0
- package/dist/parsers/twitter-card-parser.js +41 -0
- package/dist/parsers/twitter-card-parser.js.map +1 -0
- package/dist/resources/cached-content.d.ts +0 -1
- package/dist/resources/cached-content.d.ts.map +1 -1
- package/dist/resources/cached-content.js +3 -9
- package/dist/resources/cached-content.js.map +1 -1
- package/dist/resources/index.d.ts.map +1 -1
- package/dist/resources/index.js +8 -8
- package/dist/resources/index.js.map +1 -1
- package/dist/server.d.ts.map +1 -1
- package/dist/server.js +10 -10
- package/dist/server.js.map +1 -1
- package/dist/services/cache.d.ts +0 -28
- package/dist/services/cache.d.ts.map +1 -1
- package/dist/services/cache.js +10 -173
- package/dist/services/cache.js.map +1 -1
- package/dist/services/extractor.d.ts +1 -11
- package/dist/services/extractor.d.ts.map +1 -1
- package/dist/services/extractor.js +86 -84
- package/dist/services/extractor.js.map +1 -1
- package/dist/services/fetcher.d.ts +2 -13
- package/dist/services/fetcher.d.ts.map +1 -1
- package/dist/services/fetcher.js +195 -211
- package/dist/services/fetcher.js.map +1 -1
- package/dist/services/logger.d.ts +5 -4
- package/dist/services/logger.d.ts.map +1 -1
- package/dist/services/logger.js +27 -42
- package/dist/services/logger.js.map +1 -1
- package/dist/services/parser.d.ts.map +1 -1
- package/dist/services/parser.js +35 -26
- package/dist/services/parser.js.map +1 -1
- package/dist/services/session-manager.d.ts +18 -0
- package/dist/services/session-manager.d.ts.map +1 -0
- package/dist/services/session-manager.js +73 -0
- package/dist/services/session-manager.js.map +1 -0
- package/dist/strategies/exponential-backoff-strategy.d.ts +13 -0
- package/dist/strategies/exponential-backoff-strategy.d.ts.map +1 -0
- package/dist/strategies/exponential-backoff-strategy.js +32 -0
- package/dist/strategies/exponential-backoff-strategy.js.map +1 -0
- package/dist/tools/handlers/fetch-links.tool.d.ts +2 -9
- package/dist/tools/handlers/fetch-links.tool.d.ts.map +1 -1
- package/dist/tools/handlers/fetch-links.tool.js +0 -1
- package/dist/tools/handlers/fetch-links.tool.js.map +1 -1
- package/dist/tools/handlers/fetch-markdown.tool.d.ts +5 -2
- package/dist/tools/handlers/fetch-markdown.tool.d.ts.map +1 -1
- package/dist/tools/handlers/fetch-markdown.tool.js +23 -33
- package/dist/tools/handlers/fetch-markdown.tool.js.map +1 -1
- package/dist/tools/handlers/fetch-url.tool.d.ts +2 -9
- package/dist/tools/handlers/fetch-url.tool.d.ts.map +1 -1
- package/dist/tools/handlers/fetch-url.tool.js +15 -20
- package/dist/tools/handlers/fetch-url.tool.js.map +1 -1
- package/dist/tools/handlers/fetch-urls.tool.d.ts +2 -9
- package/dist/tools/handlers/fetch-urls.tool.d.ts.map +1 -1
- package/dist/tools/handlers/fetch-urls.tool.js +124 -105
- package/dist/tools/handlers/fetch-urls.tool.js.map +1 -1
- package/dist/tools/index.d.ts.map +1 -1
- package/dist/tools/index.js +0 -4
- package/dist/tools/index.js.map +1 -1
- package/dist/tools/utils/common.d.ts +6 -7
- package/dist/tools/utils/common.d.ts.map +1 -1
- package/dist/tools/utils/common.js +8 -8
- package/dist/tools/utils/common.js.map +1 -1
- package/dist/tools/utils/fetch-pipeline.d.ts +8 -0
- package/dist/tools/utils/fetch-pipeline.d.ts.map +1 -1
- package/dist/tools/utils/fetch-pipeline.js +47 -79
- package/dist/tools/utils/fetch-pipeline.js.map +1 -1
- package/dist/transformers/jsonl.transformer.d.ts +1 -1
- package/dist/transformers/jsonl.transformer.d.ts.map +1 -1
- package/dist/transformers/jsonl.transformer.js +15 -10
- package/dist/transformers/jsonl.transformer.js.map +1 -1
- package/dist/transformers/markdown.transformer.d.ts.map +1 -1
- package/dist/transformers/markdown.transformer.js +58 -62
- package/dist/transformers/markdown.transformer.js.map +1 -1
- package/dist/utils/concurrency.d.ts +2 -5
- package/dist/utils/concurrency.d.ts.map +1 -1
- package/dist/utils/concurrency.js +19 -19
- package/dist/utils/concurrency.js.map +1 -1
- package/dist/utils/content-cleaner.d.ts +0 -25
- package/dist/utils/content-cleaner.d.ts.map +1 -1
- package/dist/utils/content-cleaner.js +12 -187
- package/dist/utils/content-cleaner.js.map +1 -1
- package/dist/utils/html-truncator.d.ts +2 -0
- package/dist/utils/html-truncator.d.ts.map +1 -0
- package/dist/utils/html-truncator.js +14 -0
- package/dist/utils/html-truncator.js.map +1 -0
- package/dist/utils/language-detector.d.ts +0 -3
- package/dist/utils/language-detector.d.ts.map +1 -1
- package/dist/utils/language-detector.js +0 -11
- package/dist/utils/language-detector.js.map +1 -1
- package/dist/utils/sanitizer.d.ts.map +1 -1
- package/dist/utils/sanitizer.js +7 -5
- package/dist/utils/sanitizer.js.map +1 -1
- package/dist/utils/tool-error-handler.d.ts.map +1 -1
- package/dist/utils/tool-error-handler.js +15 -42
- package/dist/utils/tool-error-handler.js.map +1 -1
- package/dist/utils/url-validator.d.ts +0 -6
- package/dist/utils/url-validator.d.ts.map +1 -1
- package/dist/utils/url-validator.js +12 -81
- package/dist/utils/url-validator.js.map +1 -1
- package/package.json +5 -6
|
@@ -2,10 +2,6 @@ import * as cache from '../../services/cache.js';
|
|
|
2
2
|
import { fetchUrlWithRetry } from '../../services/fetcher.js';
|
|
3
3
|
import { logDebug, logWarn } from '../../services/logger.js';
|
|
4
4
|
import { validateAndNormalizeUrl } from '../../utils/url-validator.js';
|
|
5
|
-
/**
|
|
6
|
-
* Safe JSON parse with error handling for cache deserialization.
|
|
7
|
-
* Returns undefined on parse failure, treating it as a cache miss.
|
|
8
|
-
*/
|
|
9
5
|
function safeJsonParse(cached, cacheKey) {
|
|
10
6
|
try {
|
|
11
7
|
return JSON.parse(cached);
|
|
@@ -17,91 +13,63 @@ function safeJsonParse(cached, cacheKey) {
|
|
|
17
13
|
return undefined;
|
|
18
14
|
}
|
|
19
15
|
}
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
const
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
16
|
+
function attemptCacheRetrieval(cacheKey, deserialize, cacheNamespace, normalizedUrl) {
|
|
17
|
+
if (!cacheKey)
|
|
18
|
+
return null;
|
|
19
|
+
const cached = cache.get(cacheKey);
|
|
20
|
+
if (!cached)
|
|
21
|
+
return null;
|
|
22
|
+
logDebug('Cache hit', { namespace: cacheNamespace, url: normalizedUrl });
|
|
23
|
+
const data = deserialize
|
|
24
|
+
? deserialize(cached.content)
|
|
25
|
+
: safeJsonParse(cached.content, cacheKey);
|
|
26
|
+
if (data === undefined) {
|
|
27
|
+
logDebug('Cache miss due to deserialize failure', {
|
|
28
|
+
namespace: cacheNamespace,
|
|
29
|
+
url: normalizedUrl,
|
|
30
|
+
});
|
|
31
|
+
return null;
|
|
29
32
|
}
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
+
return {
|
|
34
|
+
data,
|
|
35
|
+
fromCache: true,
|
|
36
|
+
url: normalizedUrl,
|
|
37
|
+
fetchedAt: cached.fetchedAt,
|
|
38
|
+
};
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* Unified fetch pipeline that handles caching, fetching, and transformation.
|
|
42
|
+
* Implements cache-first strategy with automatic serialization.
|
|
43
|
+
*
|
|
44
|
+
* @template T - Type of the transformed result
|
|
45
|
+
* @param options - Pipeline configuration options
|
|
46
|
+
* @returns Promise resolving to the pipeline result
|
|
47
|
+
*/
|
|
33
48
|
export async function executeFetchPipeline(options) {
|
|
34
49
|
const { url, cacheNamespace, customHeaders, retries, signal, timeout, transform, serialize = JSON.stringify, deserialize, } = options;
|
|
35
50
|
const normalizedUrl = validateAndNormalizeUrl(url);
|
|
36
51
|
const cacheKey = cache.createCacheKey(cacheNamespace, normalizedUrl);
|
|
37
|
-
|
|
38
|
-
if (
|
|
39
|
-
|
|
40
|
-
if (cached) {
|
|
41
|
-
logDebug('Cache hit', { namespace: cacheNamespace, url: normalizedUrl });
|
|
42
|
-
// Use provided deserializer or safe JSON parse
|
|
43
|
-
const data = deserialize
|
|
44
|
-
? deserialize(cached.content)
|
|
45
|
-
: safeJsonParse(cached.content, cacheKey);
|
|
46
|
-
// If deserialization failed, treat as cache miss
|
|
47
|
-
if (data === undefined) {
|
|
48
|
-
logDebug('Cache miss due to deserialize failure', {
|
|
49
|
-
namespace: cacheNamespace,
|
|
50
|
-
url: normalizedUrl,
|
|
51
|
-
});
|
|
52
|
-
}
|
|
53
|
-
else {
|
|
54
|
-
return {
|
|
55
|
-
data,
|
|
56
|
-
fromCache: true,
|
|
57
|
-
url: normalizedUrl,
|
|
58
|
-
fetchedAt: cached.fetchedAt,
|
|
59
|
-
};
|
|
60
|
-
}
|
|
61
|
-
}
|
|
62
|
-
}
|
|
63
|
-
// Check for pending request to prevent duplicate fetches
|
|
64
|
-
// Include custom headers hash to ensure requests with different headers aren't deduplicated
|
|
65
|
-
const headersKey = customHeaders ? JSON.stringify(customHeaders) : '';
|
|
66
|
-
const dedupeKey = `${cacheNamespace}:${normalizedUrl}:${headersKey}`;
|
|
67
|
-
const pending = pendingRequests.get(dedupeKey);
|
|
68
|
-
if (pending) {
|
|
69
|
-
logDebug('Request deduplication hit', { url: normalizedUrl });
|
|
70
|
-
return pending.promise;
|
|
52
|
+
const cachedResult = attemptCacheRetrieval(cacheKey, deserialize, cacheNamespace, normalizedUrl);
|
|
53
|
+
if (cachedResult) {
|
|
54
|
+
return cachedResult;
|
|
71
55
|
}
|
|
72
|
-
// Build fetch options
|
|
73
56
|
const fetchOptions = {
|
|
74
57
|
customHeaders,
|
|
75
58
|
signal,
|
|
76
59
|
timeout,
|
|
77
60
|
};
|
|
78
|
-
|
|
79
|
-
const
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
fromCache: false,
|
|
92
|
-
url: normalizedUrl,
|
|
93
|
-
fetchedAt: new Date().toISOString(),
|
|
94
|
-
};
|
|
95
|
-
}
|
|
96
|
-
finally {
|
|
97
|
-
// Clean up pending request
|
|
98
|
-
pendingRequests.delete(dedupeKey);
|
|
99
|
-
}
|
|
100
|
-
})();
|
|
101
|
-
pendingRequests.set(dedupeKey, {
|
|
102
|
-
promise: request,
|
|
103
|
-
timestamp: Date.now(),
|
|
104
|
-
});
|
|
105
|
-
return request;
|
|
61
|
+
logDebug('Fetching URL', { url: normalizedUrl, retries });
|
|
62
|
+
const html = await fetchUrlWithRetry(normalizedUrl, fetchOptions, retries);
|
|
63
|
+
const data = transform(html, normalizedUrl);
|
|
64
|
+
if (cacheKey) {
|
|
65
|
+
const serialized = serialize(data);
|
|
66
|
+
cache.set(cacheKey, serialized);
|
|
67
|
+
}
|
|
68
|
+
return {
|
|
69
|
+
data,
|
|
70
|
+
fromCache: false,
|
|
71
|
+
url: normalizedUrl,
|
|
72
|
+
fetchedAt: new Date().toISOString(),
|
|
73
|
+
};
|
|
106
74
|
}
|
|
107
75
|
//# sourceMappingURL=fetch-pipeline.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"fetch-pipeline.js","sourceRoot":"","sources":["../../../src/tools/utils/fetch-pipeline.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"fetch-pipeline.js","sourceRoot":"","sources":["../../../src/tools/utils/fetch-pipeline.ts"],"names":[],"mappings":"AAMA,OAAO,KAAK,KAAK,MAAM,yBAAyB,CAAC;AACjD,OAAO,EAAE,iBAAiB,EAAE,MAAM,2BAA2B,CAAC;AAC9D,OAAO,EAAE,QAAQ,EAAE,OAAO,EAAE,MAAM,0BAA0B,CAAC;AAE7D,OAAO,EAAE,uBAAuB,EAAE,MAAM,8BAA8B,CAAC;AAEvE,SAAS,aAAa,CAAC,MAAc,EAAE,QAAgB;IACrD,IAAI,CAAC;QACH,OAAO,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;IAC5B,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,CAAC,4CAA4C,EAAE;YACpD,GAAG,EAAE,QAAQ,CAAC,SAAS,CAAC,CAAC,EAAE,GAAG,CAAC;SAChC,CAAC,CAAC;QACH,OAAO,SAAS,CAAC;IACnB,CAAC;AACH,CAAC;AAED,SAAS,qBAAqB,CAC5B,QAAuB,EACvB,WAAgD,EAChD,cAAsB,EACtB,aAAqB;IAErB,IAAI,CAAC,QAAQ;QAAE,OAAO,IAAI,CAAC;IAE3B,MAAM,MAAM,GAAG,KAAK,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;IACnC,IAAI,CAAC,MAAM;QAAE,OAAO,IAAI,CAAC;IAEzB,QAAQ,CAAC,WAAW,EAAE,EAAE,SAAS,EAAE,cAAc,EAAE,GAAG,EAAE,aAAa,EAAE,CAAC,CAAC;IAEzE,MAAM,IAAI,GAAG,WAAW;QACtB,CAAC,CAAC,WAAW,CAAC,MAAM,CAAC,OAAO,CAAC;QAC7B,CAAC,CAAE,aAAa,CAAC,MAAM,CAAC,OAAO,EAAE,QAAQ,CAAmB,CAAC;IAE/D,IAAI,IAAI,KAAK,SAAS,EAAE,CAAC;QACvB,QAAQ,CAAC,uCAAuC,EAAE;YAChD,SAAS,EAAE,cAAc;YACzB,GAAG,EAAE,aAAa;SACnB,CAAC,CAAC;QACH,OAAO,IAAI,CAAC;IACd,CAAC;IAED,OAAO;QACL,IAAI;QACJ,SAAS,EAAE,IAAI;QACf,GAAG,EAAE,aAAa;QAClB,SAAS,EAAE,MAAM,CAAC,SAAS;KAC5B,CAAC;AACJ,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,CAAC,KAAK,UAAU,oBAAoB,CACxC,OAAgC;IAEhC,MAAM,EACJ,GAAG,EACH,cAAc,EACd,aAAa,EACb,OAAO,EACP,MAAM,EACN,OAAO,EACP,SAAS,EACT,SAAS,GAAG,IAAI,CAAC,SAAS,EAC1B,WAAW,GACZ,GAAG,OAAO,CAAC;IAEZ,MAAM,aAAa,GAAG,uBAAuB,CAAC,GAAG,CAAC,CAAC;IACnD,MAAM,QAAQ,GAAG,KAAK,CAAC,cAAc,CAAC,cAAc,EAAE,aAAa,CAAC,CAAC;IAErE,MAAM,YAAY,GAAG,qBAAqB,CACxC,QAAQ,EACR,WAAW,EACX,cAAc,EACd,aAAa,CACd,CAAC;IAEF,IAAI,YAAY,EAAE,CAAC;QACjB,OAAO,YAAY,CAAC;IACtB,CAAC;IAED,MAAM,YAAY,GAAiB;QACjC,aAAa;QACb,MAAM;QACN,OAAO;KACR,CAAC;IAEF,QAAQ,CAAC,cAAc,EAAE,EAAE,GAAG,EAAE,aAAa,EAAE,OAAO,EAAE,CAAC,CAAC;IAE1D,MAAM,IAAI,GAAG,MAAM,iBAAiB,CAAC,aAAa,EAAE,YAAY,EAAE,OAAO,CAAC,CAAC;IAC3E,MAAM,IAAI,GAAG,SAAS,CAAC,IAAI,EAAE,aAAa,CAAC,CAAC;IAE5C,IAAI,QAAQ,EAAE,CAAC;QACb,MAAM,UAAU,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;QACnC,KAAK,CAAC,GAAG,CAAC,QAAQ,EAAE,UAAU,CAAC,CAAC;IAClC,CAAC;IAED,OAAO;QACL,IAAI;QACJ,SAAS,EAAE,KAAK;QAChB,GAAG,EAAE,aAAa;QAClB,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;KACpC,CAAC;AACJ,CAAC"}
|
|
@@ -1,3 +1,3 @@
|
|
|
1
1
|
import type { ContentBlockUnion, MetadataBlock } from '../config/types.js';
|
|
2
|
-
export declare function toJsonl(blocks: ContentBlockUnion[], metadata?: MetadataBlock): string;
|
|
2
|
+
export declare function toJsonl(blocks: readonly ContentBlockUnion[], metadata?: MetadataBlock): string;
|
|
3
3
|
//# sourceMappingURL=jsonl.transformer.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"jsonl.transformer.d.ts","sourceRoot":"","sources":["../../src/transformers/jsonl.transformer.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,iBAAiB,EAAE,aAAa,EAAE,MAAM,oBAAoB,CAAC;
|
|
1
|
+
{"version":3,"file":"jsonl.transformer.d.ts","sourceRoot":"","sources":["../../src/transformers/jsonl.transformer.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,iBAAiB,EAAE,aAAa,EAAE,MAAM,oBAAoB,CAAC;AAuC3E,wBAAgB,OAAO,CACrB,MAAM,EAAE,SAAS,iBAAiB,EAAE,EACpC,QAAQ,CAAC,EAAE,aAAa,GACvB,MAAM,CAwBR"}
|
|
@@ -12,35 +12,40 @@ function truncateBlock(block) {
|
|
|
12
12
|
}
|
|
13
13
|
case 'list': {
|
|
14
14
|
const truncatedItems = block.items.map((item) => truncateText(item, maxLength));
|
|
15
|
-
const hasChanges = truncatedItems.some((item,
|
|
15
|
+
const hasChanges = truncatedItems.some((item, index) => item !== block.items[index]);
|
|
16
16
|
return hasChanges ? { ...block, items: truncatedItems } : block;
|
|
17
17
|
}
|
|
18
18
|
default:
|
|
19
19
|
return block;
|
|
20
20
|
}
|
|
21
21
|
}
|
|
22
|
+
function serializeBlock(block) {
|
|
23
|
+
try {
|
|
24
|
+
return JSON.stringify(truncateBlock(block));
|
|
25
|
+
}
|
|
26
|
+
catch {
|
|
27
|
+
return null;
|
|
28
|
+
}
|
|
29
|
+
}
|
|
22
30
|
export function toJsonl(blocks, metadata) {
|
|
23
31
|
const lines = [];
|
|
24
|
-
// Minimal metadata - just title and URL for context
|
|
25
32
|
if (metadata) {
|
|
26
33
|
try {
|
|
27
|
-
const
|
|
34
|
+
const minimalMetadata = {
|
|
28
35
|
type: metadata.type,
|
|
29
36
|
title: metadata.title,
|
|
30
37
|
url: metadata.url,
|
|
31
38
|
};
|
|
32
|
-
lines.push(JSON.stringify(
|
|
39
|
+
lines.push(JSON.stringify(minimalMetadata));
|
|
33
40
|
}
|
|
34
41
|
catch {
|
|
35
|
-
|
|
42
|
+
/* skip */
|
|
36
43
|
}
|
|
37
44
|
}
|
|
38
45
|
for (const block of blocks) {
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
catch {
|
|
43
|
-
// Skip blocks that fail to serialize
|
|
46
|
+
const serialized = serializeBlock(block);
|
|
47
|
+
if (serialized) {
|
|
48
|
+
lines.push(serialized);
|
|
44
49
|
}
|
|
45
50
|
}
|
|
46
51
|
return lines.join('\n');
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"jsonl.transformer.js","sourceRoot":"","sources":["../../src/transformers/jsonl.transformer.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAG5C,OAAO,EAAE,YAAY,EAAE,MAAM,uBAAuB,CAAC;AAErD,SAAS,aAAa,CAAC,KAAwB;IAC7C,MAAM,SAAS,GAAG,MAAM,CAAC,UAAU,CAAC,cAAc,CAAC;IAEnD,QAAQ,KAAK,CAAC,IAAI,EAAE,CAAC;QACnB,KAAK,WAAW,CAAC;QACjB,KAAK,SAAS,CAAC;QACf,KAAK,MAAM,CAAC;QACZ,KAAK,YAAY,CAAC,CAAC,CAAC;YAClB,MAAM,SAAS,GAAG,YAAY,CAAC,KAAK,CAAC,IAAI,EAAE,SAAS,CAAC,CAAC;YACtD,OAAO,SAAS,KAAK,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,GAAG,KAAK,EAAE,IAAI,EAAE,SAAS,EAAE,CAAC;QAC1E,CAAC;
|
|
1
|
+
{"version":3,"file":"jsonl.transformer.js","sourceRoot":"","sources":["../../src/transformers/jsonl.transformer.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAG5C,OAAO,EAAE,YAAY,EAAE,MAAM,uBAAuB,CAAC;AAErD,SAAS,aAAa,CAAC,KAAwB;IAC7C,MAAM,SAAS,GAAG,MAAM,CAAC,UAAU,CAAC,cAAc,CAAC;IAEnD,QAAQ,KAAK,CAAC,IAAI,EAAE,CAAC;QACnB,KAAK,WAAW,CAAC;QACjB,KAAK,SAAS,CAAC;QACf,KAAK,MAAM,CAAC;QACZ,KAAK,YAAY,CAAC,CAAC,CAAC;YAClB,MAAM,SAAS,GAAG,YAAY,CAAC,KAAK,CAAC,IAAI,EAAE,SAAS,CAAC,CAAC;YACtD,OAAO,SAAS,KAAK,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,GAAG,KAAK,EAAE,IAAI,EAAE,SAAS,EAAE,CAAC;QAC1E,CAAC;QAED,KAAK,MAAM,CAAC,CAAC,CAAC;YACZ,MAAM,cAAc,GAAG,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAC9C,YAAY,CAAC,IAAI,EAAE,SAAS,CAAC,CAC9B,CAAC;YACF,MAAM,UAAU,GAAG,cAAc,CAAC,IAAI,CACpC,CAAC,IAAI,EAAE,KAAK,EAAE,EAAE,CAAC,IAAI,KAAK,KAAK,CAAC,KAAK,CAAC,KAAK,CAAC,CAC7C,CAAC;YACF,OAAO,UAAU,CAAC,CAAC,CAAC,EAAE,GAAG,KAAK,EAAE,KAAK,EAAE,cAAc,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC;QAClE,CAAC;QAED;YACE,OAAO,KAAK,CAAC;IACjB,CAAC;AACH,CAAC;AAED,SAAS,cAAc,CAAC,KAAwB;IAC9C,IAAI,CAAC;QACH,OAAO,IAAI,CAAC,SAAS,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC,CAAC;IAC9C,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED,MAAM,UAAU,OAAO,CACrB,MAAoC,EACpC,QAAwB;IAExB,MAAM,KAAK,GAAa,EAAE,CAAC;IAE3B,IAAI,QAAQ,EAAE,CAAC;QACb,IAAI,CAAC;YACH,MAAM,eAAe,GAAG;gBACtB,IAAI,EAAE,QAAQ,CAAC,IAAI;gBACnB,KAAK,EAAE,QAAQ,CAAC,KAAK;gBACrB,GAAG,EAAE,QAAQ,CAAC,GAAG;aAClB,CAAC;YACF,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,eAAe,CAAC,CAAC,CAAC;QAC9C,CAAC;QAAC,MAAM,CAAC;YACP,UAAU;QACZ,CAAC;IACH,CAAC;IAED,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC3B,MAAM,UAAU,GAAG,cAAc,CAAC,KAAK,CAAC,CAAC;QACzC,IAAI,UAAU,EAAE,CAAC;YACf,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QACzB,CAAC;IACH,CAAC;IAED,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"markdown.transformer.d.ts","sourceRoot":"","sources":["../../src/transformers/markdown.transformer.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,oBAAoB,CAAC;
|
|
1
|
+
{"version":3,"file":"markdown.transformer.d.ts","sourceRoot":"","sources":["../../src/transformers/markdown.transformer.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,oBAAoB,CAAC;AAwKxD,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE,aAAa,GAAG,MAAM,CAiB7E"}
|
|
@@ -1,7 +1,28 @@
|
|
|
1
1
|
import TurndownService from 'turndown';
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
2
|
+
function detectLanguageFromCode(code) {
|
|
3
|
+
const patterns = [
|
|
4
|
+
[
|
|
5
|
+
/^\s*import\s+.*\s+from\s+['"]react['"]|<[A-Z][a-zA-Z]*[\s/>]|jsx\s*:|className=/m,
|
|
6
|
+
'jsx',
|
|
7
|
+
],
|
|
8
|
+
[
|
|
9
|
+
/:\s*(string|number|boolean|void|any|unknown|never)\b|interface\s+\w+|type\s+\w+\s*=/m,
|
|
10
|
+
'typescript',
|
|
11
|
+
],
|
|
12
|
+
[/^\s*(fn|let\s+mut|impl|struct|enum|use\s+\w+::)/m, 'rust'],
|
|
13
|
+
[
|
|
14
|
+
/^\s*(export|const|let|var|function|class|async|await)\b|^\s*import\s+.*['"]]/m,
|
|
15
|
+
'javascript',
|
|
16
|
+
],
|
|
17
|
+
[/^\s*(def|class|import|from|if __name__|print\()/m, 'python'],
|
|
18
|
+
[/^\s*(npm|yarn|pnpm|npx)\s+(install|add|run|build|start)/m, 'bash'],
|
|
19
|
+
[/^\s*[.#@]?[\w-]+\s*\{[^}]*\}|@media|@import/m, 'css'],
|
|
20
|
+
[/^\s*<(!DOCTYPE|html|head|body|div)\b/im, 'html'],
|
|
21
|
+
[/^\s*\{\s*"|^\s*\[\s*(")/m, 'json'],
|
|
22
|
+
[/^\s*(SELECT|INSERT|UPDATE|DELETE|CREATE)\s+/im, 'sql'],
|
|
23
|
+
];
|
|
24
|
+
return patterns.find(([pattern]) => pattern.test(code))?.[1];
|
|
25
|
+
}
|
|
5
26
|
const NOISE_LINE_PATTERNS = [
|
|
6
27
|
// Single letters or panel labels (common in code examples)
|
|
7
28
|
/^[A-Z]$/,
|
|
@@ -12,52 +33,32 @@ const NOISE_LINE_PATTERNS = [
|
|
|
12
33
|
/^\[\d+\]$/,
|
|
13
34
|
/^\(\d+\)$/,
|
|
14
35
|
];
|
|
15
|
-
|
|
16
|
-
* Check if a line is noise that should be removed
|
|
17
|
-
*/
|
|
36
|
+
const MULTIPLE_NEWLINES = /\n{3,}/g;
|
|
18
37
|
function isNoiseLine(line) {
|
|
19
38
|
const trimmed = line.trim();
|
|
20
|
-
// Empty lines are fine
|
|
21
39
|
if (!trimmed)
|
|
22
40
|
return false;
|
|
23
|
-
|
|
24
|
-
if (trimmed.startsWith(
|
|
25
|
-
trimmed.startsWith('-') ||
|
|
26
|
-
trimmed.startsWith('*') ||
|
|
27
|
-
trimmed.startsWith('`') ||
|
|
28
|
-
trimmed.startsWith('>') ||
|
|
29
|
-
trimmed.startsWith('|')) {
|
|
41
|
+
const markdownPrefixes = ['#', '-', '*', '`', '>', '|'];
|
|
42
|
+
if (markdownPrefixes.some((prefix) => trimmed.startsWith(prefix))) {
|
|
30
43
|
return false;
|
|
31
44
|
}
|
|
32
|
-
|
|
33
|
-
for (const pattern of NOISE_LINE_PATTERNS) {
|
|
34
|
-
if (pattern.test(trimmed)) {
|
|
35
|
-
return true;
|
|
36
|
-
}
|
|
37
|
-
}
|
|
38
|
-
return false;
|
|
45
|
+
return NOISE_LINE_PATTERNS.some((pattern) => pattern.test(trimmed));
|
|
39
46
|
}
|
|
40
|
-
|
|
41
|
-
* Post-process markdown to remove noise lines
|
|
42
|
-
*/
|
|
47
|
+
const CODE_FENCE = '```';
|
|
43
48
|
function cleanMarkdownContent(markdown) {
|
|
44
|
-
// Split by lines but preserve code blocks
|
|
45
49
|
const lines = markdown.split('\n');
|
|
46
50
|
const cleanedLines = [];
|
|
47
|
-
let
|
|
51
|
+
let insideCodeBlock = false;
|
|
48
52
|
for (const line of lines) {
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
inCodeBlock = !inCodeBlock;
|
|
53
|
+
if (line.trim().startsWith(CODE_FENCE)) {
|
|
54
|
+
insideCodeBlock = !insideCodeBlock;
|
|
52
55
|
cleanedLines.push(line);
|
|
53
56
|
continue;
|
|
54
57
|
}
|
|
55
|
-
|
|
56
|
-
if (inCodeBlock) {
|
|
58
|
+
if (insideCodeBlock) {
|
|
57
59
|
cleanedLines.push(line);
|
|
58
60
|
continue;
|
|
59
61
|
}
|
|
60
|
-
// Filter noise lines outside code blocks
|
|
61
62
|
if (!isNoiseLine(line)) {
|
|
62
63
|
cleanedLines.push(line);
|
|
63
64
|
}
|
|
@@ -70,12 +71,10 @@ const turndown = new TurndownService({
|
|
|
70
71
|
emDelimiter: '_',
|
|
71
72
|
bulletListMarker: '-',
|
|
72
73
|
});
|
|
73
|
-
// Remove noise elements
|
|
74
74
|
turndown.addRule('removeNoise', {
|
|
75
75
|
filter: ['script', 'style', 'noscript', 'nav', 'footer', 'aside', 'iframe'],
|
|
76
76
|
replacement: () => '',
|
|
77
77
|
});
|
|
78
|
-
// Enhanced code block handling with language detection
|
|
79
78
|
turndown.addRule('fencedCodeBlockWithLanguage', {
|
|
80
79
|
filter: (node, options) => {
|
|
81
80
|
return (options.codeBlockStyle === 'fenced' &&
|
|
@@ -86,70 +85,67 @@ turndown.addRule('fencedCodeBlockWithLanguage', {
|
|
|
86
85
|
replacement: (_content, node) => {
|
|
87
86
|
const codeNode = node.firstChild;
|
|
88
87
|
const code = codeNode.textContent || '';
|
|
89
|
-
// Try to get language from class
|
|
90
88
|
const className = codeNode.getAttribute('class') ?? '';
|
|
91
89
|
const dataLang = codeNode.getAttribute('data-language') ?? '';
|
|
92
90
|
const languageMatch = /language-(\w+)/.exec(className) ??
|
|
93
91
|
/lang-(\w+)/.exec(className) ??
|
|
94
92
|
/highlight-(\w+)/.exec(className) ??
|
|
95
93
|
/^(\w+)$/.exec(dataLang);
|
|
96
|
-
|
|
97
|
-
const language = languageMatch?.[1] ?? detectLanguage(code) ?? '';
|
|
94
|
+
const language = languageMatch?.[1] ?? detectLanguageFromCode(code) ?? '';
|
|
98
95
|
return `\n\n\`\`\`${language}\n${code.replace(/\n$/, '')}\n\`\`\`\n\n`;
|
|
99
96
|
},
|
|
100
97
|
});
|
|
101
|
-
// Pre-compiled regex patterns
|
|
102
98
|
const YAML_SPECIAL_CHARS = /[:[\]{}"\n\r\t'|>&*!?,#]/;
|
|
103
99
|
const YAML_NUMERIC = /^[\d.]+$/;
|
|
104
100
|
const YAML_RESERVED_WORDS = /^(true|false|null|yes|no|on|off)$/i;
|
|
105
|
-
const
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
101
|
+
const ESCAPE_PATTERNS = {
|
|
102
|
+
backslash: /\\/g,
|
|
103
|
+
quote: /"/g,
|
|
104
|
+
newline: /\n/g,
|
|
105
|
+
tab: /\t/g,
|
|
106
|
+
};
|
|
110
107
|
function escapeYamlValue(value) {
|
|
111
|
-
const
|
|
108
|
+
const requiresQuoting = YAML_SPECIAL_CHARS.test(value) ||
|
|
112
109
|
value.startsWith(' ') ||
|
|
113
110
|
value.endsWith(' ') ||
|
|
114
111
|
value === '' ||
|
|
115
112
|
YAML_NUMERIC.test(value) ||
|
|
116
113
|
YAML_RESERVED_WORDS.test(value);
|
|
117
|
-
if (!
|
|
114
|
+
if (!requiresQuoting) {
|
|
118
115
|
return value;
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
.replace(
|
|
122
|
-
.replace(
|
|
123
|
-
.replace(
|
|
116
|
+
}
|
|
117
|
+
const escaped = value
|
|
118
|
+
.replace(ESCAPE_PATTERNS.backslash, '\\\\')
|
|
119
|
+
.replace(ESCAPE_PATTERNS.quote, '\\"')
|
|
120
|
+
.replace(ESCAPE_PATTERNS.newline, '\\n')
|
|
121
|
+
.replace(ESCAPE_PATTERNS.tab, '\\t');
|
|
122
|
+
return `"${escaped}"`;
|
|
124
123
|
}
|
|
125
124
|
function createFrontmatter(metadata) {
|
|
126
125
|
const lines = ['---'];
|
|
127
|
-
if (metadata.title)
|
|
126
|
+
if (metadata.title) {
|
|
128
127
|
lines.push(`title: ${escapeYamlValue(metadata.title)}`);
|
|
129
|
-
|
|
128
|
+
}
|
|
129
|
+
if (metadata.url) {
|
|
130
130
|
lines.push(`source: ${escapeYamlValue(metadata.url)}`);
|
|
131
|
+
}
|
|
131
132
|
lines.push('---');
|
|
132
133
|
return lines.join('\n');
|
|
133
134
|
}
|
|
134
135
|
export function htmlToMarkdown(html, metadata) {
|
|
136
|
+
const frontmatter = metadata ? createFrontmatter(metadata) : '';
|
|
135
137
|
if (!html || typeof html !== 'string') {
|
|
136
|
-
return
|
|
138
|
+
return frontmatter ? `${frontmatter}\n\n` : '';
|
|
137
139
|
}
|
|
138
|
-
let content = '';
|
|
139
140
|
try {
|
|
140
|
-
content = turndown.turndown(html);
|
|
141
|
+
let content = turndown.turndown(html);
|
|
141
142
|
content = content.replace(MULTIPLE_NEWLINES, '\n\n').trim();
|
|
142
|
-
// Clean up noise lines from the markdown
|
|
143
143
|
content = cleanMarkdownContent(content);
|
|
144
|
-
// Final cleanup of multiple newlines after removing noise
|
|
145
144
|
content = content.replace(MULTIPLE_NEWLINES, '\n\n').trim();
|
|
145
|
+
return frontmatter ? `${frontmatter}\n\n${content}` : content;
|
|
146
146
|
}
|
|
147
147
|
catch {
|
|
148
|
-
return
|
|
149
|
-
}
|
|
150
|
-
if (metadata) {
|
|
151
|
-
return `${createFrontmatter(metadata)}\n\n${content}`;
|
|
148
|
+
return frontmatter ? `${frontmatter}\n\n` : '';
|
|
152
149
|
}
|
|
153
|
-
return content;
|
|
154
150
|
}
|
|
155
151
|
//# sourceMappingURL=markdown.transformer.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"markdown.transformer.js","sourceRoot":"","sources":["../../src/transformers/markdown.transformer.ts"],"names":[],"mappings":"AAAA,OAAO,eAAe,MAAM,UAAU,CAAC;AAIvC,
|
|
1
|
+
{"version":3,"file":"markdown.transformer.js","sourceRoot":"","sources":["../../src/transformers/markdown.transformer.ts"],"names":[],"mappings":"AAAA,OAAO,eAAe,MAAM,UAAU,CAAC;AAIvC,SAAS,sBAAsB,CAAC,IAAY;IAC1C,MAAM,QAAQ,GAAgC;QAC5C;YACE,kFAAkF;YAClF,KAAK;SACN;QACD;YACE,sFAAsF;YACtF,YAAY;SACb;QACD,CAAC,kDAAkD,EAAE,MAAM,CAAC;QAC5D;YACE,+EAA+E;YAC/E,YAAY;SACb;QACD,CAAC,kDAAkD,EAAE,QAAQ,CAAC;QAC9D,CAAC,0DAA0D,EAAE,MAAM,CAAC;QACpE,CAAC,8CAA8C,EAAE,KAAK,CAAC;QACvD,CAAC,wCAAwC,EAAE,MAAM,CAAC;QAClD,CAAC,0BAA0B,EAAE,MAAM,CAAC;QACpC,CAAC,+CAA+C,EAAE,KAAK,CAAC;KACzD,CAAC;IACF,OAAO,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,OAAO,CAAC,EAAE,EAAE,CAAC,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;AAC/D,CAAC;AAED,MAAM,mBAAmB,GAAsB;IAC7C,2DAA2D;IAC3D,SAAS;IACT,kBAAkB;IAElB,mEAAmE;IACnE,8BAA8B;IAC9B,cAAc;IACd,WAAW;IACX,WAAW;CACH,CAAC;AAEX,MAAM,iBAAiB,GAAG,SAAS,CAAC;AAEpC,SAAS,WAAW,CAAC,IAAY;IAC/B,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;IAE5B,IAAI,CAAC,OAAO;QAAE,OAAO,KAAK,CAAC;IAE3B,MAAM,gBAAgB,GAAG,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC;IACxD,IAAI,gBAAgB,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,OAAO,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,EAAE,CAAC;QAClE,OAAO,KAAK,CAAC;IACf,CAAC;IAED,OAAO,mBAAmB,CAAC,IAAI,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,CAAC;AACtE,CAAC;AAED,MAAM,UAAU,GAAG,KAAK,CAAC;AAEzB,SAAS,oBAAoB,CAAC,QAAgB;IAC5C,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IACnC,MAAM,YAAY,GAAa,EAAE,CAAC;IAClC,IAAI,eAAe,GAAG,KAAK,CAAC;IAE5B,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;YACvC,eAAe,GAAG,CAAC,eAAe,CAAC;YACnC,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACxB,SAAS;QACX,CAAC;QAED,IAAI,eAAe,EAAE,CAAC;YACpB,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACxB,SAAS;QACX,CAAC;QAED,IAAI,CAAC,WAAW,CAAC,IAAI,CAAC,EAAE,CAAC;YACvB,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC1B,CAAC;IACH,CAAC;IAED,OAAO,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AACjC,CAAC;AAED,MAAM,QAAQ,GAAG,IAAI,eAAe,CAAC;IACnC,YAAY,EAAE,KAAK;IACnB,cAAc,EAAE,QAAQ;IACxB,WAAW,EAAE,GAAG;IAChB,gBAAgB,EAAE,GAAG;CACtB,CAAC,CAAC;AAEH,QAAQ,CAAC,OAAO,CAAC,aAAa,EAAE;IAC9B,MAAM,EAAE,CAAC,QAAQ,EAAE,OAAO,EAAE,UAAU,EAAE,KAAK,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,CAAC;IAC3E,WAAW,EAAE,GAAG,EAAE,CAAC,EAAE;CACtB,CAAC,CAAC;AAEH,QAAQ,CAAC,OAAO,CAAC,6BAA6B,EAAE;IAC9C,MAAM,EAAE,CAAC,IAAI,EAAE,OAAO,EAAE,EAAE;QACxB,OAAO,CACL,OAAO,CAAC,cAAc,KAAK,QAAQ;YACnC,IAAI,CAAC,QAAQ,KAAK,KAAK;YACvB,IAAI,CAAC,UAAU,KAAK,IAAI;YACxB,IAAI,CAAC,UAAU,CAAC,QAAQ,KAAK,MAAM,CACpC,CAAC;IACJ,CAAC;IACD,WAAW,EAAE,CAAC,QAAQ,EAAE,IAAI,EAAE,EAAE;QAC9B,MAAM,QAAQ,GAAG,IAAI,CAAC,UAAyB,CAAC;QAChD,MAAM,IAAI,GAAG,QAAQ,CAAC,WAAW,IAAI,EAAE,CAAC;QAExC,MAAM,SAAS,GAAG,QAAQ,CAAC,YAAY,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC;QACvD,MAAM,QAAQ,GAAG,QAAQ,CAAC,YAAY,CAAC,eAAe,CAAC,IAAI,EAAE,CAAC;QAE9D,MAAM,aAAa,GACjB,gBAAgB,CAAC,IAAI,CAAC,SAAS,CAAC;YAChC,YAAY,CAAC,IAAI,CAAC,SAAS,CAAC;YAC5B,iBAAiB,CAAC,IAAI,CAAC,SAAS,CAAC;YACjC,SAAS,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QAE3B,MAAM,QAAQ,GAAG,aAAa,EAAE,CAAC,CAAC,CAAC,IAAI,sBAAsB,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;QAE1E,OAAO,aAAa,QAAQ,KAAK,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,cAAc,CAAC;IACzE,CAAC;CACF,CAAC,CAAC;AAEH,MAAM,kBAAkB,GAAG,0BAA0B,CAAC;AACtD,MAAM,YAAY,GAAG,UAAU,CAAC;AAChC,MAAM,mBAAmB,GAAG,oCAAoC,CAAC;AAEjE,MAAM,eAAe,GAAG;IACtB,SAAS,EAAE,KAAK;IAChB,KAAK,EAAE,IAAI;IACX,OAAO,EAAE,KAAK;IACd,GAAG,EAAE,KAAK;CACF,CAAC;AAEX,SAAS,eAAe,CAAC,KAAa;IACpC,MAAM,eAAe,GACnB,kBAAkB,CAAC,IAAI,CAAC,KAAK,CAAC;QAC9B,KAAK,CAAC,UAAU,CAAC,GAAG,CAAC;QACrB,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC;QACnB,KAAK,KAAK,EAAE;QACZ,YAAY,CAAC,IAAI,CAAC,KAAK,CAAC;QACxB,mBAAmB,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IAElC,IAAI,CAAC,eAAe,EAAE,CAAC;QACrB,OAAO,KAAK,CAAC;IACf,CAAC;IAED,MAAM,OAAO,GAAG,KAAK;SAClB,OAAO,CAAC,eAAe,CAAC,SAAS,EAAE,MAAM,CAAC;SAC1C,OAAO,CAAC,eAAe,CAAC,KAAK,EAAE,KAAK,CAAC;SACrC,OAAO,CAAC,eAAe,CAAC,OAAO,EAAE,KAAK,CAAC;SACvC,OAAO,CAAC,eAAe,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC;IAEvC,OAAO,IAAI,OAAO,GAAG,CAAC;AACxB,CAAC;AAED,SAAS,iBAAiB,CAAC,QAAuB;IAChD,MAAM,KAAK,GAAG,CAAC,KAAK,CAAC,CAAC;IAEtB,IAAI,QAAQ,CAAC,KAAK,EAAE,CAAC;QACnB,KAAK,CAAC,IAAI,CAAC,UAAU,eAAe,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;IAC1D,CAAC;IACD,IAAI,QAAQ,CAAC,GAAG,EAAE,CAAC;QACjB,KAAK,CAAC,IAAI,CAAC,WAAW,eAAe,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IACzD,CAAC;IAED,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IAClB,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC;AAED,MAAM,UAAU,cAAc,CAAC,IAAY,EAAE,QAAwB;IACnE,MAAM,WAAW,GAAG,QAAQ,CAAC,CAAC,CAAC,iBAAiB,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;IAEhE,IAAI,CAAC,IAAI,IAAI,OAAO,IAAI,KAAK,QAAQ,EAAE,CAAC;QACtC,OAAO,WAAW,CAAC,CAAC,CAAC,GAAG,WAAW,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC;IACjD,CAAC;IAED,IAAI,CAAC;QACH,IAAI,OAAO,GAAG,QAAQ,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QACtC,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,iBAAiB,EAAE,MAAM,CAAC,CAAC,IAAI,EAAE,CAAC;QAC5D,OAAO,GAAG,oBAAoB,CAAC,OAAO,CAAC,CAAC;QACxC,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,iBAAiB,EAAE,MAAM,CAAC,CAAC,IAAI,EAAE,CAAC;QAE5D,OAAO,WAAW,CAAC,CAAC,CAAC,GAAG,WAAW,OAAO,OAAO,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC;IAChE,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,WAAW,CAAC,CAAC,CAAC,GAAG,WAAW,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC;IACjD,CAAC;AACH,CAAC"}
|
|
@@ -1,6 +1,3 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
}
|
|
4
|
-
export declare function runWithConcurrency<T>(limit: number, tasks: (() => Promise<T>)[], options?: ConcurrencyOptions): Promise<PromiseSettledResult<T>[]>;
|
|
5
|
-
export {};
|
|
1
|
+
import type { ConcurrencyExecutionOptions } from '../config/types.js';
|
|
2
|
+
export declare function runWithConcurrency<T>(limit: number, tasks: readonly (() => Promise<T>)[], options?: ConcurrencyExecutionOptions): Promise<PromiseSettledResult<T>[]>;
|
|
6
3
|
//# sourceMappingURL=concurrency.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"concurrency.d.ts","sourceRoot":"","sources":["../../src/utils/concurrency.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"concurrency.d.ts","sourceRoot":"","sources":["../../src/utils/concurrency.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,2BAA2B,EAE5B,MAAM,oBAAoB,CAAC;AA8B5B,wBAAsB,kBAAkB,CAAC,CAAC,EACxC,KAAK,EAAE,MAAM,EACb,KAAK,EAAE,SAAS,CAAC,MAAM,OAAO,CAAC,CAAC,CAAC,CAAC,EAAE,EACpC,OAAO,CAAC,EAAE,2BAA2B,GACpC,OAAO,CAAC,oBAAoB,CAAC,CAAC,CAAC,EAAE,CAAC,CAepC"}
|
|
@@ -1,38 +1,38 @@
|
|
|
1
|
+
const MAX_CONCURRENCY_LIMIT = 10;
|
|
2
|
+
const MIN_CONCURRENCY = 1;
|
|
1
3
|
function createConcurrencyLimiter(limit) {
|
|
2
|
-
const maxConcurrency = Math.min(Math.max(
|
|
3
|
-
let
|
|
4
|
-
const
|
|
5
|
-
return async (
|
|
6
|
-
while (
|
|
7
|
-
await new Promise((resolve) =>
|
|
4
|
+
const maxConcurrency = Math.min(Math.max(MIN_CONCURRENCY, limit), MAX_CONCURRENCY_LIMIT);
|
|
5
|
+
let activeCount = 0;
|
|
6
|
+
const waitingQueue = [];
|
|
7
|
+
return async (task) => {
|
|
8
|
+
while (activeCount >= maxConcurrency) {
|
|
9
|
+
await new Promise((resolve) => waitingQueue.push(resolve));
|
|
8
10
|
}
|
|
9
|
-
|
|
11
|
+
activeCount++;
|
|
10
12
|
try {
|
|
11
|
-
return await
|
|
13
|
+
return await task();
|
|
12
14
|
}
|
|
13
15
|
finally {
|
|
14
|
-
|
|
15
|
-
const
|
|
16
|
-
if (
|
|
17
|
-
|
|
16
|
+
activeCount--;
|
|
17
|
+
const nextWaiting = waitingQueue.shift();
|
|
18
|
+
if (nextWaiting)
|
|
19
|
+
nextWaiting();
|
|
18
20
|
}
|
|
19
21
|
};
|
|
20
22
|
}
|
|
21
23
|
export async function runWithConcurrency(limit, tasks, options) {
|
|
22
24
|
const limiter = createConcurrencyLimiter(limit);
|
|
23
|
-
const
|
|
24
|
-
let
|
|
25
|
+
const totalTasks = tasks.length;
|
|
26
|
+
let completedCount = 0;
|
|
25
27
|
const wrappedTasks = tasks.map((task) => async () => {
|
|
26
28
|
try {
|
|
27
29
|
return await limiter(task);
|
|
28
30
|
}
|
|
29
31
|
finally {
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
options.onProgress(completed, total);
|
|
33
|
-
}
|
|
32
|
+
completedCount++;
|
|
33
|
+
options?.onProgress?.(completedCount, totalTasks);
|
|
34
34
|
}
|
|
35
35
|
});
|
|
36
|
-
return Promise.allSettled(wrappedTasks.map(
|
|
36
|
+
return Promise.allSettled(wrappedTasks.map((task) => task()));
|
|
37
37
|
}
|
|
38
38
|
//# sourceMappingURL=concurrency.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"concurrency.js","sourceRoot":"","sources":["../../src/utils/concurrency.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"concurrency.js","sourceRoot":"","sources":["../../src/utils/concurrency.ts"],"names":[],"mappings":"AAKA,MAAM,qBAAqB,GAAG,EAAE,CAAC;AACjC,MAAM,eAAe,GAAG,CAAC,CAAC;AAE1B,SAAS,wBAAwB,CAAC,KAAa;IAC7C,MAAM,cAAc,GAAG,IAAI,CAAC,GAAG,CAC7B,IAAI,CAAC,GAAG,CAAC,eAAe,EAAE,KAAK,CAAC,EAChC,qBAAqB,CACtB,CAAC;IAEF,IAAI,WAAW,GAAG,CAAC,CAAC;IACpB,MAAM,YAAY,GAAmB,EAAE,CAAC;IAExC,OAAO,KAAK,EAAK,IAAsB,EAAc,EAAE;QACrD,OAAO,WAAW,IAAI,cAAc,EAAE,CAAC;YACrC,MAAM,IAAI,OAAO,CAAO,CAAC,OAAO,EAAE,EAAE,CAAC,YAAY,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,CAAC;QACnE,CAAC;QAED,WAAW,EAAE,CAAC;QACd,IAAI,CAAC;YACH,OAAO,MAAM,IAAI,EAAE,CAAC;QACtB,CAAC;gBAAS,CAAC;YACT,WAAW,EAAE,CAAC;YACd,MAAM,WAAW,GAAG,YAAY,CAAC,KAAK,EAAE,CAAC;YACzC,IAAI,WAAW;gBAAE,WAAW,EAAE,CAAC;QACjC,CAAC;IACH,CAAC,CAAC;AACJ,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,kBAAkB,CACtC,KAAa,EACb,KAAoC,EACpC,OAAqC;IAErC,MAAM,OAAO,GAAG,wBAAwB,CAAC,KAAK,CAAC,CAAC;IAChD,MAAM,UAAU,GAAG,KAAK,CAAC,MAAM,CAAC;IAChC,IAAI,cAAc,GAAG,CAAC,CAAC;IAEvB,MAAM,YAAY,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,KAAK,IAAgB,EAAE;QAC9D,IAAI,CAAC;YACH,OAAO,MAAM,OAAO,CAAC,IAAI,CAAC,CAAC;QAC7B,CAAC;gBAAS,CAAC;YACT,cAAc,EAAE,CAAC;YACjB,OAAO,EAAE,UAAU,EAAE,CAAC,cAAc,EAAE,UAAU,CAAC,CAAC;QACpD,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,OAAO,OAAO,CAAC,UAAU,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;AAChE,CAAC"}
|
|
@@ -1,32 +1,7 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Post-processing content cleaner for removing noise artifacts
|
|
3
|
-
* that slip through Readability extraction.
|
|
4
|
-
*/
|
|
5
|
-
/**
|
|
6
|
-
* Clean paragraph text by removing noise
|
|
7
|
-
*/
|
|
8
1
|
export declare function cleanParagraph(text: string): string | null;
|
|
9
|
-
/**
|
|
10
|
-
* Clean heading text by removing noise and markdown link syntax
|
|
11
|
-
*/
|
|
12
2
|
export declare function cleanHeading(text: string): string | null;
|
|
13
|
-
/**
|
|
14
|
-
* Clean list items by filtering out noise
|
|
15
|
-
*/
|
|
16
3
|
export declare function cleanListItems(items: string[]): string[];
|
|
17
|
-
/**
|
|
18
|
-
* Clean code block text - minimal cleaning to preserve code integrity
|
|
19
|
-
*/
|
|
20
4
|
export declare function cleanCodeBlock(code: string): string | null;
|
|
21
|
-
/**
|
|
22
|
-
* Strip markdown link syntax from text for cleaner slugs/display
|
|
23
|
-
* [Text](#anchor) -> Text
|
|
24
|
-
* [Text](url) -> Text
|
|
25
|
-
*/
|
|
26
5
|
export declare function stripMarkdownLinks(text: string): string;
|
|
27
|
-
/**
|
|
28
|
-
* Remove common timestamp patterns from text (inline removal)
|
|
29
|
-
* Use when you want to strip timestamps from within longer content
|
|
30
|
-
*/
|
|
31
6
|
export declare function removeInlineTimestamps(text: string): string;
|
|
32
7
|
//# sourceMappingURL=content-cleaner.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"content-cleaner.d.ts","sourceRoot":"","sources":["../../src/utils/content-cleaner.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"content-cleaner.d.ts","sourceRoot":"","sources":["../../src/utils/content-cleaner.ts"],"names":[],"mappings":"AA2BA,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,GAAG,IAAI,CAc1D;AAED,wBAAgB,YAAY,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,GAAG,IAAI,CAgBxD;AAED,wBAAgB,cAAc,CAAC,KAAK,EAAE,MAAM,EAAE,GAAG,MAAM,EAAE,CAQxD;AAED,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,GAAG,IAAI,CAY1D;AAED,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAEvD;AAED,wBAAgB,sBAAsB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAgB3D"}
|