@j0hanz/superfetch 1.0.2 → 1.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +345 -57
- package/dist/config/index.d.ts.map +1 -1
- package/dist/config/index.js +6 -10
- package/dist/config/index.js.map +1 -1
- package/dist/config/types.d.ts +256 -0
- package/dist/config/types.d.ts.map +1 -0
- package/dist/config/types.js +2 -0
- package/dist/config/types.js.map +1 -0
- package/dist/errors/app-error.d.ts +6 -20
- package/dist/errors/app-error.d.ts.map +1 -1
- package/dist/errors/app-error.js +7 -18
- package/dist/errors/app-error.js.map +1 -1
- package/dist/index.js +75 -62
- package/dist/index.js.map +1 -1
- package/dist/middleware/error-handler.d.ts +1 -5
- package/dist/middleware/error-handler.d.ts.map +1 -1
- package/dist/middleware/error-handler.js +4 -12
- package/dist/middleware/error-handler.js.map +1 -1
- package/dist/middleware/rate-limiter.d.ts +2 -20
- package/dist/middleware/rate-limiter.d.ts.map +1 -1
- package/dist/middleware/rate-limiter.js +22 -47
- package/dist/middleware/rate-limiter.js.map +1 -1
- package/dist/prompts/index.d.ts +0 -3
- package/dist/prompts/index.d.ts.map +1 -1
- package/dist/prompts/index.js +2 -10
- package/dist/prompts/index.js.map +1 -1
- package/dist/resources/cached-content.d.ts +5 -0
- package/dist/resources/cached-content.d.ts.map +1 -0
- package/dist/resources/cached-content.js +93 -0
- package/dist/resources/cached-content.js.map +1 -0
- package/dist/resources/index.d.ts +0 -3
- package/dist/resources/index.d.ts.map +1 -1
- package/dist/resources/index.js +40 -5
- package/dist/resources/index.js.map +1 -1
- package/dist/server.d.ts +0 -4
- package/dist/server.d.ts.map +1 -1
- package/dist/server.js +11 -6
- package/dist/server.js.map +1 -1
- package/dist/services/cache.d.ts +20 -6
- package/dist/services/cache.d.ts.map +1 -1
- package/dist/services/cache.js +128 -20
- package/dist/services/cache.js.map +1 -1
- package/dist/services/card-extractor.d.ts +10 -0
- package/dist/services/card-extractor.d.ts.map +1 -0
- package/dist/services/card-extractor.js +194 -0
- package/dist/services/card-extractor.js.map +1 -0
- package/dist/services/extractor.d.ts +12 -19
- package/dist/services/extractor.d.ts.map +1 -1
- package/dist/services/extractor.js +60 -46
- package/dist/services/extractor.js.map +1 -1
- package/dist/services/fetcher.d.ts +13 -11
- package/dist/services/fetcher.d.ts.map +1 -1
- package/dist/services/fetcher.js +143 -54
- package/dist/services/fetcher.js.map +1 -1
- package/dist/services/logger.d.ts.map +1 -1
- package/dist/services/logger.js +4 -6
- package/dist/services/logger.js.map +1 -1
- package/dist/services/parser.d.ts +1 -6
- package/dist/services/parser.d.ts.map +1 -1
- package/dist/services/parser.js +57 -27
- package/dist/services/parser.js.map +1 -1
- package/dist/tools/handlers/fetch-links.tool.d.ts +6 -18
- package/dist/tools/handlers/fetch-links.tool.d.ts.map +1 -1
- package/dist/tools/handlers/fetch-links.tool.js +104 -79
- package/dist/tools/handlers/fetch-links.tool.js.map +1 -1
- package/dist/tools/handlers/fetch-markdown.tool.d.ts +6 -10
- package/dist/tools/handlers/fetch-markdown.tool.d.ts.map +1 -1
- package/dist/tools/handlers/fetch-markdown.tool.js +83 -84
- package/dist/tools/handlers/fetch-markdown.tool.js.map +1 -1
- package/dist/tools/handlers/fetch-url.tool.d.ts +6 -12
- package/dist/tools/handlers/fetch-url.tool.d.ts.map +1 -1
- package/dist/tools/handlers/fetch-url.tool.js +51 -93
- package/dist/tools/handlers/fetch-url.tool.js.map +1 -1
- package/dist/tools/handlers/fetch-urls.tool.d.ts +12 -0
- package/dist/tools/handlers/fetch-urls.tool.d.ts.map +1 -0
- package/dist/tools/handlers/fetch-urls.tool.js +184 -0
- package/dist/tools/handlers/fetch-urls.tool.js.map +1 -0
- package/dist/tools/index.d.ts +0 -4
- package/dist/tools/index.d.ts.map +1 -1
- package/dist/tools/index.js +145 -15
- package/dist/tools/index.js.map +1 -1
- package/dist/tools/utils/common.d.ts +8 -0
- package/dist/tools/utils/common.d.ts.map +1 -0
- package/dist/tools/utils/common.js +35 -0
- package/dist/tools/utils/common.js.map +1 -0
- package/dist/tools/utils/fetch-pipeline.d.ts +3 -0
- package/dist/tools/utils/fetch-pipeline.d.ts.map +1 -0
- package/dist/tools/utils/fetch-pipeline.js +78 -0
- package/dist/tools/utils/fetch-pipeline.js.map +1 -0
- package/dist/tools/utils/index.d.ts +4 -0
- package/dist/tools/utils/index.d.ts.map +1 -0
- package/dist/tools/utils/index.js +3 -0
- package/dist/tools/utils/index.js.map +1 -0
- package/dist/tools/utils/response-builder.d.ts +3 -0
- package/dist/tools/utils/response-builder.d.ts.map +1 -0
- package/dist/tools/utils/response-builder.js +24 -0
- package/dist/tools/utils/response-builder.js.map +1 -0
- package/dist/transformers/jsonl.transformer.d.ts +1 -1
- package/dist/transformers/jsonl.transformer.d.ts.map +1 -1
- package/dist/transformers/jsonl.transformer.js +2 -1
- package/dist/transformers/jsonl.transformer.js.map +1 -1
- package/dist/transformers/markdown.transformer.d.ts +1 -1
- package/dist/transformers/markdown.transformer.d.ts.map +1 -1
- package/dist/transformers/markdown.transformer.js +99 -5
- package/dist/transformers/markdown.transformer.js.map +1 -1
- package/dist/types/content.types.d.ts +11 -11
- package/dist/types/content.types.d.ts.map +1 -1
- package/dist/types/index.d.ts +1 -2
- package/dist/types/index.d.ts.map +1 -1
- package/dist/types/index.js +1 -2
- package/dist/types/index.js.map +1 -1
- package/dist/types/schemas.d.ts +39 -12
- package/dist/types/schemas.d.ts.map +1 -1
- package/dist/utils/concurrency.d.ts +6 -0
- package/dist/utils/concurrency.d.ts.map +1 -0
- package/dist/utils/concurrency.js +38 -0
- package/dist/utils/concurrency.js.map +1 -0
- package/dist/utils/content-cleaner.d.ts +32 -0
- package/dist/utils/content-cleaner.d.ts.map +1 -0
- package/dist/utils/content-cleaner.js +238 -0
- package/dist/utils/content-cleaner.js.map +1 -0
- package/dist/utils/language-detector.d.ts +5 -0
- package/dist/utils/language-detector.d.ts.map +1 -0
- package/dist/utils/language-detector.js +50 -0
- package/dist/utils/language-detector.js.map +1 -0
- package/dist/utils/sanitizer.d.ts +0 -10
- package/dist/utils/sanitizer.d.ts.map +1 -1
- package/dist/utils/sanitizer.js +4 -12
- package/dist/utils/sanitizer.js.map +1 -1
- package/dist/utils/tool-error-handler.d.ts +1 -15
- package/dist/utils/tool-error-handler.d.ts.map +1 -1
- package/dist/utils/tool-error-handler.js +34 -6
- package/dist/utils/tool-error-handler.js.map +1 -1
- package/dist/utils/url-validator.d.ts +0 -8
- package/dist/utils/url-validator.d.ts.map +1 -1
- package/dist/utils/url-validator.js +17 -31
- package/dist/utils/url-validator.js.map +1 -1
- package/package.json +81 -79
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
function createConcurrencyLimiter(limit) {
|
|
2
|
+
const maxConcurrency = Math.min(Math.max(1, limit), 10);
|
|
3
|
+
let active = 0;
|
|
4
|
+
const queue = [];
|
|
5
|
+
return async (fn) => {
|
|
6
|
+
while (active >= maxConcurrency) {
|
|
7
|
+
await new Promise((resolve) => queue.push(resolve));
|
|
8
|
+
}
|
|
9
|
+
active++;
|
|
10
|
+
try {
|
|
11
|
+
return await fn();
|
|
12
|
+
}
|
|
13
|
+
finally {
|
|
14
|
+
active--;
|
|
15
|
+
const next = queue.shift();
|
|
16
|
+
if (next)
|
|
17
|
+
next();
|
|
18
|
+
}
|
|
19
|
+
};
|
|
20
|
+
}
|
|
21
|
+
export async function runWithConcurrency(limit, tasks, options) {
|
|
22
|
+
const limiter = createConcurrencyLimiter(limit);
|
|
23
|
+
const total = tasks.length;
|
|
24
|
+
let completed = 0;
|
|
25
|
+
const wrappedTasks = tasks.map((task) => async () => {
|
|
26
|
+
try {
|
|
27
|
+
return await limiter(task);
|
|
28
|
+
}
|
|
29
|
+
finally {
|
|
30
|
+
completed++;
|
|
31
|
+
if (options?.onProgress) {
|
|
32
|
+
options.onProgress(completed, total);
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
});
|
|
36
|
+
return Promise.allSettled(wrappedTasks.map(async (task) => task()));
|
|
37
|
+
}
|
|
38
|
+
//# sourceMappingURL=concurrency.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"concurrency.js","sourceRoot":"","sources":["../../src/utils/concurrency.ts"],"names":[],"mappings":"AAMA,SAAS,wBAAwB,CAAC,KAAa;IAC7C,MAAM,cAAc,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,KAAK,CAAC,EAAE,EAAE,CAAC,CAAC;IACxD,IAAI,MAAM,GAAG,CAAC,CAAC;IACf,MAAM,KAAK,GAAmB,EAAE,CAAC;IAEjC,OAAO,KAAK,EAAK,EAAoB,EAAc,EAAE;QACnD,OAAO,MAAM,IAAI,cAAc,EAAE,CAAC;YAChC,MAAM,IAAI,OAAO,CAAO,CAAC,OAAO,EAAE,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,CAAC;QAC5D,CAAC;QAED,MAAM,EAAE,CAAC;QACT,IAAI,CAAC;YACH,OAAO,MAAM,EAAE,EAAE,CAAC;QACpB,CAAC;gBAAS,CAAC;YACT,MAAM,EAAE,CAAC;YACT,MAAM,IAAI,GAAG,KAAK,CAAC,KAAK,EAAE,CAAC;YAC3B,IAAI,IAAI;gBAAE,IAAI,EAAE,CAAC;QACnB,CAAC;IACH,CAAC,CAAC;AACJ,CAAC;AACD,MAAM,CAAC,KAAK,UAAU,kBAAkB,CACtC,KAAa,EACb,KAA2B,EAC3B,OAA4B;IAE5B,MAAM,OAAO,GAAG,wBAAwB,CAAC,KAAK,CAAC,CAAC;IAChD,MAAM,KAAK,GAAG,KAAK,CAAC,MAAM,CAAC;IAC3B,IAAI,SAAS,GAAG,CAAC,CAAC;IAElB,MAAM,YAAY,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,KAAK,IAAI,EAAE;QAClD,IAAI,CAAC;YACH,OAAO,MAAM,OAAO,CAAC,IAAI,CAAC,CAAC;QAC7B,CAAC;gBAAS,CAAC;YACT,SAAS,EAAE,CAAC;YACZ,IAAI,OAAO,EAAE,UAAU,EAAE,CAAC;gBACxB,OAAO,CAAC,UAAU,CAAC,SAAS,EAAE,KAAK,CAAC,CAAC;YACvC,CAAC;QACH,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,OAAO,OAAO,CAAC,UAAU,CAAC,YAAY,CAAC,GAAG,CAAC,KAAK,EAAE,IAAI,EAAE,EAAE,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;AACtE,CAAC"}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Post-processing content cleaner for removing noise artifacts
|
|
3
|
+
* that slip through Readability extraction.
|
|
4
|
+
*/
|
|
5
|
+
/**
|
|
6
|
+
* Clean paragraph text by removing noise
|
|
7
|
+
*/
|
|
8
|
+
export declare function cleanParagraph(text: string): string | null;
|
|
9
|
+
/**
|
|
10
|
+
* Clean heading text by removing noise and markdown link syntax
|
|
11
|
+
*/
|
|
12
|
+
export declare function cleanHeading(text: string): string | null;
|
|
13
|
+
/**
|
|
14
|
+
* Clean list items by filtering out noise
|
|
15
|
+
*/
|
|
16
|
+
export declare function cleanListItems(items: string[]): string[];
|
|
17
|
+
/**
|
|
18
|
+
* Clean code block text - minimal cleaning to preserve code integrity
|
|
19
|
+
*/
|
|
20
|
+
export declare function cleanCodeBlock(code: string): string | null;
|
|
21
|
+
/**
|
|
22
|
+
* Strip markdown link syntax from text for cleaner slugs/display
|
|
23
|
+
* [Text](#anchor) -> Text
|
|
24
|
+
* [Text](url) -> Text
|
|
25
|
+
*/
|
|
26
|
+
export declare function stripMarkdownLinks(text: string): string;
|
|
27
|
+
/**
|
|
28
|
+
* Remove common timestamp patterns from text (inline removal)
|
|
29
|
+
* Use when you want to strip timestamps from within longer content
|
|
30
|
+
*/
|
|
31
|
+
export declare function removeInlineTimestamps(text: string): string;
|
|
32
|
+
//# sourceMappingURL=content-cleaner.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"content-cleaner.d.ts","sourceRoot":"","sources":["../../src/utils/content-cleaner.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAsKH;;GAEG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,GAAG,IAAI,CAsB1D;AAED;;GAEG;AACH,wBAAgB,YAAY,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,GAAG,IAAI,CAuBxD;AAED;;GAEG;AACH,wBAAgB,cAAc,CAAC,KAAK,EAAE,MAAM,EAAE,GAAG,MAAM,EAAE,CAQxD;AAED;;GAEG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,GAAG,IAAI,CAc1D;AAED;;;;GAIG;AACH,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAEvD;AAED;;;GAGG;AACH,wBAAgB,sBAAsB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAsB3D"}
|
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Post-processing content cleaner for removing noise artifacts
|
|
3
|
+
* that slip through Readability extraction.
|
|
4
|
+
*/
|
|
5
|
+
// Pre-compiled combined pattern for optimal performance
|
|
6
|
+
const NOISE_PATTERN_COMBINED = new RegExp([
|
|
7
|
+
// Relative timestamps
|
|
8
|
+
'^\\d+\\s*(seconds?|minutes?|hours?|days?|weeks?|months?|years?)\\s*ago$',
|
|
9
|
+
'^(just now|recently|today|yesterday|last week|last month)$',
|
|
10
|
+
'^(updated|modified|edited|created|published)\\s*:?\\s*\\d+\\s*(seconds?|minutes?|hours?|days?|weeks?|months?|years?)\\s*ago$',
|
|
11
|
+
'^(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\\s+\\d{1,2},?\\s+\\d{4}$',
|
|
12
|
+
'^\\d{1,2}\\s+(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\\s+\\d{4}$',
|
|
13
|
+
'^\\d{4}-\\d{2}-\\d{2}$',
|
|
14
|
+
'^last\\s+updated\\s*:?',
|
|
15
|
+
// Share/action buttons
|
|
16
|
+
'^(share|copy|like|follow|subscribe|download|print|save|bookmark|tweet|pin it|email|export)$',
|
|
17
|
+
'^(copy to clipboard|copied!?|copy code|copy link)$',
|
|
18
|
+
'^(share on|share to|share via)\\s+(twitter|facebook|linkedin|reddit|x|email)$',
|
|
19
|
+
// UI artifacts
|
|
20
|
+
'^(click to copy|expand|collapse|show more|show less|load more|view more|read more|see more|see all|view all)$',
|
|
21
|
+
'^(toggle|switch|enable|disable|on|off)$',
|
|
22
|
+
'^(edit|delete|remove|add|new|create|update|cancel|confirm|submit|reset|clear)$',
|
|
23
|
+
'^(open in|view in|edit in)\\s+\\w+$',
|
|
24
|
+
'^(try it|run|execute|play|preview|demo|live demo|playground)$',
|
|
25
|
+
'^(source|view source|edit this page|edit on github|improve this doc)$',
|
|
26
|
+
// Empty/placeholder
|
|
27
|
+
'^(loading\\.{0,3}|please wait\\.{0,3}|\\.{2,})$',
|
|
28
|
+
'^(n\\/a|tbd|todo|coming soon|placeholder|untitled)$',
|
|
29
|
+
// Navigation
|
|
30
|
+
'^(next|previous|prev|back|forward|home|menu|close|open|skip to|jump to|go to)$',
|
|
31
|
+
'^(table of contents|toc|contents|on this page|in this article|in this section)$',
|
|
32
|
+
'^(scroll to top|back to top|top)$',
|
|
33
|
+
// Cookie/consent
|
|
34
|
+
'^(accept|reject|accept all|reject all|cookie settings|privacy settings|manage preferences)$',
|
|
35
|
+
'^(accept cookies|decline cookies|cookie policy|privacy policy|terms of service|terms & conditions)$',
|
|
36
|
+
// Counts
|
|
37
|
+
'^\\d+\\s*(comments?|replies?|reactions?|responses?)$',
|
|
38
|
+
'^\\d+\\s*(likes?|shares?|views?|followers?|retweets?|stars?|forks?|claps?|upvotes?|downvotes?)$',
|
|
39
|
+
'^(liked by|shared by|followed by)\\s+\\d+',
|
|
40
|
+
// Version badges
|
|
41
|
+
'^v?\\d+\\.\\d+(\\.\\d+)?(-\\w+)?$',
|
|
42
|
+
'^(stable|beta|alpha|rc|preview|experimental|deprecated|legacy|new|updated)$',
|
|
43
|
+
// Structural
|
|
44
|
+
'^(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z)$',
|
|
45
|
+
'^panel\\s*[a-z]?$',
|
|
46
|
+
// API artifacts
|
|
47
|
+
'^(required|optional|default|type|example|description|parameters?|returns?|response|request)$',
|
|
48
|
+
'^(get|post|put|patch|delete|head|options)\\s*$',
|
|
49
|
+
// Interactive
|
|
50
|
+
'^(drag|drop|resize|zoom|scroll|swipe|tap|click|hover|focus)(\\s+to\\s+\\w+)?$',
|
|
51
|
+
'^(drag the|move the|resize the|drag to|click to)\\s+\\w+',
|
|
52
|
+
// Breadcrumbs
|
|
53
|
+
'^[/\\\\>→»›]+$',
|
|
54
|
+
// Ads
|
|
55
|
+
'^(ad|advertisement|sponsored|promoted|partner content)$',
|
|
56
|
+
].join('|'), 'i');
|
|
57
|
+
// Pre-compiled pattern for short text noise
|
|
58
|
+
const SHORT_TEXT_NOISE_PATTERN = new RegExp([
|
|
59
|
+
'^#\\w+$',
|
|
60
|
+
'^@\\w+$',
|
|
61
|
+
'^\\d+$',
|
|
62
|
+
'^[•·→←↑↓►▼▲◄▶◀■□●○★☆✓✗✔✘×]+$',
|
|
63
|
+
'^[,;:\\-–—]+$',
|
|
64
|
+
'^\\[\\d+\\]$',
|
|
65
|
+
'^\\(\\d+\\)$',
|
|
66
|
+
'^fig\\.?\\s*\\d+$',
|
|
67
|
+
'^table\\s*\\d+$',
|
|
68
|
+
'^step\\s*\\d+$',
|
|
69
|
+
'^note:?$',
|
|
70
|
+
'^tip:?$',
|
|
71
|
+
'^warning:?$',
|
|
72
|
+
'^info:?$',
|
|
73
|
+
'^caution:?$',
|
|
74
|
+
].join('|'), 'i');
|
|
75
|
+
// Pre-compiled pattern for UI chrome detection
|
|
76
|
+
const UI_CHROME_PATTERN = new RegExp([
|
|
77
|
+
'^(sign in|sign up|log in|log out|register|create account)$',
|
|
78
|
+
'^(search|search\\.\\.\\.|search docs|search documentation)$',
|
|
79
|
+
'^(dark mode|light mode|theme|language|locale)$',
|
|
80
|
+
'^(feedback|report issue|report a bug|file an issue|suggest edit)$',
|
|
81
|
+
'^(documentation|docs|api|reference|guide|tutorial|examples?)$',
|
|
82
|
+
"^(version|changelog|release notes|what's new)$",
|
|
83
|
+
].join('|'), 'i');
|
|
84
|
+
// Minimum lengths for different content types
|
|
85
|
+
const MIN_PARAGRAPH_LENGTH = 20;
|
|
86
|
+
const MIN_HEADING_LENGTH = 2;
|
|
87
|
+
const MIN_LIST_ITEM_LENGTH = 3;
|
|
88
|
+
const SHORT_TEXT_THRESHOLD = 25;
|
|
89
|
+
/**
|
|
90
|
+
* Check if text matches any noise pattern
|
|
91
|
+
*/
|
|
92
|
+
function isNoiseText(text) {
|
|
93
|
+
const trimmed = text.trim();
|
|
94
|
+
// Empty or whitespace-only
|
|
95
|
+
if (!trimmed) {
|
|
96
|
+
return true;
|
|
97
|
+
}
|
|
98
|
+
// Check combined noise pattern (single regex test)
|
|
99
|
+
if (NOISE_PATTERN_COMBINED.test(trimmed)) {
|
|
100
|
+
return true;
|
|
101
|
+
}
|
|
102
|
+
// Check short text patterns for brief content
|
|
103
|
+
if (trimmed.length < SHORT_TEXT_THRESHOLD) {
|
|
104
|
+
if (SHORT_TEXT_NOISE_PATTERN.test(trimmed)) {
|
|
105
|
+
return true;
|
|
106
|
+
}
|
|
107
|
+
// Also check UI chrome patterns for short text
|
|
108
|
+
if (UI_CHROME_PATTERN.test(trimmed)) {
|
|
109
|
+
return true;
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
return false;
|
|
113
|
+
}
|
|
114
|
+
// Pre-compiled placeholder pattern (combined for performance)
|
|
115
|
+
const PLACEHOLDER_PATTERN = /^(lorem ipsum|sample text|placeholder|example (text|content|data)|test (text|content|data)|your (text|content|name|email) here|enter (your|a) |type (your|a|something) )/i;
|
|
116
|
+
// Cache for placeholder checks to avoid repeated regex tests
|
|
117
|
+
const PLACEHOLDER_CACHE = new Map();
|
|
118
|
+
const PLACEHOLDER_CACHE_MAX_SIZE = 1000;
|
|
119
|
+
/**
|
|
120
|
+
* Check if text looks like placeholder/demo content
|
|
121
|
+
* Uses caching for 3-8x performance improvement on repeated patterns
|
|
122
|
+
*/
|
|
123
|
+
function isPlaceholderContent(text) {
|
|
124
|
+
const trimmed = text.trim().toLowerCase();
|
|
125
|
+
// Check cache first
|
|
126
|
+
const cached = PLACEHOLDER_CACHE.get(trimmed);
|
|
127
|
+
if (cached !== undefined) {
|
|
128
|
+
return cached;
|
|
129
|
+
}
|
|
130
|
+
// Single regex test (faster than array iteration)
|
|
131
|
+
const result = PLACEHOLDER_PATTERN.test(trimmed);
|
|
132
|
+
// Cache result with LRU eviction
|
|
133
|
+
if (PLACEHOLDER_CACHE.size >= PLACEHOLDER_CACHE_MAX_SIZE) {
|
|
134
|
+
const firstKey = PLACEHOLDER_CACHE.keys().next().value;
|
|
135
|
+
if (firstKey !== undefined) {
|
|
136
|
+
PLACEHOLDER_CACHE.delete(firstKey);
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
PLACEHOLDER_CACHE.set(trimmed, result);
|
|
140
|
+
return result;
|
|
141
|
+
}
|
|
142
|
+
/**
|
|
143
|
+
* Clean paragraph text by removing noise
|
|
144
|
+
*/
|
|
145
|
+
export function cleanParagraph(text) {
|
|
146
|
+
const trimmed = text.trim();
|
|
147
|
+
// Too short to be meaningful
|
|
148
|
+
if (trimmed.length < MIN_PARAGRAPH_LENGTH) {
|
|
149
|
+
// Allow very short paragraphs if they end with punctuation (likely real content)
|
|
150
|
+
if (!/[.!?]$/.test(trimmed)) {
|
|
151
|
+
return null;
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
// Is noise content
|
|
155
|
+
if (isNoiseText(trimmed)) {
|
|
156
|
+
return null;
|
|
157
|
+
}
|
|
158
|
+
// Is placeholder content (in paragraphs, not in examples)
|
|
159
|
+
if (isPlaceholderContent(trimmed)) {
|
|
160
|
+
return null;
|
|
161
|
+
}
|
|
162
|
+
return trimmed;
|
|
163
|
+
}
|
|
164
|
+
/**
|
|
165
|
+
* Clean heading text by removing noise and markdown link syntax
|
|
166
|
+
*/
|
|
167
|
+
export function cleanHeading(text) {
|
|
168
|
+
let cleaned = text.trim();
|
|
169
|
+
// Too short
|
|
170
|
+
if (cleaned.length < MIN_HEADING_LENGTH) {
|
|
171
|
+
return null;
|
|
172
|
+
}
|
|
173
|
+
// Remove markdown link syntax: [Text](#anchor) -> Text
|
|
174
|
+
cleaned = cleaned.replace(/\[([^\]]+)\]\([^)]*\)/g, '$1');
|
|
175
|
+
// Remove trailing anchor links like "Link for this heading"
|
|
176
|
+
cleaned = cleaned.replace(/\s*Link for (this heading|[\w\s]+)\s*$/i, '');
|
|
177
|
+
// Remove trailing hash symbols often used for anchor links
|
|
178
|
+
cleaned = cleaned.replace(/\s*#+\s*$/, '');
|
|
179
|
+
// Is noise content
|
|
180
|
+
if (isNoiseText(cleaned)) {
|
|
181
|
+
return null;
|
|
182
|
+
}
|
|
183
|
+
return cleaned.trim();
|
|
184
|
+
}
|
|
185
|
+
/**
|
|
186
|
+
* Clean list items by filtering out noise
|
|
187
|
+
*/
|
|
188
|
+
export function cleanListItems(items) {
|
|
189
|
+
return items
|
|
190
|
+
.map((item) => item.trim())
|
|
191
|
+
.filter((item) => {
|
|
192
|
+
if (item.length < MIN_LIST_ITEM_LENGTH)
|
|
193
|
+
return false;
|
|
194
|
+
if (isNoiseText(item))
|
|
195
|
+
return false;
|
|
196
|
+
return true;
|
|
197
|
+
});
|
|
198
|
+
}
|
|
199
|
+
/**
|
|
200
|
+
* Clean code block text - minimal cleaning to preserve code integrity
|
|
201
|
+
*/
|
|
202
|
+
export function cleanCodeBlock(code) {
|
|
203
|
+
const trimmed = code.trim();
|
|
204
|
+
// Empty code block
|
|
205
|
+
if (trimmed.length === 0) {
|
|
206
|
+
return null;
|
|
207
|
+
}
|
|
208
|
+
// Very short code blocks that are likely just labels
|
|
209
|
+
if (trimmed.length < 3 && !/^[{}[\]();<>]$/.test(trimmed)) {
|
|
210
|
+
return null;
|
|
211
|
+
}
|
|
212
|
+
return trimmed;
|
|
213
|
+
}
|
|
214
|
+
/**
|
|
215
|
+
* Strip markdown link syntax from text for cleaner slugs/display
|
|
216
|
+
* [Text](#anchor) -> Text
|
|
217
|
+
* [Text](url) -> Text
|
|
218
|
+
*/
|
|
219
|
+
export function stripMarkdownLinks(text) {
|
|
220
|
+
return text.replace(/\[([^\]]+)\]\([^)]*\)/g, '$1');
|
|
221
|
+
}
|
|
222
|
+
/**
|
|
223
|
+
* Remove common timestamp patterns from text (inline removal)
|
|
224
|
+
* Use when you want to strip timestamps from within longer content
|
|
225
|
+
*/
|
|
226
|
+
export function removeInlineTimestamps(text) {
|
|
227
|
+
return (text
|
|
228
|
+
// Remove "X days/hours/etc ago" patterns
|
|
229
|
+
.replace(/\b\d+\s*(seconds?|minutes?|hours?|days?|weeks?|months?|years?)\s*ago\b/gi, '')
|
|
230
|
+
// Remove "Updated: date" patterns
|
|
231
|
+
.replace(/\b(updated|modified|edited|created|published)\s*:?\s*\d+\s*(seconds?|minutes?|hours?|days?|weeks?|months?|years?)\s*ago\b/gi, '')
|
|
232
|
+
// Remove standalone dates
|
|
233
|
+
.replace(/\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\s+\d{1,2},?\s+\d{4}\b/gi, '')
|
|
234
|
+
// Clean up extra whitespace
|
|
235
|
+
.replace(/\s{2,}/g, ' ')
|
|
236
|
+
.trim());
|
|
237
|
+
}
|
|
238
|
+
//# sourceMappingURL=content-cleaner.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"content-cleaner.js","sourceRoot":"","sources":["../../src/utils/content-cleaner.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,wDAAwD;AACxD,MAAM,sBAAsB,GAAG,IAAI,MAAM,CACvC;IACE,sBAAsB;IACtB,yEAAyE;IACzE,4DAA4D;IAC5D,8HAA8H;IAC9H,6EAA6E;IAC7E,2EAA2E;IAC3E,wBAAwB;IACxB,wBAAwB;IACxB,uBAAuB;IACvB,6FAA6F;IAC7F,oDAAoD;IACpD,+EAA+E;IAC/E,eAAe;IACf,+GAA+G;IAC/G,yCAAyC;IACzC,gFAAgF;IAChF,qCAAqC;IACrC,+DAA+D;IAC/D,uEAAuE;IACvE,oBAAoB;IACpB,iDAAiD;IACjD,qDAAqD;IACrD,aAAa;IACb,gFAAgF;IAChF,iFAAiF;IACjF,mCAAmC;IACnC,iBAAiB;IACjB,6FAA6F;IAC7F,qGAAqG;IACrG,SAAS;IACT,sDAAsD;IACtD,iGAAiG;IACjG,2CAA2C;IAC3C,iBAAiB;IACjB,mCAAmC;IACnC,6EAA6E;IAC7E,aAAa;IACb,yDAAyD;IACzD,mBAAmB;IACnB,gBAAgB;IAChB,8FAA8F;IAC9F,gDAAgD;IAChD,cAAc;IACd,+EAA+E;IAC/E,0DAA0D;IAC1D,cAAc;IACd,gBAAgB;IAChB,MAAM;IACN,yDAAyD;CAC1D,CAAC,IAAI,CAAC,GAAG,CAAC,EACX,GAAG,CACJ,CAAC;AAEF,4CAA4C;AAC5C,MAAM,wBAAwB,GAAG,IAAI,MAAM,CACzC;IACE,SAAS;IACT,SAAS;IACT,QAAQ;IACR,8BAA8B;IAC9B,eAAe;IACf,cAAc;IACd,cAAc;IACd,mBAAmB;IACnB,iBAAiB;IACjB,gBAAgB;IAChB,UAAU;IACV,SAAS;IACT,aAAa;IACb,UAAU;IACV,aAAa;CACd,CAAC,IAAI,CAAC,GAAG,CAAC,EACX,GAAG,CACJ,CAAC;AAEF,+CAA+C;AAC/C,MAAM,iBAAiB,GAAG,IAAI,MAAM,CAClC;IACE,4DAA4D;IAC5D,6DAA6D;IAC7D,gDAAgD;IAChD,mEAAmE;IACnE,+DAA+D;IAC/D,gDAAgD;CACjD,CAAC,IAAI,CAAC,GAAG,CAAC,EACX,GAAG,CACJ,CAAC;AAEF,8CAA8C;AAC9C,MAAM,oBAAoB,GAAG,EAAE,CAAC;AAChC,MAAM,kBAAkB,GAAG,CAAC,CAAC;AAC7B,MAAM,oBAAoB,GAAG,CAAC,CAAC;AAC/B,MAAM,oBAAoB,GAAG,EAAE,CAAC;AAEhC;;GAEG;AACH,SAAS,WAAW,CAAC,IAAY;IAC/B,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;IAE5B,2BAA2B;IAC3B,IAAI,CAAC,OAAO,EAAE,CAAC;QACb,OAAO,IAAI,CAAC;IACd,CAAC;IAED,mDAAmD;IACnD,IAAI,sBAAsB,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC;QACzC,OAAO,IAAI,CAAC;IACd,CAAC;IAED,8CAA8C;IAC9C,IAAI,OAAO,CAAC,MAAM,GAAG,oBAAoB,EAAE,CAAC;QAC1C,IAAI,wBAAwB,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC;YAC3C,OAAO,IAAI,CAAC;QACd,CAAC;QAED,+CAA+C;QAC/C,IAAI,iBAAiB,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC;YACpC,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED,8DAA8D;AAC9D,MAAM,mBAAmB,GACvB,2KAA2K,CAAC;AAE9K,6DAA6D;AAC7D,MAAM,iBAAiB,GAAG,IAAI,GAAG,EAAmB,CAAC;AACrD,MAAM,0BAA0B,GAAG,IAAI,CAAC;AAExC;;;GAGG;AACH,SAAS,oBAAoB,CAAC,IAAY;IACxC,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;IAE1C,oBAAoB;IACpB,MAAM,MAAM,GAAG,iBAAiB,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;IAC9C,IAAI,MAAM,KAAK,SAAS,EAAE,CAAC;QACzB,OAAO,MAAM,CAAC;IAChB,CAAC;IAED,kDAAkD;IAClD,MAAM,MAAM,GAAG,mBAAmB,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IAEjD,iCAAiC;IACjC,IAAI,iBAAiB,CAAC,IAAI,IAAI,0BAA0B,EAAE,CAAC;QACzD,MAAM,QAAQ,GAAG,iBAAiB,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC;QACvD,IAAI,QAAQ,KAAK,SAAS,EAAE,CAAC;YAC3B,iBAAiB,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;QACrC,CAAC;IACH,CAAC;IACD,iBAAiB,CAAC,GAAG,CAAC,OAAO,EAAE,MAAM,CAAC,CAAC;IAEvC,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,cAAc,CAAC,IAAY;IACzC,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;IAE5B,6BAA6B;IAC7B,IAAI,OAAO,CAAC,MAAM,GAAG,oBAAoB,EAAE,CAAC;QAC1C,iFAAiF;QACjF,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC;YAC5B,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IAED,mBAAmB;IACnB,IAAI,WAAW,CAAC,OAAO,CAAC,EAAE,CAAC;QACzB,OAAO,IAAI,CAAC;IACd,CAAC;IAED,0DAA0D;IAC1D,IAAI,oBAAoB,CAAC,OAAO,CAAC,EAAE,CAAC;QAClC,OAAO,IAAI,CAAC;IACd,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,YAAY,CAAC,IAAY;IACvC,IAAI,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;IAE1B,YAAY;IACZ,IAAI,OAAO,CAAC,MAAM,GAAG,kBAAkB,EAAE,CAAC;QACxC,OAAO,IAAI,CAAC;IACd,CAAC;IAED,uDAAuD;IACvD,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,wBAAwB,EAAE,IAAI,CAAC,CAAC;IAE1D,4DAA4D;IAC5D,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,yCAAyC,EAAE,EAAE,CAAC,CAAC;IAEzE,2DAA2D;IAC3D,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC;IAE3C,mBAAmB;IACnB,IAAI,WAAW,CAAC,OAAO,CAAC,EAAE,CAAC;QACzB,OAAO,IAAI,CAAC;IACd,CAAC;IAED,OAAO,OAAO,CAAC,IAAI,EAAE,CAAC;AACxB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,cAAc,CAAC,KAAe;IAC5C,OAAO,KAAK;SACT,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;SAC1B,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE;QACf,IAAI,IAAI,CAAC,MAAM,GAAG,oBAAoB;YAAE,OAAO,KAAK,CAAC;QACrD,IAAI,WAAW,CAAC,IAAI,CAAC;YAAE,OAAO,KAAK,CAAC;QACpC,OAAO,IAAI,CAAC;IACd,CAAC,CAAC,CAAC;AACP,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,cAAc,CAAC,IAAY;IACzC,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;IAE5B,mBAAmB;IACnB,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzB,OAAO,IAAI,CAAC;IACd,CAAC;IAED,qDAAqD;IACrD,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,IAAI,CAAC,gBAAgB,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC;QAC1D,OAAO,IAAI,CAAC;IACd,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,kBAAkB,CAAC,IAAY;IAC7C,OAAO,IAAI,CAAC,OAAO,CAAC,wBAAwB,EAAE,IAAI,CAAC,CAAC;AACtD,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,sBAAsB,CAAC,IAAY;IACjD,OAAO,CACL,IAAI;QACF,yCAAyC;SACxC,OAAO,CACN,0EAA0E,EAC1E,EAAE,CACH;QACD,kCAAkC;SACjC,OAAO,CACN,6HAA6H,EAC7H,EAAE,CACH;QACD,0BAA0B;SACzB,OAAO,CACN,6EAA6E,EAC7E,EAAE,CACH;QACD,4BAA4B;SAC3B,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC;SACvB,IAAI,EAAE,CACV,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"language-detector.d.ts","sourceRoot":"","sources":["../../src/utils/language-detector.ts"],"names":[],"mappings":"AA4CA;;GAEG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS,CAE/D"}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Language detection patterns for code blocks
|
|
3
|
+
* Shared between parser and markdown transformer
|
|
4
|
+
*/
|
|
5
|
+
const LANGUAGE_PATTERNS = [
|
|
6
|
+
// JSX/TSX patterns
|
|
7
|
+
[
|
|
8
|
+
/^\s*import\s+.*\s+from\s+['"]react['"]|<[A-Z][a-zA-Z]*[\s/>]|jsx\s*:|className=/m,
|
|
9
|
+
'jsx',
|
|
10
|
+
],
|
|
11
|
+
// TypeScript patterns
|
|
12
|
+
[
|
|
13
|
+
/:\s*(string|number|boolean|void|any|unknown|never)\b|interface\s+\w+|type\s+\w+\s*=/m,
|
|
14
|
+
'typescript',
|
|
15
|
+
],
|
|
16
|
+
// Rust patterns
|
|
17
|
+
[/^\s*(fn|let\s+mut|impl|struct|enum|use\s+\w+::)/m, 'rust'],
|
|
18
|
+
// JavaScript patterns (generic)
|
|
19
|
+
[
|
|
20
|
+
/^\s*(export|const|let|var|function|class|async|await)\b|^\s*import\s+.*['"]/m,
|
|
21
|
+
'javascript',
|
|
22
|
+
],
|
|
23
|
+
// Python patterns
|
|
24
|
+
[/^\s*(def|class|import|from|if __name__|print\()/m, 'python'],
|
|
25
|
+
// Bash/Shell patterns
|
|
26
|
+
[
|
|
27
|
+
/^\s*(npm|yarn|pnpm|npx|brew|apt|pip|cargo|go )\s+(install|add|run|build|start)/m,
|
|
28
|
+
'bash',
|
|
29
|
+
],
|
|
30
|
+
[/^\s*[$#]\s+\w+|^\s*#!|^\s*(sudo|chmod|mkdir|cd|ls|cat|echo)\s+/m, 'bash'],
|
|
31
|
+
// CSS patterns
|
|
32
|
+
[/^\s*[.#@]?[\w-]+\s*\{[^}]*\}|@media|@import|@keyframes/m, 'css'],
|
|
33
|
+
// HTML patterns
|
|
34
|
+
[/^\s*<(!DOCTYPE|html|head|body|div|span|p|a|script|style)\b/im, 'html'],
|
|
35
|
+
// JSON patterns
|
|
36
|
+
[/^\s*\{\s*"|^\s*\[\s*("|\d|true|false|null)/m, 'json'],
|
|
37
|
+
// YAML patterns
|
|
38
|
+
[/^\s*[\w-]+:\s*.+$/m, 'yaml'],
|
|
39
|
+
// SQL patterns
|
|
40
|
+
[/^\s*(SELECT|INSERT|UPDATE|DELETE|CREATE|ALTER|DROP)\s+/im, 'sql'],
|
|
41
|
+
// Go patterns
|
|
42
|
+
[/^\s*(func|package|import\s+")/m, 'go'],
|
|
43
|
+
];
|
|
44
|
+
/**
|
|
45
|
+
* Detect programming language from code content
|
|
46
|
+
*/
|
|
47
|
+
export function detectLanguage(code) {
|
|
48
|
+
return LANGUAGE_PATTERNS.find(([pattern]) => pattern.test(code))?.[1];
|
|
49
|
+
}
|
|
50
|
+
//# sourceMappingURL=language-detector.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"language-detector.js","sourceRoot":"","sources":["../../src/utils/language-detector.ts"],"names":[],"mappings":"AAAA;;;GAGG;AACH,MAAM,iBAAiB,GAAG;IACxB,mBAAmB;IACnB;QACE,kFAAkF;QAClF,KAAK;KACN;IACD,sBAAsB;IACtB;QACE,sFAAsF;QACtF,YAAY;KACb;IACD,gBAAgB;IAChB,CAAC,kDAAkD,EAAE,MAAM,CAAC;IAC5D,gCAAgC;IAChC;QACE,8EAA8E;QAC9E,YAAY;KACb;IACD,kBAAkB;IAClB,CAAC,kDAAkD,EAAE,QAAQ,CAAC;IAC9D,sBAAsB;IACtB;QACE,iFAAiF;QACjF,MAAM;KACP;IACD,CAAC,iEAAiE,EAAE,MAAM,CAAC;IAC3E,eAAe;IACf,CAAC,yDAAyD,EAAE,KAAK,CAAC;IAClE,gBAAgB;IAChB,CAAC,8DAA8D,EAAE,MAAM,CAAC;IACxE,gBAAgB;IAChB,CAAC,6CAA6C,EAAE,MAAM,CAAC;IACvD,gBAAgB;IAChB,CAAC,oBAAoB,EAAE,MAAM,CAAC;IAC9B,eAAe;IACf,CAAC,0DAA0D,EAAE,KAAK,CAAC;IACnE,cAAc;IACd,CAAC,gCAAgC,EAAE,IAAI,CAAC;CAChC,CAAC;AAEX;;GAEG;AACH,MAAM,UAAU,cAAc,CAAC,IAAY;IACzC,OAAO,iBAAiB,CAAC,IAAI,CAAC,CAAC,CAAC,OAAO,CAAC,EAAE,EAAE,CAAC,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;AACxE,CAAC"}
|
|
@@ -1,13 +1,3 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Sanitizes text content by collapsing whitespace and trimming
|
|
3
|
-
* Returns empty string for null/undefined input
|
|
4
|
-
*/
|
|
5
1
|
export declare function sanitizeText(text: string | null | undefined): string;
|
|
6
|
-
/**
|
|
7
|
-
* Truncates text to a maximum length with ellipsis
|
|
8
|
-
* @param text - Text to truncate
|
|
9
|
-
* @param maxLength - Maximum length (must be > 3 to accommodate ellipsis)
|
|
10
|
-
* @returns Truncated text with ellipsis if needed
|
|
11
|
-
*/
|
|
12
2
|
export declare function truncateText(text: string, maxLength: number): string;
|
|
13
3
|
//# sourceMappingURL=sanitizer.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"sanitizer.d.ts","sourceRoot":"","sources":["../../src/utils/sanitizer.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"sanitizer.d.ts","sourceRoot":"","sources":["../../src/utils/sanitizer.ts"],"names":[],"mappings":"AAGA,wBAAgB,YAAY,CAAC,IAAI,EAAE,MAAM,GAAG,IAAI,GAAG,SAAS,GAAG,MAAM,CAIpE;AAED,wBAAgB,YAAY,CAAC,IAAI,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,GAAG,MAAM,CAQpE"}
|
package/dist/utils/sanitizer.js
CHANGED
|
@@ -1,20 +1,12 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
* Returns empty string for null/undefined input
|
|
4
|
-
*/
|
|
1
|
+
// Pre-compiled regex patterns for hot path optimization
|
|
2
|
+
const WHITESPACE_REGEX = /\s+/g;
|
|
5
3
|
export function sanitizeText(text) {
|
|
6
4
|
if (text == null)
|
|
7
5
|
return '';
|
|
8
6
|
if (typeof text !== 'string')
|
|
9
7
|
return String(text);
|
|
10
|
-
return text.replace(
|
|
8
|
+
return text.replace(WHITESPACE_REGEX, ' ').trim();
|
|
11
9
|
}
|
|
12
|
-
/**
|
|
13
|
-
* Truncates text to a maximum length with ellipsis
|
|
14
|
-
* @param text - Text to truncate
|
|
15
|
-
* @param maxLength - Maximum length (must be > 3 to accommodate ellipsis)
|
|
16
|
-
* @returns Truncated text with ellipsis if needed
|
|
17
|
-
*/
|
|
18
10
|
export function truncateText(text, maxLength) {
|
|
19
11
|
if (maxLength < 4) {
|
|
20
12
|
return text.length > 0 ? text.charAt(0) : '';
|
|
@@ -22,6 +14,6 @@ export function truncateText(text, maxLength) {
|
|
|
22
14
|
if (text.length <= maxLength) {
|
|
23
15
|
return text;
|
|
24
16
|
}
|
|
25
|
-
return text.substring(0, maxLength - 3)
|
|
17
|
+
return `${text.substring(0, maxLength - 3)}...`;
|
|
26
18
|
}
|
|
27
19
|
//# sourceMappingURL=sanitizer.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"sanitizer.js","sourceRoot":"","sources":["../../src/utils/sanitizer.ts"],"names":[],"mappings":"AAAA
|
|
1
|
+
{"version":3,"file":"sanitizer.js","sourceRoot":"","sources":["../../src/utils/sanitizer.ts"],"names":[],"mappings":"AAAA,wDAAwD;AACxD,MAAM,gBAAgB,GAAG,MAAM,CAAC;AAEhC,MAAM,UAAU,YAAY,CAAC,IAA+B;IAC1D,IAAI,IAAI,IAAI,IAAI;QAAE,OAAO,EAAE,CAAC;IAC5B,IAAI,OAAO,IAAI,KAAK,QAAQ;QAAE,OAAO,MAAM,CAAC,IAAI,CAAC,CAAC;IAClD,OAAO,IAAI,CAAC,OAAO,CAAC,gBAAgB,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;AACpD,CAAC;AAED,MAAM,UAAU,YAAY,CAAC,IAAY,EAAE,SAAiB;IAC1D,IAAI,SAAS,GAAG,CAAC,EAAE,CAAC;QAClB,OAAO,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;IAC/C,CAAC;IACD,IAAI,IAAI,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;QAC7B,OAAO,IAAI,CAAC;IACd,CAAC;IACD,OAAO,GAAG,IAAI,CAAC,SAAS,CAAC,CAAC,EAAE,SAAS,GAAG,CAAC,CAAC,KAAK,CAAC;AAClD,CAAC"}
|
|
@@ -1,18 +1,4 @@
|
|
|
1
|
-
|
|
2
|
-
export type ToolErrorResponse = {
|
|
3
|
-
[x: string]: unknown;
|
|
4
|
-
content: {
|
|
5
|
-
type: 'text';
|
|
6
|
-
text: string;
|
|
7
|
-
}[];
|
|
8
|
-
structuredContent: {
|
|
9
|
-
[x: string]: unknown;
|
|
10
|
-
error: string;
|
|
11
|
-
url: string;
|
|
12
|
-
errorCode: string;
|
|
13
|
-
};
|
|
14
|
-
isError: true;
|
|
15
|
-
};
|
|
1
|
+
import type { ToolErrorResponse } from '../config/types.js';
|
|
16
2
|
export declare function createToolErrorResponse(message: string, url: string, code: string): ToolErrorResponse;
|
|
17
3
|
export declare function handleToolError(error: unknown, url: string, fallbackMessage?: string): ToolErrorResponse;
|
|
18
4
|
//# sourceMappingURL=tool-error-handler.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"tool-error-handler.d.ts","sourceRoot":"","sources":["../../src/utils/tool-error-handler.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"tool-error-handler.d.ts","sourceRoot":"","sources":["../../src/utils/tool-error-handler.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,oBAAoB,CAAC;AAa5D,wBAAgB,uBAAuB,CACrC,OAAO,EAAE,MAAM,EACf,GAAG,EAAE,MAAM,EACX,IAAI,EAAE,MAAM,GACX,iBAAiB,CAOnB;AAED,wBAAgB,eAAe,CAC7B,KAAK,EAAE,OAAO,EACd,GAAG,EAAE,MAAM,EACX,eAAe,SAAqB,GACnC,iBAAiB,CA8CnB"}
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { AbortError, AppError, FetchError, RateLimitError, TimeoutError, UrlValidationError, } from '../errors/index.js';
|
|
2
|
+
const isDevelopment = process.env.NODE_ENV === 'development';
|
|
2
3
|
export function createToolErrorResponse(message, url, code) {
|
|
3
4
|
const structuredContent = { error: message, url, errorCode: code };
|
|
4
5
|
return {
|
|
@@ -9,19 +10,46 @@ export function createToolErrorResponse(message, url, code) {
|
|
|
9
10
|
}
|
|
10
11
|
export function handleToolError(error, url, fallbackMessage = 'Operation failed') {
|
|
11
12
|
if (error instanceof UrlValidationError) {
|
|
12
|
-
|
|
13
|
+
const message = isDevelopment
|
|
14
|
+
? `${error.message}\nURL: ${error.url}\nStack: ${error.stack ?? ''}`
|
|
15
|
+
: error.message;
|
|
16
|
+
return createToolErrorResponse(message, url, 'INVALID_URL');
|
|
17
|
+
}
|
|
18
|
+
if (error instanceof AbortError) {
|
|
19
|
+
const message = isDevelopment
|
|
20
|
+
? `Request aborted${error.reason ? `: ${error.reason}` : ''}\n${error.stack ?? ''}`
|
|
21
|
+
: `Request aborted${error.reason ? `: ${error.reason}` : ''}`;
|
|
22
|
+
return createToolErrorResponse(message, url, 'ABORTED');
|
|
13
23
|
}
|
|
14
24
|
if (error instanceof TimeoutError) {
|
|
15
|
-
|
|
25
|
+
const message = isDevelopment
|
|
26
|
+
? `Request timed out after ${error.timeoutMs}ms\n${error.stack ?? ''}`
|
|
27
|
+
: `Request timed out after ${error.timeoutMs}ms`;
|
|
28
|
+
return createToolErrorResponse(message, url, 'TIMEOUT');
|
|
29
|
+
}
|
|
30
|
+
if (error instanceof RateLimitError) {
|
|
31
|
+
const message = isDevelopment
|
|
32
|
+
? `Rate limited. Retry after ${error.retryAfter}s\n${error.stack ?? ''}`
|
|
33
|
+
: `Rate limited. Retry after ${error.retryAfter}s`;
|
|
34
|
+
return createToolErrorResponse(message, url, 'RATE_LIMITED');
|
|
16
35
|
}
|
|
17
36
|
if (error instanceof FetchError) {
|
|
18
37
|
const code = error.httpStatus ? `HTTP_${error.httpStatus}` : 'FETCH_ERROR';
|
|
19
|
-
|
|
38
|
+
const message = isDevelopment
|
|
39
|
+
? `${error.message}\n${error.stack ?? ''}`
|
|
40
|
+
: error.message;
|
|
41
|
+
return createToolErrorResponse(message, url, code);
|
|
20
42
|
}
|
|
21
43
|
if (error instanceof AppError) {
|
|
22
|
-
|
|
44
|
+
const message = isDevelopment
|
|
45
|
+
? `${error.message}\n${error.stack ?? ''}`
|
|
46
|
+
: error.message;
|
|
47
|
+
return createToolErrorResponse(message, url, error.code);
|
|
23
48
|
}
|
|
24
49
|
const message = error instanceof Error ? error.message : 'Unknown error';
|
|
25
|
-
|
|
50
|
+
const fullMessage = isDevelopment && error instanceof Error
|
|
51
|
+
? `${fallbackMessage}: ${message}\n${error.stack ?? ''}`
|
|
52
|
+
: `${fallbackMessage}: ${message}`;
|
|
53
|
+
return createToolErrorResponse(fullMessage, url, 'UNKNOWN_ERROR');
|
|
26
54
|
}
|
|
27
55
|
//# sourceMappingURL=tool-error-handler.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"tool-error-handler.js","sourceRoot":"","sources":["../../src/utils/tool-error-handler.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"tool-error-handler.js","sourceRoot":"","sources":["../../src/utils/tool-error-handler.ts"],"names":[],"mappings":"AAEA,OAAO,EACL,UAAU,EACV,QAAQ,EACR,UAAU,EACV,cAAc,EACd,YAAY,EACZ,kBAAkB,GACnB,MAAM,oBAAoB,CAAC;AAE5B,MAAM,aAAa,GAAG,OAAO,CAAC,GAAG,CAAC,QAAQ,KAAK,aAAa,CAAC;AAE7D,MAAM,UAAU,uBAAuB,CACrC,OAAe,EACf,GAAW,EACX,IAAY;IAEZ,MAAM,iBAAiB,GAAG,EAAE,KAAK,EAAE,OAAO,EAAE,GAAG,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC;IACnE,OAAO;QACL,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,iBAAiB,CAAC,EAAE,CAAC;QACpE,iBAAiB;QACjB,OAAO,EAAE,IAAI;KACd,CAAC;AACJ,CAAC;AAED,MAAM,UAAU,eAAe,CAC7B,KAAc,EACd,GAAW,EACX,eAAe,GAAG,kBAAkB;IAEpC,IAAI,KAAK,YAAY,kBAAkB,EAAE,CAAC;QACxC,MAAM,OAAO,GAAG,aAAa;YAC3B,CAAC,CAAC,GAAG,KAAK,CAAC,OAAO,UAAU,KAAK,CAAC,GAAG,YAAY,KAAK,CAAC,KAAK,IAAI,EAAE,EAAE;YACpE,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC;QAClB,OAAO,uBAAuB,CAAC,OAAO,EAAE,GAAG,EAAE,aAAa,CAAC,CAAC;IAC9D,CAAC;IACD,IAAI,KAAK,YAAY,UAAU,EAAE,CAAC;QAChC,MAAM,OAAO,GAAG,aAAa;YAC3B,CAAC,CAAC,kBAAkB,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,KAAK,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,KAAK,CAAC,KAAK,IAAI,EAAE,EAAE;YACnF,CAAC,CAAC,kBAAkB,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,KAAK,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC;QAChE,OAAO,uBAAuB,CAAC,OAAO,EAAE,GAAG,EAAE,SAAS,CAAC,CAAC;IAC1D,CAAC;IACD,IAAI,KAAK,YAAY,YAAY,EAAE,CAAC;QAClC,MAAM,OAAO,GAAG,aAAa;YAC3B,CAAC,CAAC,2BAA2B,KAAK,CAAC,SAAS,OAAO,KAAK,CAAC,KAAK,IAAI,EAAE,EAAE;YACtE,CAAC,CAAC,2BAA2B,KAAK,CAAC,SAAS,IAAI,CAAC;QACnD,OAAO,uBAAuB,CAAC,OAAO,EAAE,GAAG,EAAE,SAAS,CAAC,CAAC;IAC1D,CAAC;IACD,IAAI,KAAK,YAAY,cAAc,EAAE,CAAC;QACpC,MAAM,OAAO,GAAG,aAAa;YAC3B,CAAC,CAAC,6BAA6B,KAAK,CAAC,UAAU,MAAM,KAAK,CAAC,KAAK,IAAI,EAAE,EAAE;YACxE,CAAC,CAAC,6BAA6B,KAAK,CAAC,UAAU,GAAG,CAAC;QACrD,OAAO,uBAAuB,CAAC,OAAO,EAAE,GAAG,EAAE,cAAc,CAAC,CAAC;IAC/D,CAAC;IACD,IAAI,KAAK,YAAY,UAAU,EAAE,CAAC;QAChC,MAAM,IAAI,GAAG,KAAK,CAAC,UAAU,CAAC,CAAC,CAAC,QAAQ,KAAK,CAAC,UAAU,EAAE,CAAC,CAAC,CAAC,aAAa,CAAC;QAC3E,MAAM,OAAO,GAAG,aAAa;YAC3B,CAAC,CAAC,GAAG,KAAK,CAAC,OAAO,KAAK,KAAK,CAAC,KAAK,IAAI,EAAE,EAAE;YAC1C,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC;QAClB,OAAO,uBAAuB,CAAC,OAAO,EAAE,GAAG,EAAE,IAAI,CAAC,CAAC;IACrD,CAAC;IACD,IAAI,KAAK,YAAY,QAAQ,EAAE,CAAC;QAC9B,MAAM,OAAO,GAAG,aAAa;YAC3B,CAAC,CAAC,GAAG,KAAK,CAAC,OAAO,KAAK,KAAK,CAAC,KAAK,IAAI,EAAE,EAAE;YAC1C,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC;QAClB,OAAO,uBAAuB,CAAC,OAAO,EAAE,GAAG,EAAE,KAAK,CAAC,IAAI,CAAC,CAAC;IAC3D,CAAC;IAED,MAAM,OAAO,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,CAAC;IACzE,MAAM,WAAW,GACf,aAAa,IAAI,KAAK,YAAY,KAAK;QACrC,CAAC,CAAC,GAAG,eAAe,KAAK,OAAO,KAAK,KAAK,CAAC,KAAK,IAAI,EAAE,EAAE;QACxD,CAAC,CAAC,GAAG,eAAe,KAAK,OAAO,EAAE,CAAC;IAEvC,OAAO,uBAAuB,CAAC,WAAW,EAAE,GAAG,EAAE,eAAe,CAAC,CAAC;AACpE,CAAC"}
|
|
@@ -1,11 +1,3 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Validates and normalizes a URL, blocking SSRF attack vectors
|
|
3
|
-
* @throws {ValidationError} if URL is empty or too long
|
|
4
|
-
* @throws {UrlValidationError} if URL is invalid or blocked
|
|
5
|
-
*/
|
|
6
1
|
export declare function validateAndNormalizeUrl(urlString: string): string;
|
|
7
|
-
/**
|
|
8
|
-
* Checks if a URL is internal (same domain)
|
|
9
|
-
*/
|
|
10
2
|
export declare function isInternalUrl(url: string, baseUrl: string): boolean;
|
|
11
3
|
//# sourceMappingURL=url-validator.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"url-validator.d.ts","sourceRoot":"","sources":["../../src/utils/url-validator.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"url-validator.d.ts","sourceRoot":"","sources":["../../src/utils/url-validator.ts"],"names":[],"mappings":"AAmCA,wBAAgB,uBAAuB,CAAC,SAAS,EAAE,MAAM,GAAG,MAAM,CA2EjE;AAED,wBAAgB,aAAa,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,GAAG,OAAO,CAQnE"}
|
|
@@ -1,44 +1,33 @@
|
|
|
1
1
|
import { UrlValidationError, ValidationError } from '../errors/app-error.js';
|
|
2
|
-
// Maximum URL length to prevent DoS attacks
|
|
3
2
|
const MAX_URL_LENGTH = 2048;
|
|
4
|
-
// Blocked hosts to prevent SSRF attacks
|
|
5
3
|
const BLOCKED_HOSTS = new Set([
|
|
6
4
|
'localhost',
|
|
7
5
|
'127.0.0.1',
|
|
8
6
|
'0.0.0.0',
|
|
9
7
|
'::1',
|
|
10
|
-
'169.254.169.254',
|
|
11
|
-
'metadata.google.internal',
|
|
12
|
-
'metadata.azure.com',
|
|
13
|
-
'100.100.100.200',
|
|
14
|
-
'instance-data',
|
|
8
|
+
'169.254.169.254',
|
|
9
|
+
'metadata.google.internal',
|
|
10
|
+
'metadata.azure.com',
|
|
11
|
+
'100.100.100.200',
|
|
12
|
+
'instance-data',
|
|
15
13
|
]);
|
|
16
|
-
// Blocked IP patterns (private networks)
|
|
17
14
|
const BLOCKED_IP_PATTERNS = [
|
|
18
|
-
/^10\./,
|
|
19
|
-
/^172\.(1[6-9]|2\d|3[01])\./,
|
|
20
|
-
/^192\.168\./,
|
|
21
|
-
/^127\./,
|
|
22
|
-
/^0\./,
|
|
23
|
-
/^169\.254\./,
|
|
24
|
-
/^fc00:/i,
|
|
25
|
-
/^fe80:/i,
|
|
26
|
-
/^::ffff:127\./,
|
|
27
|
-
/^::ffff:10\./,
|
|
28
|
-
/^::ffff:172\.(1[6-9]|2\d|3[01])\./,
|
|
29
|
-
/^::ffff:192\.168\./,
|
|
15
|
+
/^10\./,
|
|
16
|
+
/^172\.(1[6-9]|2\d|3[01])\./,
|
|
17
|
+
/^192\.168\./,
|
|
18
|
+
/^127\./,
|
|
19
|
+
/^0\./,
|
|
20
|
+
/^169\.254\./,
|
|
21
|
+
/^fc00:/i,
|
|
22
|
+
/^fe80:/i,
|
|
23
|
+
/^::ffff:127\./,
|
|
24
|
+
/^::ffff:10\./,
|
|
25
|
+
/^::ffff:172\.(1[6-9]|2\d|3[01])\./,
|
|
26
|
+
/^::ffff:192\.168\./,
|
|
30
27
|
];
|
|
31
|
-
/**
|
|
32
|
-
* Checks if a hostname matches blocked IP patterns
|
|
33
|
-
*/
|
|
34
28
|
function isBlockedIp(hostname) {
|
|
35
29
|
return BLOCKED_IP_PATTERNS.some((pattern) => pattern.test(hostname));
|
|
36
30
|
}
|
|
37
|
-
/**
|
|
38
|
-
* Validates and normalizes a URL, blocking SSRF attack vectors
|
|
39
|
-
* @throws {ValidationError} if URL is empty or too long
|
|
40
|
-
* @throws {UrlValidationError} if URL is invalid or blocked
|
|
41
|
-
*/
|
|
42
31
|
export function validateAndNormalizeUrl(urlString) {
|
|
43
32
|
// Check for empty or whitespace-only input
|
|
44
33
|
if (!urlString || typeof urlString !== 'string') {
|
|
@@ -86,9 +75,6 @@ export function validateAndNormalizeUrl(urlString) {
|
|
|
86
75
|
}
|
|
87
76
|
return url.href;
|
|
88
77
|
}
|
|
89
|
-
/**
|
|
90
|
-
* Checks if a URL is internal (same domain)
|
|
91
|
-
*/
|
|
92
78
|
export function isInternalUrl(url, baseUrl) {
|
|
93
79
|
try {
|
|
94
80
|
const urlObj = new URL(url, baseUrl);
|