mcp-docs-scraper 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +357 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +20 -0
- package/dist/index.js.map +1 -0
- package/dist/server.d.ts +6 -0
- package/dist/server.d.ts.map +1 -0
- package/dist/server.js +231 -0
- package/dist/server.js.map +1 -0
- package/dist/services/cache-manager.d.ts +100 -0
- package/dist/services/cache-manager.d.ts.map +1 -0
- package/dist/services/cache-manager.js +212 -0
- package/dist/services/cache-manager.js.map +1 -0
- package/dist/services/content-cleaner.d.ts +48 -0
- package/dist/services/content-cleaner.d.ts.map +1 -0
- package/dist/services/content-cleaner.js +295 -0
- package/dist/services/content-cleaner.js.map +1 -0
- package/dist/services/github-detector.d.ts +49 -0
- package/dist/services/github-detector.d.ts.map +1 -0
- package/dist/services/github-detector.js +276 -0
- package/dist/services/github-detector.js.map +1 -0
- package/dist/services/github-fetcher.d.ts +94 -0
- package/dist/services/github-fetcher.d.ts.map +1 -0
- package/dist/services/github-fetcher.js +393 -0
- package/dist/services/github-fetcher.js.map +1 -0
- package/dist/services/search-index.d.ts +106 -0
- package/dist/services/search-index.d.ts.map +1 -0
- package/dist/services/search-index.js +210 -0
- package/dist/services/search-index.js.map +1 -0
- package/dist/services/web-scraper.d.ts +88 -0
- package/dist/services/web-scraper.d.ts.map +1 -0
- package/dist/services/web-scraper.js +244 -0
- package/dist/services/web-scraper.js.map +1 -0
- package/dist/tools/clear-cache.d.ts +24 -0
- package/dist/tools/clear-cache.d.ts.map +1 -0
- package/dist/tools/clear-cache.js +29 -0
- package/dist/tools/clear-cache.js.map +1 -0
- package/dist/tools/detect-github.d.ts +21 -0
- package/dist/tools/detect-github.d.ts.map +1 -0
- package/dist/tools/detect-github.js +18 -0
- package/dist/tools/detect-github.js.map +1 -0
- package/dist/tools/get-content.d.ts +43 -0
- package/dist/tools/get-content.d.ts.map +1 -0
- package/dist/tools/get-content.js +84 -0
- package/dist/tools/get-content.js.map +1 -0
- package/dist/tools/get-tree.d.ts +31 -0
- package/dist/tools/get-tree.d.ts.map +1 -0
- package/dist/tools/get-tree.js +102 -0
- package/dist/tools/get-tree.js.map +1 -0
- package/dist/tools/index-docs.d.ts +63 -0
- package/dist/tools/index-docs.d.ts.map +1 -0
- package/dist/tools/index-docs.js +371 -0
- package/dist/tools/index-docs.js.map +1 -0
- package/dist/tools/index.d.ts +11 -0
- package/dist/tools/index.d.ts.map +1 -0
- package/dist/tools/index.js +11 -0
- package/dist/tools/index.js.map +1 -0
- package/dist/tools/list-cached.d.ts +19 -0
- package/dist/tools/list-cached.d.ts.map +1 -0
- package/dist/tools/list-cached.js +20 -0
- package/dist/tools/list-cached.js.map +1 -0
- package/dist/tools/search-docs.d.ts +31 -0
- package/dist/tools/search-docs.d.ts.map +1 -0
- package/dist/tools/search-docs.js +64 -0
- package/dist/tools/search-docs.js.map +1 -0
- package/dist/types/cache.d.ts +53 -0
- package/dist/types/cache.d.ts.map +1 -0
- package/dist/types/cache.js +2 -0
- package/dist/types/cache.js.map +1 -0
- package/dist/types/errors.d.ts +102 -0
- package/dist/types/errors.d.ts.map +1 -0
- package/dist/types/errors.js +216 -0
- package/dist/types/errors.js.map +1 -0
- package/dist/types/index.d.ts +6 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/index.js +5 -0
- package/dist/types/index.js.map +1 -0
- package/dist/utils/fs.d.ts +45 -0
- package/dist/utils/fs.d.ts.map +1 -0
- package/dist/utils/fs.js +113 -0
- package/dist/utils/fs.js.map +1 -0
- package/dist/utils/rate-limit.d.ts +55 -0
- package/dist/utils/rate-limit.d.ts.map +1 -0
- package/dist/utils/rate-limit.js +89 -0
- package/dist/utils/rate-limit.js.map +1 -0
- package/dist/utils/url.d.ts +69 -0
- package/dist/utils/url.d.ts.map +1 -0
- package/dist/utils/url.js +251 -0
- package/dist/utils/url.js.map +1 -0
- package/package.json +58 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"rate-limit.js","sourceRoot":"","sources":["../../src/utils/rate-limit.ts"],"names":[],"mappings":"AAAA;;GAEG;AAaH;;GAEG;AACH,MAAM,OAAO,gBAAgB;IACnB,IAAI,GAAyB,IAAI,CAAC;IAE1C;;OAEG;IACH,iBAAiB,CAAC,OAAgB;QAChC,MAAM,KAAK,GAAG,OAAO,CAAC,GAAG,CAAC,mBAAmB,CAAC,CAAC;QAC/C,MAAM,SAAS,GAAG,OAAO,CAAC,GAAG,CAAC,uBAAuB,CAAC,CAAC;QACvD,MAAM,KAAK,GAAG,OAAO,CAAC,GAAG,CAAC,mBAAmB,CAAC,CAAC;QAE/C,IAAI,KAAK,IAAI,SAAS,IAAI,KAAK,EAAE,CAAC;YAChC,IAAI,CAAC,IAAI,GAAG;gBACV,KAAK,EAAE,QAAQ,CAAC,KAAK,EAAE,EAAE,CAAC;gBAC1B,SAAS,EAAE,QAAQ,CAAC,SAAS,EAAE,EAAE,CAAC;gBAClC,KAAK,EAAE,QAAQ,CAAC,KAAK,EAAE,EAAE,CAAC;gBAC1B,SAAS,EAAE,IAAI,IAAI,EAAE;aACtB,CAAC;QACJ,CAAC;IACH,CAAC;IAED;;OAEG;IACH,OAAO;QACL,OAAO,IAAI,CAAC,IAAI,CAAC;IACnB,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,YAAoB,CAAC;QACzB,IAAI,CAAC,IAAI,CAAC,IAAI;YAAE,OAAO,KAAK,CAAC;QAC7B,OAAO,IAAI,CAAC,IAAI,CAAC,SAAS,GAAG,SAAS,CAAC;IACzC,CAAC;IAED;;OAEG;IACH,WAAW;QACT,IAAI,CAAC,IAAI,CAAC,IAAI;YAAE,OAAO,KAAK,CAAC;QAC7B,OAAO,IAAI,CAAC,IAAI,CAAC,SAAS,IAAI,CAAC,CAAC;IAClC,CAAC;IAED;;;OAGG;IACH,iBAAiB;QACf,IAAI,CAAC,IAAI,CAAC,IAAI;YAAE,OAAO,CAAC,CAAC;QACzB,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,CAAC,gBAAgB;QAC1D,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QACvB,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,SAAS,GAAG,GAAG,CAAC,CAAC;IACtC,CAAC;IAED;;OAEG;IACH,gBAAgB;QACd,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;YACf,OAAO,+BAA+B,CAAC;QACzC,CAAC;QAED,MAAM,iBAAiB,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,iBAAiB,EAAE,GAAG,KAAK,CAAC,CAAC;QAEtE,IAAI,IAAI,CAAC,WAAW,EAAE,EAAE,CAAC;YACvB,OAAO,mCAAmC,iBAAiB,WAAW,CAAC;QACzE,CAAC;QAED,IAAI,IAAI,CAAC,KAAK,EAAE,EAAE,CAAC;YACjB,OAAO,mBAAmB,IAAI,CAAC,IAAI,CAAC,SAAS,IAAI,IAAI,CAAC,IAAI,CAAC,KAAK,yBAAyB,iBAAiB,WAAW,CAAC;QACxH,CAAC;QAED,OAAO,eAAe,IAAI,CAAC,IAAI,CAAC,SAAS,IAAI,IAAI,CAAC,IAAI,CAAC,KAAK,YAAY,CAAC;IAC3E,CAAC;IAED;;;OAGG;IACH,YAAY;QACV,IAAI,CAAC,IAAI,CAAC,IAAI;YAAE,OAAO,SAAS,CAAC;QACjC,OAAO,IAAI,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,CAAC;IAC1C,CAAC;CACF;AAED;;GAEG;AACH,MAAM,CAAC,MAAM,eAAe,GAAG,IAAI,gBAAgB,EAAE,CAAC"}
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* URL utilities for web scraping.
|
|
3
|
+
*
|
|
4
|
+
* Provides URL normalization, domain extraction, and link filtering.
|
|
5
|
+
*/
|
|
6
|
+
/**
|
|
7
|
+
* Normalizes a URL by removing fragments, tracking params, and normalizing format.
|
|
8
|
+
*
|
|
9
|
+
* @param url The URL to normalize
|
|
10
|
+
* @returns Normalized URL string
|
|
11
|
+
*/
|
|
12
|
+
export declare function normalizeUrl(url: string): string;
|
|
13
|
+
/**
|
|
14
|
+
* Extracts the domain (host) from a URL.
|
|
15
|
+
*
|
|
16
|
+
* @param url The URL to extract domain from
|
|
17
|
+
* @returns Domain string or null if invalid
|
|
18
|
+
*/
|
|
19
|
+
export declare function extractDomain(url: string): string | null;
|
|
20
|
+
/**
|
|
21
|
+
* Checks if two URLs are on the same domain.
|
|
22
|
+
*
|
|
23
|
+
* @param url1 First URL
|
|
24
|
+
* @param url2 Second URL
|
|
25
|
+
* @returns True if same domain
|
|
26
|
+
*/
|
|
27
|
+
export declare function isSameDomain(url1: string, url2: string): boolean;
|
|
28
|
+
/**
|
|
29
|
+
* Resolves a potentially relative URL against a base URL.
|
|
30
|
+
*
|
|
31
|
+
* @param href The href to resolve (may be relative or absolute)
|
|
32
|
+
* @param baseUrl The base URL to resolve against
|
|
33
|
+
* @returns Absolute URL string or null if invalid
|
|
34
|
+
*/
|
|
35
|
+
export declare function resolveUrl(href: string, baseUrl: string): string | null;
|
|
36
|
+
/**
|
|
37
|
+
* Checks if a URL should be crawled based on various criteria.
|
|
38
|
+
*
|
|
39
|
+
* @param url The URL to check
|
|
40
|
+
* @param baseUrl The base URL of the crawl
|
|
41
|
+
* @returns Object with isValid boolean and reason if invalid
|
|
42
|
+
*/
|
|
43
|
+
export declare function shouldCrawl(url: string, baseUrl: string): {
|
|
44
|
+
isValid: boolean;
|
|
45
|
+
reason?: string;
|
|
46
|
+
};
|
|
47
|
+
/**
|
|
48
|
+
* Extracts all links from an HTML document.
|
|
49
|
+
*
|
|
50
|
+
* @param html The HTML content
|
|
51
|
+
* @param baseUrl The base URL for resolving relative links
|
|
52
|
+
* @returns Array of absolute URLs found in the document
|
|
53
|
+
*/
|
|
54
|
+
export declare function extractLinks(html: string, baseUrl: string): string[];
|
|
55
|
+
/**
|
|
56
|
+
* Gets the path depth of a URL (number of path segments).
|
|
57
|
+
*
|
|
58
|
+
* @param url The URL to analyze
|
|
59
|
+
* @returns Number of path segments
|
|
60
|
+
*/
|
|
61
|
+
export declare function getPathDepth(url: string): number;
|
|
62
|
+
/**
|
|
63
|
+
* Converts a URL to a safe filename for caching.
|
|
64
|
+
*
|
|
65
|
+
* @param url The URL to convert
|
|
66
|
+
* @returns Safe filename string
|
|
67
|
+
*/
|
|
68
|
+
export declare function urlToFilename(url: string): string;
|
|
69
|
+
//# sourceMappingURL=url.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"url.d.ts","sourceRoot":"","sources":["../../src/utils/url.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAmBH;;;;;GAKG;AACH,wBAAgB,YAAY,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,CAgChD;AAED;;;;;GAKG;AACH,wBAAgB,aAAa,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,GAAG,IAAI,CAOxD;AAED;;;;;;GAMG;AACH,wBAAgB,YAAY,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,GAAG,OAAO,CAWhE;AAED;;;;;;GAMG;AACH,wBAAgB,UAAU,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,GAAG,MAAM,GAAG,IAAI,CAYvE;AAED;;;;;;GAMG;AACH,wBAAgB,WAAW,CACzB,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,MAAM,GACd;IAAE,OAAO,EAAE,OAAO,CAAC;IAAC,MAAM,CAAC,EAAE,MAAM,CAAA;CAAE,CAwCvC;AAED;;;;;;GAMG;AACH,wBAAgB,YAAY,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,GAAG,MAAM,EAAE,CAoCpE;AAED;;;;;GAKG;AACH,wBAAgB,YAAY,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,CAQhD;AAED;;;;;GAKG;AACH,wBAAgB,aAAa,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,CA+BjD"}
|
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* URL utilities for web scraping.
|
|
3
|
+
*
|
|
4
|
+
* Provides URL normalization, domain extraction, and link filtering.
|
|
5
|
+
*/
|
|
6
|
+
/**
|
|
7
|
+
* Tracking parameters to remove from URLs.
|
|
8
|
+
*/
|
|
9
|
+
const TRACKING_PARAMS = [
|
|
10
|
+
"utm_source",
|
|
11
|
+
"utm_medium",
|
|
12
|
+
"utm_campaign",
|
|
13
|
+
"utm_term",
|
|
14
|
+
"utm_content",
|
|
15
|
+
"ref",
|
|
16
|
+
"source",
|
|
17
|
+
"fbclid",
|
|
18
|
+
"gclid",
|
|
19
|
+
"msclkid",
|
|
20
|
+
"_ga",
|
|
21
|
+
];
|
|
22
|
+
/**
|
|
23
|
+
* Normalizes a URL by removing fragments, tracking params, and normalizing format.
|
|
24
|
+
*
|
|
25
|
+
* @param url The URL to normalize
|
|
26
|
+
* @returns Normalized URL string
|
|
27
|
+
*/
|
|
28
|
+
export function normalizeUrl(url) {
|
|
29
|
+
try {
|
|
30
|
+
const parsed = new URL(url);
|
|
31
|
+
// Remove fragment
|
|
32
|
+
parsed.hash = "";
|
|
33
|
+
// Remove tracking parameters
|
|
34
|
+
for (const param of TRACKING_PARAMS) {
|
|
35
|
+
parsed.searchParams.delete(param);
|
|
36
|
+
}
|
|
37
|
+
// Sort remaining search params for consistency
|
|
38
|
+
parsed.searchParams.sort();
|
|
39
|
+
// Normalize path - remove trailing slash except for root
|
|
40
|
+
if (parsed.pathname.length > 1 && parsed.pathname.endsWith("/")) {
|
|
41
|
+
parsed.pathname = parsed.pathname.slice(0, -1);
|
|
42
|
+
}
|
|
43
|
+
// Decode URL-encoded characters in pathname for readability
|
|
44
|
+
try {
|
|
45
|
+
parsed.pathname = decodeURIComponent(parsed.pathname);
|
|
46
|
+
}
|
|
47
|
+
catch {
|
|
48
|
+
// Keep as-is if decoding fails
|
|
49
|
+
}
|
|
50
|
+
return parsed.href;
|
|
51
|
+
}
|
|
52
|
+
catch {
|
|
53
|
+
// Return as-is if URL is invalid
|
|
54
|
+
return url;
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
/**
|
|
58
|
+
* Extracts the domain (host) from a URL.
|
|
59
|
+
*
|
|
60
|
+
* @param url The URL to extract domain from
|
|
61
|
+
* @returns Domain string or null if invalid
|
|
62
|
+
*/
|
|
63
|
+
export function extractDomain(url) {
|
|
64
|
+
try {
|
|
65
|
+
const parsed = new URL(url);
|
|
66
|
+
return parsed.hostname;
|
|
67
|
+
}
|
|
68
|
+
catch {
|
|
69
|
+
return null;
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
/**
|
|
73
|
+
* Checks if two URLs are on the same domain.
|
|
74
|
+
*
|
|
75
|
+
* @param url1 First URL
|
|
76
|
+
* @param url2 Second URL
|
|
77
|
+
* @returns True if same domain
|
|
78
|
+
*/
|
|
79
|
+
export function isSameDomain(url1, url2) {
|
|
80
|
+
const domain1 = extractDomain(url1);
|
|
81
|
+
const domain2 = extractDomain(url2);
|
|
82
|
+
if (!domain1 || !domain2) {
|
|
83
|
+
return false;
|
|
84
|
+
}
|
|
85
|
+
// Handle www prefix variations
|
|
86
|
+
const normalize = (d) => d.replace(/^www\./, "");
|
|
87
|
+
return normalize(domain1) === normalize(domain2);
|
|
88
|
+
}
|
|
89
|
+
/**
|
|
90
|
+
* Resolves a potentially relative URL against a base URL.
|
|
91
|
+
*
|
|
92
|
+
* @param href The href to resolve (may be relative or absolute)
|
|
93
|
+
* @param baseUrl The base URL to resolve against
|
|
94
|
+
* @returns Absolute URL string or null if invalid
|
|
95
|
+
*/
|
|
96
|
+
export function resolveUrl(href, baseUrl) {
|
|
97
|
+
try {
|
|
98
|
+
// Handle protocol-relative URLs
|
|
99
|
+
if (href.startsWith("//")) {
|
|
100
|
+
const base = new URL(baseUrl);
|
|
101
|
+
return new URL(`${base.protocol}${href}`).href;
|
|
102
|
+
}
|
|
103
|
+
return new URL(href, baseUrl).href;
|
|
104
|
+
}
|
|
105
|
+
catch {
|
|
106
|
+
return null;
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
/**
|
|
110
|
+
* Checks if a URL should be crawled based on various criteria.
|
|
111
|
+
*
|
|
112
|
+
* @param url The URL to check
|
|
113
|
+
* @param baseUrl The base URL of the crawl
|
|
114
|
+
* @returns Object with isValid boolean and reason if invalid
|
|
115
|
+
*/
|
|
116
|
+
export function shouldCrawl(url, baseUrl) {
|
|
117
|
+
try {
|
|
118
|
+
const parsed = new URL(url);
|
|
119
|
+
// Only HTTP/HTTPS
|
|
120
|
+
if (!["http:", "https:"].includes(parsed.protocol)) {
|
|
121
|
+
return { isValid: false, reason: "non-http protocol" };
|
|
122
|
+
}
|
|
123
|
+
// Must be same domain
|
|
124
|
+
if (!isSameDomain(url, baseUrl)) {
|
|
125
|
+
return { isValid: false, reason: "external domain" };
|
|
126
|
+
}
|
|
127
|
+
// Skip common non-content paths
|
|
128
|
+
const skipPatterns = [
|
|
129
|
+
/\/api\//i,
|
|
130
|
+
/\/auth\//i,
|
|
131
|
+
/\/login/i,
|
|
132
|
+
/\/logout/i,
|
|
133
|
+
/\/signup/i,
|
|
134
|
+
/\/register/i,
|
|
135
|
+
/\/admin/i,
|
|
136
|
+
/\/cdn-cgi\//i,
|
|
137
|
+
/\.(pdf|zip|tar|gz|exe|dmg|pkg|deb|rpm)$/i,
|
|
138
|
+
/\.(png|jpg|jpeg|gif|svg|ico|webp)$/i,
|
|
139
|
+
/\.(css|js|json|xml|rss|atom)$/i,
|
|
140
|
+
/\.(mp3|mp4|avi|mov|wmv|flv|webm)$/i,
|
|
141
|
+
];
|
|
142
|
+
for (const pattern of skipPatterns) {
|
|
143
|
+
if (pattern.test(parsed.pathname)) {
|
|
144
|
+
return { isValid: false, reason: "non-content path" };
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
return { isValid: true };
|
|
148
|
+
}
|
|
149
|
+
catch {
|
|
150
|
+
return { isValid: false, reason: "invalid URL" };
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
/**
|
|
154
|
+
* Extracts all links from an HTML document.
|
|
155
|
+
*
|
|
156
|
+
* @param html The HTML content
|
|
157
|
+
* @param baseUrl The base URL for resolving relative links
|
|
158
|
+
* @returns Array of absolute URLs found in the document
|
|
159
|
+
*/
|
|
160
|
+
export function extractLinks(html, baseUrl) {
|
|
161
|
+
const links = [];
|
|
162
|
+
const seen = new Set();
|
|
163
|
+
// Simple regex to extract href attributes
|
|
164
|
+
// This is faster than parsing full DOM for link extraction
|
|
165
|
+
const hrefRegex = /href=["']([^"']+)["']/gi;
|
|
166
|
+
let match;
|
|
167
|
+
while ((match = hrefRegex.exec(html)) !== null) {
|
|
168
|
+
const href = match[1];
|
|
169
|
+
// Skip anchors, javascript, mailto, tel
|
|
170
|
+
if (href.startsWith("#") ||
|
|
171
|
+
href.startsWith("javascript:") ||
|
|
172
|
+
href.startsWith("mailto:") ||
|
|
173
|
+
href.startsWith("tel:")) {
|
|
174
|
+
continue;
|
|
175
|
+
}
|
|
176
|
+
const resolved = resolveUrl(href, baseUrl);
|
|
177
|
+
if (resolved) {
|
|
178
|
+
const normalized = normalizeUrl(resolved);
|
|
179
|
+
// Check if should crawl and not seen
|
|
180
|
+
const { isValid } = shouldCrawl(normalized, baseUrl);
|
|
181
|
+
if (isValid && !seen.has(normalized)) {
|
|
182
|
+
seen.add(normalized);
|
|
183
|
+
links.push(normalized);
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
return links;
|
|
188
|
+
}
|
|
189
|
+
/**
|
|
190
|
+
* Gets the path depth of a URL (number of path segments).
|
|
191
|
+
*
|
|
192
|
+
* @param url The URL to analyze
|
|
193
|
+
* @returns Number of path segments
|
|
194
|
+
*/
|
|
195
|
+
export function getPathDepth(url) {
|
|
196
|
+
try {
|
|
197
|
+
const parsed = new URL(url);
|
|
198
|
+
const segments = parsed.pathname.split("/").filter(Boolean);
|
|
199
|
+
return segments.length;
|
|
200
|
+
}
|
|
201
|
+
catch {
|
|
202
|
+
return 0;
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
/**
|
|
206
|
+
* Converts a URL to a safe filename for caching.
|
|
207
|
+
*
|
|
208
|
+
* @param url The URL to convert
|
|
209
|
+
* @returns Safe filename string
|
|
210
|
+
*/
|
|
211
|
+
export function urlToFilename(url) {
|
|
212
|
+
try {
|
|
213
|
+
const parsed = new URL(url);
|
|
214
|
+
// Use pathname as base
|
|
215
|
+
let filename = parsed.pathname;
|
|
216
|
+
// Add search params hash if present
|
|
217
|
+
if (parsed.search) {
|
|
218
|
+
const hash = simpleHash(parsed.search);
|
|
219
|
+
filename += `_${hash}`;
|
|
220
|
+
}
|
|
221
|
+
// Clean up the filename
|
|
222
|
+
filename = filename
|
|
223
|
+
.replace(/^\//, "") // Remove leading slash
|
|
224
|
+
.replace(/\//g, "_") // Replace slashes with underscores
|
|
225
|
+
.replace(/[^a-zA-Z0-9_.-]/g, "_") // Replace special chars
|
|
226
|
+
.replace(/_+/g, "_") // Collapse multiple underscores
|
|
227
|
+
.slice(0, 200); // Limit length
|
|
228
|
+
// Ensure it ends with .md for markdown files
|
|
229
|
+
if (!filename.endsWith(".md")) {
|
|
230
|
+
filename = filename || "index";
|
|
231
|
+
filename += ".md";
|
|
232
|
+
}
|
|
233
|
+
return filename;
|
|
234
|
+
}
|
|
235
|
+
catch {
|
|
236
|
+
return "page.md";
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
/**
|
|
240
|
+
* Simple string hash for deduplication.
|
|
241
|
+
*/
|
|
242
|
+
function simpleHash(str) {
|
|
243
|
+
let hash = 0;
|
|
244
|
+
for (let i = 0; i < str.length; i++) {
|
|
245
|
+
const char = str.charCodeAt(i);
|
|
246
|
+
hash = (hash << 5) - hash + char;
|
|
247
|
+
hash = hash & hash; // Convert to 32bit integer
|
|
248
|
+
}
|
|
249
|
+
return Math.abs(hash).toString(36);
|
|
250
|
+
}
|
|
251
|
+
//# sourceMappingURL=url.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"url.js","sourceRoot":"","sources":["../../src/utils/url.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH;;GAEG;AACH,MAAM,eAAe,GAAG;IACtB,YAAY;IACZ,YAAY;IACZ,cAAc;IACd,UAAU;IACV,aAAa;IACb,KAAK;IACL,QAAQ;IACR,QAAQ;IACR,OAAO;IACP,SAAS;IACT,KAAK;CACN,CAAC;AAEF;;;;;GAKG;AACH,MAAM,UAAU,YAAY,CAAC,GAAW;IACtC,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAE5B,kBAAkB;QAClB,MAAM,CAAC,IAAI,GAAG,EAAE,CAAC;QAEjB,6BAA6B;QAC7B,KAAK,MAAM,KAAK,IAAI,eAAe,EAAE,CAAC;YACpC,MAAM,CAAC,YAAY,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;QACpC,CAAC;QAED,+CAA+C;QAC/C,MAAM,CAAC,YAAY,CAAC,IAAI,EAAE,CAAC;QAE3B,yDAAyD;QACzD,IAAI,MAAM,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,IAAI,MAAM,CAAC,QAAQ,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;YAChE,MAAM,CAAC,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QACjD,CAAC;QAED,4DAA4D;QAC5D,IAAI,CAAC;YACH,MAAM,CAAC,QAAQ,GAAG,kBAAkB,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;QACxD,CAAC;QAAC,MAAM,CAAC;YACP,+BAA+B;QACjC,CAAC;QAED,OAAO,MAAM,CAAC,IAAI,CAAC;IACrB,CAAC;IAAC,MAAM,CAAC;QACP,iCAAiC;QACjC,OAAO,GAAG,CAAC;IACb,CAAC;AACH,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,aAAa,CAAC,GAAW;IACvC,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAC5B,OAAO,MAAM,CAAC,QAAQ,CAAC;IACzB,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,YAAY,CAAC,IAAY,EAAE,IAAY;IACrD,MAAM,OAAO,GAAG,aAAa,CAAC,IAAI,CAAC,CAAC;IACpC,MAAM,OAAO,GAAG,aAAa,CAAC,IAAI,CAAC,CAAC;IAEpC,IAAI,CAAC,OAAO,IAAI,CAAC,OAAO,EAAE,CAAC;QACzB,OAAO,KAAK,CAAC;IACf,CAAC;IAED,+BAA+B;IAC/B,MAAM,SAAS,GAAG,CAAC,CAAS,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;IACzD,OAAO,SAAS,CAAC,OAAO,CAAC,KAAK,SAAS,CAAC,OAAO,CAAC,CAAC;AACnD,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,UAAU,CAAC,IAAY,EAAE,OAAe;IACtD,IAAI,CAAC;QACH,gCAAgC;QAChC,IAAI,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,EAAE,CAAC;YAC1B,MAAM,IAAI,GAAG,IAAI,GAAG,CAAC,OAAO,CAAC,CAAC;YAC9B,OAAO,IAAI,GAAG,CAAC,GAAG,IAAI,CAAC,QAAQ,GAAG,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC;QACjD,CAAC;QAED,OAAO,IAAI,GAAG,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC,IAAI,CAAC;IACrC,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,WAAW,CACzB,GAAW,EACX,OAAe;IAEf,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAE5B,kBAAkB;QAClB,IAAI,CAAC,CAAC,OAAO,EAAE,QAAQ,CAAC,CAAC,QAAQ,CAAC,MAAM,CAAC,QAAQ,CAAC,EAAE,CAAC;YACnD,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,MAAM,EAAE,mBAAmB,EAAE,CAAC;QACzD,CAAC;QAED,sBAAsB;QACtB,IAAI,CAAC,YAAY,CAAC,GAAG,EAAE,OAAO,CAAC,EAAE,CAAC;YAChC,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,MAAM,EAAE,iBAAiB,EAAE,CAAC;QACvD,CAAC;QAED,gCAAgC;QAChC,MAAM,YAAY,GAAG;YACnB,UAAU;YACV,WAAW;YACX,UAAU;YACV,WAAW;YACX,WAAW;YACX,aAAa;YACb,UAAU;YACV,cAAc;YACd,0CAA0C;YAC1C,qCAAqC;YACrC,gCAAgC;YAChC,oCAAoC;SACrC,CAAC;QAEF,KAAK,MAAM,OAAO,IAAI,YAAY,EAAE,CAAC;YACnC,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,EAAE,CAAC;gBAClC,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,MAAM,EAAE,kBAAkB,EAAE,CAAC;YACxD,CAAC;QACH,CAAC;QAED,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC;IAC3B,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,MAAM,EAAE,aAAa,EAAE,CAAC;IACnD,CAAC;AACH,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,YAAY,CAAC,IAAY,EAAE,OAAe;IACxD,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAC;IAE/B,0CAA0C;IAC1C,2DAA2D;IAC3D,MAAM,SAAS,GAAG,yBAAyB,CAAC;IAC5C,IAAI,KAAK,CAAC;IAEV,OAAO,CAAC,KAAK,GAAG,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QAC/C,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QAEtB,wCAAwC;QACxC,IACE,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC;YACpB,IAAI,CAAC,UAAU,CAAC,aAAa,CAAC;YAC9B,IAAI,CAAC,UAAU,CAAC,SAAS,CAAC;YAC1B,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,EACvB,CAAC;YACD,SAAS;QACX,CAAC;QAED,MAAM,QAAQ,GAAG,UAAU,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;QAC3C,IAAI,QAAQ,EAAE,CAAC;YACb,MAAM,UAAU,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;YAE1C,qCAAqC;YACrC,MAAM,EAAE,OAAO,EAAE,GAAG,WAAW,CAAC,UAAU,EAAE,OAAO,CAAC,CAAC;YACrD,IAAI,OAAO,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,UAAU,CAAC,EAAE,CAAC;gBACrC,IAAI,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;gBACrB,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;YACzB,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,YAAY,CAAC,GAAW;IACtC,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAC5B,MAAM,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QAC5D,OAAO,QAAQ,CAAC,MAAM,CAAC;IACzB,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,CAAC,CAAC;IACX,CAAC;AACH,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,aAAa,CAAC,GAAW;IACvC,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAE5B,uBAAuB;QACvB,IAAI,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC;QAE/B,oCAAoC;QACpC,IAAI,MAAM,CAAC,MAAM,EAAE,CAAC;YAClB,MAAM,IAAI,GAAG,UAAU,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;YACvC,QAAQ,IAAI,IAAI,IAAI,EAAE,CAAC;QACzB,CAAC;QAED,wBAAwB;QACxB,QAAQ,GAAG,QAAQ;aAChB,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC,uBAAuB;aAC1C,OAAO,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,mCAAmC;aACvD,OAAO,CAAC,kBAAkB,EAAE,GAAG,CAAC,CAAC,wBAAwB;aACzD,OAAO,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,gCAAgC;aACpD,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC,eAAe;QAEjC,6CAA6C;QAC7C,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;YAC9B,QAAQ,GAAG,QAAQ,IAAI,OAAO,CAAC;YAC/B,QAAQ,IAAI,KAAK,CAAC;QACpB,CAAC;QAED,OAAO,QAAQ,CAAC;IAClB,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,SAAS,CAAC;IACnB,CAAC;AACH,CAAC;AAED;;GAEG;AACH,SAAS,UAAU,CAAC,GAAW;IAC7B,IAAI,IAAI,GAAG,CAAC,CAAC;IACb,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACpC,MAAM,IAAI,GAAG,GAAG,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;QAC/B,IAAI,GAAG,CAAC,IAAI,IAAI,CAAC,CAAC,GAAG,IAAI,GAAG,IAAI,CAAC;QACjC,IAAI,GAAG,IAAI,GAAG,IAAI,CAAC,CAAC,2BAA2B;IACjD,CAAC;IACD,OAAO,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC;AACrC,CAAC"}
|
package/package.json
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "mcp-docs-scraper",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "An MCP server that gives coding agents fast, efficient access to library documentation",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "dist/index.js",
|
|
7
|
+
"bin": {
|
|
8
|
+
"mcp-docs-scraper": "dist/index.js"
|
|
9
|
+
},
|
|
10
|
+
"files": [
|
|
11
|
+
"dist",
|
|
12
|
+
"README.md",
|
|
13
|
+
"LICENSE"
|
|
14
|
+
],
|
|
15
|
+
"keywords": [
|
|
16
|
+
"mcp",
|
|
17
|
+
"model-context-protocol",
|
|
18
|
+
"documentation",
|
|
19
|
+
"scraper",
|
|
20
|
+
"ai",
|
|
21
|
+
"llm",
|
|
22
|
+
"claude",
|
|
23
|
+
"cursor",
|
|
24
|
+
"github",
|
|
25
|
+
"web-scraper"
|
|
26
|
+
],
|
|
27
|
+
"author": "Kuba Kwiecien <kwiscion@gmail.com>",
|
|
28
|
+
"license": "MIT",
|
|
29
|
+
"repository": {
|
|
30
|
+
"type": "git",
|
|
31
|
+
"url": "https://github.com/kwiscion/mcp-docs-scraper.git"
|
|
32
|
+
},
|
|
33
|
+
"homepage": "https://github.com/kwiscion/mcp-docs-scraper#readme",
|
|
34
|
+
"bugs": {
|
|
35
|
+
"url": "https://github.com/kwiscion/mcp-docs-scraper/issues"
|
|
36
|
+
},
|
|
37
|
+
"engines": {
|
|
38
|
+
"node": ">=22.0.0"
|
|
39
|
+
},
|
|
40
|
+
"dependencies": {
|
|
41
|
+
"@modelcontextprotocol/sdk": "^1.25.0",
|
|
42
|
+
"cheerio": "^1.1.2",
|
|
43
|
+
"minisearch": "^7.2.0",
|
|
44
|
+
"turndown": "^7.2.2",
|
|
45
|
+
"zod": "^4.3.5"
|
|
46
|
+
},
|
|
47
|
+
"devDependencies": {
|
|
48
|
+
"@types/node": "^22.0.0",
|
|
49
|
+
"@types/turndown": "^5.0.6",
|
|
50
|
+
"tsx": "^4.21.0",
|
|
51
|
+
"typescript": "^5.9.0"
|
|
52
|
+
},
|
|
53
|
+
"scripts": {
|
|
54
|
+
"build": "tsc",
|
|
55
|
+
"start": "node dist/index.js",
|
|
56
|
+
"dev": "tsx src/index.ts"
|
|
57
|
+
}
|
|
58
|
+
}
|