mcp-docs-scraper 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +357 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +20 -0
- package/dist/index.js.map +1 -0
- package/dist/server.d.ts +6 -0
- package/dist/server.d.ts.map +1 -0
- package/dist/server.js +231 -0
- package/dist/server.js.map +1 -0
- package/dist/services/cache-manager.d.ts +100 -0
- package/dist/services/cache-manager.d.ts.map +1 -0
- package/dist/services/cache-manager.js +212 -0
- package/dist/services/cache-manager.js.map +1 -0
- package/dist/services/content-cleaner.d.ts +48 -0
- package/dist/services/content-cleaner.d.ts.map +1 -0
- package/dist/services/content-cleaner.js +295 -0
- package/dist/services/content-cleaner.js.map +1 -0
- package/dist/services/github-detector.d.ts +49 -0
- package/dist/services/github-detector.d.ts.map +1 -0
- package/dist/services/github-detector.js +276 -0
- package/dist/services/github-detector.js.map +1 -0
- package/dist/services/github-fetcher.d.ts +94 -0
- package/dist/services/github-fetcher.d.ts.map +1 -0
- package/dist/services/github-fetcher.js +393 -0
- package/dist/services/github-fetcher.js.map +1 -0
- package/dist/services/search-index.d.ts +106 -0
- package/dist/services/search-index.d.ts.map +1 -0
- package/dist/services/search-index.js +210 -0
- package/dist/services/search-index.js.map +1 -0
- package/dist/services/web-scraper.d.ts +88 -0
- package/dist/services/web-scraper.d.ts.map +1 -0
- package/dist/services/web-scraper.js +244 -0
- package/dist/services/web-scraper.js.map +1 -0
- package/dist/tools/clear-cache.d.ts +24 -0
- package/dist/tools/clear-cache.d.ts.map +1 -0
- package/dist/tools/clear-cache.js +29 -0
- package/dist/tools/clear-cache.js.map +1 -0
- package/dist/tools/detect-github.d.ts +21 -0
- package/dist/tools/detect-github.d.ts.map +1 -0
- package/dist/tools/detect-github.js +18 -0
- package/dist/tools/detect-github.js.map +1 -0
- package/dist/tools/get-content.d.ts +43 -0
- package/dist/tools/get-content.d.ts.map +1 -0
- package/dist/tools/get-content.js +84 -0
- package/dist/tools/get-content.js.map +1 -0
- package/dist/tools/get-tree.d.ts +31 -0
- package/dist/tools/get-tree.d.ts.map +1 -0
- package/dist/tools/get-tree.js +102 -0
- package/dist/tools/get-tree.js.map +1 -0
- package/dist/tools/index-docs.d.ts +63 -0
- package/dist/tools/index-docs.d.ts.map +1 -0
- package/dist/tools/index-docs.js +371 -0
- package/dist/tools/index-docs.js.map +1 -0
- package/dist/tools/index.d.ts +11 -0
- package/dist/tools/index.d.ts.map +1 -0
- package/dist/tools/index.js +11 -0
- package/dist/tools/index.js.map +1 -0
- package/dist/tools/list-cached.d.ts +19 -0
- package/dist/tools/list-cached.d.ts.map +1 -0
- package/dist/tools/list-cached.js +20 -0
- package/dist/tools/list-cached.js.map +1 -0
- package/dist/tools/search-docs.d.ts +31 -0
- package/dist/tools/search-docs.d.ts.map +1 -0
- package/dist/tools/search-docs.js +64 -0
- package/dist/tools/search-docs.js.map +1 -0
- package/dist/types/cache.d.ts +53 -0
- package/dist/types/cache.d.ts.map +1 -0
- package/dist/types/cache.js +2 -0
- package/dist/types/cache.js.map +1 -0
- package/dist/types/errors.d.ts +102 -0
- package/dist/types/errors.d.ts.map +1 -0
- package/dist/types/errors.js +216 -0
- package/dist/types/errors.js.map +1 -0
- package/dist/types/index.d.ts +6 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/index.js +5 -0
- package/dist/types/index.js.map +1 -0
- package/dist/utils/fs.d.ts +45 -0
- package/dist/utils/fs.d.ts.map +1 -0
- package/dist/utils/fs.js +113 -0
- package/dist/utils/fs.js.map +1 -0
- package/dist/utils/rate-limit.d.ts +55 -0
- package/dist/utils/rate-limit.d.ts.map +1 -0
- package/dist/utils/rate-limit.js +89 -0
- package/dist/utils/rate-limit.js.map +1 -0
- package/dist/utils/url.d.ts +69 -0
- package/dist/utils/url.d.ts.map +1 -0
- package/dist/utils/url.js +251 -0
- package/dist/utils/url.js.map +1 -0
- package/package.json +58 -0
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Search index service - Full-text search using MiniSearch.
|
|
3
|
+
*
|
|
4
|
+
* Indexes documentation content for fast searching.
|
|
5
|
+
* Index is built during docs indexing and stored in cache.
|
|
6
|
+
*/
|
|
7
|
+
import MiniSearch from "minisearch";
|
|
8
|
+
// Current index version
|
|
9
|
+
const INDEX_VERSION = 1;
|
|
10
|
+
// Snippet configuration
|
|
11
|
+
const SNIPPET_LENGTH = 150;
|
|
12
|
+
const SNIPPET_CONTEXT = 50;
|
|
13
|
+
/**
|
|
14
|
+
* Creates and manages a full-text search index for documentation.
|
|
15
|
+
*/
|
|
16
|
+
export class SearchIndex {
|
|
17
|
+
miniSearch;
|
|
18
|
+
documents;
|
|
19
|
+
constructor() {
|
|
20
|
+
this.miniSearch = new MiniSearch({
|
|
21
|
+
fields: ["title", "headings", "content"],
|
|
22
|
+
storeFields: ["title"],
|
|
23
|
+
searchOptions: {
|
|
24
|
+
boost: { title: 3, headings: 2, content: 1 },
|
|
25
|
+
fuzzy: 0.2,
|
|
26
|
+
prefix: true,
|
|
27
|
+
},
|
|
28
|
+
});
|
|
29
|
+
this.documents = new Map();
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Adds a document to the index.
|
|
33
|
+
*/
|
|
34
|
+
addDocument(doc) {
|
|
35
|
+
// Store document for snippet generation
|
|
36
|
+
this.documents.set(doc.id, {
|
|
37
|
+
title: doc.title,
|
|
38
|
+
content: doc.content,
|
|
39
|
+
});
|
|
40
|
+
// Add to search index
|
|
41
|
+
this.miniSearch.add(doc);
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* Adds multiple documents to the index.
|
|
45
|
+
*/
|
|
46
|
+
addDocuments(docs) {
|
|
47
|
+
for (const doc of docs) {
|
|
48
|
+
this.addDocument(doc);
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
/**
|
|
52
|
+
* Searches the index and returns results with snippets.
|
|
53
|
+
*/
|
|
54
|
+
search(query, limit = 10) {
|
|
55
|
+
if (!query.trim()) {
|
|
56
|
+
return [];
|
|
57
|
+
}
|
|
58
|
+
// MiniSearch returns all results, we slice them to the limit
|
|
59
|
+
const results = this.miniSearch.search(query);
|
|
60
|
+
return results
|
|
61
|
+
.slice(0, limit)
|
|
62
|
+
.map((result) => this.toSearchResult(result, query));
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* Converts a MiniSearch result to our SearchResult format.
|
|
66
|
+
*/
|
|
67
|
+
toSearchResult(result, query) {
|
|
68
|
+
const doc = this.documents.get(result.id);
|
|
69
|
+
const title = doc?.title || result.id;
|
|
70
|
+
const content = doc?.content || "";
|
|
71
|
+
return {
|
|
72
|
+
path: result.id,
|
|
73
|
+
title,
|
|
74
|
+
snippet: this.generateSnippet(content, query),
|
|
75
|
+
score: result.score,
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
/**
|
|
79
|
+
* Generates a snippet with the matching text highlighted.
|
|
80
|
+
*/
|
|
81
|
+
generateSnippet(content, query) {
|
|
82
|
+
if (!content) {
|
|
83
|
+
return "";
|
|
84
|
+
}
|
|
85
|
+
// Find the best position to start the snippet
|
|
86
|
+
const queryTerms = query.toLowerCase().split(/\s+/);
|
|
87
|
+
const contentLower = content.toLowerCase();
|
|
88
|
+
let bestPosition = 0;
|
|
89
|
+
let bestScore = 0;
|
|
90
|
+
// Search for query terms and find the position with most matches
|
|
91
|
+
for (const term of queryTerms) {
|
|
92
|
+
if (term.length < 2)
|
|
93
|
+
continue;
|
|
94
|
+
const pos = contentLower.indexOf(term);
|
|
95
|
+
if (pos !== -1) {
|
|
96
|
+
// Score based on how early the term appears
|
|
97
|
+
const score = 1 / (pos + 1);
|
|
98
|
+
if (score > bestScore) {
|
|
99
|
+
bestScore = score;
|
|
100
|
+
bestPosition = pos;
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
// Calculate snippet boundaries
|
|
105
|
+
const start = Math.max(0, bestPosition - SNIPPET_CONTEXT);
|
|
106
|
+
const end = Math.min(content.length, start + SNIPPET_LENGTH);
|
|
107
|
+
// Extract snippet
|
|
108
|
+
let snippet = content.slice(start, end);
|
|
109
|
+
// Clean up snippet
|
|
110
|
+
snippet = snippet
|
|
111
|
+
.replace(/\n+/g, " ") // Replace newlines with spaces
|
|
112
|
+
.replace(/\s+/g, " ") // Normalize whitespace
|
|
113
|
+
.trim();
|
|
114
|
+
// Add ellipsis if needed
|
|
115
|
+
if (start > 0) {
|
|
116
|
+
snippet = "..." + snippet;
|
|
117
|
+
}
|
|
118
|
+
if (end < content.length) {
|
|
119
|
+
snippet = snippet + "...";
|
|
120
|
+
}
|
|
121
|
+
return snippet;
|
|
122
|
+
}
|
|
123
|
+
/**
|
|
124
|
+
* Returns the number of indexed documents.
|
|
125
|
+
*/
|
|
126
|
+
get documentCount() {
|
|
127
|
+
return this.documents.size;
|
|
128
|
+
}
|
|
129
|
+
/**
|
|
130
|
+
* Serializes the index for storage.
|
|
131
|
+
*/
|
|
132
|
+
toJSON() {
|
|
133
|
+
const serialized = {
|
|
134
|
+
index: this.miniSearch.toJSON(),
|
|
135
|
+
documents: this.documents,
|
|
136
|
+
version: INDEX_VERSION,
|
|
137
|
+
};
|
|
138
|
+
// Convert Map to array for JSON serialization
|
|
139
|
+
return JSON.stringify({
|
|
140
|
+
...serialized,
|
|
141
|
+
documents: Array.from(this.documents.entries()),
|
|
142
|
+
});
|
|
143
|
+
}
|
|
144
|
+
/**
|
|
145
|
+
* Creates a SearchIndex from serialized JSON.
|
|
146
|
+
*/
|
|
147
|
+
static fromJSON(json) {
|
|
148
|
+
const parsed = JSON.parse(json);
|
|
149
|
+
// Check version
|
|
150
|
+
if (parsed.version !== INDEX_VERSION) {
|
|
151
|
+
throw new Error(`Search index version mismatch: expected ${INDEX_VERSION}, got ${parsed.version}`);
|
|
152
|
+
}
|
|
153
|
+
const searchIndex = new SearchIndex();
|
|
154
|
+
// Restore MiniSearch index
|
|
155
|
+
searchIndex.miniSearch = MiniSearch.loadJSON(JSON.stringify(parsed.index), {
|
|
156
|
+
fields: ["title", "headings", "content"],
|
|
157
|
+
storeFields: ["title"],
|
|
158
|
+
searchOptions: {
|
|
159
|
+
boost: { title: 3, headings: 2, content: 1 },
|
|
160
|
+
fuzzy: 0.2,
|
|
161
|
+
prefix: true,
|
|
162
|
+
},
|
|
163
|
+
});
|
|
164
|
+
// Restore documents map
|
|
165
|
+
searchIndex.documents = new Map(parsed.documents);
|
|
166
|
+
return searchIndex;
|
|
167
|
+
}
|
|
168
|
+
/**
|
|
169
|
+
* Clears the index.
|
|
170
|
+
*/
|
|
171
|
+
clear() {
|
|
172
|
+
this.miniSearch.removeAll();
|
|
173
|
+
this.documents.clear();
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
/**
|
|
177
|
+
* Extracts a title from markdown content.
|
|
178
|
+
* Returns the first H1 heading, or undefined if none found.
|
|
179
|
+
*/
|
|
180
|
+
export function extractTitle(content) {
|
|
181
|
+
const match = content.match(/^#\s+(.+)$/m);
|
|
182
|
+
return match ? match[1].trim() : undefined;
|
|
183
|
+
}
|
|
184
|
+
/**
|
|
185
|
+
* Extracts all headings from markdown content.
|
|
186
|
+
* Returns a concatenated string of all headings.
|
|
187
|
+
*/
|
|
188
|
+
export function extractHeadings(content) {
|
|
189
|
+
const headings = [];
|
|
190
|
+
const regex = /^#{1,6}\s+(.+)$/gm;
|
|
191
|
+
let match;
|
|
192
|
+
while ((match = regex.exec(content)) !== null) {
|
|
193
|
+
headings.push(match[1].trim());
|
|
194
|
+
}
|
|
195
|
+
return headings.join(" ");
|
|
196
|
+
}
|
|
197
|
+
/**
|
|
198
|
+
* Creates an indexable document from file content.
|
|
199
|
+
*/
|
|
200
|
+
export function createIndexableDocument(path, content) {
|
|
201
|
+
const title = extractTitle(content) || path.split("/").pop() || path;
|
|
202
|
+
const headings = extractHeadings(content);
|
|
203
|
+
return {
|
|
204
|
+
id: path,
|
|
205
|
+
title,
|
|
206
|
+
headings,
|
|
207
|
+
content,
|
|
208
|
+
};
|
|
209
|
+
}
|
|
210
|
+
//# sourceMappingURL=search-index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"search-index.js","sourceRoot":"","sources":["../../src/services/search-index.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,UAAgD,MAAM,YAAY,CAAC;AA0C1E,wBAAwB;AACxB,MAAM,aAAa,GAAG,CAAC,CAAC;AAExB,wBAAwB;AACxB,MAAM,cAAc,GAAG,GAAG,CAAC;AAC3B,MAAM,eAAe,GAAG,EAAE,CAAC;AAE3B;;GAEG;AACH,MAAM,OAAO,WAAW;IACd,UAAU,CAAgC;IAC1C,SAAS,CAAkD;IAEnE;QACE,IAAI,CAAC,UAAU,GAAG,IAAI,UAAU,CAAoB;YAClD,MAAM,EAAE,CAAC,OAAO,EAAE,UAAU,EAAE,SAAS,CAAC;YACxC,WAAW,EAAE,CAAC,OAAO,CAAC;YACtB,aAAa,EAAE;gBACb,KAAK,EAAE,EAAE,KAAK,EAAE,CAAC,EAAE,QAAQ,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,EAAE;gBAC5C,KAAK,EAAE,GAAG;gBACV,MAAM,EAAE,IAAI;aACb;SACF,CAAC,CAAC;QACH,IAAI,CAAC,SAAS,GAAG,IAAI,GAAG,EAAE,CAAC;IAC7B,CAAC;IAED;;OAEG;IACH,WAAW,CAAC,GAAsB;QAChC,wCAAwC;QACxC,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,EAAE;YACzB,KAAK,EAAE,GAAG,CAAC,KAAK;YAChB,OAAO,EAAE,GAAG,CAAC,OAAO;SACrB,CAAC,CAAC;QAEH,sBAAsB;QACtB,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;IAC3B,CAAC;IAED;;OAEG;IACH,YAAY,CAAC,IAAyB;QACpC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;YACvB,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC;QACxB,CAAC;IACH,CAAC;IAED;;OAEG;IACH,MAAM,CAAC,KAAa,EAAE,QAAgB,EAAE;QACtC,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,EAAE,CAAC;YAClB,OAAO,EAAE,CAAC;QACZ,CAAC;QAED,6DAA6D;QAC7D,MAAM,OAAO,GAAG,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;QAE9C,OAAO,OAAO;aACX,KAAK,CAAC,CAAC,EAAE,KAAK,CAAC;aACf,GAAG,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,IAAI,CAAC,cAAc,CAAC,MAAM,EAAE,KAAK,CAAC,CAAC,CAAC;IACzD,CAAC;IAED;;OAEG;IACK,cAAc,CACpB,MAAwB,EACxB,KAAa;QAEb,MAAM,GAAG,GAAG,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;QAC1C,MAAM,KAAK,GAAG,GAAG,EAAE,KAAK,IAAI,MAAM,CAAC,EAAE,CAAC;QACtC,MAAM,OAAO,GAAG,GAAG,EAAE,OAAO,IAAI,EAAE,CAAC;QAEnC,OAAO;YACL,IAAI,EAAE,MAAM,CAAC,EAAE;YACf,KAAK;YACL,OAAO,EAAE,IAAI,CAAC,eAAe,CAAC,OAAO,EAAE,KAAK,CAAC;YAC7C,KAAK,EAAE,MAAM,CAAC,KAAK;SACpB,CAAC;IACJ,CAAC;IAED;;OAEG;IACK,eAAe,CAAC,OAAe,EAAE,KAAa;QACpD,IAAI,CAAC,OAAO,EAAE,CAAC;YACb,OAAO,EAAE,CAAC;QACZ,CAAC;QAED,8CAA8C;QAC9C,MAAM,UAAU,GAAG,KAAK,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;QACpD,MAAM,YAAY,GAAG,OAAO,CAAC,WAAW,EAAE,CAAC;QAE3C,IAAI,YAAY,GAAG,CAAC,CAAC;QACrB,IAAI,SAAS,GAAG,CAAC,CAAC;QAElB,iEAAiE;QACjE,KAAK,MAAM,IAAI,IAAI,UAAU,EAAE,CAAC;YAC9B,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC;gBAAE,SAAS;YAE9B,MAAM,GAAG,GAAG,YAAY,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;YACvC,IAAI,GAAG,KAAK,CAAC,CAAC,EAAE,CAAC;gBACf,4CAA4C;gBAC5C,MAAM,KAAK,GAAG,CAAC,GAAG,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC;gBAC5B,IAAI,KAAK,GAAG,SAAS,EAAE,CAAC;oBACtB,SAAS,GAAG,KAAK,CAAC;oBAClB,YAAY,GAAG,GAAG,CAAC;gBACrB,CAAC;YACH,CAAC;QACH,CAAC;QAED,+BAA+B;QAC/B,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,YAAY,GAAG,eAAe,CAAC,CAAC;QAC1D,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,MAAM,EAAE,KAAK,GAAG,cAAc,CAAC,CAAC;QAE7D,kBAAkB;QAClB,IAAI,OAAO,GAAG,OAAO,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC;QAExC,mBAAmB;QACnB,OAAO,GAAG,OAAO;aACd,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,+BAA+B;aACpD,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,uBAAuB;aAC5C,IAAI,EAAE,CAAC;QAEV,yBAAyB;QACzB,IAAI,KAAK,GAAG,CAAC,EAAE,CAAC;YACd,OAAO,GAAG,KAAK,GAAG,OAAO,CAAC;QAC5B,CAAC;QACD,IAAI,GAAG,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC;YACzB,OAAO,GAAG,OAAO,GAAG,KAAK,CAAC;QAC5B,CAAC;QAED,OAAO,OAAO,CAAC;IACjB,CAAC;IAED;;OAEG;IACH,IAAI,aAAa;QACf,OAAO,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC;IAC7B,CAAC;IAED;;OAEG;IACH,MAAM;QACJ,MAAM,UAAU,GAA0B;YACxC,KAAK,EAAE,IAAI,CAAC,UAAU,CAAC,MAAM,EAAE;YAC/B,SAAS,EAAE,IAAI,CAAC,SAAS;YACzB,OAAO,EAAE,aAAa;SACvB,CAAC;QAEF,8CAA8C;QAC9C,OAAO,IAAI,CAAC,SAAS,CAAC;YACpB,GAAG,UAAU;YACb,SAAS,EAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,CAAC;SAChD,CAAC,CAAC;IACL,CAAC;IAED;;OAEG;IACH,MAAM,CAAC,QAAQ,CAAC,IAAY;QAC1B,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAEhC,gBAAgB;QAChB,IAAI,MAAM,CAAC,OAAO,KAAK,aAAa,EAAE,CAAC;YACrC,MAAM,IAAI,KAAK,CACb,2CAA2C,aAAa,SAAS,MAAM,CAAC,OAAO,EAAE,CAClF,CAAC;QACJ,CAAC;QAED,MAAM,WAAW,GAAG,IAAI,WAAW,EAAE,CAAC;QAEtC,2BAA2B;QAC3B,WAAW,CAAC,UAAU,GAAG,UAAU,CAAC,QAAQ,CAAC,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC,KAAK,CAAC,EAAE;YACzE,MAAM,EAAE,CAAC,OAAO,EAAE,UAAU,EAAE,SAAS,CAAC;YACxC,WAAW,EAAE,CAAC,OAAO,CAAC;YACtB,aAAa,EAAE;gBACb,KAAK,EAAE,EAAE,KAAK,EAAE,CAAC,EAAE,QAAQ,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,EAAE;gBAC5C,KAAK,EAAE,GAAG;gBACV,MAAM,EAAE,IAAI;aACb;SACF,CAAC,CAAC;QAEH,wBAAwB;QACxB,WAAW,CAAC,SAAS,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAElD,OAAO,WAAW,CAAC;IACrB,CAAC;IAED;;OAEG;IACH,KAAK;QACH,IAAI,CAAC,UAAU,CAAC,SAAS,EAAE,CAAC;QAC5B,IAAI,CAAC,SAAS,CAAC,KAAK,EAAE,CAAC;IACzB,CAAC;CACF;AAED;;;GAGG;AACH,MAAM,UAAU,YAAY,CAAC,OAAe;IAC1C,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;IAC3C,OAAO,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC;AAC7C,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,eAAe,CAAC,OAAe;IAC7C,MAAM,QAAQ,GAAa,EAAE,CAAC;IAC9B,MAAM,KAAK,GAAG,mBAAmB,CAAC;IAClC,IAAI,KAAK,CAAC;IAEV,OAAO,CAAC,KAAK,GAAG,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QAC9C,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;IACjC,CAAC;IAED,OAAO,QAAQ,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;AAC5B,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,uBAAuB,CACrC,IAAY,EACZ,OAAe;IAEf,MAAM,KAAK,GAAG,YAAY,CAAC,OAAO,CAAC,IAAI,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,IAAI,IAAI,CAAC;IACrE,MAAM,QAAQ,GAAG,eAAe,CAAC,OAAO,CAAC,CAAC;IAE1C,OAAO;QACL,EAAE,EAAE,IAAI;QACR,KAAK;QACL,QAAQ;QACR,OAAO;KACR,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Web Scraper Service - Crawls documentation websites.
|
|
3
|
+
*
|
|
4
|
+
* Responsibilities:
|
|
5
|
+
* - Crawl docs websites with depth limit
|
|
6
|
+
* - Respect robots.txt
|
|
7
|
+
* - Extract and normalize links
|
|
8
|
+
* - Rate limiting between requests
|
|
9
|
+
*/
|
|
10
|
+
/**
|
|
11
|
+
* Options for the web scraper.
|
|
12
|
+
*/
|
|
13
|
+
export interface ScraperOptions {
|
|
14
|
+
/** Maximum crawl depth (default: 2, max: 5) */
|
|
15
|
+
maxDepth?: number;
|
|
16
|
+
/** Delay between requests in ms (default: 500) */
|
|
17
|
+
requestDelay?: number;
|
|
18
|
+
/** Maximum pages to crawl (default: 100) */
|
|
19
|
+
maxPages?: number;
|
|
20
|
+
/** Whether to respect robots.txt (default: true) */
|
|
21
|
+
respectRobotsTxt?: boolean;
|
|
22
|
+
/** Custom user agent */
|
|
23
|
+
userAgent?: string;
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Result of scraping a single page.
|
|
27
|
+
*/
|
|
28
|
+
export interface ScrapedPage {
|
|
29
|
+
/** Original URL */
|
|
30
|
+
url: string;
|
|
31
|
+
/** Normalized URL */
|
|
32
|
+
normalizedUrl: string;
|
|
33
|
+
/** Safe filename for caching */
|
|
34
|
+
filename: string;
|
|
35
|
+
/** Raw HTML content */
|
|
36
|
+
html: string;
|
|
37
|
+
/** HTTP status code */
|
|
38
|
+
status: number;
|
|
39
|
+
/** Content type header */
|
|
40
|
+
contentType: string;
|
|
41
|
+
/** Crawl depth this page was found at */
|
|
42
|
+
depth: number;
|
|
43
|
+
/** Links found on this page */
|
|
44
|
+
links: string[];
|
|
45
|
+
}
|
|
46
|
+
/**
|
|
47
|
+
* Result of a complete crawl operation.
|
|
48
|
+
*/
|
|
49
|
+
export interface CrawlResult {
|
|
50
|
+
/** Base URL that was crawled */
|
|
51
|
+
baseUrl: string;
|
|
52
|
+
/** All successfully scraped pages */
|
|
53
|
+
pages: ScrapedPage[];
|
|
54
|
+
/** URLs that failed to fetch */
|
|
55
|
+
failed: Array<{
|
|
56
|
+
url: string;
|
|
57
|
+
reason: string;
|
|
58
|
+
}>;
|
|
59
|
+
/** URLs that were skipped (robots.txt, external, etc.) */
|
|
60
|
+
skipped: Array<{
|
|
61
|
+
url: string;
|
|
62
|
+
reason: string;
|
|
63
|
+
}>;
|
|
64
|
+
/** Crawl statistics */
|
|
65
|
+
stats: {
|
|
66
|
+
totalDiscovered: number;
|
|
67
|
+
totalCrawled: number;
|
|
68
|
+
totalFailed: number;
|
|
69
|
+
totalSkipped: number;
|
|
70
|
+
maxDepthReached: number;
|
|
71
|
+
durationMs: number;
|
|
72
|
+
};
|
|
73
|
+
}
|
|
74
|
+
/**
|
|
75
|
+
* Crawls a documentation website starting from a base URL.
|
|
76
|
+
*
|
|
77
|
+
* @param startUrl The URL to start crawling from
|
|
78
|
+
* @param options Scraper options
|
|
79
|
+
* @returns CrawlResult with all scraped pages and statistics
|
|
80
|
+
*/
|
|
81
|
+
export declare function crawlWebsite(startUrl: string, options?: ScraperOptions): Promise<CrawlResult>;
|
|
82
|
+
/**
|
|
83
|
+
* Web scraper singleton for convenience.
|
|
84
|
+
*/
|
|
85
|
+
export declare const webScraper: {
|
|
86
|
+
crawl: typeof crawlWebsite;
|
|
87
|
+
};
|
|
88
|
+
//# sourceMappingURL=web-scraper.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"web-scraper.d.ts","sourceRoot":"","sources":["../../src/services/web-scraper.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AASH;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,+CAA+C;IAC/C,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,kDAAkD;IAClD,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,4CAA4C;IAC5C,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,oDAAoD;IACpD,gBAAgB,CAAC,EAAE,OAAO,CAAC;IAC3B,wBAAwB;IACxB,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,mBAAmB;IACnB,GAAG,EAAE,MAAM,CAAC;IACZ,qBAAqB;IACrB,aAAa,EAAE,MAAM,CAAC;IACtB,gCAAgC;IAChC,QAAQ,EAAE,MAAM,CAAC;IACjB,uBAAuB;IACvB,IAAI,EAAE,MAAM,CAAC;IACb,uBAAuB;IACvB,MAAM,EAAE,MAAM,CAAC;IACf,0BAA0B;IAC1B,WAAW,EAAE,MAAM,CAAC;IACpB,yCAAyC;IACzC,KAAK,EAAE,MAAM,CAAC;IACd,+BAA+B;IAC/B,KAAK,EAAE,MAAM,EAAE,CAAC;CACjB;AAED;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,gCAAgC;IAChC,OAAO,EAAE,MAAM,CAAC;IAChB,qCAAqC;IACrC,KAAK,EAAE,WAAW,EAAE,CAAC;IACrB,gCAAgC;IAChC,MAAM,EAAE,KAAK,CAAC;QAAE,GAAG,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IAC/C,0DAA0D;IAC1D,OAAO,EAAE,KAAK,CAAC;QAAE,GAAG,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IAChD,uBAAuB;IACvB,KAAK,EAAE;QACL,eAAe,EAAE,MAAM,CAAC;QACxB,YAAY,EAAE,MAAM,CAAC;QACrB,WAAW,EAAE,MAAM,CAAC;QACpB,YAAY,EAAE,MAAM,CAAC;QACrB,eAAe,EAAE,MAAM,CAAC;QACxB,UAAU,EAAE,MAAM,CAAC;KACpB,CAAC;CACH;AA4JD;;;;;;GAMG;AACH,wBAAsB,YAAY,CAChC,QAAQ,EAAE,MAAM,EAChB,OAAO,GAAE,cAAmB,GAC3B,OAAO,CAAC,WAAW,CAAC,CAqHtB;AAED;;GAEG;AACH,eAAO,MAAM,UAAU;;CAEtB,CAAC"}
|
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Web Scraper Service - Crawls documentation websites.
|
|
3
|
+
*
|
|
4
|
+
* Responsibilities:
|
|
5
|
+
* - Crawl docs websites with depth limit
|
|
6
|
+
* - Respect robots.txt
|
|
7
|
+
* - Extract and normalize links
|
|
8
|
+
* - Rate limiting between requests
|
|
9
|
+
*/
|
|
10
|
+
import { normalizeUrl, extractLinks, isSameDomain, urlToFilename, } from "../utils/url.js";
|
|
11
|
+
/**
|
|
12
|
+
* Default scraper options.
|
|
13
|
+
*/
|
|
14
|
+
const DEFAULT_OPTIONS = {
|
|
15
|
+
maxDepth: 2,
|
|
16
|
+
requestDelay: 500,
|
|
17
|
+
maxPages: 100,
|
|
18
|
+
respectRobotsTxt: true,
|
|
19
|
+
userAgent: "mcp-docs-scraper/1.0 (documentation indexer)",
|
|
20
|
+
};
|
|
21
|
+
/**
|
|
22
|
+
* Maximum allowed depth.
|
|
23
|
+
*/
|
|
24
|
+
const MAX_DEPTH = 5;
|
|
25
|
+
/**
|
|
26
|
+
* Fetches and parses robots.txt for a domain.
|
|
27
|
+
*/
|
|
28
|
+
async function fetchRobotsTxt(baseUrl, userAgent) {
|
|
29
|
+
const rules = {
|
|
30
|
+
disallowedPaths: [],
|
|
31
|
+
};
|
|
32
|
+
try {
|
|
33
|
+
const robotsUrl = new URL("/robots.txt", baseUrl).href;
|
|
34
|
+
const response = await fetch(robotsUrl, {
|
|
35
|
+
headers: { "User-Agent": userAgent },
|
|
36
|
+
signal: AbortSignal.timeout(5000),
|
|
37
|
+
});
|
|
38
|
+
if (!response.ok) {
|
|
39
|
+
return rules; // No robots.txt or error - allow all
|
|
40
|
+
}
|
|
41
|
+
const text = await response.text();
|
|
42
|
+
const lines = text.split("\n");
|
|
43
|
+
let isRelevantUserAgent = false;
|
|
44
|
+
for (const line of lines) {
|
|
45
|
+
const trimmed = line.trim().toLowerCase();
|
|
46
|
+
// Check user-agent directive
|
|
47
|
+
if (trimmed.startsWith("user-agent:")) {
|
|
48
|
+
const agent = trimmed.slice(11).trim();
|
|
49
|
+
isRelevantUserAgent = agent === "*" || userAgent.toLowerCase().includes(agent);
|
|
50
|
+
}
|
|
51
|
+
// Parse disallow directive
|
|
52
|
+
if (isRelevantUserAgent && trimmed.startsWith("disallow:")) {
|
|
53
|
+
const path = line.trim().slice(9).trim();
|
|
54
|
+
if (path) {
|
|
55
|
+
rules.disallowedPaths.push(path);
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
// Parse crawl-delay directive
|
|
59
|
+
if (isRelevantUserAgent && trimmed.startsWith("crawl-delay:")) {
|
|
60
|
+
const delay = parseInt(line.trim().slice(12).trim(), 10);
|
|
61
|
+
if (!isNaN(delay)) {
|
|
62
|
+
rules.crawlDelay = delay * 1000; // Convert to ms
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
catch {
|
|
68
|
+
// Ignore errors - assume allowed
|
|
69
|
+
}
|
|
70
|
+
return rules;
|
|
71
|
+
}
|
|
72
|
+
/**
|
|
73
|
+
* Checks if a URL is disallowed by robots.txt rules.
|
|
74
|
+
*/
|
|
75
|
+
function isDisallowed(url, rules) {
|
|
76
|
+
try {
|
|
77
|
+
const parsed = new URL(url);
|
|
78
|
+
const pathname = parsed.pathname;
|
|
79
|
+
for (const disallowed of rules.disallowedPaths) {
|
|
80
|
+
// Handle wildcard patterns
|
|
81
|
+
if (disallowed.includes("*")) {
|
|
82
|
+
const regex = new RegExp("^" + disallowed.replace(/\*/g, ".*").replace(/\?/g, "\\?") + "$");
|
|
83
|
+
if (regex.test(pathname)) {
|
|
84
|
+
return true;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
else if (pathname.startsWith(disallowed)) {
|
|
88
|
+
return true;
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
return false;
|
|
92
|
+
}
|
|
93
|
+
catch {
|
|
94
|
+
return false;
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* Fetches a single page.
|
|
99
|
+
*/
|
|
100
|
+
async function fetchPage(url, userAgent) {
|
|
101
|
+
try {
|
|
102
|
+
const response = await fetch(url, {
|
|
103
|
+
headers: {
|
|
104
|
+
"User-Agent": userAgent,
|
|
105
|
+
Accept: "text/html,application/xhtml+xml",
|
|
106
|
+
},
|
|
107
|
+
redirect: "follow",
|
|
108
|
+
signal: AbortSignal.timeout(10000),
|
|
109
|
+
});
|
|
110
|
+
const contentType = response.headers.get("content-type") || "";
|
|
111
|
+
// Only process HTML content
|
|
112
|
+
if (!contentType.includes("text/html") && !contentType.includes("application/xhtml")) {
|
|
113
|
+
return null;
|
|
114
|
+
}
|
|
115
|
+
const html = await response.text();
|
|
116
|
+
return {
|
|
117
|
+
html,
|
|
118
|
+
status: response.status,
|
|
119
|
+
contentType,
|
|
120
|
+
};
|
|
121
|
+
}
|
|
122
|
+
catch {
|
|
123
|
+
return null;
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
/**
|
|
127
|
+
* Delays execution for a specified time.
|
|
128
|
+
*/
|
|
129
|
+
function delay(ms) {
|
|
130
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
131
|
+
}
|
|
132
|
+
/**
|
|
133
|
+
* Crawls a documentation website starting from a base URL.
|
|
134
|
+
*
|
|
135
|
+
* @param startUrl The URL to start crawling from
|
|
136
|
+
* @param options Scraper options
|
|
137
|
+
* @returns CrawlResult with all scraped pages and statistics
|
|
138
|
+
*/
|
|
139
|
+
export async function crawlWebsite(startUrl, options = {}) {
|
|
140
|
+
const startTime = Date.now();
|
|
141
|
+
// Merge options with defaults
|
|
142
|
+
const opts = {
|
|
143
|
+
...DEFAULT_OPTIONS,
|
|
144
|
+
...options,
|
|
145
|
+
maxDepth: Math.min(options.maxDepth || DEFAULT_OPTIONS.maxDepth, MAX_DEPTH),
|
|
146
|
+
};
|
|
147
|
+
// Normalize starting URL
|
|
148
|
+
const baseUrl = normalizeUrl(startUrl);
|
|
149
|
+
// Initialize result
|
|
150
|
+
const result = {
|
|
151
|
+
baseUrl,
|
|
152
|
+
pages: [],
|
|
153
|
+
failed: [],
|
|
154
|
+
skipped: [],
|
|
155
|
+
stats: {
|
|
156
|
+
totalDiscovered: 1,
|
|
157
|
+
totalCrawled: 0,
|
|
158
|
+
totalFailed: 0,
|
|
159
|
+
totalSkipped: 0,
|
|
160
|
+
maxDepthReached: 0,
|
|
161
|
+
durationMs: 0,
|
|
162
|
+
},
|
|
163
|
+
};
|
|
164
|
+
// Fetch robots.txt if needed
|
|
165
|
+
let robotsRules = { disallowedPaths: [] };
|
|
166
|
+
if (opts.respectRobotsTxt) {
|
|
167
|
+
robotsRules = await fetchRobotsTxt(baseUrl, opts.userAgent);
|
|
168
|
+
if (robotsRules.crawlDelay) {
|
|
169
|
+
opts.requestDelay = Math.max(opts.requestDelay, robotsRules.crawlDelay);
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
// Track visited URLs and queue
|
|
173
|
+
const visited = new Set();
|
|
174
|
+
const queue = [{ url: baseUrl, depth: 0 }];
|
|
175
|
+
while (queue.length > 0 && result.pages.length < opts.maxPages) {
|
|
176
|
+
const current = queue.shift();
|
|
177
|
+
const { url, depth } = current;
|
|
178
|
+
// Skip if already visited
|
|
179
|
+
if (visited.has(url)) {
|
|
180
|
+
continue;
|
|
181
|
+
}
|
|
182
|
+
visited.add(url);
|
|
183
|
+
// Check robots.txt
|
|
184
|
+
if (opts.respectRobotsTxt && isDisallowed(url, robotsRules)) {
|
|
185
|
+
result.skipped.push({ url, reason: "disallowed by robots.txt" });
|
|
186
|
+
result.stats.totalSkipped++;
|
|
187
|
+
continue;
|
|
188
|
+
}
|
|
189
|
+
// Check same domain
|
|
190
|
+
if (!isSameDomain(url, baseUrl)) {
|
|
191
|
+
result.skipped.push({ url, reason: "external domain" });
|
|
192
|
+
result.stats.totalSkipped++;
|
|
193
|
+
continue;
|
|
194
|
+
}
|
|
195
|
+
// Add delay between requests (except first)
|
|
196
|
+
if (result.stats.totalCrawled > 0) {
|
|
197
|
+
await delay(opts.requestDelay);
|
|
198
|
+
}
|
|
199
|
+
// Fetch the page
|
|
200
|
+
const pageResult = await fetchPage(url, opts.userAgent);
|
|
201
|
+
if (!pageResult) {
|
|
202
|
+
result.failed.push({ url, reason: "fetch failed or non-HTML content" });
|
|
203
|
+
result.stats.totalFailed++;
|
|
204
|
+
continue;
|
|
205
|
+
}
|
|
206
|
+
// Extract links for further crawling
|
|
207
|
+
const links = depth < opts.maxDepth ? extractLinks(pageResult.html, url) : [];
|
|
208
|
+
// Create scraped page
|
|
209
|
+
const page = {
|
|
210
|
+
url,
|
|
211
|
+
normalizedUrl: normalizeUrl(url),
|
|
212
|
+
filename: urlToFilename(url),
|
|
213
|
+
html: pageResult.html,
|
|
214
|
+
status: pageResult.status,
|
|
215
|
+
contentType: pageResult.contentType,
|
|
216
|
+
depth,
|
|
217
|
+
links,
|
|
218
|
+
};
|
|
219
|
+
result.pages.push(page);
|
|
220
|
+
result.stats.totalCrawled++;
|
|
221
|
+
result.stats.maxDepthReached = Math.max(result.stats.maxDepthReached, depth);
|
|
222
|
+
// Add new links to queue
|
|
223
|
+
if (depth < opts.maxDepth) {
|
|
224
|
+
for (const link of links) {
|
|
225
|
+
if (!visited.has(link)) {
|
|
226
|
+
queue.push({ url: link, depth: depth + 1 });
|
|
227
|
+
result.stats.totalDiscovered++;
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
// Log progress
|
|
232
|
+
console.error(`[scraper] Crawled ${result.stats.totalCrawled}/${opts.maxPages}: ${url} (depth ${depth})`);
|
|
233
|
+
}
|
|
234
|
+
// Update final stats
|
|
235
|
+
result.stats.durationMs = Date.now() - startTime;
|
|
236
|
+
return result;
|
|
237
|
+
}
|
|
238
|
+
/**
|
|
239
|
+
* Web scraper singleton for convenience.
|
|
240
|
+
*/
|
|
241
|
+
export const webScraper = {
|
|
242
|
+
crawl: crawlWebsite,
|
|
243
|
+
};
|
|
244
|
+
//# sourceMappingURL=web-scraper.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"web-scraper.js","sourceRoot":"","sources":["../../src/services/web-scraper.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,EACL,YAAY,EACZ,YAAY,EACZ,YAAY,EACZ,aAAa,GACd,MAAM,iBAAiB,CAAC;AAuEzB;;GAEG;AACH,MAAM,eAAe,GAA6B;IAChD,QAAQ,EAAE,CAAC;IACX,YAAY,EAAE,GAAG;IACjB,QAAQ,EAAE,GAAG;IACb,gBAAgB,EAAE,IAAI;IACtB,SAAS,EAAE,8CAA8C;CAC1D,CAAC;AAEF;;GAEG;AACH,MAAM,SAAS,GAAG,CAAC,CAAC;AAEpB;;GAEG;AACH,KAAK,UAAU,cAAc,CAC3B,OAAe,EACf,SAAiB;IAEjB,MAAM,KAAK,GAAgB;QACzB,eAAe,EAAE,EAAE;KACpB,CAAC;IAEF,IAAI,CAAC;QACH,MAAM,SAAS,GAAG,IAAI,GAAG,CAAC,aAAa,EAAE,OAAO,CAAC,CAAC,IAAI,CAAC;QACvD,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,SAAS,EAAE;YACtC,OAAO,EAAE,EAAE,YAAY,EAAE,SAAS,EAAE;YACpC,MAAM,EAAE,WAAW,CAAC,OAAO,CAAC,IAAI,CAAC;SAClC,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;YACjB,OAAO,KAAK,CAAC,CAAC,qCAAqC;QACrD,CAAC;QAED,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;QACnC,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAE/B,IAAI,mBAAmB,GAAG,KAAK,CAAC;QAEhC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;YAE1C,6BAA6B;YAC7B,IAAI,OAAO,CAAC,UAAU,CAAC,aAAa,CAAC,EAAE,CAAC;gBACtC,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;gBACvC,mBAAmB,GAAG,KAAK,KAAK,GAAG,IAAI,SAAS,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC;YACjF,CAAC;YAED,2BAA2B;YAC3B,IAAI,mBAAmB,IAAI,OAAO,CAAC,UAAU,CAAC,WAAW,CAAC,EAAE,CAAC;gBAC3D,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;gBACzC,IAAI,IAAI,EAAE,CAAC;oBACT,KAAK,CAAC,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBACnC,CAAC;YACH,CAAC;YAED,8BAA8B;YAC9B,IAAI,mBAAmB,IAAI,OAAO,CAAC,UAAU,CAAC,cAAc,CAAC,EAAE,CAAC;gBAC9D,MAAM,KAAK,GAAG,QAAQ,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,EAAE,EAAE,CAAC,CAAC;gBACzD,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC;oBAClB,KAAK,CAAC,UAAU,GAAG,KAAK,GAAG,IAAI,CAAC,CAAC,gBAAgB;gBACnD,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAAC,MAAM,CAAC;QACP,iCAAiC;IACnC,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;GAEG;AACH,SAAS,YAAY,CAAC,GAAW,EAAE,KAAkB;IACnD,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAC5B,MAAM,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC;QAEjC,KAAK,MAAM,UAAU,IAAI,KAAK,CAAC,eAAe,EAAE,CAAC;YAC/C,2BAA2B;YAC3B,IAAI,UAAU,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;gBAC7B,MAAM,KAAK,GAAG,IAAI,MAAM,CACtB,GAAG,GAAG,UAAU,CAAC,OAAO,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC,OAAO,CAAC,KAAK,EAAE,KAAK,CAAC,GAAG,GAAG,CAClE,CAAC;gBACF,IAAI,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,EAAE,CAAC;oBACzB,OAAO,IAAI,CAAC;gBACd,CAAC;YACH,CAAC;iBAAM,IAAI,QAAQ,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;gBAC3C,OAAO,IAAI,CAAC;YACd,CAAC;QACH,CAAC;QAED,OAAO,KAAK,CAAC;IACf,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,KAAK,CAAC;IACf,CAAC;AACH,CAAC;AAED;;GAEG;AACH,KAAK,UAAU,SAAS,CACtB,GAAW,EACX,SAAiB;IAEjB,IAAI,CAAC;QACH,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;YAChC,OAAO,EAAE;gBACP,YAAY,EAAE,SAAS;gBACvB,MAAM,EAAE,iCAAiC;aAC1C;YACD,QAAQ,EAAE,QAAQ;YAClB,MAAM,EAAE,WAAW,CAAC,OAAO,CAAC,KAAK,CAAC;SACnC,CAAC,CAAC;QAEH,MAAM,WAAW,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,IAAI,EAAE,CAAC;QAE/D,4BAA4B;QAC5B,IAAI,CAAC,WAAW,CAAC,QAAQ,CAAC,WAAW,CAAC,IAAI,CAAC,WAAW,CAAC,QAAQ,CAAC,mBAAmB,CAAC,EAAE,CAAC;YACrF,OAAO,IAAI,CAAC;QACd,CAAC;QAED,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;QAEnC,OAAO;YACL,IAAI;YACJ,MAAM,EAAE,QAAQ,CAAC,MAAM;YACvB,WAAW;SACZ,CAAC;IACJ,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED;;GAEG;AACH,SAAS,KAAK,CAAC,EAAU;IACvB,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,UAAU,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC,CAAC;AAC3D,CAAC;AAED;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,YAAY,CAChC,QAAgB,EAChB,UAA0B,EAAE;IAE5B,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAE7B,8BAA8B;IAC9B,MAAM,IAAI,GAA6B;QACrC,GAAG,eAAe;QAClB,GAAG,OAAO;QACV,QAAQ,EAAE,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,QAAQ,IAAI,eAAe,CAAC,QAAQ,EAAE,SAAS,CAAC;KAC5E,CAAC;IAEF,yBAAyB;IACzB,MAAM,OAAO,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IAEvC,oBAAoB;IACpB,MAAM,MAAM,GAAgB;QAC1B,OAAO;QACP,KAAK,EAAE,EAAE;QACT,MAAM,EAAE,EAAE;QACV,OAAO,EAAE,EAAE;QACX,KAAK,EAAE;YACL,eAAe,EAAE,CAAC;YAClB,YAAY,EAAE,CAAC;YACf,WAAW,EAAE,CAAC;YACd,YAAY,EAAE,CAAC;YACf,eAAe,EAAE,CAAC;YAClB,UAAU,EAAE,CAAC;SACd;KACF,CAAC;IAEF,6BAA6B;IAC7B,IAAI,WAAW,GAAgB,EAAE,eAAe,EAAE,EAAE,EAAE,CAAC;IACvD,IAAI,IAAI,CAAC,gBAAgB,EAAE,CAAC;QAC1B,WAAW,GAAG,MAAM,cAAc,CAAC,OAAO,EAAE,IAAI,CAAC,SAAS,CAAC,CAAC;QAC5D,IAAI,WAAW,CAAC,UAAU,EAAE,CAAC;YAC3B,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,YAAY,EAAE,WAAW,CAAC,UAAU,CAAC,CAAC;QAC1E,CAAC;IACH,CAAC;IAED,+BAA+B;IAC/B,MAAM,OAAO,GAAG,IAAI,GAAG,EAAU,CAAC;IAClC,MAAM,KAAK,GAA0C,CAAC,EAAE,GAAG,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC,EAAE,CAAC,CAAC;IAElF,OAAO,KAAK,CAAC,MAAM,GAAG,CAAC,IAAI,MAAM,CAAC,KAAK,CAAC,MAAM,GAAG,IAAI,CAAC,QAAQ,EAAE,CAAC;QAC/D,MAAM,OAAO,GAAG,KAAK,CAAC,KAAK,EAAG,CAAC;QAC/B,MAAM,EAAE,GAAG,EAAE,KAAK,EAAE,GAAG,OAAO,CAAC;QAE/B,0BAA0B;QAC1B,IAAI,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;YACrB,SAAS;QACX,CAAC;QACD,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QAEjB,mBAAmB;QACnB,IAAI,IAAI,CAAC,gBAAgB,IAAI,YAAY,CAAC,GAAG,EAAE,WAAW,CAAC,EAAE,CAAC;YAC5D,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,MAAM,EAAE,0BAA0B,EAAE,CAAC,CAAC;YACjE,MAAM,CAAC,KAAK,CAAC,YAAY,EAAE,CAAC;YAC5B,SAAS;QACX,CAAC;QAED,oBAAoB;QACpB,IAAI,CAAC,YAAY,CAAC,GAAG,EAAE,OAAO,CAAC,EAAE,CAAC;YAChC,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,MAAM,EAAE,iBAAiB,EAAE,CAAC,CAAC;YACxD,MAAM,CAAC,KAAK,CAAC,YAAY,EAAE,CAAC;YAC5B,SAAS;QACX,CAAC;QAED,4CAA4C;QAC5C,IAAI,MAAM,CAAC,KAAK,CAAC,YAAY,GAAG,CAAC,EAAE,CAAC;YAClC,MAAM,KAAK,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;QACjC,CAAC;QAED,iBAAiB;QACjB,MAAM,UAAU,GAAG,MAAM,SAAS,CAAC,GAAG,EAAE,IAAI,CAAC,SAAS,CAAC,CAAC;QAExD,IAAI,CAAC,UAAU,EAAE,CAAC;YAChB,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,MAAM,EAAE,kCAAkC,EAAE,CAAC,CAAC;YACxE,MAAM,CAAC,KAAK,CAAC,WAAW,EAAE,CAAC;YAC3B,SAAS;QACX,CAAC;QAED,qCAAqC;QACrC,MAAM,KAAK,GAAG,KAAK,GAAG,IAAI,CAAC,QAAQ,CAAC,CAAC,CAAC,YAAY,CAAC,UAAU,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAE9E,sBAAsB;QACtB,MAAM,IAAI,GAAgB;YACxB,GAAG;YACH,aAAa,EAAE,YAAY,CAAC,GAAG,CAAC;YAChC,QAAQ,EAAE,aAAa,CAAC,GAAG,CAAC;YAC5B,IAAI,EAAE,UAAU,CAAC,IAAI;YACrB,MAAM,EAAE,UAAU,CAAC,MAAM;YACzB,WAAW,EAAE,UAAU,CAAC,WAAW;YACnC,KAAK;YACL,KAAK;SACN,CAAC;QAEF,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACxB,MAAM,CAAC,KAAK,CAAC,YAAY,EAAE,CAAC;QAC5B,MAAM,CAAC,KAAK,CAAC,eAAe,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC,eAAe,EAAE,KAAK,CAAC,CAAC;QAE7E,yBAAyB;QACzB,IAAI,KAAK,GAAG,IAAI,CAAC,QAAQ,EAAE,CAAC;YAC1B,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;gBACzB,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC;oBACvB,KAAK,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,IAAI,EAAE,KAAK,EAAE,KAAK,GAAG,CAAC,EAAE,CAAC,CAAC;oBAC5C,MAAM,CAAC,KAAK,CAAC,eAAe,EAAE,CAAC;gBACjC,CAAC;YACH,CAAC;QACH,CAAC;QAED,eAAe;QACf,OAAO,CAAC,KAAK,CAAC,qBAAqB,MAAM,CAAC,KAAK,CAAC,YAAY,IAAI,IAAI,CAAC,QAAQ,KAAK,GAAG,WAAW,KAAK,GAAG,CAAC,CAAC;IAC5G,CAAC;IAED,qBAAqB;IACrB,MAAM,CAAC,KAAK,CAAC,UAAU,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;IAEjD,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,MAAM,UAAU,GAAG;IACxB,KAAK,EAAE,YAAY;CACpB,CAAC"}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Input type for clear_cache tool.
|
|
3
|
+
*/
|
|
4
|
+
export interface ClearCacheInput {
|
|
5
|
+
/** Specific docs ID to clear (optional) */
|
|
6
|
+
docs_id?: string;
|
|
7
|
+
/** Clear all cached docs (default: false) */
|
|
8
|
+
all?: boolean;
|
|
9
|
+
}
|
|
10
|
+
/**
|
|
11
|
+
* Output type for clear_cache tool.
|
|
12
|
+
*/
|
|
13
|
+
export interface ClearCacheOutput {
|
|
14
|
+
/** IDs that were cleared */
|
|
15
|
+
cleared: string[];
|
|
16
|
+
/** Count of remaining cached docs */
|
|
17
|
+
remaining: number;
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* Removes cached documentation.
|
|
21
|
+
* Either clears a specific docs entry by ID, or all entries if `all` is true.
|
|
22
|
+
*/
|
|
23
|
+
export declare function clearCache(input: ClearCacheInput): Promise<ClearCacheOutput>;
|
|
24
|
+
//# sourceMappingURL=clear-cache.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"clear-cache.d.ts","sourceRoot":"","sources":["../../src/tools/clear-cache.ts"],"names":[],"mappings":"AAEA;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,2CAA2C;IAC3C,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,6CAA6C;IAC7C,GAAG,CAAC,EAAE,OAAO,CAAC;CACf;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B,4BAA4B;IAC5B,OAAO,EAAE,MAAM,EAAE,CAAC;IAClB,qCAAqC;IACrC,SAAS,EAAE,MAAM,CAAC;CACnB;AAED;;;GAGG;AACH,wBAAsB,UAAU,CAAC,KAAK,EAAE,eAAe,GAAG,OAAO,CAAC,gBAAgB,CAAC,CAyBlF"}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import { cacheManager } from "../services/cache-manager.js";
|
|
2
|
+
/**
|
|
3
|
+
* Removes cached documentation.
|
|
4
|
+
* Either clears a specific docs entry by ID, or all entries if `all` is true.
|
|
5
|
+
*/
|
|
6
|
+
export async function clearCache(input) {
|
|
7
|
+
await cacheManager.initialize();
|
|
8
|
+
const cleared = [];
|
|
9
|
+
if (input.all) {
|
|
10
|
+
// Clear everything
|
|
11
|
+
const allCleared = await cacheManager.clearAll();
|
|
12
|
+
cleared.push(...allCleared);
|
|
13
|
+
}
|
|
14
|
+
else if (input.docs_id) {
|
|
15
|
+
// Clear specific entry - try both sources
|
|
16
|
+
const meta = await cacheManager.findById(input.docs_id);
|
|
17
|
+
if (meta) {
|
|
18
|
+
await cacheManager.clearEntry(meta.source, meta.id);
|
|
19
|
+
cleared.push(meta.id);
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
// Count remaining entries
|
|
23
|
+
const remaining = (await cacheManager.listEntries()).length;
|
|
24
|
+
return {
|
|
25
|
+
cleared,
|
|
26
|
+
remaining,
|
|
27
|
+
};
|
|
28
|
+
}
|
|
29
|
+
//# sourceMappingURL=clear-cache.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"clear-cache.js","sourceRoot":"","sources":["../../src/tools/clear-cache.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,8BAA8B,CAAC;AAsB5D;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,UAAU,CAAC,KAAsB;IACrD,MAAM,YAAY,CAAC,UAAU,EAAE,CAAC;IAEhC,MAAM,OAAO,GAAa,EAAE,CAAC;IAE7B,IAAI,KAAK,CAAC,GAAG,EAAE,CAAC;QACd,mBAAmB;QACnB,MAAM,UAAU,GAAG,MAAM,YAAY,CAAC,QAAQ,EAAE,CAAC;QACjD,OAAO,CAAC,IAAI,CAAC,GAAG,UAAU,CAAC,CAAC;IAC9B,CAAC;SAAM,IAAI,KAAK,CAAC,OAAO,EAAE,CAAC;QACzB,0CAA0C;QAC1C,MAAM,IAAI,GAAG,MAAM,YAAY,CAAC,QAAQ,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;QACxD,IAAI,IAAI,EAAE,CAAC;YACT,MAAM,YAAY,CAAC,UAAU,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,EAAE,CAAC,CAAC;YACpD,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACxB,CAAC;IACH,CAAC;IAED,0BAA0B;IAC1B,MAAM,SAAS,GAAG,CAAC,MAAM,YAAY,CAAC,WAAW,EAAE,CAAC,CAAC,MAAM,CAAC;IAE5D,OAAO;QACL,OAAO;QACP,SAAS;KACV,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* detect_github_repo tool - Finds GitHub repository from a docs website URL.
|
|
3
|
+
*/
|
|
4
|
+
import { type GitHubDetectionResult } from "../services/github-detector.js";
|
|
5
|
+
/**
|
|
6
|
+
* Input for the detect_github_repo tool.
|
|
7
|
+
*/
|
|
8
|
+
export interface DetectGitHubInput {
|
|
9
|
+
/** Docs website URL to analyze */
|
|
10
|
+
url: string;
|
|
11
|
+
}
|
|
12
|
+
/**
|
|
13
|
+
* Output from the detect_github_repo tool.
|
|
14
|
+
* Same as GitHubDetectionResult.
|
|
15
|
+
*/
|
|
16
|
+
export type DetectGitHubOutput = GitHubDetectionResult;
|
|
17
|
+
/**
|
|
18
|
+
* Detects GitHub repository from a documentation website URL.
|
|
19
|
+
*/
|
|
20
|
+
export declare function detectGitHub(input: DetectGitHubInput): Promise<DetectGitHubOutput>;
|
|
21
|
+
//# sourceMappingURL=detect-github.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"detect-github.d.ts","sourceRoot":"","sources":["../../src/tools/detect-github.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAEL,KAAK,qBAAqB,EAC3B,MAAM,gCAAgC,CAAC;AAGxC;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,kCAAkC;IAClC,GAAG,EAAE,MAAM,CAAC;CACb;AAED;;;GAGG;AACH,MAAM,MAAM,kBAAkB,GAAG,qBAAqB,CAAC;AAEvD;;GAEG;AACH,wBAAsB,YAAY,CAChC,KAAK,EAAE,iBAAiB,GACvB,OAAO,CAAC,kBAAkB,CAAC,CAU7B"}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* detect_github_repo tool - Finds GitHub repository from a docs website URL.
|
|
3
|
+
*/
|
|
4
|
+
import { detectGitHubRepo, } from "../services/github-detector.js";
|
|
5
|
+
import { ValidationError } from "../types/errors.js";
|
|
6
|
+
/**
|
|
7
|
+
* Detects GitHub repository from a documentation website URL.
|
|
8
|
+
*/
|
|
9
|
+
export async function detectGitHub(input) {
|
|
10
|
+
const { url } = input;
|
|
11
|
+
// Validate required parameters
|
|
12
|
+
if (!url) {
|
|
13
|
+
throw new ValidationError("Missing required parameter: url", "url");
|
|
14
|
+
}
|
|
15
|
+
// Run detection
|
|
16
|
+
return detectGitHubRepo(url);
|
|
17
|
+
}
|
|
18
|
+
//# sourceMappingURL=detect-github.js.map
|