mcp-docs-scraper 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +357 -0
  3. package/dist/index.d.ts +3 -0
  4. package/dist/index.d.ts.map +1 -0
  5. package/dist/index.js +20 -0
  6. package/dist/index.js.map +1 -0
  7. package/dist/server.d.ts +6 -0
  8. package/dist/server.d.ts.map +1 -0
  9. package/dist/server.js +231 -0
  10. package/dist/server.js.map +1 -0
  11. package/dist/services/cache-manager.d.ts +100 -0
  12. package/dist/services/cache-manager.d.ts.map +1 -0
  13. package/dist/services/cache-manager.js +212 -0
  14. package/dist/services/cache-manager.js.map +1 -0
  15. package/dist/services/content-cleaner.d.ts +48 -0
  16. package/dist/services/content-cleaner.d.ts.map +1 -0
  17. package/dist/services/content-cleaner.js +295 -0
  18. package/dist/services/content-cleaner.js.map +1 -0
  19. package/dist/services/github-detector.d.ts +49 -0
  20. package/dist/services/github-detector.d.ts.map +1 -0
  21. package/dist/services/github-detector.js +276 -0
  22. package/dist/services/github-detector.js.map +1 -0
  23. package/dist/services/github-fetcher.d.ts +94 -0
  24. package/dist/services/github-fetcher.d.ts.map +1 -0
  25. package/dist/services/github-fetcher.js +393 -0
  26. package/dist/services/github-fetcher.js.map +1 -0
  27. package/dist/services/search-index.d.ts +106 -0
  28. package/dist/services/search-index.d.ts.map +1 -0
  29. package/dist/services/search-index.js +210 -0
  30. package/dist/services/search-index.js.map +1 -0
  31. package/dist/services/web-scraper.d.ts +88 -0
  32. package/dist/services/web-scraper.d.ts.map +1 -0
  33. package/dist/services/web-scraper.js +244 -0
  34. package/dist/services/web-scraper.js.map +1 -0
  35. package/dist/tools/clear-cache.d.ts +24 -0
  36. package/dist/tools/clear-cache.d.ts.map +1 -0
  37. package/dist/tools/clear-cache.js +29 -0
  38. package/dist/tools/clear-cache.js.map +1 -0
  39. package/dist/tools/detect-github.d.ts +21 -0
  40. package/dist/tools/detect-github.d.ts.map +1 -0
  41. package/dist/tools/detect-github.js +18 -0
  42. package/dist/tools/detect-github.js.map +1 -0
  43. package/dist/tools/get-content.d.ts +43 -0
  44. package/dist/tools/get-content.d.ts.map +1 -0
  45. package/dist/tools/get-content.js +84 -0
  46. package/dist/tools/get-content.js.map +1 -0
  47. package/dist/tools/get-tree.d.ts +31 -0
  48. package/dist/tools/get-tree.d.ts.map +1 -0
  49. package/dist/tools/get-tree.js +102 -0
  50. package/dist/tools/get-tree.js.map +1 -0
  51. package/dist/tools/index-docs.d.ts +63 -0
  52. package/dist/tools/index-docs.d.ts.map +1 -0
  53. package/dist/tools/index-docs.js +371 -0
  54. package/dist/tools/index-docs.js.map +1 -0
  55. package/dist/tools/index.d.ts +11 -0
  56. package/dist/tools/index.d.ts.map +1 -0
  57. package/dist/tools/index.js +11 -0
  58. package/dist/tools/index.js.map +1 -0
  59. package/dist/tools/list-cached.d.ts +19 -0
  60. package/dist/tools/list-cached.d.ts.map +1 -0
  61. package/dist/tools/list-cached.js +20 -0
  62. package/dist/tools/list-cached.js.map +1 -0
  63. package/dist/tools/search-docs.d.ts +31 -0
  64. package/dist/tools/search-docs.d.ts.map +1 -0
  65. package/dist/tools/search-docs.js +64 -0
  66. package/dist/tools/search-docs.js.map +1 -0
  67. package/dist/types/cache.d.ts +53 -0
  68. package/dist/types/cache.d.ts.map +1 -0
  69. package/dist/types/cache.js +2 -0
  70. package/dist/types/cache.js.map +1 -0
  71. package/dist/types/errors.d.ts +102 -0
  72. package/dist/types/errors.d.ts.map +1 -0
  73. package/dist/types/errors.js +216 -0
  74. package/dist/types/errors.js.map +1 -0
  75. package/dist/types/index.d.ts +6 -0
  76. package/dist/types/index.d.ts.map +1 -0
  77. package/dist/types/index.js +5 -0
  78. package/dist/types/index.js.map +1 -0
  79. package/dist/utils/fs.d.ts +45 -0
  80. package/dist/utils/fs.d.ts.map +1 -0
  81. package/dist/utils/fs.js +113 -0
  82. package/dist/utils/fs.js.map +1 -0
  83. package/dist/utils/rate-limit.d.ts +55 -0
  84. package/dist/utils/rate-limit.d.ts.map +1 -0
  85. package/dist/utils/rate-limit.js +89 -0
  86. package/dist/utils/rate-limit.js.map +1 -0
  87. package/dist/utils/url.d.ts +69 -0
  88. package/dist/utils/url.d.ts.map +1 -0
  89. package/dist/utils/url.js +251 -0
  90. package/dist/utils/url.js.map +1 -0
  91. package/package.json +58 -0
@@ -0,0 +1,210 @@
1
+ /**
2
+ * Search index service - Full-text search using MiniSearch.
3
+ *
4
+ * Indexes documentation content for fast searching.
5
+ * Index is built during docs indexing and stored in cache.
6
+ */
7
+ import MiniSearch from "minisearch";
8
+ // Current index version
9
+ const INDEX_VERSION = 1;
10
+ // Snippet configuration
11
+ const SNIPPET_LENGTH = 150;
12
+ const SNIPPET_CONTEXT = 50;
13
+ /**
14
+ * Creates and manages a full-text search index for documentation.
15
+ */
16
+ export class SearchIndex {
17
+ miniSearch;
18
+ documents;
19
+ constructor() {
20
+ this.miniSearch = new MiniSearch({
21
+ fields: ["title", "headings", "content"],
22
+ storeFields: ["title"],
23
+ searchOptions: {
24
+ boost: { title: 3, headings: 2, content: 1 },
25
+ fuzzy: 0.2,
26
+ prefix: true,
27
+ },
28
+ });
29
+ this.documents = new Map();
30
+ }
31
+ /**
32
+ * Adds a document to the index.
33
+ */
34
+ addDocument(doc) {
35
+ // Store document for snippet generation
36
+ this.documents.set(doc.id, {
37
+ title: doc.title,
38
+ content: doc.content,
39
+ });
40
+ // Add to search index
41
+ this.miniSearch.add(doc);
42
+ }
43
+ /**
44
+ * Adds multiple documents to the index.
45
+ */
46
+ addDocuments(docs) {
47
+ for (const doc of docs) {
48
+ this.addDocument(doc);
49
+ }
50
+ }
51
+ /**
52
+ * Searches the index and returns results with snippets.
53
+ */
54
+ search(query, limit = 10) {
55
+ if (!query.trim()) {
56
+ return [];
57
+ }
58
+ // MiniSearch returns all results, we slice them to the limit
59
+ const results = this.miniSearch.search(query);
60
+ return results
61
+ .slice(0, limit)
62
+ .map((result) => this.toSearchResult(result, query));
63
+ }
64
+ /**
65
+ * Converts a MiniSearch result to our SearchResult format.
66
+ */
67
+ toSearchResult(result, query) {
68
+ const doc = this.documents.get(result.id);
69
+ const title = doc?.title || result.id;
70
+ const content = doc?.content || "";
71
+ return {
72
+ path: result.id,
73
+ title,
74
+ snippet: this.generateSnippet(content, query),
75
+ score: result.score,
76
+ };
77
+ }
78
+ /**
79
+ * Generates a snippet with the matching text highlighted.
80
+ */
81
+ generateSnippet(content, query) {
82
+ if (!content) {
83
+ return "";
84
+ }
85
+ // Find the best position to start the snippet
86
+ const queryTerms = query.toLowerCase().split(/\s+/);
87
+ const contentLower = content.toLowerCase();
88
+ let bestPosition = 0;
89
+ let bestScore = 0;
90
+ // Search for query terms and find the position with most matches
91
+ for (const term of queryTerms) {
92
+ if (term.length < 2)
93
+ continue;
94
+ const pos = contentLower.indexOf(term);
95
+ if (pos !== -1) {
96
+ // Score based on how early the term appears
97
+ const score = 1 / (pos + 1);
98
+ if (score > bestScore) {
99
+ bestScore = score;
100
+ bestPosition = pos;
101
+ }
102
+ }
103
+ }
104
+ // Calculate snippet boundaries
105
+ const start = Math.max(0, bestPosition - SNIPPET_CONTEXT);
106
+ const end = Math.min(content.length, start + SNIPPET_LENGTH);
107
+ // Extract snippet
108
+ let snippet = content.slice(start, end);
109
+ // Clean up snippet
110
+ snippet = snippet
111
+ .replace(/\n+/g, " ") // Replace newlines with spaces
112
+ .replace(/\s+/g, " ") // Normalize whitespace
113
+ .trim();
114
+ // Add ellipsis if needed
115
+ if (start > 0) {
116
+ snippet = "..." + snippet;
117
+ }
118
+ if (end < content.length) {
119
+ snippet = snippet + "...";
120
+ }
121
+ return snippet;
122
+ }
123
+ /**
124
+ * Returns the number of indexed documents.
125
+ */
126
+ get documentCount() {
127
+ return this.documents.size;
128
+ }
129
+ /**
130
+ * Serializes the index for storage.
131
+ */
132
+ toJSON() {
133
+ const serialized = {
134
+ index: this.miniSearch.toJSON(),
135
+ documents: this.documents,
136
+ version: INDEX_VERSION,
137
+ };
138
+ // Convert Map to array for JSON serialization
139
+ return JSON.stringify({
140
+ ...serialized,
141
+ documents: Array.from(this.documents.entries()),
142
+ });
143
+ }
144
+ /**
145
+ * Creates a SearchIndex from serialized JSON.
146
+ */
147
+ static fromJSON(json) {
148
+ const parsed = JSON.parse(json);
149
+ // Check version
150
+ if (parsed.version !== INDEX_VERSION) {
151
+ throw new Error(`Search index version mismatch: expected ${INDEX_VERSION}, got ${parsed.version}`);
152
+ }
153
+ const searchIndex = new SearchIndex();
154
+ // Restore MiniSearch index
155
+ searchIndex.miniSearch = MiniSearch.loadJSON(JSON.stringify(parsed.index), {
156
+ fields: ["title", "headings", "content"],
157
+ storeFields: ["title"],
158
+ searchOptions: {
159
+ boost: { title: 3, headings: 2, content: 1 },
160
+ fuzzy: 0.2,
161
+ prefix: true,
162
+ },
163
+ });
164
+ // Restore documents map
165
+ searchIndex.documents = new Map(parsed.documents);
166
+ return searchIndex;
167
+ }
168
+ /**
169
+ * Clears the index.
170
+ */
171
+ clear() {
172
+ this.miniSearch.removeAll();
173
+ this.documents.clear();
174
+ }
175
+ }
176
+ /**
177
+ * Extracts a title from markdown content.
178
+ * Returns the first H1 heading, or undefined if none found.
179
+ */
180
+ export function extractTitle(content) {
181
+ const match = content.match(/^#\s+(.+)$/m);
182
+ return match ? match[1].trim() : undefined;
183
+ }
184
+ /**
185
+ * Extracts all headings from markdown content.
186
+ * Returns a concatenated string of all headings.
187
+ */
188
+ export function extractHeadings(content) {
189
+ const headings = [];
190
+ const regex = /^#{1,6}\s+(.+)$/gm;
191
+ let match;
192
+ while ((match = regex.exec(content)) !== null) {
193
+ headings.push(match[1].trim());
194
+ }
195
+ return headings.join(" ");
196
+ }
197
+ /**
198
+ * Creates an indexable document from file content.
199
+ */
200
+ export function createIndexableDocument(path, content) {
201
+ const title = extractTitle(content) || path.split("/").pop() || path;
202
+ const headings = extractHeadings(content);
203
+ return {
204
+ id: path,
205
+ title,
206
+ headings,
207
+ content,
208
+ };
209
+ }
210
+ //# sourceMappingURL=search-index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"search-index.js","sourceRoot":"","sources":["../../src/services/search-index.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,UAAgD,MAAM,YAAY,CAAC;AA0C1E,wBAAwB;AACxB,MAAM,aAAa,GAAG,CAAC,CAAC;AAExB,wBAAwB;AACxB,MAAM,cAAc,GAAG,GAAG,CAAC;AAC3B,MAAM,eAAe,GAAG,EAAE,CAAC;AAE3B;;GAEG;AACH,MAAM,OAAO,WAAW;IACd,UAAU,CAAgC;IAC1C,SAAS,CAAkD;IAEnE;QACE,IAAI,CAAC,UAAU,GAAG,IAAI,UAAU,CAAoB;YAClD,MAAM,EAAE,CAAC,OAAO,EAAE,UAAU,EAAE,SAAS,CAAC;YACxC,WAAW,EAAE,CAAC,OAAO,CAAC;YACtB,aAAa,EAAE;gBACb,KAAK,EAAE,EAAE,KAAK,EAAE,CAAC,EAAE,QAAQ,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,EAAE;gBAC5C,KAAK,EAAE,GAAG;gBACV,MAAM,EAAE,IAAI;aACb;SACF,CAAC,CAAC;QACH,IAAI,CAAC,SAAS,GAAG,IAAI,GAAG,EAAE,CAAC;IAC7B,CAAC;IAED;;OAEG;IACH,WAAW,CAAC,GAAsB;QAChC,wCAAwC;QACxC,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,EAAE;YACzB,KAAK,EAAE,GAAG,CAAC,KAAK;YAChB,OAAO,EAAE,GAAG,CAAC,OAAO;SACrB,CAAC,CAAC;QAEH,sBAAsB;QACtB,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;IAC3B,CAAC;IAED;;OAEG;IACH,YAAY,CAAC,IAAyB;QACpC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;YACvB,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC;QACxB,CAAC;IACH,CAAC;IAED;;OAEG;IACH,MAAM,CAAC,KAAa,EAAE,QAAgB,EAAE;QACtC,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,EAAE,CAAC;YAClB,OAAO,EAAE,CAAC;QACZ,CAAC;QAED,6DAA6D;QAC7D,MAAM,OAAO,GAAG,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;QAE9C,OAAO,OAAO;aACX,KAAK,CAAC,CAAC,EAAE,KAAK,CAAC;aACf,GAAG,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,IAAI,CAAC,cAAc,CAAC,MAAM,EAAE,KAAK,CAAC,CAAC,CAAC;IACzD,CAAC;IAED;;OAEG;IACK,cAAc,CACpB,MAAwB,EACxB,KAAa;QAEb,MAAM,GAAG,GAAG,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;QAC1C,MAAM,KAAK,GAAG,GAAG,EAAE,KAAK,IAAI,MAAM,CAAC,EAAE,CAAC;QACtC,MAAM,OAAO,GAAG,GAAG,EAAE,OAAO,IAAI,EAAE,CAAC;QAEnC,OAAO;YACL,IAAI,EAAE,MAAM,CAAC,EAAE;YACf,KAAK;YACL,OAAO,EAAE,IAAI,CAAC,eAAe,CAAC,OAAO,EAAE,KAAK,CAAC;YAC7C,KAAK,EAAE,MAAM,CAAC,KAAK;SACpB,CAAC;IACJ,CAAC;IAED;;OAEG;IACK,eAAe,CAAC,OAAe,EAAE,KAAa;QACpD,IAAI,CAAC,OAAO,EAAE,CAAC;YACb,OAAO,EAAE,CAAC;QACZ,CAAC;QAED,8CAA8C;QAC9C,MAAM,UAAU,GAAG,KAAK,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;QACpD,MAAM,YAAY,GAAG,OAAO,CAAC,WAAW,EAAE,CAAC;QAE3C,IAAI,YAAY,GAAG,CAAC,CAAC;QACrB,IAAI,SAAS,GAAG,CAAC,CAAC;QAElB,iEAAiE;QACjE,KAAK,MAAM,IAAI,IAAI,UAAU,EAAE,CAAC;YAC9B,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC;gBAAE,SAAS;YAE9B,MAAM,GAAG,GAAG,YAAY,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;YACvC,IAAI,GAAG,KAAK,CAAC,CAAC,EAAE,CAAC;gBACf,4CAA4C;gBAC5C,MAAM,KAAK,GAAG,CAAC,GAAG,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC;gBAC5B,IAAI,KAAK,GAAG,SAAS,EAAE,CAAC;oBACtB,SAAS,GAAG,KAAK,CAAC;oBAClB,YAAY,GAAG,GAAG,CAAC;gBACrB,CAAC;YACH,CAAC;QACH,CAAC;QAED,+BAA+B;QAC/B,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,YAAY,GAAG,eAAe,CAAC,CAAC;QAC1D,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,MAAM,EAAE,KAAK,GAAG,cAAc,CAAC,CAAC;QAE7D,kBAAkB;QAClB,IAAI,OAAO,GAAG,OAAO,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC;QAExC,mBAAmB;QACnB,OAAO,GAAG,OAAO;aACd,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,+BAA+B;aACpD,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,uBAAuB;aAC5C,IAAI,EAAE,CAAC;QAEV,yBAAyB;QACzB,IAAI,KAAK,GAAG,CAAC,EAAE,CAAC;YACd,OAAO,GAAG,KAAK,GAAG,OAAO,CAAC;QAC5B,CAAC;QACD,IAAI,GAAG,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC;YACzB,OAAO,GAAG,OAAO,GAAG,KAAK,CAAC;QAC5B,CAAC;QAED,OAAO,OAAO,CAAC;IACjB,CAAC;IAED;;OAEG;IACH,IAAI,aAAa;QACf,OAAO,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC;IAC7B,CAAC;IAED;;OAEG;IACH,MAAM;QACJ,MAAM,UAAU,GAA0B;YACxC,KAAK,EAAE,IAAI,CAAC,UAAU,CAAC,MAAM,EAAE;YAC/B,SAAS,EAAE,IAAI,CAAC,SAAS;YACzB,OAAO,EAAE,aAAa;SACvB,CAAC;QAEF,8CAA8C;QAC9C,OAAO,IAAI,CAAC,SAAS,CAAC;YACpB,GAAG,UAAU;YACb,SAAS,EAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,CAAC;SAChD,CAAC,CAAC;IACL,CAAC;IAED;;OAEG;IACH,MAAM,CAAC,QAAQ,CAAC,IAAY;QAC1B,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAEhC,gBAAgB;QAChB,IAAI,MAAM,CAAC,OAAO,KAAK,aAAa,EAAE,CAAC;YACrC,MAAM,IAAI,KAAK,CACb,2CAA2C,aAAa,SAAS,MAAM,CAAC,OAAO,EAAE,CAClF,CAAC;QACJ,CAAC;QAED,MAAM,WAAW,GAAG,IAAI,WAAW,EAAE,CAAC;QAEtC,2BAA2B;QAC3B,WAAW,CAAC,UAAU,GAAG,UAAU,CAAC,QAAQ,CAAC,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC,KAAK,CAAC,EAAE;YACzE,MAAM,EAAE,CAAC,OAAO,EAAE,UAAU,EAAE,SAAS,CAAC;YACxC,WAAW,EAAE,CAAC,OAAO,CAAC;YACtB,aAAa,EAAE;gBACb,KAAK,EAAE,EAAE,KAAK,EAAE,CAAC,EAAE,QAAQ,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,EAAE;gBAC5C,KAAK,EAAE,GAAG;gBACV,MAAM,EAAE,IAAI;aACb;SACF,CAAC,CAAC;QAEH,wBAAwB;QACxB,WAAW,CAAC,SAAS,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAElD,OAAO,WAAW,CAAC;IACrB,CAAC;IAED;;OAEG;IACH,KAAK;QACH,IAAI,CAAC,UAAU,CAAC,SAAS,EAAE,CAAC;QAC5B,IAAI,CAAC,SAAS,CAAC,KAAK,EAAE,CAAC;IACzB,CAAC;CACF;AAED;;;GAGG;AACH,MAAM,UAAU,YAAY,CAAC,OAAe;IAC1C,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;IAC3C,OAAO,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC;AAC7C,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,eAAe,CAAC,OAAe;IAC7C,MAAM,QAAQ,GAAa,EAAE,CAAC;IAC9B,MAAM,KAAK,GAAG,mBAAmB,CAAC;IAClC,IAAI,KAAK,CAAC;IAEV,OAAO,CAAC,KAAK,GAAG,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QAC9C,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;IACjC,CAAC;IAED,OAAO,QAAQ,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;AAC5B,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,uBAAuB,CACrC,IAAY,EACZ,OAAe;IAEf,MAAM,KAAK,GAAG,YAAY,CAAC,OAAO,CAAC,IAAI,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,IAAI,IAAI,CAAC;IACrE,MAAM,QAAQ,GAAG,eAAe,CAAC,OAAO,CAAC,CAAC;IAE1C,OAAO;QACL,EAAE,EAAE,IAAI;QACR,KAAK;QACL,QAAQ;QACR,OAAO;KACR,CAAC;AACJ,CAAC"}
@@ -0,0 +1,88 @@
1
+ /**
2
+ * Web Scraper Service - Crawls documentation websites.
3
+ *
4
+ * Responsibilities:
5
+ * - Crawl docs websites with depth limit
6
+ * - Respect robots.txt
7
+ * - Extract and normalize links
8
+ * - Rate limiting between requests
9
+ */
10
+ /**
11
+ * Options for the web scraper.
12
+ */
13
+ export interface ScraperOptions {
14
+ /** Maximum crawl depth (default: 2, max: 5) */
15
+ maxDepth?: number;
16
+ /** Delay between requests in ms (default: 500) */
17
+ requestDelay?: number;
18
+ /** Maximum pages to crawl (default: 100) */
19
+ maxPages?: number;
20
+ /** Whether to respect robots.txt (default: true) */
21
+ respectRobotsTxt?: boolean;
22
+ /** Custom user agent */
23
+ userAgent?: string;
24
+ }
25
+ /**
26
+ * Result of scraping a single page.
27
+ */
28
+ export interface ScrapedPage {
29
+ /** Original URL */
30
+ url: string;
31
+ /** Normalized URL */
32
+ normalizedUrl: string;
33
+ /** Safe filename for caching */
34
+ filename: string;
35
+ /** Raw HTML content */
36
+ html: string;
37
+ /** HTTP status code */
38
+ status: number;
39
+ /** Content type header */
40
+ contentType: string;
41
+ /** Crawl depth this page was found at */
42
+ depth: number;
43
+ /** Links found on this page */
44
+ links: string[];
45
+ }
46
+ /**
47
+ * Result of a complete crawl operation.
48
+ */
49
+ export interface CrawlResult {
50
+ /** Base URL that was crawled */
51
+ baseUrl: string;
52
+ /** All successfully scraped pages */
53
+ pages: ScrapedPage[];
54
+ /** URLs that failed to fetch */
55
+ failed: Array<{
56
+ url: string;
57
+ reason: string;
58
+ }>;
59
+ /** URLs that were skipped (robots.txt, external, etc.) */
60
+ skipped: Array<{
61
+ url: string;
62
+ reason: string;
63
+ }>;
64
+ /** Crawl statistics */
65
+ stats: {
66
+ totalDiscovered: number;
67
+ totalCrawled: number;
68
+ totalFailed: number;
69
+ totalSkipped: number;
70
+ maxDepthReached: number;
71
+ durationMs: number;
72
+ };
73
+ }
74
+ /**
75
+ * Crawls a documentation website starting from a base URL.
76
+ *
77
+ * @param startUrl The URL to start crawling from
78
+ * @param options Scraper options
79
+ * @returns CrawlResult with all scraped pages and statistics
80
+ */
81
+ export declare function crawlWebsite(startUrl: string, options?: ScraperOptions): Promise<CrawlResult>;
82
+ /**
83
+ * Web scraper singleton for convenience.
84
+ */
85
+ export declare const webScraper: {
86
+ crawl: typeof crawlWebsite;
87
+ };
88
+ //# sourceMappingURL=web-scraper.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"web-scraper.d.ts","sourceRoot":"","sources":["../../src/services/web-scraper.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AASH;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,+CAA+C;IAC/C,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,kDAAkD;IAClD,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,4CAA4C;IAC5C,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,oDAAoD;IACpD,gBAAgB,CAAC,EAAE,OAAO,CAAC;IAC3B,wBAAwB;IACxB,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,mBAAmB;IACnB,GAAG,EAAE,MAAM,CAAC;IACZ,qBAAqB;IACrB,aAAa,EAAE,MAAM,CAAC;IACtB,gCAAgC;IAChC,QAAQ,EAAE,MAAM,CAAC;IACjB,uBAAuB;IACvB,IAAI,EAAE,MAAM,CAAC;IACb,uBAAuB;IACvB,MAAM,EAAE,MAAM,CAAC;IACf,0BAA0B;IAC1B,WAAW,EAAE,MAAM,CAAC;IACpB,yCAAyC;IACzC,KAAK,EAAE,MAAM,CAAC;IACd,+BAA+B;IAC/B,KAAK,EAAE,MAAM,EAAE,CAAC;CACjB;AAED;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,gCAAgC;IAChC,OAAO,EAAE,MAAM,CAAC;IAChB,qCAAqC;IACrC,KAAK,EAAE,WAAW,EAAE,CAAC;IACrB,gCAAgC;IAChC,MAAM,EAAE,KAAK,CAAC;QAAE,GAAG,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IAC/C,0DAA0D;IAC1D,OAAO,EAAE,KAAK,CAAC;QAAE,GAAG,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IAChD,uBAAuB;IACvB,KAAK,EAAE;QACL,eAAe,EAAE,MAAM,CAAC;QACxB,YAAY,EAAE,MAAM,CAAC;QACrB,WAAW,EAAE,MAAM,CAAC;QACpB,YAAY,EAAE,MAAM,CAAC;QACrB,eAAe,EAAE,MAAM,CAAC;QACxB,UAAU,EAAE,MAAM,CAAC;KACpB,CAAC;CACH;AA4JD;;;;;;GAMG;AACH,wBAAsB,YAAY,CAChC,QAAQ,EAAE,MAAM,EAChB,OAAO,GAAE,cAAmB,GAC3B,OAAO,CAAC,WAAW,CAAC,CAqHtB;AAED;;GAEG;AACH,eAAO,MAAM,UAAU;;CAEtB,CAAC"}
@@ -0,0 +1,244 @@
1
+ /**
2
+ * Web Scraper Service - Crawls documentation websites.
3
+ *
4
+ * Responsibilities:
5
+ * - Crawl docs websites with depth limit
6
+ * - Respect robots.txt
7
+ * - Extract and normalize links
8
+ * - Rate limiting between requests
9
+ */
10
+ import { normalizeUrl, extractLinks, isSameDomain, urlToFilename, } from "../utils/url.js";
11
+ /**
12
+ * Default scraper options.
13
+ */
14
+ const DEFAULT_OPTIONS = {
15
+ maxDepth: 2,
16
+ requestDelay: 500,
17
+ maxPages: 100,
18
+ respectRobotsTxt: true,
19
+ userAgent: "mcp-docs-scraper/1.0 (documentation indexer)",
20
+ };
21
+ /**
22
+ * Maximum allowed depth.
23
+ */
24
+ const MAX_DEPTH = 5;
25
+ /**
26
+ * Fetches and parses robots.txt for a domain.
27
+ */
28
+ async function fetchRobotsTxt(baseUrl, userAgent) {
29
+ const rules = {
30
+ disallowedPaths: [],
31
+ };
32
+ try {
33
+ const robotsUrl = new URL("/robots.txt", baseUrl).href;
34
+ const response = await fetch(robotsUrl, {
35
+ headers: { "User-Agent": userAgent },
36
+ signal: AbortSignal.timeout(5000),
37
+ });
38
+ if (!response.ok) {
39
+ return rules; // No robots.txt or error - allow all
40
+ }
41
+ const text = await response.text();
42
+ const lines = text.split("\n");
43
+ let isRelevantUserAgent = false;
44
+ for (const line of lines) {
45
+ const trimmed = line.trim().toLowerCase();
46
+ // Check user-agent directive
47
+ if (trimmed.startsWith("user-agent:")) {
48
+ const agent = trimmed.slice(11).trim();
49
+ isRelevantUserAgent = agent === "*" || userAgent.toLowerCase().includes(agent);
50
+ }
51
+ // Parse disallow directive
52
+ if (isRelevantUserAgent && trimmed.startsWith("disallow:")) {
53
+ const path = line.trim().slice(9).trim();
54
+ if (path) {
55
+ rules.disallowedPaths.push(path);
56
+ }
57
+ }
58
+ // Parse crawl-delay directive
59
+ if (isRelevantUserAgent && trimmed.startsWith("crawl-delay:")) {
60
+ const delay = parseInt(line.trim().slice(12).trim(), 10);
61
+ if (!isNaN(delay)) {
62
+ rules.crawlDelay = delay * 1000; // Convert to ms
63
+ }
64
+ }
65
+ }
66
+ }
67
+ catch {
68
+ // Ignore errors - assume allowed
69
+ }
70
+ return rules;
71
+ }
72
+ /**
73
+ * Checks if a URL is disallowed by robots.txt rules.
74
+ */
75
+ function isDisallowed(url, rules) {
76
+ try {
77
+ const parsed = new URL(url);
78
+ const pathname = parsed.pathname;
79
+ for (const disallowed of rules.disallowedPaths) {
80
+ // Handle wildcard patterns
81
+ if (disallowed.includes("*")) {
82
+ const regex = new RegExp("^" + disallowed.replace(/\*/g, ".*").replace(/\?/g, "\\?") + "$");
83
+ if (regex.test(pathname)) {
84
+ return true;
85
+ }
86
+ }
87
+ else if (pathname.startsWith(disallowed)) {
88
+ return true;
89
+ }
90
+ }
91
+ return false;
92
+ }
93
+ catch {
94
+ return false;
95
+ }
96
+ }
97
+ /**
98
+ * Fetches a single page.
99
+ */
100
+ async function fetchPage(url, userAgent) {
101
+ try {
102
+ const response = await fetch(url, {
103
+ headers: {
104
+ "User-Agent": userAgent,
105
+ Accept: "text/html,application/xhtml+xml",
106
+ },
107
+ redirect: "follow",
108
+ signal: AbortSignal.timeout(10000),
109
+ });
110
+ const contentType = response.headers.get("content-type") || "";
111
+ // Only process HTML content
112
+ if (!contentType.includes("text/html") && !contentType.includes("application/xhtml")) {
113
+ return null;
114
+ }
115
+ const html = await response.text();
116
+ return {
117
+ html,
118
+ status: response.status,
119
+ contentType,
120
+ };
121
+ }
122
+ catch {
123
+ return null;
124
+ }
125
+ }
126
+ /**
127
+ * Delays execution for a specified time.
128
+ */
129
+ function delay(ms) {
130
+ return new Promise((resolve) => setTimeout(resolve, ms));
131
+ }
132
+ /**
133
+ * Crawls a documentation website starting from a base URL.
134
+ *
135
+ * @param startUrl The URL to start crawling from
136
+ * @param options Scraper options
137
+ * @returns CrawlResult with all scraped pages and statistics
138
+ */
139
+ export async function crawlWebsite(startUrl, options = {}) {
140
+ const startTime = Date.now();
141
+ // Merge options with defaults
142
+ const opts = {
143
+ ...DEFAULT_OPTIONS,
144
+ ...options,
145
+ maxDepth: Math.min(options.maxDepth || DEFAULT_OPTIONS.maxDepth, MAX_DEPTH),
146
+ };
147
+ // Normalize starting URL
148
+ const baseUrl = normalizeUrl(startUrl);
149
+ // Initialize result
150
+ const result = {
151
+ baseUrl,
152
+ pages: [],
153
+ failed: [],
154
+ skipped: [],
155
+ stats: {
156
+ totalDiscovered: 1,
157
+ totalCrawled: 0,
158
+ totalFailed: 0,
159
+ totalSkipped: 0,
160
+ maxDepthReached: 0,
161
+ durationMs: 0,
162
+ },
163
+ };
164
+ // Fetch robots.txt if needed
165
+ let robotsRules = { disallowedPaths: [] };
166
+ if (opts.respectRobotsTxt) {
167
+ robotsRules = await fetchRobotsTxt(baseUrl, opts.userAgent);
168
+ if (robotsRules.crawlDelay) {
169
+ opts.requestDelay = Math.max(opts.requestDelay, robotsRules.crawlDelay);
170
+ }
171
+ }
172
+ // Track visited URLs and queue
173
+ const visited = new Set();
174
+ const queue = [{ url: baseUrl, depth: 0 }];
175
+ while (queue.length > 0 && result.pages.length < opts.maxPages) {
176
+ const current = queue.shift();
177
+ const { url, depth } = current;
178
+ // Skip if already visited
179
+ if (visited.has(url)) {
180
+ continue;
181
+ }
182
+ visited.add(url);
183
+ // Check robots.txt
184
+ if (opts.respectRobotsTxt && isDisallowed(url, robotsRules)) {
185
+ result.skipped.push({ url, reason: "disallowed by robots.txt" });
186
+ result.stats.totalSkipped++;
187
+ continue;
188
+ }
189
+ // Check same domain
190
+ if (!isSameDomain(url, baseUrl)) {
191
+ result.skipped.push({ url, reason: "external domain" });
192
+ result.stats.totalSkipped++;
193
+ continue;
194
+ }
195
+ // Add delay between requests (except first)
196
+ if (result.stats.totalCrawled > 0) {
197
+ await delay(opts.requestDelay);
198
+ }
199
+ // Fetch the page
200
+ const pageResult = await fetchPage(url, opts.userAgent);
201
+ if (!pageResult) {
202
+ result.failed.push({ url, reason: "fetch failed or non-HTML content" });
203
+ result.stats.totalFailed++;
204
+ continue;
205
+ }
206
+ // Extract links for further crawling
207
+ const links = depth < opts.maxDepth ? extractLinks(pageResult.html, url) : [];
208
+ // Create scraped page
209
+ const page = {
210
+ url,
211
+ normalizedUrl: normalizeUrl(url),
212
+ filename: urlToFilename(url),
213
+ html: pageResult.html,
214
+ status: pageResult.status,
215
+ contentType: pageResult.contentType,
216
+ depth,
217
+ links,
218
+ };
219
+ result.pages.push(page);
220
+ result.stats.totalCrawled++;
221
+ result.stats.maxDepthReached = Math.max(result.stats.maxDepthReached, depth);
222
+ // Add new links to queue
223
+ if (depth < opts.maxDepth) {
224
+ for (const link of links) {
225
+ if (!visited.has(link)) {
226
+ queue.push({ url: link, depth: depth + 1 });
227
+ result.stats.totalDiscovered++;
228
+ }
229
+ }
230
+ }
231
+ // Log progress
232
+ console.error(`[scraper] Crawled ${result.stats.totalCrawled}/${opts.maxPages}: ${url} (depth ${depth})`);
233
+ }
234
+ // Update final stats
235
+ result.stats.durationMs = Date.now() - startTime;
236
+ return result;
237
+ }
238
+ /**
239
+ * Web scraper singleton for convenience.
240
+ */
241
+ export const webScraper = {
242
+ crawl: crawlWebsite,
243
+ };
244
+ //# sourceMappingURL=web-scraper.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"web-scraper.js","sourceRoot":"","sources":["../../src/services/web-scraper.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,EACL,YAAY,EACZ,YAAY,EACZ,YAAY,EACZ,aAAa,GACd,MAAM,iBAAiB,CAAC;AAuEzB;;GAEG;AACH,MAAM,eAAe,GAA6B;IAChD,QAAQ,EAAE,CAAC;IACX,YAAY,EAAE,GAAG;IACjB,QAAQ,EAAE,GAAG;IACb,gBAAgB,EAAE,IAAI;IACtB,SAAS,EAAE,8CAA8C;CAC1D,CAAC;AAEF;;GAEG;AACH,MAAM,SAAS,GAAG,CAAC,CAAC;AAEpB;;GAEG;AACH,KAAK,UAAU,cAAc,CAC3B,OAAe,EACf,SAAiB;IAEjB,MAAM,KAAK,GAAgB;QACzB,eAAe,EAAE,EAAE;KACpB,CAAC;IAEF,IAAI,CAAC;QACH,MAAM,SAAS,GAAG,IAAI,GAAG,CAAC,aAAa,EAAE,OAAO,CAAC,CAAC,IAAI,CAAC;QACvD,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,SAAS,EAAE;YACtC,OAAO,EAAE,EAAE,YAAY,EAAE,SAAS,EAAE;YACpC,MAAM,EAAE,WAAW,CAAC,OAAO,CAAC,IAAI,CAAC;SAClC,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;YACjB,OAAO,KAAK,CAAC,CAAC,qCAAqC;QACrD,CAAC;QAED,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;QACnC,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAE/B,IAAI,mBAAmB,GAAG,KAAK,CAAC;QAEhC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;YAE1C,6BAA6B;YAC7B,IAAI,OAAO,CAAC,UAAU,CAAC,aAAa,CAAC,EAAE,CAAC;gBACtC,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;gBACvC,mBAAmB,GAAG,KAAK,KAAK,GAAG,IAAI,SAAS,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC;YACjF,CAAC;YAED,2BAA2B;YAC3B,IAAI,mBAAmB,IAAI,OAAO,CAAC,UAAU,CAAC,WAAW,CAAC,EAAE,CAAC;gBAC3D,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;gBACzC,IAAI,IAAI,EAAE,CAAC;oBACT,KAAK,CAAC,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBACnC,CAAC;YACH,CAAC;YAED,8BAA8B;YAC9B,IAAI,mBAAmB,IAAI,OAAO,CAAC,UAAU,CAAC,cAAc,CAAC,EAAE,CAAC;gBAC9D,MAAM,KAAK,GAAG,QAAQ,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,EAAE,EAAE,CAAC,CAAC;gBACzD,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC;oBAClB,KAAK,CAAC,UAAU,GAAG,KAAK,GAAG,IAAI,CAAC,CAAC,gBAAgB;gBACnD,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAAC,MAAM,CAAC;QACP,iCAAiC;IACnC,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;GAEG;AACH,SAAS,YAAY,CAAC,GAAW,EAAE,KAAkB;IACnD,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAC5B,MAAM,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC;QAEjC,KAAK,MAAM,UAAU,IAAI,KAAK,CAAC,eAAe,EAAE,CAAC;YAC/C,2BAA2B;YAC3B,IAAI,UAAU,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;gBAC7B,MAAM,KAAK,GAAG,IAAI,MAAM,CACtB,GAAG,GAAG,UAAU,CAAC,OAAO,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC,OAAO,CAAC,KAAK,EAAE,KAAK,CAAC,GAAG,GAAG,CAClE,CAAC;gBACF,IAAI,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,EAAE,CAAC;oBACzB,OAAO,IAAI,CAAC;gBACd,CAAC;YACH,CAAC;iBAAM,IAAI,QAAQ,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;gBAC3C,OAAO,IAAI,CAAC;YACd,CAAC;QACH,CAAC;QAED,OAAO,KAAK,CAAC;IACf,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,KAAK,CAAC;IACf,CAAC;AACH,CAAC;AAED;;GAEG;AACH,KAAK,UAAU,SAAS,CACtB,GAAW,EACX,SAAiB;IAEjB,IAAI,CAAC;QACH,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;YAChC,OAAO,EAAE;gBACP,YAAY,EAAE,SAAS;gBACvB,MAAM,EAAE,iCAAiC;aAC1C;YACD,QAAQ,EAAE,QAAQ;YAClB,MAAM,EAAE,WAAW,CAAC,OAAO,CAAC,KAAK,CAAC;SACnC,CAAC,CAAC;QAEH,MAAM,WAAW,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,IAAI,EAAE,CAAC;QAE/D,4BAA4B;QAC5B,IAAI,CAAC,WAAW,CAAC,QAAQ,CAAC,WAAW,CAAC,IAAI,CAAC,WAAW,CAAC,QAAQ,CAAC,mBAAmB,CAAC,EAAE,CAAC;YACrF,OAAO,IAAI,CAAC;QACd,CAAC;QAED,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;QAEnC,OAAO;YACL,IAAI;YACJ,MAAM,EAAE,QAAQ,CAAC,MAAM;YACvB,WAAW;SACZ,CAAC;IACJ,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED;;GAEG;AACH,SAAS,KAAK,CAAC,EAAU;IACvB,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,UAAU,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC,CAAC;AAC3D,CAAC;AAED;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,YAAY,CAChC,QAAgB,EAChB,UAA0B,EAAE;IAE5B,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAE7B,8BAA8B;IAC9B,MAAM,IAAI,GAA6B;QACrC,GAAG,eAAe;QAClB,GAAG,OAAO;QACV,QAAQ,EAAE,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,QAAQ,IAAI,eAAe,CAAC,QAAQ,EAAE,SAAS,CAAC;KAC5E,CAAC;IAEF,yBAAyB;IACzB,MAAM,OAAO,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IAEvC,oBAAoB;IACpB,MAAM,MAAM,GAAgB;QAC1B,OAAO;QACP,KAAK,EAAE,EAAE;QACT,MAAM,EAAE,EAAE;QACV,OAAO,EAAE,EAAE;QACX,KAAK,EAAE;YACL,eAAe,EAAE,CAAC;YAClB,YAAY,EAAE,CAAC;YACf,WAAW,EAAE,CAAC;YACd,YAAY,EAAE,CAAC;YACf,eAAe,EAAE,CAAC;YAClB,UAAU,EAAE,CAAC;SACd;KACF,CAAC;IAEF,6BAA6B;IAC7B,IAAI,WAAW,GAAgB,EAAE,eAAe,EAAE,EAAE,EAAE,CAAC;IACvD,IAAI,IAAI,CAAC,gBAAgB,EAAE,CAAC;QAC1B,WAAW,GAAG,MAAM,cAAc,CAAC,OAAO,EAAE,IAAI,CAAC,SAAS,CAAC,CAAC;QAC5D,IAAI,WAAW,CAAC,UAAU,EAAE,CAAC;YAC3B,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,YAAY,EAAE,WAAW,CAAC,UAAU,CAAC,CAAC;QAC1E,CAAC;IACH,CAAC;IAED,+BAA+B;IAC/B,MAAM,OAAO,GAAG,IAAI,GAAG,EAAU,CAAC;IAClC,MAAM,KAAK,GAA0C,CAAC,EAAE,GAAG,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC,EAAE,CAAC,CAAC;IAElF,OAAO,KAAK,CAAC,MAAM,GAAG,CAAC,IAAI,MAAM,CAAC,KAAK,CAAC,MAAM,GAAG,IAAI,CAAC,QAAQ,EAAE,CAAC;QAC/D,MAAM,OAAO,GAAG,KAAK,CAAC,KAAK,EAAG,CAAC;QAC/B,MAAM,EAAE,GAAG,EAAE,KAAK,EAAE,GAAG,OAAO,CAAC;QAE/B,0BAA0B;QAC1B,IAAI,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;YACrB,SAAS;QACX,CAAC;QACD,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QAEjB,mBAAmB;QACnB,IAAI,IAAI,CAAC,gBAAgB,IAAI,YAAY,CAAC,GAAG,EAAE,WAAW,CAAC,EAAE,CAAC;YAC5D,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,MAAM,EAAE,0BAA0B,EAAE,CAAC,CAAC;YACjE,MAAM,CAAC,KAAK,CAAC,YAAY,EAAE,CAAC;YAC5B,SAAS;QACX,CAAC;QAED,oBAAoB;QACpB,IAAI,CAAC,YAAY,CAAC,GAAG,EAAE,OAAO,CAAC,EAAE,CAAC;YAChC,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,MAAM,EAAE,iBAAiB,EAAE,CAAC,CAAC;YACxD,MAAM,CAAC,KAAK,CAAC,YAAY,EAAE,CAAC;YAC5B,SAAS;QACX,CAAC;QAED,4CAA4C;QAC5C,IAAI,MAAM,CAAC,KAAK,CAAC,YAAY,GAAG,CAAC,EAAE,CAAC;YAClC,MAAM,KAAK,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;QACjC,CAAC;QAED,iBAAiB;QACjB,MAAM,UAAU,GAAG,MAAM,SAAS,CAAC,GAAG,EAAE,IAAI,CAAC,SAAS,CAAC,CAAC;QAExD,IAAI,CAAC,UAAU,EAAE,CAAC;YAChB,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,MAAM,EAAE,kCAAkC,EAAE,CAAC,CAAC;YACxE,MAAM,CAAC,KAAK,CAAC,WAAW,EAAE,CAAC;YAC3B,SAAS;QACX,CAAC;QAED,qCAAqC;QACrC,MAAM,KAAK,GAAG,KAAK,GAAG,IAAI,CAAC,QAAQ,CAAC,CAAC,CAAC,YAAY,CAAC,UAAU,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAE9E,sBAAsB;QACtB,MAAM,IAAI,GAAgB;YACxB,GAAG;YACH,aAAa,EAAE,YAAY,CAAC,GAAG,CAAC;YAChC,QAAQ,EAAE,aAAa,CAAC,GAAG,CAAC;YAC5B,IAAI,EAAE,UAAU,CAAC,IAAI;YACrB,MAAM,EAAE,UAAU,CAAC,MAAM;YACzB,WAAW,EAAE,UAAU,CAAC,WAAW;YACnC,KAAK;YACL,KAAK;SACN,CAAC;QAEF,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACxB,MAAM,CAAC,KAAK,CAAC,YAAY,EAAE,CAAC;QAC5B,MAAM,CAAC,KAAK,CAAC,eAAe,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC,eAAe,EAAE,KAAK,CAAC,CAAC;QAE7E,yBAAyB;QACzB,IAAI,KAAK,GAAG,IAAI,CAAC,QAAQ,EAAE,CAAC;YAC1B,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;gBACzB,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC;oBACvB,KAAK,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,IAAI,EAAE,KAAK,EAAE,KAAK,GAAG,CAAC,EAAE,CAAC,CAAC;oBAC5C,MAAM,CAAC,KAAK,CAAC,eAAe,EAAE,CAAC;gBACjC,CAAC;YACH,CAAC;QACH,CAAC;QAED,eAAe;QACf,OAAO,CAAC,KAAK,CAAC,qBAAqB,MAAM,CAAC,KAAK,CAAC,YAAY,IAAI,IAAI,CAAC,QAAQ,KAAK,GAAG,WAAW,KAAK,GAAG,CAAC,CAAC;IAC5G,CAAC;IAED,qBAAqB;IACrB,MAAM,CAAC,KAAK,CAAC,UAAU,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;IAEjD,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,MAAM,UAAU,GAAG;IACxB,KAAK,EAAE,YAAY;CACpB,CAAC"}
@@ -0,0 +1,24 @@
1
+ /**
2
+ * Input type for clear_cache tool.
3
+ */
4
+ export interface ClearCacheInput {
5
+ /** Specific docs ID to clear (optional) */
6
+ docs_id?: string;
7
+ /** Clear all cached docs (default: false) */
8
+ all?: boolean;
9
+ }
10
+ /**
11
+ * Output type for clear_cache tool.
12
+ */
13
+ export interface ClearCacheOutput {
14
+ /** IDs that were cleared */
15
+ cleared: string[];
16
+ /** Count of remaining cached docs */
17
+ remaining: number;
18
+ }
19
+ /**
20
+ * Removes cached documentation.
21
+ * Either clears a specific docs entry by ID, or all entries if `all` is true.
22
+ */
23
+ export declare function clearCache(input: ClearCacheInput): Promise<ClearCacheOutput>;
24
+ //# sourceMappingURL=clear-cache.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"clear-cache.d.ts","sourceRoot":"","sources":["../../src/tools/clear-cache.ts"],"names":[],"mappings":"AAEA;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,2CAA2C;IAC3C,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,6CAA6C;IAC7C,GAAG,CAAC,EAAE,OAAO,CAAC;CACf;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B,4BAA4B;IAC5B,OAAO,EAAE,MAAM,EAAE,CAAC;IAClB,qCAAqC;IACrC,SAAS,EAAE,MAAM,CAAC;CACnB;AAED;;;GAGG;AACH,wBAAsB,UAAU,CAAC,KAAK,EAAE,eAAe,GAAG,OAAO,CAAC,gBAAgB,CAAC,CAyBlF"}
@@ -0,0 +1,29 @@
1
+ import { cacheManager } from "../services/cache-manager.js";
2
+ /**
3
+ * Removes cached documentation.
4
+ * Either clears a specific docs entry by ID, or all entries if `all` is true.
5
+ */
6
+ export async function clearCache(input) {
7
+ await cacheManager.initialize();
8
+ const cleared = [];
9
+ if (input.all) {
10
+ // Clear everything
11
+ const allCleared = await cacheManager.clearAll();
12
+ cleared.push(...allCleared);
13
+ }
14
+ else if (input.docs_id) {
15
+ // Clear specific entry - try both sources
16
+ const meta = await cacheManager.findById(input.docs_id);
17
+ if (meta) {
18
+ await cacheManager.clearEntry(meta.source, meta.id);
19
+ cleared.push(meta.id);
20
+ }
21
+ }
22
+ // Count remaining entries
23
+ const remaining = (await cacheManager.listEntries()).length;
24
+ return {
25
+ cleared,
26
+ remaining,
27
+ };
28
+ }
29
+ //# sourceMappingURL=clear-cache.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"clear-cache.js","sourceRoot":"","sources":["../../src/tools/clear-cache.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,8BAA8B,CAAC;AAsB5D;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,UAAU,CAAC,KAAsB;IACrD,MAAM,YAAY,CAAC,UAAU,EAAE,CAAC;IAEhC,MAAM,OAAO,GAAa,EAAE,CAAC;IAE7B,IAAI,KAAK,CAAC,GAAG,EAAE,CAAC;QACd,mBAAmB;QACnB,MAAM,UAAU,GAAG,MAAM,YAAY,CAAC,QAAQ,EAAE,CAAC;QACjD,OAAO,CAAC,IAAI,CAAC,GAAG,UAAU,CAAC,CAAC;IAC9B,CAAC;SAAM,IAAI,KAAK,CAAC,OAAO,EAAE,CAAC;QACzB,0CAA0C;QAC1C,MAAM,IAAI,GAAG,MAAM,YAAY,CAAC,QAAQ,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;QACxD,IAAI,IAAI,EAAE,CAAC;YACT,MAAM,YAAY,CAAC,UAAU,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,EAAE,CAAC,CAAC;YACpD,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACxB,CAAC;IACH,CAAC;IAED,0BAA0B;IAC1B,MAAM,SAAS,GAAG,CAAC,MAAM,YAAY,CAAC,WAAW,EAAE,CAAC,CAAC,MAAM,CAAC;IAE5D,OAAO;QACL,OAAO;QACP,SAAS;KACV,CAAC;AACJ,CAAC"}
@@ -0,0 +1,21 @@
1
+ /**
2
+ * detect_github_repo tool - Finds GitHub repository from a docs website URL.
3
+ */
4
+ import { type GitHubDetectionResult } from "../services/github-detector.js";
5
+ /**
6
+ * Input for the detect_github_repo tool.
7
+ */
8
+ export interface DetectGitHubInput {
9
+ /** Docs website URL to analyze */
10
+ url: string;
11
+ }
12
+ /**
13
+ * Output from the detect_github_repo tool.
14
+ * Same as GitHubDetectionResult.
15
+ */
16
+ export type DetectGitHubOutput = GitHubDetectionResult;
17
+ /**
18
+ * Detects GitHub repository from a documentation website URL.
19
+ */
20
+ export declare function detectGitHub(input: DetectGitHubInput): Promise<DetectGitHubOutput>;
21
+ //# sourceMappingURL=detect-github.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"detect-github.d.ts","sourceRoot":"","sources":["../../src/tools/detect-github.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAEL,KAAK,qBAAqB,EAC3B,MAAM,gCAAgC,CAAC;AAGxC;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,kCAAkC;IAClC,GAAG,EAAE,MAAM,CAAC;CACb;AAED;;;GAGG;AACH,MAAM,MAAM,kBAAkB,GAAG,qBAAqB,CAAC;AAEvD;;GAEG;AACH,wBAAsB,YAAY,CAChC,KAAK,EAAE,iBAAiB,GACvB,OAAO,CAAC,kBAAkB,CAAC,CAU7B"}
@@ -0,0 +1,18 @@
1
+ /**
2
+ * detect_github_repo tool - Finds GitHub repository from a docs website URL.
3
+ */
4
+ import { detectGitHubRepo, } from "../services/github-detector.js";
5
+ import { ValidationError } from "../types/errors.js";
6
+ /**
7
+ * Detects GitHub repository from a documentation website URL.
8
+ */
9
+ export async function detectGitHub(input) {
10
+ const { url } = input;
11
+ // Validate required parameters
12
+ if (!url) {
13
+ throw new ValidationError("Missing required parameter: url", "url");
14
+ }
15
+ // Run detection
16
+ return detectGitHubRepo(url);
17
+ }
18
+ //# sourceMappingURL=detect-github.js.map