@cosmocoder/mcp-web-docs 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. package/README.md +132 -10
  2. package/build/crawler/auth.d.ts +17 -6
  3. package/build/crawler/auth.js +166 -31
  4. package/build/crawler/auth.js.map +1 -1
  5. package/build/crawler/auth.test.js +197 -33
  6. package/build/crawler/auth.test.js.map +1 -1
  7. package/build/index.js +68 -16
  8. package/build/index.js.map +1 -1
  9. package/build/index.test.js +134 -0
  10. package/build/index.test.js.map +1 -1
  11. package/build/storage/storage.d.ts +17 -0
  12. package/build/storage/storage.js +102 -5
  13. package/build/storage/storage.js.map +1 -1
  14. package/build/storage/storage.test.js +91 -0
  15. package/build/storage/storage.test.js.map +1 -1
  16. package/build/types.d.ts +4 -0
  17. package/build/util/security.js +8 -0
  18. package/build/util/security.js.map +1 -1
  19. package/build/util/security.test.js +18 -0
  20. package/build/util/security.test.js.map +1 -1
  21. package/package.json +11 -2
  22. package/build/crawler/cheerio.d.ts +0 -11
  23. package/build/crawler/cheerio.js +0 -134
  24. package/build/crawler/cheerio.js.map +0 -1
  25. package/build/crawler/chromium.d.ts +0 -21
  26. package/build/crawler/chromium.js +0 -596
  27. package/build/crawler/chromium.js.map +0 -1
  28. package/build/crawler/default.d.ts +0 -11
  29. package/build/crawler/default.js +0 -138
  30. package/build/crawler/default.js.map +0 -1
  31. package/build/crawler/factory.d.ts +0 -6
  32. package/build/crawler/factory.js +0 -83
  33. package/build/crawler/factory.js.map +0 -1
  34. package/build/crawler/puppeteer.d.ts +0 -16
  35. package/build/crawler/puppeteer.js +0 -191
  36. package/build/crawler/puppeteer.js.map +0 -1
  37. package/build/embeddings/openai.d.ts +0 -8
  38. package/build/embeddings/openai.js +0 -56
  39. package/build/embeddings/openai.js.map +0 -1
  40. package/build/rag/cache.d.ts +0 -10
  41. package/build/rag/cache.js +0 -10
  42. package/build/rag/cache.js.map +0 -1
  43. package/build/rag/code-generator.d.ts +0 -11
  44. package/build/rag/code-generator.js +0 -30
  45. package/build/rag/code-generator.js.map +0 -1
  46. package/build/rag/context-assembler.d.ts +0 -23
  47. package/build/rag/context-assembler.js +0 -113
  48. package/build/rag/context-assembler.js.map +0 -1
  49. package/build/rag/docs-search.d.ts +0 -55
  50. package/build/rag/docs-search.js +0 -380
  51. package/build/rag/docs-search.js.map +0 -1
  52. package/build/rag/pipeline.d.ts +0 -26
  53. package/build/rag/pipeline.js +0 -91
  54. package/build/rag/pipeline.js.map +0 -1
  55. package/build/rag/query-processor.d.ts +0 -14
  56. package/build/rag/query-processor.js +0 -57
  57. package/build/rag/query-processor.js.map +0 -1
  58. package/build/rag/reranker.d.ts +0 -55
  59. package/build/rag/reranker.js +0 -210
  60. package/build/rag/reranker.js.map +0 -1
  61. package/build/rag/response-generator.d.ts +0 -20
  62. package/build/rag/response-generator.js +0 -101
  63. package/build/rag/response-generator.js.map +0 -1
  64. package/build/rag/retriever.d.ts +0 -19
  65. package/build/rag/retriever.js +0 -111
  66. package/build/rag/retriever.js.map +0 -1
  67. package/build/rag/validator.d.ts +0 -22
  68. package/build/rag/validator.js +0 -128
  69. package/build/rag/validator.js.map +0 -1
  70. package/build/rag/version-manager.d.ts +0 -23
  71. package/build/rag/version-manager.js +0 -98
  72. package/build/rag/version-manager.js.map +0 -1
  73. package/build/types/rag.d.ts +0 -27
  74. package/build/types/rag.js +0 -2
  75. package/build/types/rag.js.map +0 -1
  76. package/build/util/content-utils.d.ts +0 -31
  77. package/build/util/content-utils.js +0 -120
  78. package/build/util/content-utils.js.map +0 -1
  79. package/build/util/content.d.ts +0 -1
  80. package/build/util/content.js +0 -16
  81. package/build/util/content.js.map +0 -1
  82. package/build/util/site-detector.d.ts +0 -22
  83. package/build/util/site-detector.js +0 -42
  84. package/build/util/site-detector.js.map +0 -1
@@ -1,11 +0,0 @@
1
- import { CrawlResult } from '../types.js';
2
- import { BaseCrawler } from './base.js';
3
- export declare class DefaultCrawler extends BaseCrawler {
4
- private readonly BATCH_SIZE;
5
- private readonly FETCH_TIMEOUT;
6
- constructor(maxDepth?: number, maxRequestsPerCrawl?: number, onProgress?: (progress: number, description: string) => void);
7
- crawl(url: string): AsyncGenerator<CrawlResult, void, unknown>;
8
- private processPageWithRetry;
9
- private extractLinks;
10
- private extractTitle;
11
- }
@@ -1,138 +0,0 @@
1
- import { URL } from 'url';
2
- import * as cheerio from 'cheerio';
3
- import { BaseCrawler } from './base.js';
4
- export class DefaultCrawler extends BaseCrawler {
5
- BATCH_SIZE = 50;
6
- FETCH_TIMEOUT = 30000; // 30 seconds
7
- constructor(maxDepth = 4, maxRequestsPerCrawl = 1000, onProgress) {
8
- super(maxDepth, maxRequestsPerCrawl, onProgress);
9
- }
10
- async *crawl(url) {
11
- console.debug(`[${this.constructor.name}] Starting crawl from: ${url}`);
12
- if (this.isAborting) {
13
- console.debug('[DefaultCrawler] Crawl aborted');
14
- return;
15
- }
16
- const startUrl = new URL(url);
17
- const baseUrl = this.normalizeUrl(startUrl.toString());
18
- // Track pages to process
19
- const pagesToCrawl = new Map(); // URL -> depth
20
- pagesToCrawl.set(baseUrl, 0);
21
- while (pagesToCrawl.size > 0 && !this.isAborting) {
22
- // Get batch of URLs to process
23
- const batchEntries = Array.from(pagesToCrawl.entries()).slice(0, this.BATCH_SIZE);
24
- const batch = new Map(batchEntries);
25
- // Remove batch from queue
26
- batchEntries.forEach(([url]) => pagesToCrawl.delete(url));
27
- try {
28
- // Process batch in parallel with timeout and rate limiting
29
- const results = await Promise.all(Array.from(batch.entries()).map(async ([pageUrl]) => {
30
- // Apply rate limiting
31
- await this.rateLimit();
32
- const result = await this.processPageWithRetry(pageUrl);
33
- return { pageUrl, ...result };
34
- }));
35
- // Handle results
36
- for (const { pageUrl, content, links, error } of results) {
37
- if (error || !content || this.isAborting)
38
- continue;
39
- this.markUrlAsSeen(pageUrl);
40
- yield {
41
- url: pageUrl,
42
- path: this.getPathFromUrl(pageUrl),
43
- content,
44
- title: this.extractTitle(content)
45
- };
46
- // Add new links to queue if within depth limit
47
- const currentDepth = batch.get(pageUrl) || 0;
48
- if (currentDepth < this.maxDepth) {
49
- for (const link of links) {
50
- const normalizedLink = this.normalizeUrl(link);
51
- if (this.shouldCrawl(normalizedLink) && !pagesToCrawl.has(normalizedLink)) {
52
- pagesToCrawl.set(normalizedLink, currentDepth + 1);
53
- }
54
- }
55
- }
56
- // Check if we've hit the request limit
57
- if (this.seenUrls.size >= this.maxRequestsPerCrawl) {
58
- console.debug('[DefaultCrawler] Max requests reached');
59
- return;
60
- }
61
- }
62
- // Add delay between batches
63
- await new Promise(resolve => setTimeout(resolve, 1000));
64
- }
65
- catch (e) {
66
- console.error('[DefaultCrawler] Error processing batch:', e);
67
- }
68
- }
69
- console.debug('[DefaultCrawler] Crawl completed');
70
- }
71
- async processPageWithRetry(url) {
72
- return this.retryWithBackoff(async () => {
73
- try {
74
- // Create fetch request with timeout
75
- const controller = new AbortController();
76
- const timeoutId = setTimeout(() => controller.abort(), this.FETCH_TIMEOUT);
77
- const response = await fetch(url, {
78
- signal: controller.signal,
79
- headers: {
80
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
81
- }
82
- });
83
- clearTimeout(timeoutId);
84
- if (!response.ok) {
85
- throw new Error(`HTTP error! status: ${response.status}`);
86
- }
87
- const content = await response.text();
88
- const links = this.extractLinks(content, new URL(url));
89
- return { content, links };
90
- }
91
- catch (e) {
92
- if (e instanceof Error) {
93
- return { content: null, links: [], error: e };
94
- }
95
- return { content: null, links: [], error: new Error('Unknown error occurred') };
96
- }
97
- });
98
- }
99
- extractLinks(html, baseUrl) {
100
- try {
101
- const $ = cheerio.load(html);
102
- const links = new Set();
103
- // Find all links, including those in navigation elements
104
- $('a').each((_, element) => {
105
- const href = $(element).attr('href');
106
- if (!href)
107
- return;
108
- try {
109
- const url = new URL(href, baseUrl);
110
- const normalizedUrl = this.normalizeUrl(url.toString());
111
- // Use BaseCrawler's URL validation
112
- if (this.shouldCrawl(normalizedUrl)) {
113
- links.add(normalizedUrl);
114
- }
115
- }
116
- catch (e) {
117
- console.debug(`[DefaultCrawler] Invalid URL ${href}:`, e);
118
- }
119
- });
120
- return Array.from(links);
121
- }
122
- catch (e) {
123
- console.error('[DefaultCrawler] Error extracting links:', e);
124
- return [];
125
- }
126
- }
127
- extractTitle(html) {
128
- try {
129
- const $ = cheerio.load(html);
130
- return $('title').text().trim() || 'Untitled';
131
- }
132
- catch (e) {
133
- console.error('[DefaultCrawler] Error extracting title:', e);
134
- return 'Untitled';
135
- }
136
- }
137
- }
138
- //# sourceMappingURL=default.js.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"default.js","sourceRoot":"","sources":["../../src/crawler/default.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,GAAG,EAAE,MAAM,KAAK,CAAC;AAC1B,OAAO,KAAK,OAAO,MAAM,SAAS,CAAC;AAEnC,OAAO,EAAE,WAAW,EAAE,MAAM,WAAW,CAAC;AAExC,MAAM,OAAO,cAAe,SAAQ,WAAW;IAC5B,UAAU,GAAG,EAAE,CAAC;IAChB,aAAa,GAAG,KAAK,CAAC,CAAC,aAAa;IAErD,YACE,WAAmB,CAAC,EACpB,sBAA8B,IAAI,EAClC,UAA4D;QAE5D,KAAK,CAAC,QAAQ,EAAE,mBAAmB,EAAE,UAAU,CAAC,CAAC;IACnD,CAAC;IAED,KAAK,CAAC,CAAC,KAAK,CAAC,GAAW;QACtB,OAAO,CAAC,KAAK,CAAC,IAAI,IAAI,CAAC,WAAW,CAAC,IAAI,0BAA0B,GAAG,EAAE,CAAC,CAAC;QAExE,IAAI,IAAI,CAAC,UAAU,EAAE,CAAC;YACpB,OAAO,CAAC,KAAK,CAAC,gCAAgC,CAAC,CAAC;YAChD,OAAO;QACT,CAAC;QAED,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAC9B,MAAM,OAAO,GAAG,IAAI,CAAC,YAAY,CAAC,QAAQ,CAAC,QAAQ,EAAE,CAAC,CAAC;QAEvD,yBAAyB;QACzB,MAAM,YAAY,GAAG,IAAI,GAAG,EAAkB,CAAC,CAAC,eAAe;QAC/D,YAAY,CAAC,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;QAE7B,OAAO,YAAY,CAAC,IAAI,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,CAAC;YACjD,+BAA+B;YAC/B,MAAM,YAAY,GAAG,KAAK,CAAC,IAAI,CAAC,YAAY,CAAC,OAAO,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,UAAU,CAAC,CAAC;YAClF,MAAM,KAAK,GAAG,IAAI,GAAG,CAAC,YAAY,CAAC,CAAC;YAEpC,0BAA0B;YAC1B,YAAY,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC,YAAY,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC;YAE1D,IAAI,CAAC;gBACH,2DAA2D;gBAC3D,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,GAAG,CAC/B,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC,OAAO,CAAC,EAAE,EAAE;oBAClD,sBAAsB;oBACtB,MAAM,IAAI,CAAC,SAAS,EAAE,CAAC;oBACvB,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,oBAAoB,CAAC,OAAO,CAAC,CAAC;oBACxD,OAAO,EAAE,OAAO,EAAE,GAAG,MAAM,EAAE,CAAC;gBAChC,CAAC,CAAC,CACH,CAAC;gBAEF,iBAAiB;gBACjB,KAAK,MAAM,EAAE,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,IAAI,OAAO,EAAE,CAAC;oBACzD,IAAI,KAAK,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,UAAU;wBAAE,SAAS;oBAEnD,IAAI,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC;oBAE5B,MAAM;wBACJ,GAAG,EAAE,OAAO;wBACZ,IAAI,EAAE,IAAI,CAAC,cAAc,CAAC,OAAO,CAAC;wBAClC,OAAO;wBACP,KAAK,EAAE,IAAI,CAAC,YAAY,CAAC,OAAO,CAAC;qBAClC,CAAC;oBAEF,+CAA+C;oBAC/C,MAAM,YAAY,GAAG,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;oBAC7C,IAAI,YAAY,GAAG,IAAI,CAAC,QAAQ,EAAE,CAAC;wBACjC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;4BACzB,MAAM,cAAc,GAAG,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC;4BAC/C,IAAI,IAAI,CAAC,WAAW,CAAC,cAAc,CAAC,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,cAAc,CAAC,EAAE,CAAC;gCAC1E,YAAY,CAAC,GAAG,CAAC,cAAc,EAAE,YAAY,GAAG,CAAC,CAAC,CAAC;4BACrD,CAAC;wBACH,CAAC;oBACH,CAAC;oBAED,uCAAuC;oBACvC,IAAI,IAAI,CAAC,QAAQ,CAAC,IAAI,IAAI,IAAI,CAAC,mBAAmB,EAAE,CAAC;wBACnD,OAAO,CAAC,KAAK,CAAC,uCAAuC,CAAC,CAAC;wBACvD,OAAO;oBACT,CAAC;gBACH,CAAC;gBAED,4BAA4B;gBAC5B,MAAM,IAAI,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC,UAAU,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC,CAAC;YAC1D,CAAC;YAAC,OAAO,CAAC,EAAE,CAAC;gBACX,OAAO,CAAC,KAAK,CAAC,0CAA0C,EAAE,CAAC,CAAC,CAAC;YAC/D,CAAC;QACH,CAAC;QAED,OAAO,CAAC,KAAK,CAAC,kCAAkC,CAAC,CAAC;IACpD,CAAC;IAEO,KAAK,CAAC,oBAAoB,CAAC,GAAW;QAK5C,OAAO,IAAI,CAAC,gBAAgB,CAAC,KAAK,IAAI,EAAE;YACtC,IAAI,CAAC;gBACH,oCAAoC;gBACpC,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;gBACzC,MAAM,SAAS,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,IAAI,CAAC,aAAa,CAAC,CAAC;gBAE3E,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;oBAChC,MAAM,EAAE,UAAU,CAAC,MAAM;oBACzB,OAAO,EAAE;wBACP,YAAY,EAAE,qHAAqH;qBACpI;iBACF,CAAC,CAAC;gBAEH,YAAY,CAAC,SAAS,CAAC,CAAC;gBAExB,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;oBACjB,MAAM,IAAI,KAAK,CAAC,uBAAuB,QAAQ,CAAC,MAAM,EAAE,CAAC,CAAC;gBAC5D,CAAC;gBAED,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;gBACtC,MAAM,KAAK,GAAG,IAAI,CAAC,YAAY,CAAC,OAAO,EAAE,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC;gBAEvD,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC;YAC5B,CAAC;YAAC,OAAO,CAAC,EAAE,CAAC;gBACX,IAAI,CAAC,YAAY,KAAK,EAAE,CAAC;oBACvB,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE,EAAE,KAAK,EAAE,CAAC,EAAE,CAAC;gBAChD,CAAC;gBACD,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE,EAAE,KAAK,EAAE,IAAI,KAAK,CAAC,wBAAwB,CAAC,EAAE,CAAC;YAClF,CAAC;QACH,CAAC,CAAC,CAAC;IACL,CAAC;IAEO,YAAY,CAAC,IAAY,EAAE,OAAY;QAC7C,IAAI,CAAC;YACH,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC7B,MAAM,KAAK,GAAG,IAAI,GAAG,EAAU,CAAC;YAEhC,yDAAyD;YACzD,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,EAAE;gBACzB,MAAM,IAAI,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;gBACrC,IAAI,CAAC,IAAI;oBAAE,OAAO;gBAElB,IAAI,CAAC;oBACH,MAAM,GAAG,GAAG,IAAI,GAAG,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;oBACnC,MAAM,aAAa,GAAG,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAC,CAAC;oBAExD,mCAAmC;oBACnC,IAAI,IAAI,CAAC,WAAW,CAAC,aAAa,CAAC,EAAE,CAAC;wBACpC,KAAK,CAAC,GAAG,CAAC,aAAa,CAAC,CAAC;oBAC3B,CAAC;gBACH,CAAC;gBAAC,OAAO,CAAC,EAAE,CAAC;oBACX,OAAO,CAAC,KAAK,CAAC,gCAAgC,IAAI,GAAG,EAAE,CAAC,CAAC,CAAC;gBAC5D,CAAC;YACH,CAAC,CAAC,CAAC;YAEH,OAAO,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAC3B,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,OAAO,CAAC,KAAK,CAAC,0CAA0C,EAAE,CAAC,CAAC,CAAC;YAC7D,OAAO,EAAE,CAAC;QACZ,CAAC;IACH,CAAC;IAEO,YAAY,CAAC,IAAY;QAC/B,IAAI,CAAC;YACH,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC7B,OAAO,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,IAAI,UAAU,CAAC;QAChD,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,OAAO,CAAC,KAAK,CAAC,0CAA0C,EAAE,CAAC,CAAC,CAAC;YAC7D,OAAO,UAAU,CAAC;QACpB,CAAC;IACH,CAAC;CACF"}
@@ -1,6 +0,0 @@
1
- import { BaseCrawler } from './base.js';
2
- export declare class CrawlerFactory {
3
- private static readonly JS_FRAMEWORK_INDICATORS;
4
- private static detectSiteType;
5
- static createCrawler(url: string, maxRequestsPerCrawl?: number, maxDepth?: number, onProgress?: (progress: number, description: string) => void): Promise<BaseCrawler>;
6
- }
@@ -1,83 +0,0 @@
1
- import { URL } from 'url';
2
- import { DefaultCrawler } from './default.js';
3
- import { ChromiumCrawler } from './chromium.js';
4
- import { CheerioCrawler } from './cheerio.js';
5
- export class CrawlerFactory {
6
- // Common JavaScript framework identifiers
7
- static JS_FRAMEWORK_INDICATORS = [
8
- 'react',
9
- 'vue',
10
- 'angular',
11
- 'next',
12
- 'nuxt',
13
- 'gatsby',
14
- 'docusaurus',
15
- 'vuepress',
16
- 'gridsome',
17
- 'svelte'
18
- ];
19
- static async detectSiteType(url) {
20
- try {
21
- const response = await fetch(url, {
22
- headers: {
23
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
24
- }
25
- });
26
- if (!response.ok) {
27
- throw new Error(`HTTP error! status: ${response.status}`);
28
- }
29
- const html = await response.text();
30
- // Check for JavaScript frameworks
31
- const hasFramework = CrawlerFactory.JS_FRAMEWORK_INDICATORS.some(framework => html.toLowerCase().includes(framework));
32
- // Check for JavaScript-heavy indicators
33
- const isJsHeavy = (html.includes('data-react') ||
34
- html.includes('ng-') ||
35
- html.includes('v-') ||
36
- html.includes('__NEXT_DATA__') ||
37
- html.includes('nuxt') ||
38
- html.includes('id="___gatsby"'));
39
- return { isJsHeavy, hasFramework };
40
- }
41
- catch (e) {
42
- console.error('[CrawlerFactory] Error detecting site type:', e);
43
- return { isJsHeavy: false, hasFramework: false };
44
- }
45
- }
46
- static async createCrawler(url, maxRequestsPerCrawl = 1000, maxDepth = 4, onProgress) {
47
- const startUrl = new URL(url);
48
- console.debug(`[CrawlerFactory] Creating crawler for ${startUrl}`);
49
- // Check if site is JavaScript-heavy first
50
- const { isJsHeavy, hasFramework } = await CrawlerFactory.detectSiteType(url);
51
- // Try Chromium for JavaScript-heavy sites
52
- if (isJsHeavy || hasFramework) {
53
- console.debug(`[CrawlerFactory] Site appears to be JavaScript-heavy, using Chromium crawler`);
54
- return new ChromiumCrawler(maxDepth, maxRequestsPerCrawl, onProgress);
55
- }
56
- // Try default crawler
57
- try {
58
- console.debug(`[CrawlerFactory] Attempting default crawler for ${url}`);
59
- const defaultCrawler = new DefaultCrawler(maxDepth, maxRequestsPerCrawl, onProgress);
60
- const generator = defaultCrawler.crawl(url);
61
- const { value: firstPage, done } = await generator.next();
62
- if (!done && firstPage?.content) {
63
- console.debug('[CrawlerFactory] Successfully created default crawler');
64
- return defaultCrawler;
65
- }
66
- }
67
- catch (e) {
68
- console.debug('[CrawlerFactory] Default crawler failed:', e);
69
- }
70
- // Fall back to Cheerio crawler
71
- console.debug(`[CrawlerFactory] Attempting Cheerio crawler for ${url}`);
72
- const cheerioCrawler = new CheerioCrawler(maxDepth, maxRequestsPerCrawl, onProgress);
73
- const generator = cheerioCrawler.crawl(url);
74
- const { value: firstPage, done } = await generator.next();
75
- if (!done && firstPage?.content) {
76
- console.debug('[CrawlerFactory] Successfully created Cheerio crawler');
77
- return cheerioCrawler;
78
- }
79
- console.error(`[CrawlerFactory] All crawlers failed for ${url}`);
80
- throw new Error(`Failed to create crawler for ${url}`);
81
- }
82
- }
83
- //# sourceMappingURL=factory.js.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"factory.js","sourceRoot":"","sources":["../../src/crawler/factory.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,GAAG,EAAE,MAAM,KAAK,CAAC;AAE1B,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAC9C,OAAO,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AAChD,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAE9C,MAAM,OAAO,cAAc;IACzB,0CAA0C;IAClC,MAAM,CAAU,uBAAuB,GAAG;QAChD,OAAO;QACP,KAAK;QACL,SAAS;QACT,MAAM;QACN,MAAM;QACN,QAAQ;QACR,YAAY;QACZ,UAAU;QACV,UAAU;QACV,QAAQ;KACT,CAAC;IAEM,MAAM,CAAC,KAAK,CAAC,cAAc,CAAC,GAAW;QAI7C,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;gBAChC,OAAO,EAAE;oBACP,YAAY,EAAE,qHAAqH;iBACpI;aACF,CAAC,CAAC;YAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;gBACjB,MAAM,IAAI,KAAK,CAAC,uBAAuB,QAAQ,CAAC,MAAM,EAAE,CAAC,CAAC;YAC5D,CAAC;YAED,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;YAEnC,kCAAkC;YAClC,MAAM,YAAY,GAAG,cAAc,CAAC,uBAAuB,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,CAC3E,IAAI,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,SAAS,CAAC,CACvC,CAAC;YAEF,wCAAwC;YACxC,MAAM,SAAS,GAAG,CAChB,IAAI,CAAC,QAAQ,CAAC,YAAY,CAAC;gBAC3B,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC;gBACpB,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC;gBACnB,IAAI,CAAC,QAAQ,CAAC,eAAe,CAAC;gBAC9B,IAAI,CAAC,QAAQ,CAAC,MAAM,CAAC;gBACrB,IAAI,CAAC,QAAQ,CAAC,gBAAgB,CAAC,CAChC,CAAC;YAEF,OAAO,EAAE,SAAS,EAAE,YAAY,EAAE,CAAC;QACrC,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,OAAO,CAAC,KAAK,CAAC,6CAA6C,EAAE,CAAC,CAAC,CAAC;YAChE,OAAO,EAAE,SAAS,EAAE,KAAK,EAAE,YAAY,EAAE,KAAK,EAAE,CAAC;QACnD,CAAC;IACH,CAAC;IAED,MAAM,CAAC,KAAK,CAAC,aAAa,CACxB,GAAW,EACX,sBAA8B,IAAI,EAClC,WAAmB,CAAC,EACpB,UAA4D;QAE5D,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAC9B,OAAO,CAAC,KAAK,CAAC,yCAAyC,QAAQ,EAAE,CAAC,CAAC;QAEnE,0CAA0C;QAC1C,MAAM,EAAE,SAAS,EAAE,YAAY,EAAE,GAAG,MAAM,cAAc,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;QAE7E,0CAA0C;QAC1C,IAAI,SAAS,IAAI,YAAY,EAAE,CAAC;YAC9B,OAAO,CAAC,KAAK,CAAC,8EAA8E,CAAC,CAAC;YAC9F,OAAO,IAAI,eAAe,CAAC,QAAQ,EAAE,mBAAmB,EAAE,UAAU,CAAC,CAAC;QACxE,CAAC;QAED,sBAAsB;QACtB,IAAI,CAAC;YACH,OAAO,CAAC,KAAK,CAAC,mDAAmD,GAAG,EAAE,CAAC,CAAC;YACxE,MAAM,cAAc,GAAG,IAAI,cAAc,CAAC,QAAQ,EAAE,mBAAmB,EAAE,UAAU,CAAC,CAAC;YACrF,MAAM,SAAS,GAAG,cAAc,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;YAC5C,MAAM,EAAE,KAAK,EAAE,SAAS,EAAE,IAAI,EAAE,GAAG,MAAM,SAAS,CAAC,IAAI,EAAE,CAAC;YAE1D,IAAI,CAAC,IAAI,IAAI,SAAS,EAAE,OAAO,EAAE,CAAC;gBAChC,OAAO,CAAC,KAAK,CAAC,uDAAuD,CAAC,CAAC;gBACvE,OAAO,cAAc,CAAC;YACxB,CAAC;QACH,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,OAAO,CAAC,KAAK,CAAC,0CAA0C,EAAE,CAAC,CAAC,CAAC;QAC/D,CAAC;QAED,+BAA+B;QAC/B,OAAO,CAAC,KAAK,CAAC,mDAAmD,GAAG,EAAE,CAAC,CAAC;QACxE,MAAM,cAAc,GAAG,IAAI,cAAc,CAAC,QAAQ,EAAE,mBAAmB,EAAE,UAAU,CAAC,CAAC;QACrF,MAAM,SAAS,GAAG,cAAc,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAC5C,MAAM,EAAE,KAAK,EAAE,SAAS,EAAE,IAAI,EAAE,GAAG,MAAM,SAAS,CAAC,IAAI,EAAE,CAAC;QAE1D,IAAI,CAAC,IAAI,IAAI,SAAS,EAAE,OAAO,EAAE,CAAC;YAChC,OAAO,CAAC,KAAK,CAAC,uDAAuD,CAAC,CAAC;YACvE,OAAO,cAAc,CAAC;QACxB,CAAC;QAED,OAAO,CAAC,KAAK,CAAC,4CAA4C,GAAG,EAAE,CAAC,CAAC;QACjE,MAAM,IAAI,KAAK,CAAC,gCAAgC,GAAG,EAAE,CAAC,CAAC;IACzD,CAAC"}
@@ -1,16 +0,0 @@
1
- import { CrawlResult } from '../types.js';
2
- import { BaseCrawler } from './base.js';
3
- export declare class PuppeteerCrawler extends BaseCrawler {
4
- private browser?;
5
- private readonly userAgent;
6
- private readonly LINK_GROUP_SIZE;
7
- private curCrawlCount;
8
- crawl(url: string): AsyncGenerator<CrawlResult, void, unknown>;
9
- private setupPage;
10
- private crawlSitePages;
11
- private gotoPageAndHandleRedirects;
12
- private processPage;
13
- private getLinksFromPage;
14
- private groupLinks;
15
- abort(): void;
16
- }
@@ -1,191 +0,0 @@
1
- import puppeteer from 'puppeteer';
2
- import { BaseCrawler } from './base.js';
3
- import { JSDOM } from 'jsdom';
4
- import { Readability } from '@mozilla/readability';
5
- export class PuppeteerCrawler extends BaseCrawler {
6
- browser;
7
- userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36';
8
- LINK_GROUP_SIZE = 2;
9
- curCrawlCount = 0;
10
- async *crawl(url) {
11
- try {
12
- this.browser = await puppeteer.launch({
13
- headless: true,
14
- args: [
15
- '--no-sandbox',
16
- '--disable-setuid-sandbox',
17
- '--disable-dev-shm-usage',
18
- '--disable-gpu',
19
- '--window-size=1280,800'
20
- ]
21
- });
22
- const page = await this.browser.newPage();
23
- await this.setupPage(page);
24
- const visitedUrls = new Set();
25
- yield* this.crawlSitePages(page, new URL(url), 0, visitedUrls);
26
- }
27
- finally {
28
- await this.browser?.close();
29
- }
30
- }
31
- async setupPage(page) {
32
- await page.setUserAgent(this.userAgent);
33
- await page.setViewport({ width: 1280, height: 800 });
34
- // Block only unnecessary resources
35
- await page.setRequestInterception(true);
36
- page.on('request', request => {
37
- const resourceType = request.resourceType();
38
- if (['image', 'media', 'font'].includes(resourceType)) {
39
- request.abort();
40
- }
41
- else {
42
- request.continue();
43
- }
44
- });
45
- // Handle JavaScript errors
46
- page.on('pageerror', error => {
47
- console.warn('Page error:', error);
48
- });
49
- // Handle console messages
50
- page.on('console', (msg) => {
51
- const type = msg.type();
52
- if (type === 'error' || type === 'warn') {
53
- console.debug(`Console ${type}:`, msg.text());
54
- }
55
- });
56
- }
57
- async *crawlSitePages(page, curUrl, depth, visitedUrls) {
58
- const urlStr = curUrl.toString();
59
- if (visitedUrls.has(urlStr) || !this.shouldCrawl(urlStr) || depth > this.maxDepth) {
60
- return;
61
- }
62
- try {
63
- // Rate limiting
64
- await this.rateLimit();
65
- // Navigate to page with proper redirect handling
66
- await this.gotoPageAndHandleRedirects(page, urlStr);
67
- // Extract content
68
- const { content, title, links } = await this.processPage(page, curUrl);
69
- visitedUrls.add(urlStr);
70
- this.markUrlAsSeen(urlStr);
71
- this.curCrawlCount++;
72
- yield {
73
- url: urlStr,
74
- path: this.getPathFromUrl(urlStr),
75
- content,
76
- title
77
- };
78
- // Process links in batches
79
- if (depth < this.maxDepth && this.curCrawlCount < this.maxRequestsPerCrawl) {
80
- const linkGroups = this.groupLinks(links);
81
- for (const linkGroup of linkGroups) {
82
- for (const link of linkGroup) {
83
- if (this.curCrawlCount >= this.maxRequestsPerCrawl) {
84
- return;
85
- }
86
- yield* this.crawlSitePages(page, new URL(link), depth + 1, visitedUrls);
87
- }
88
- }
89
- }
90
- }
91
- catch (error) {
92
- console.error(`Error crawling ${urlStr}:`, error);
93
- }
94
- }
95
- async gotoPageAndHandleRedirects(page, url) {
96
- const MAX_PAGE_WAIT_MS = 5000;
97
- await page.goto(url, {
98
- timeout: 0,
99
- waitUntil: 'networkidle2'
100
- });
101
- let responseEventOccurred = false;
102
- const responseHandler = () => responseEventOccurred = true;
103
- const responseWatcher = new Promise((resolve) => {
104
- setTimeout(() => {
105
- if (!responseEventOccurred) {
106
- resolve();
107
- }
108
- else {
109
- setTimeout(() => resolve(), MAX_PAGE_WAIT_MS);
110
- }
111
- }, 500);
112
- });
113
- page.on('response', responseHandler);
114
- await Promise.race([responseWatcher, page.waitForNavigation()]);
115
- page.off('response', responseHandler);
116
- }
117
- async processPage(page, url) {
118
- // Wait for dynamic content
119
- try {
120
- await page.waitForFunction(() => {
121
- const mainContent = document.querySelector('main') || document.querySelector('.content') || document.querySelector('#content');
122
- return mainContent && mainContent.children.length > 0;
123
- }, { timeout: 5000 });
124
- }
125
- catch (error) {
126
- console.warn('Timeout waiting for main content, proceeding anyway');
127
- }
128
- // Extract content using Readability
129
- const html = await page.content();
130
- const dom = new JSDOM(html, { url: url.toString() });
131
- const reader = new Readability(dom.window.document, {
132
- charThreshold: 20,
133
- nbTopCandidates: 5,
134
- maxElemsToParse: 10000
135
- });
136
- const article = reader.parse();
137
- if (!article) {
138
- throw new Error('Failed to parse page content');
139
- }
140
- // Extract links
141
- const links = await this.getLinksFromPage(page, url);
142
- return {
143
- content: article.textContent,
144
- title: article.title,
145
- links
146
- };
147
- }
148
- async getLinksFromPage(page, curUrl) {
149
- const links = await page.$$eval('a', (links) => links.map((a) => a.href));
150
- const cleanedLinks = links
151
- .map(link => {
152
- try {
153
- const url = new URL(link);
154
- url.hash = ''; // Remove hash
155
- return url.href;
156
- }
157
- catch {
158
- return null;
159
- }
160
- })
161
- .filter((link) => {
162
- if (!link)
163
- return false;
164
- try {
165
- const url = new URL(link);
166
- return (url.pathname.startsWith(curUrl.pathname) &&
167
- url.hostname === curUrl.hostname &&
168
- link !== curUrl.href);
169
- }
170
- catch {
171
- return false;
172
- }
173
- });
174
- return Array.from(new Set(cleanedLinks));
175
- }
176
- groupLinks(links) {
177
- return links.reduce((acc, link, i) => {
178
- const groupIndex = Math.floor(i / this.LINK_GROUP_SIZE);
179
- if (!acc[groupIndex]) {
180
- acc.push([]);
181
- }
182
- acc[groupIndex].push(link);
183
- return acc;
184
- }, []);
185
- }
186
- abort() {
187
- super.abort();
188
- void this.browser?.close();
189
- }
190
- }
191
- //# sourceMappingURL=puppeteer.js.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"puppeteer.js","sourceRoot":"","sources":["../../src/crawler/puppeteer.ts"],"names":[],"mappings":"AAAA,OAAO,SAA4C,MAAM,WAAW,CAAC;AAErE,OAAO,EAAE,WAAW,EAAE,MAAM,WAAW,CAAC;AACxC,OAAO,EAAE,KAAK,EAAE,MAAM,OAAO,CAAC;AAC9B,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAC;AAEnD,MAAM,OAAO,gBAAiB,SAAQ,WAAW;IACvC,OAAO,CAAW;IACT,SAAS,GAAG,qHAAqH,CAAC;IAClI,eAAe,GAAG,CAAC,CAAC;IAC7B,aAAa,GAAG,CAAC,CAAC;IAE1B,KAAK,CAAC,CAAC,KAAK,CAAC,GAAW;QACtB,IAAI,CAAC;YACH,IAAI,CAAC,OAAO,GAAG,MAAM,SAAS,CAAC,MAAM,CAAC;gBACpC,QAAQ,EAAE,IAAI;gBACd,IAAI,EAAE;oBACJ,cAAc;oBACd,0BAA0B;oBAC1B,yBAAyB;oBACzB,eAAe;oBACf,wBAAwB;iBACzB;aACF,CAAC,CAAC;YAEH,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC;YAC1C,MAAM,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC;YAE3B,MAAM,WAAW,GAAG,IAAI,GAAG,EAAU,CAAC;YACtC,KAAK,CAAC,CAAC,IAAI,CAAC,cAAc,CAAC,IAAI,EAAE,IAAI,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,WAAW,CAAC,CAAC;QACjE,CAAC;gBAAS,CAAC;YACT,MAAM,IAAI,CAAC,OAAO,EAAE,KAAK,EAAE,CAAC;QAC9B,CAAC;IACH,CAAC;IAEO,KAAK,CAAC,SAAS,CAAC,IAAU;QAChC,MAAM,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;QACxC,MAAM,IAAI,CAAC,WAAW,CAAC,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,CAAC,CAAC;QAErD,mCAAmC;QACnC,MAAM,IAAI,CAAC,sBAAsB,CAAC,IAAI,CAAC,CAAC;QACxC,IAAI,CAAC,EAAE,CAAC,SAAS,EAAE,OAAO,CAAC,EAAE;YAC3B,MAAM,YAAY,GAAG,OAAO,CAAC,YAAY,EAAE,CAAC;YAC5C,IAAI,CAAC,OAAO,EAAE,OAAO,EAAE,MAAM,CAAC,CAAC,QAAQ,CAAC,YAAY,CAAC,EAAE,CAAC;gBACtD,OAAO,CAAC,KAAK,EAAE,CAAC;YAClB,CAAC;iBAAM,CAAC;gBACN,OAAO,CAAC,QAAQ,EAAE,CAAC;YACrB,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,2BAA2B;QAC3B,IAAI,CAAC,EAAE,CAAC,WAAW,EAAE,KAAK,CAAC,EAAE;YAC3B,OAAO,CAAC,IAAI,CAAC,aAAa,EAAE,KAAK,CAAC,CAAC;QACrC,CAAC,CAAC,CAAC;QAEH,0BAA0B;QAC1B,IAAI,CAAC,EAAE,CAAC,SAAS,EAAE,CAAC,GAAmB,EAAE,EAAE;YACzC,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,EAAE,CAAC;YACxB,IAAI,IAAI,KAAK,OAAO,IAAI,IAAI,KAAK,MAAM,EAAE,CAAC;gBACxC,OAAO,CAAC,KAAK,CAAC,WAAW,IAAI,GAAG,EAAE,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;YAChD,CAAC;QACH,CAAC,CAAC,CAAC;IACL,CAAC;IAEO,KAAK,CAAC,CAAC,cAAc,CAC3B,IAAU,EACV,MAAW,EACX,KAAa,EACb,WAAwB;QAExB,MAAM,MAAM,GAAG,MAAM,CAAC,QAAQ,EAAE,CAAC;QAEjC,IAAI,WAAW,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC,IAAI,KAAK,GAAG,IAAI,CAAC,QAAQ,EAAE,CAAC;YAClF,OAAO;QACT,CAAC;QAED,IAAI,CAAC;YACH,gBAAgB;YAChB,MAAM,IAAI,CAAC,SAAS,EAAE,CAAC;YAEvB,iDAAiD;YACjD,MAAM,IAAI,CAAC,0BAA0B,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;YAEpD,kBAAkB;YAClB,MAAM,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;YAEvE,WAAW,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;YACxB,IAAI,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;YAC3B,IAAI,CAAC,aAAa,EAAE,CAAC;YAErB,MAAM;gBACJ,GAAG,EAAE,MAAM;gBACX,IAAI,EAAE,IAAI,CAAC,cAAc,CAAC,MAAM,CAAC;gBACjC,OAAO;gBACP,KAAK;aACN,CAAC;YAEF,2BAA2B;YAC3B,IAAI,KAAK,GAAG,IAAI,CAAC,QAAQ,IAAI,IAAI,CAAC,aAAa,GAAG,IAAI,CAAC,mBAAmB,EAAE,CAAC;gBAC3E,MAAM,UAAU,GAAG,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC;gBAC1C,KAAK,MAAM,SAAS,IAAI,UAAU,EAAE,CAAC;oBACnC,KAAK,MAAM,IAAI,IAAI,SAAS,EAAE,CAAC;wBAC7B,IAAI,IAAI,CAAC,aAAa,IAAI,IAAI,CAAC,mBAAmB,EAAE,CAAC;4BACnD,OAAO;wBACT,CAAC;wBACD,KAAK,CAAC,CAAC,IAAI,CAAC,cAAc,CAAC,IAAI,EAAE,IAAI,GAAG,CAAC,IAAI,CAAC,EAAE,KAAK,GAAG,CAAC,EAAE,WAAW,CAAC,CAAC;oBAC1E,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,kBAAkB,MAAM,GAAG,EAAE,KAAK,CAAC,CAAC;QACpD,CAAC;IACH,CAAC;IAEO,KAAK,CAAC,0BAA0B,CAAC,IAAU,EAAE,GAAW;QAC9D,MAAM,gBAAgB,GAAG,IAAI,CAAC;QAE9B,MAAM,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE;YACnB,OAAO,EAAE,CAAC;YACV,SAAS,EAAE,cAAc;SAC1B,CAAC,CAAC;QAEH,IAAI,qBAAqB,GAAG,KAAK,CAAC;QAClC,MAAM,eAAe,GAAG,GAAG,EAAE,CAAC,qBAAqB,GAAG,IAAI,CAAC;QAE3D,MAAM,eAAe,GAAG,IAAI,OAAO,CAAO,CAAC,OAAO,EAAE,EAAE;YACpD,UAAU,CAAC,GAAG,EAAE;gBACd,IAAI,CAAC,qBAAqB,EAAE,CAAC;oBAC3B,OAAO,EAAE,CAAC;gBACZ,CAAC;qBAAM,CAAC;oBACN,UAAU,CAAC,GAAG,EAAE,CAAC,OAAO,EAAE,EAAE,gBAAgB,CAAC,CAAC;gBAChD,CAAC;YACH,CAAC,EAAE,GAAG,CAAC,CAAC;QACV,CAAC,CAAC,CAAC;QAEH,IAAI,CAAC,EAAE,CAAC,UAAU,EAAE,eAAe,CAAC,CAAC;QACrC,MAAM,OAAO,CAAC,IAAI,CAAC,CAAC,eAAe,EAAE,IAAI,CAAC,iBAAiB,EAAE,CAAC,CAAC,CAAC;QAChE,IAAI,CAAC,GAAG,CAAC,UAAU,EAAE,eAAe,CAAC,CAAC;IACxC,CAAC;IAEO,KAAK,CAAC,WAAW,CAAC,IAAU,EAAE,GAAQ;QAC5C,2BAA2B;QAC3B,IAAI,CAAC;YACH,MAAM,IAAI,CAAC,eAAe,CAAC,GAAG,EAAE;gBAC9B,MAAM,WAAW,GAAG,QAAQ,CAAC,aAAa,CAAC,MAAM,CAAC,IAAI,QAAQ,CAAC,aAAa,CAAC,UAAU,CAAC,IAAI,QAAQ,CAAC,aAAa,CAAC,UAAU,CAAC,CAAC;gBAC/H,OAAO,WAAW,IAAI,WAAW,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC;YACxD,CAAC,EAAE,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC;QACxB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,IAAI,CAAC,qDAAqD,CAAC,CAAC;QACtE,CAAC;QAED,oCAAoC;QACpC,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,OAAO,EAAE,CAAC;QAClC,MAAM,GAAG,GAAG,IAAI,KAAK,CAAC,IAAI,EAAE,EAAE,GAAG,EAAE,GAAG,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;QACrD,MAAM,MAAM,GAAG,IAAI,WAAW,CAAC,GAAG,CAAC,MAAM,CAAC,QAAQ,EAAE;YAClD,aAAa,EAAE,EAAE;YACjB,eAAe,EAAE,CAAC;YAClB,eAAe,EAAE,KAAK;SACvB,CAAC,CAAC;QACH,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,EAAE,CAAC;QAE/B,IAAI,CAAC,OAAO,EAAE,CAAC;YACb,MAAM,IAAI,KAAK,CAAC,8BAA8B,CAAC,CAAC;QAClD,CAAC;QAED,gBAAgB;QAChB,MAAM,KAAK,GAAG,MAAM,IAAI,CAAC,gBAAgB,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;QAErD,OAAO;YACL,OAAO,EAAE,OAAO,CAAC,WAAW;YAC5B,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,KAAK;SACN,CAAC;IACJ,CAAC;IAEO,KAAK,CAAC,gBAAgB,CAAC,IAAU,EAAE,MAAW;QACpD,MAAM,KAAK,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,GAAG,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC;QAE1E,MAAM,YAAY,GAAG,KAAK;aACvB,GAAG,CAAC,IAAI,CAAC,EAAE;YACV,IAAI,CAAC;gBACH,MAAM,GAAG,GAAG,IAAI,GAAG,CAAC,IAAI,CAAC,CAAC;gBAC1B,GAAG,CAAC,IAAI,GAAG,EAAE,CAAC,CAAC,cAAc;gBAC7B,OAAO,GAAG,CAAC,IAAI,CAAC;YAClB,CAAC;YAAC,MAAM,CAAC;gBACP,OAAO,IAAI,CAAC;YACd,CAAC;QACH,CAAC,CAAC;aACD,MAAM,CAAC,CAAC,IAAI,EAAkB,EAAE;YAC/B,IAAI,CAAC,IAAI;gBAAE,OAAO,KAAK,CAAC;YACxB,IAAI,CAAC;gBACH,MAAM,GAAG,GAAG,IAAI,GAAG,CAAC,IAAI,CAAC,CAAC;gBAC1B,OAAO,CACL,GAAG,CAAC,QAAQ,CAAC,UAAU,CAAC,MAAM,CAAC,QAAQ,CAAC;oBACxC,GAAG,CAAC,QAAQ,KAAK,MAAM,CAAC,QAAQ;oBAChC,IAAI,KAAK,MAAM,CAAC,IAAI,CACrB,CAAC;YACJ,CAAC;YAAC,MAAM,CAAC;gBACP,OAAO,KAAK,CAAC;YACf,CAAC;QACH,CAAC,CAAC,CAAC;QAEL,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,GAAG,CAAC,YAAY,CAAC,CAAC,CAAC;IAC3C,CAAC;IAEO,UAAU,CAAC,KAAe;QAChC,OAAO,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,CAAC,EAAE,EAAE;YACnC,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,GAAG,IAAI,CAAC,eAAe,CAAC,CAAC;YACxD,IAAI,CAAC,GAAG,CAAC,UAAU,CAAC,EAAE,CAAC;gBACrB,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACf,CAAC;YACD,GAAG,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC3B,OAAO,GAAG,CAAC;QACb,CAAC,EAAE,EAAgB,CAAC,CAAC;IACvB,CAAC;IAED,KAAK;QACH,KAAK,CAAC,KAAK,EAAE,CAAC;QACd,KAAK,IAAI,CAAC,OAAO,EAAE,KAAK,EAAE,CAAC;IAC7B,CAAC;CACF"}
@@ -1,8 +0,0 @@
1
- import { EmbeddingsProvider } from './types.js';
2
- export declare class OpenAIEmbeddings implements EmbeddingsProvider {
3
- private openai;
4
- private cache;
5
- readonly dimensions = 1536;
6
- constructor(apiKey: string);
7
- embed(text: string): Promise<number[]>;
8
- }
@@ -1,56 +0,0 @@
1
- import OpenAI from 'openai';
2
- import { logger } from '../util/logger.js';
3
- export class OpenAIEmbeddings {
4
- openai;
5
- cache;
6
- dimensions = 1536; // text-embedding-3-small dimensions
7
- constructor(apiKey) {
8
- if (!apiKey) {
9
- throw new Error('OpenAI API key is required');
10
- }
11
- this.openai = new OpenAI({ apiKey });
12
- this.cache = new Map();
13
- }
14
- async embed(text) {
15
- // Ensure input is a string and not empty
16
- if (!text || typeof text !== 'string') {
17
- throw new Error('Input text must be a non-empty string');
18
- }
19
- // Check cache first
20
- const cacheKey = text.slice(0, 1000); // Limit cache key size
21
- const cached = this.cache.get(cacheKey);
22
- if (cached) {
23
- return cached;
24
- }
25
- try {
26
- const cleanText = text.trim();
27
- if (!cleanText) {
28
- throw new Error('Input text is empty after trimming');
29
- }
30
- const response = await this.openai.embeddings.create({
31
- model: "text-embedding-3-small",
32
- input: cleanText,
33
- dimensions: this.dimensions
34
- });
35
- if (!response.data?.[0]?.embedding) {
36
- throw new Error('No embedding returned from OpenAI');
37
- }
38
- const embedding = response.data[0].embedding;
39
- // Cache the result
40
- this.cache.set(cacheKey, embedding);
41
- // Limit cache size to prevent memory issues
42
- if (this.cache.size > 1000) {
43
- const firstKey = this.cache.keys().next().value;
44
- if (firstKey) {
45
- this.cache.delete(firstKey);
46
- }
47
- }
48
- return embedding;
49
- }
50
- catch (error) {
51
- logger.debug('Error generating embedding:', error);
52
- throw error;
53
- }
54
- }
55
- }
56
- //# sourceMappingURL=openai.js.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"openai.js","sourceRoot":"","sources":["../../src/embeddings/openai.ts"],"names":[],"mappings":"AAAA,OAAO,MAAM,MAAM,QAAQ,CAAC;AAE5B,OAAO,EAAE,MAAM,EAAE,MAAM,mBAAmB,CAAC;AAE3C,MAAM,OAAO,gBAAgB;IACnB,MAAM,CAAS;IACf,KAAK,CAAwB;IAC5B,UAAU,GAAG,IAAI,CAAC,CAAC,oCAAoC;IAEhE,YAAY,MAAc;QACxB,IAAI,CAAC,MAAM,EAAE,CAAC;YACZ,MAAM,IAAI,KAAK,CAAC,4BAA4B,CAAC,CAAC;QAChD,CAAC;QACD,IAAI,CAAC,MAAM,GAAG,IAAI,MAAM,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC;QACrC,IAAI,CAAC,KAAK,GAAG,IAAI,GAAG,EAAE,CAAC;IACzB,CAAC;IAED,KAAK,CAAC,KAAK,CAAC,IAAY;QACtB,yCAAyC;QACzC,IAAI,CAAC,IAAI,IAAI,OAAO,IAAI,KAAK,QAAQ,EAAE,CAAC;YACtC,MAAM,IAAI,KAAK,CAAC,uCAAuC,CAAC,CAAC;QAC3D,CAAC;QAED,oBAAoB;QACpB,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC,CAAC,uBAAuB;QAC7D,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;QACxC,IAAI,MAAM,EAAE,CAAC;YACX,OAAO,MAAM,CAAC;QAChB,CAAC;QAED,IAAI,CAAC;YACH,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;YAC9B,IAAI,CAAC,SAAS,EAAE,CAAC;gBACf,MAAM,IAAI,KAAK,CAAC,oCAAoC,CAAC,CAAC;YACxD,CAAC;YAED,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,UAAU,CAAC,MAAM,CAAC;gBACnD,KAAK,EAAE,wBAAwB;gBAC/B,KAAK,EAAE,SAAS;gBAChB,UAAU,EAAE,IAAI,CAAC,UAAU;aAC5B,CAAC,CAAC;YAEH,IAAI,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,SAAS,EAAE,CAAC;gBACnC,MAAM,IAAI,KAAK,CAAC,mCAAmC,CAAC,CAAC;YACvD,CAAC;YAED,MAAM,SAAS,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;YAE7C,mBAAmB;YACnB,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;YAEpC,4CAA4C;YAC5C,IAAI,IAAI,CAAC,KAAK,CAAC,IAAI,GAAG,IAAI,EAAE,CAAC;gBAC3B,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC;gBAChD,IAAI,QAAQ,EAAE,CAAC;oBACb,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;gBAC9B,CAAC;YACH,CAAC;YAED,OAAO,SAAS,CAAC;QACnB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,CAAC,KAAK,CAAC,6BAA6B,EAAE,KAAK,CAAC,CAAC;YACnD,MAAM,KAAK,CAAC;QACd,CAAC;IACH,CAAC;CACF"}
@@ -1,10 +0,0 @@
1
- import { GeneratedResponse } from "./response-generator.js";
2
- export interface CachedResponse {
3
- query: string;
4
- response: GeneratedResponse;
5
- }
6
- export declare class RAGCache {
7
- private cache;
8
- getCachedResponse(query: string): Promise<CachedResponse | null>;
9
- cacheResponse(query: string, response: GeneratedResponse): Promise<void>;
10
- }
@@ -1,10 +0,0 @@
1
- export class RAGCache {
2
- cache = new Map();
3
- async getCachedResponse(query) {
4
- return this.cache.get(query) || null;
5
- }
6
- async cacheResponse(query, response) {
7
- this.cache.set(query, { query, response });
8
- }
9
- }
10
- //# sourceMappingURL=cache.js.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"cache.js","sourceRoot":"","sources":["../../src/rag/cache.ts"],"names":[],"mappings":"AAOA,MAAM,OAAO,QAAQ;IACX,KAAK,GAAgC,IAAI,GAAG,EAAE,CAAC;IAEvD,KAAK,CAAC,iBAAiB,CAAC,KAAa;QACnC,OAAO,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,IAAI,CAAC;IACvC,CAAC;IAED,KAAK,CAAC,aAAa,CAAC,KAAa,EAAE,QAA2B;QAC5D,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,KAAK,EAAE,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC,CAAC;IAC7C,CAAC;CACF"}
@@ -1,11 +0,0 @@
1
- export interface CodeExample {
2
- imports: string;
3
- props: string;
4
- usage: string;
5
- }
6
- export declare class CodeGenerator {
7
- generateCodeExample(component: any, context: any): Promise<CodeExample>;
8
- private generateImports;
9
- private generateProps;
10
- private generateUsage;
11
- }
@@ -1,30 +0,0 @@
1
- export class CodeGenerator {
2
- async generateCodeExample(component, context) {
3
- // Extract component name and props from the component info
4
- const componentName = component.name || 'Example';
5
- const componentProps = component.props || {};
6
- // Generate imports based on context
7
- const imports = this.generateImports(componentName, context);
8
- // Generate props string
9
- const props = this.generateProps(componentProps);
10
- // Generate usage example
11
- const usage = this.generateUsage(componentName, props);
12
- return { imports, props, usage };
13
- }
14
- generateImports(componentName, context) {
15
- // Look for import path in context
16
- const importPath = context.importPath || 'example';
17
- return `import ${componentName} from '${importPath}';`;
18
- }
19
- generateProps(props) {
20
- return Object.entries(props)
21
- .map(([key, value]) => `${key}: ${JSON.stringify(value)}`)
22
- .join(', ');
23
- }
24
- generateUsage(componentName, props) {
25
- return props
26
- ? `<${componentName} ${props} />`
27
- : `<${componentName} />`;
28
- }
29
- }
30
- //# sourceMappingURL=code-generator.js.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"code-generator.js","sourceRoot":"","sources":["../../src/rag/code-generator.ts"],"names":[],"mappings":"AAMA,MAAM,OAAO,aAAa;IACxB,KAAK,CAAC,mBAAmB,CAAC,SAAc,EAAE,OAAY;QACpD,2DAA2D;QAC3D,MAAM,aAAa,GAAG,SAAS,CAAC,IAAI,IAAI,SAAS,CAAC;QAClD,MAAM,cAAc,GAAG,SAAS,CAAC,KAAK,IAAI,EAAE,CAAC;QAE7C,oCAAoC;QACpC,MAAM,OAAO,GAAG,IAAI,CAAC,eAAe,CAAC,aAAa,EAAE,OAAO,CAAC,CAAC;QAE7D,wBAAwB;QACxB,MAAM,KAAK,GAAG,IAAI,CAAC,aAAa,CAAC,cAAc,CAAC,CAAC;QAEjD,yBAAyB;QACzB,MAAM,KAAK,GAAG,IAAI,CAAC,aAAa,CAAC,aAAa,EAAE,KAAK,CAAC,CAAC;QAEvD,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC;IACnC,CAAC;IAEO,eAAe,CAAC,aAAqB,EAAE,OAAY;QACzD,kCAAkC;QAClC,MAAM,UAAU,GAAG,OAAO,CAAC,UAAU,IAAI,SAAS,CAAC;QACnD,OAAO,UAAU,aAAa,UAAU,UAAU,IAAI,CAAC;IACzD,CAAC;IAEO,aAAa,CAAC,KAA0B;QAC9C,OAAO,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC;aACzB,GAAG,CAAC,CAAC,CAAC,GAAG,EAAE,KAAK,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,KAAK,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,EAAE,CAAC;aACzD,IAAI,CAAC,IAAI,CAAC,CAAC;IAChB,CAAC;IAEO,aAAa,CAAC,aAAqB,EAAE,KAAa;QACxD,OAAO,KAAK;YACV,CAAC,CAAC,IAAI,aAAa,IAAI,KAAK,KAAK;YACjC,CAAC,CAAC,IAAI,aAAa,KAAK,CAAC;IAC7B,CAAC;CACF"}
@@ -1,23 +0,0 @@
1
- import { EnhancedChunk } from "../types/rag.js";
2
- import { ComponentRelationship } from "../crawler/content-extractor-types.js";
3
- export interface AssembledContext {
4
- hierarchicalContext: EnhancedChunk[];
5
- relationships: ComponentRelationship[];
6
- metadata: {
7
- summary: string;
8
- topics: string[];
9
- complexity: 'basic' | 'intermediate' | 'advanced';
10
- prerequisites: string[];
11
- frameworks: string[];
12
- languages: string[];
13
- };
14
- }
15
- export declare class ContextAssembler {
16
- assembleContext(chunks: EnhancedChunk[]): Promise<AssembledContext>;
17
- private groupChunksByType;
18
- private extractRelationships;
19
- private buildHierarchy;
20
- private consolidateMetadata;
21
- private generateSummary;
22
- private deduplicateRelationships;
23
- }