@cosmocoder/mcp-web-docs 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (240) hide show
  1. package/LICENSE +22 -0
  2. package/README.md +368 -0
  3. package/build/__mocks__/embeddings.d.ts +17 -0
  4. package/build/__mocks__/embeddings.js +66 -0
  5. package/build/__mocks__/embeddings.js.map +1 -0
  6. package/build/config.d.ts +44 -0
  7. package/build/config.js +158 -0
  8. package/build/config.js.map +1 -0
  9. package/build/config.test.d.ts +1 -0
  10. package/build/config.test.js +165 -0
  11. package/build/config.test.js.map +1 -0
  12. package/build/crawler/auth.d.ts +128 -0
  13. package/build/crawler/auth.js +546 -0
  14. package/build/crawler/auth.js.map +1 -0
  15. package/build/crawler/auth.test.d.ts +1 -0
  16. package/build/crawler/auth.test.js +174 -0
  17. package/build/crawler/auth.test.js.map +1 -0
  18. package/build/crawler/base.d.ts +24 -0
  19. package/build/crawler/base.js +149 -0
  20. package/build/crawler/base.js.map +1 -0
  21. package/build/crawler/base.test.d.ts +1 -0
  22. package/build/crawler/base.test.js +234 -0
  23. package/build/crawler/base.test.js.map +1 -0
  24. package/build/crawler/browser-config.d.ts +2 -0
  25. package/build/crawler/browser-config.js +29 -0
  26. package/build/crawler/browser-config.js.map +1 -0
  27. package/build/crawler/browser-config.test.d.ts +1 -0
  28. package/build/crawler/browser-config.test.js +56 -0
  29. package/build/crawler/browser-config.test.js.map +1 -0
  30. package/build/crawler/cheerio.d.ts +11 -0
  31. package/build/crawler/cheerio.js +134 -0
  32. package/build/crawler/cheerio.js.map +1 -0
  33. package/build/crawler/chromium.d.ts +21 -0
  34. package/build/crawler/chromium.js +596 -0
  35. package/build/crawler/chromium.js.map +1 -0
  36. package/build/crawler/content-extractor-types.d.ts +25 -0
  37. package/build/crawler/content-extractor-types.js +2 -0
  38. package/build/crawler/content-extractor-types.js.map +1 -0
  39. package/build/crawler/content-extractors.d.ts +9 -0
  40. package/build/crawler/content-extractors.js +9 -0
  41. package/build/crawler/content-extractors.js.map +1 -0
  42. package/build/crawler/content-utils.d.ts +2 -0
  43. package/build/crawler/content-utils.js +22 -0
  44. package/build/crawler/content-utils.js.map +1 -0
  45. package/build/crawler/content-utils.test.d.ts +1 -0
  46. package/build/crawler/content-utils.test.js +99 -0
  47. package/build/crawler/content-utils.test.js.map +1 -0
  48. package/build/crawler/crawlee-crawler.d.ts +63 -0
  49. package/build/crawler/crawlee-crawler.js +342 -0
  50. package/build/crawler/crawlee-crawler.js.map +1 -0
  51. package/build/crawler/crawlee-crawler.test.d.ts +1 -0
  52. package/build/crawler/crawlee-crawler.test.js +280 -0
  53. package/build/crawler/crawlee-crawler.test.js.map +1 -0
  54. package/build/crawler/default-extractor.d.ts +4 -0
  55. package/build/crawler/default-extractor.js +26 -0
  56. package/build/crawler/default-extractor.js.map +1 -0
  57. package/build/crawler/default-extractor.test.d.ts +1 -0
  58. package/build/crawler/default-extractor.test.js +200 -0
  59. package/build/crawler/default-extractor.test.js.map +1 -0
  60. package/build/crawler/default.d.ts +11 -0
  61. package/build/crawler/default.js +138 -0
  62. package/build/crawler/default.js.map +1 -0
  63. package/build/crawler/docs-crawler.d.ts +26 -0
  64. package/build/crawler/docs-crawler.js +97 -0
  65. package/build/crawler/docs-crawler.js.map +1 -0
  66. package/build/crawler/docs-crawler.test.d.ts +1 -0
  67. package/build/crawler/docs-crawler.test.js +185 -0
  68. package/build/crawler/docs-crawler.test.js.map +1 -0
  69. package/build/crawler/factory.d.ts +6 -0
  70. package/build/crawler/factory.js +83 -0
  71. package/build/crawler/factory.js.map +1 -0
  72. package/build/crawler/github-pages-extractor.d.ts +4 -0
  73. package/build/crawler/github-pages-extractor.js +33 -0
  74. package/build/crawler/github-pages-extractor.js.map +1 -0
  75. package/build/crawler/github-pages-extractor.test.d.ts +1 -0
  76. package/build/crawler/github-pages-extractor.test.js +184 -0
  77. package/build/crawler/github-pages-extractor.test.js.map +1 -0
  78. package/build/crawler/github.d.ts +20 -0
  79. package/build/crawler/github.js +181 -0
  80. package/build/crawler/github.js.map +1 -0
  81. package/build/crawler/github.test.d.ts +1 -0
  82. package/build/crawler/github.test.js +326 -0
  83. package/build/crawler/github.test.js.map +1 -0
  84. package/build/crawler/puppeteer.d.ts +16 -0
  85. package/build/crawler/puppeteer.js +191 -0
  86. package/build/crawler/puppeteer.js.map +1 -0
  87. package/build/crawler/queue-manager.d.ts +43 -0
  88. package/build/crawler/queue-manager.js +169 -0
  89. package/build/crawler/queue-manager.js.map +1 -0
  90. package/build/crawler/queue-manager.test.d.ts +1 -0
  91. package/build/crawler/queue-manager.test.js +509 -0
  92. package/build/crawler/queue-manager.test.js.map +1 -0
  93. package/build/crawler/site-rules.d.ts +11 -0
  94. package/build/crawler/site-rules.js +104 -0
  95. package/build/crawler/site-rules.js.map +1 -0
  96. package/build/crawler/site-rules.test.d.ts +1 -0
  97. package/build/crawler/site-rules.test.js +139 -0
  98. package/build/crawler/site-rules.test.js.map +1 -0
  99. package/build/crawler/storybook-extractor.d.ts +34 -0
  100. package/build/crawler/storybook-extractor.js +767 -0
  101. package/build/crawler/storybook-extractor.js.map +1 -0
  102. package/build/crawler/storybook-extractor.test.d.ts +1 -0
  103. package/build/crawler/storybook-extractor.test.js +491 -0
  104. package/build/crawler/storybook-extractor.test.js.map +1 -0
  105. package/build/embeddings/fastembed.d.ts +25 -0
  106. package/build/embeddings/fastembed.js +188 -0
  107. package/build/embeddings/fastembed.js.map +1 -0
  108. package/build/embeddings/fastembed.test.d.ts +1 -0
  109. package/build/embeddings/fastembed.test.js +307 -0
  110. package/build/embeddings/fastembed.test.js.map +1 -0
  111. package/build/embeddings/openai.d.ts +8 -0
  112. package/build/embeddings/openai.js +56 -0
  113. package/build/embeddings/openai.js.map +1 -0
  114. package/build/embeddings/types.d.ts +4 -0
  115. package/build/embeddings/types.js +2 -0
  116. package/build/embeddings/types.js.map +1 -0
  117. package/build/index.d.ts +2 -0
  118. package/build/index.js +1007 -0
  119. package/build/index.js.map +1 -0
  120. package/build/index.test.d.ts +1 -0
  121. package/build/index.test.js +364 -0
  122. package/build/index.test.js.map +1 -0
  123. package/build/indexing/queue-manager.d.ts +36 -0
  124. package/build/indexing/queue-manager.js +86 -0
  125. package/build/indexing/queue-manager.js.map +1 -0
  126. package/build/indexing/queue-manager.test.d.ts +1 -0
  127. package/build/indexing/queue-manager.test.js +257 -0
  128. package/build/indexing/queue-manager.test.js.map +1 -0
  129. package/build/indexing/status.d.ts +39 -0
  130. package/build/indexing/status.js +207 -0
  131. package/build/indexing/status.js.map +1 -0
  132. package/build/indexing/status.test.d.ts +1 -0
  133. package/build/indexing/status.test.js +246 -0
  134. package/build/indexing/status.test.js.map +1 -0
  135. package/build/processor/content.d.ts +16 -0
  136. package/build/processor/content.js +286 -0
  137. package/build/processor/content.js.map +1 -0
  138. package/build/processor/content.test.d.ts +1 -0
  139. package/build/processor/content.test.js +369 -0
  140. package/build/processor/content.test.js.map +1 -0
  141. package/build/processor/markdown.d.ts +11 -0
  142. package/build/processor/markdown.js +256 -0
  143. package/build/processor/markdown.js.map +1 -0
  144. package/build/processor/markdown.test.d.ts +1 -0
  145. package/build/processor/markdown.test.js +312 -0
  146. package/build/processor/markdown.test.js.map +1 -0
  147. package/build/processor/metadata-parser.d.ts +37 -0
  148. package/build/processor/metadata-parser.js +245 -0
  149. package/build/processor/metadata-parser.js.map +1 -0
  150. package/build/processor/metadata-parser.test.d.ts +1 -0
  151. package/build/processor/metadata-parser.test.js +357 -0
  152. package/build/processor/metadata-parser.test.js.map +1 -0
  153. package/build/processor/processor.d.ts +8 -0
  154. package/build/processor/processor.js +190 -0
  155. package/build/processor/processor.js.map +1 -0
  156. package/build/processor/processor.test.d.ts +1 -0
  157. package/build/processor/processor.test.js +357 -0
  158. package/build/processor/processor.test.js.map +1 -0
  159. package/build/rag/cache.d.ts +10 -0
  160. package/build/rag/cache.js +10 -0
  161. package/build/rag/cache.js.map +1 -0
  162. package/build/rag/code-generator.d.ts +11 -0
  163. package/build/rag/code-generator.js +30 -0
  164. package/build/rag/code-generator.js.map +1 -0
  165. package/build/rag/context-assembler.d.ts +23 -0
  166. package/build/rag/context-assembler.js +113 -0
  167. package/build/rag/context-assembler.js.map +1 -0
  168. package/build/rag/docs-search.d.ts +55 -0
  169. package/build/rag/docs-search.js +380 -0
  170. package/build/rag/docs-search.js.map +1 -0
  171. package/build/rag/pipeline.d.ts +26 -0
  172. package/build/rag/pipeline.js +91 -0
  173. package/build/rag/pipeline.js.map +1 -0
  174. package/build/rag/query-processor.d.ts +14 -0
  175. package/build/rag/query-processor.js +57 -0
  176. package/build/rag/query-processor.js.map +1 -0
  177. package/build/rag/reranker.d.ts +55 -0
  178. package/build/rag/reranker.js +210 -0
  179. package/build/rag/reranker.js.map +1 -0
  180. package/build/rag/response-generator.d.ts +20 -0
  181. package/build/rag/response-generator.js +101 -0
  182. package/build/rag/response-generator.js.map +1 -0
  183. package/build/rag/retriever.d.ts +19 -0
  184. package/build/rag/retriever.js +111 -0
  185. package/build/rag/retriever.js.map +1 -0
  186. package/build/rag/validator.d.ts +22 -0
  187. package/build/rag/validator.js +128 -0
  188. package/build/rag/validator.js.map +1 -0
  189. package/build/rag/version-manager.d.ts +23 -0
  190. package/build/rag/version-manager.js +98 -0
  191. package/build/rag/version-manager.js.map +1 -0
  192. package/build/setupTests.d.ts +4 -0
  193. package/build/setupTests.js +50 -0
  194. package/build/setupTests.js.map +1 -0
  195. package/build/storage/storage.d.ts +38 -0
  196. package/build/storage/storage.js +700 -0
  197. package/build/storage/storage.js.map +1 -0
  198. package/build/storage/storage.test.d.ts +1 -0
  199. package/build/storage/storage.test.js +338 -0
  200. package/build/storage/storage.test.js.map +1 -0
  201. package/build/types/rag.d.ts +27 -0
  202. package/build/types/rag.js +2 -0
  203. package/build/types/rag.js.map +1 -0
  204. package/build/types.d.ts +120 -0
  205. package/build/types.js +2 -0
  206. package/build/types.js.map +1 -0
  207. package/build/util/content-utils.d.ts +31 -0
  208. package/build/util/content-utils.js +120 -0
  209. package/build/util/content-utils.js.map +1 -0
  210. package/build/util/content.d.ts +1 -0
  211. package/build/util/content.js +16 -0
  212. package/build/util/content.js.map +1 -0
  213. package/build/util/docs.d.ts +1 -0
  214. package/build/util/docs.js +26 -0
  215. package/build/util/docs.js.map +1 -0
  216. package/build/util/docs.test.d.ts +1 -0
  217. package/build/util/docs.test.js +49 -0
  218. package/build/util/docs.test.js.map +1 -0
  219. package/build/util/favicon.d.ts +6 -0
  220. package/build/util/favicon.js +88 -0
  221. package/build/util/favicon.js.map +1 -0
  222. package/build/util/favicon.test.d.ts +1 -0
  223. package/build/util/favicon.test.js +140 -0
  224. package/build/util/favicon.test.js.map +1 -0
  225. package/build/util/logger.d.ts +17 -0
  226. package/build/util/logger.js +72 -0
  227. package/build/util/logger.js.map +1 -0
  228. package/build/util/logger.test.d.ts +1 -0
  229. package/build/util/logger.test.js +46 -0
  230. package/build/util/logger.test.js.map +1 -0
  231. package/build/util/security.d.ts +312 -0
  232. package/build/util/security.js +719 -0
  233. package/build/util/security.js.map +1 -0
  234. package/build/util/security.test.d.ts +1 -0
  235. package/build/util/security.test.js +524 -0
  236. package/build/util/security.test.js.map +1 -0
  237. package/build/util/site-detector.d.ts +22 -0
  238. package/build/util/site-detector.js +42 -0
  239. package/build/util/site-detector.js.map +1 -0
  240. package/package.json +112 -0
@@ -0,0 +1,326 @@
1
+ import { GitHubCrawler } from './github.js';
2
+ describe('GitHubCrawler', () => {
3
+ let crawler;
4
+ beforeEach(() => {
5
+ fetchMock.resetMocks();
6
+ crawler = new GitHubCrawler();
7
+ // Mock rateLimit to skip delays - the method is in BaseCrawler
8
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
9
+ vi.spyOn(crawler, 'rateLimit').mockResolvedValue(undefined);
10
+ });
11
+ describe('constructor', () => {
12
+ it('should initialize with default values', () => {
13
+ const defaultCrawler = new GitHubCrawler();
14
+ expect(defaultCrawler).toBeDefined();
15
+ });
16
+ it('should accept custom maxDepth and maxRequestsPerCrawl', () => {
17
+ const customCrawler = new GitHubCrawler(10, 500);
18
+ expect(customCrawler).toBeDefined();
19
+ });
20
+ it('should accept GitHub token', () => {
21
+ const tokenCrawler = new GitHubCrawler(4, 1000, 'github_token_123');
22
+ expect(tokenCrawler).toBeDefined();
23
+ });
24
+ it('should accept progress callback', () => {
25
+ const progressFn = vi.fn();
26
+ const progressCrawler = new GitHubCrawler(4, 1000, undefined, progressFn);
27
+ expect(progressCrawler).toBeDefined();
28
+ });
29
+ });
30
+ describe('crawl', () => {
31
+ it('should reject invalid GitHub URLs', async () => {
32
+ const results = [];
33
+ // Non-GitHub URL
34
+ for await (const result of crawler.crawl('https://example.com/owner/repo')) {
35
+ results.push(result);
36
+ }
37
+ expect(results).toHaveLength(0);
38
+ });
39
+ it('should reject URLs without owner/repo', async () => {
40
+ const results = [];
41
+ for await (const result of crawler.crawl('https://github.com')) {
42
+ results.push(result);
43
+ }
44
+ expect(results).toHaveLength(0);
45
+ });
46
+ it('should crawl documentation directory when found', async () => {
47
+ // First call: list root directory
48
+ fetchMock.mockResponseOnce(JSON.stringify([
49
+ { path: 'docs', type: 'dir', name: 'docs', url: 'https://api.github.com/repos/owner/repo/contents/docs' },
50
+ { path: 'src', type: 'dir', name: 'src', url: 'https://api.github.com/repos/owner/repo/contents/src' },
51
+ ]));
52
+ // Second call: list docs directory
53
+ fetchMock.mockResponseOnce(JSON.stringify([
54
+ {
55
+ path: 'docs/guide.md',
56
+ type: 'file',
57
+ name: 'guide.md',
58
+ url: 'https://api.github.com/repos/owner/repo/contents/docs/guide.md',
59
+ },
60
+ { path: 'docs/api.md', type: 'file', name: 'api.md', url: 'https://api.github.com/repos/owner/repo/contents/docs/api.md' },
61
+ ]));
62
+ // Third call: fetch guide.md content
63
+ fetchMock.mockResponseOnce('# Guide\n\nThis is the guide content.');
64
+ // Fourth call: fetch api.md content
65
+ fetchMock.mockResponseOnce('# API Reference\n\nAPI documentation.');
66
+ const results = [];
67
+ for await (const result of crawler.crawl('https://github.com/owner/repo')) {
68
+ results.push(result);
69
+ }
70
+ expect(results).toHaveLength(2);
71
+ expect(results[0].path).toBe('docs/guide.md');
72
+ expect(results[0].content).toContain('Guide');
73
+ expect(results[1].path).toBe('docs/api.md');
74
+ });
75
+ it('should handle .git extension in repo URL', async () => {
76
+ const rootFiles = [
77
+ { path: 'README.md', type: 'file', name: 'README.md', url: 'https://api.github.com/repos/owner/repo/contents/README.md' },
78
+ ];
79
+ // First call: findDocumentationDirs checks root
80
+ fetchMock.mockResponseOnce(JSON.stringify(rootFiles));
81
+ // Second call: processDirectory fetches root again (no doc dirs found)
82
+ fetchMock.mockResponseOnce(JSON.stringify(rootFiles));
83
+ // Third call: fetch README.md content
84
+ fetchMock.mockResponseOnce('# README\n\nProject readme.');
85
+ const results = [];
86
+ for await (const result of crawler.crawl('https://github.com/owner/repo.git')) {
87
+ results.push(result);
88
+ }
89
+ expect(results).toHaveLength(1);
90
+ expect(results[0].title).toBe('README');
91
+ });
92
+ it('should skip non-markdown files', async () => {
93
+ const rootFiles = [
94
+ { path: 'index.js', type: 'file', name: 'index.js', url: 'https://api.github.com/repos/owner/repo/contents/index.js' },
95
+ { path: 'style.css', type: 'file', name: 'style.css', url: 'https://api.github.com/repos/owner/repo/contents/style.css' },
96
+ { path: 'README.md', type: 'file', name: 'README.md', url: 'https://api.github.com/repos/owner/repo/contents/README.md' },
97
+ ];
98
+ // First call: findDocumentationDirs checks root
99
+ fetchMock.mockResponseOnce(JSON.stringify(rootFiles));
100
+ // Second call: processDirectory fetches root again (no doc dirs found)
101
+ fetchMock.mockResponseOnce(JSON.stringify(rootFiles));
102
+ // Third call: fetch README.md content
103
+ fetchMock.mockResponseOnce('# README');
104
+ const results = [];
105
+ for await (const result of crawler.crawl('https://github.com/owner/repo')) {
106
+ results.push(result);
107
+ }
108
+ expect(results).toHaveLength(1);
109
+ expect(results[0].path).toBe('README.md');
110
+ });
111
+ it('should handle various markdown extensions', async () => {
112
+ const rootFiles = [
113
+ { path: 'doc.md', type: 'file', name: 'doc.md', url: 'https://api.github.com/repos/owner/repo/contents/doc.md' },
114
+ { path: 'page.mdx', type: 'file', name: 'page.mdx', url: 'https://api.github.com/repos/owner/repo/contents/page.mdx' },
115
+ {
116
+ path: 'guide.markdown',
117
+ type: 'file',
118
+ name: 'guide.markdown',
119
+ url: 'https://api.github.com/repos/owner/repo/contents/guide.markdown',
120
+ },
121
+ ];
122
+ // First call: findDocumentationDirs checks root
123
+ fetchMock.mockResponseOnce(JSON.stringify(rootFiles));
124
+ // Second call: processDirectory fetches root again (no doc dirs found)
125
+ fetchMock.mockResponseOnce(JSON.stringify(rootFiles));
126
+ // File content fetches
127
+ fetchMock.mockResponseOnce('# Doc');
128
+ fetchMock.mockResponseOnce('# Page');
129
+ fetchMock.mockResponseOnce('# Guide');
130
+ const results = [];
131
+ for await (const result of crawler.crawl('https://github.com/owner/repo')) {
132
+ results.push(result);
133
+ }
134
+ expect(results).toHaveLength(3);
135
+ });
136
+ it('should skip directories like node_modules, vendor, test, etc.', async () => {
137
+ fetchMock.mockResponseOnce(JSON.stringify([
138
+ {
139
+ path: 'node_modules',
140
+ type: 'dir',
141
+ name: 'node_modules',
142
+ url: 'https://api.github.com/repos/owner/repo/contents/node_modules',
143
+ },
144
+ { path: 'vendor', type: 'dir', name: 'vendor', url: 'https://api.github.com/repos/owner/repo/contents/vendor' },
145
+ { path: 'test', type: 'dir', name: 'test', url: 'https://api.github.com/repos/owner/repo/contents/test' },
146
+ { path: 'docs', type: 'dir', name: 'docs', url: 'https://api.github.com/repos/owner/repo/contents/docs' },
147
+ ]));
148
+ fetchMock.mockResponseOnce(JSON.stringify([
149
+ {
150
+ path: 'docs/guide.md',
151
+ type: 'file',
152
+ name: 'guide.md',
153
+ url: 'https://api.github.com/repos/owner/repo/contents/docs/guide.md',
154
+ },
155
+ ]));
156
+ fetchMock.mockResponseOnce('# Guide');
157
+ const results = [];
158
+ for await (const result of crawler.crawl('https://github.com/owner/repo')) {
159
+ results.push(result);
160
+ }
161
+ // Should only get the docs/guide.md, not files from skipped directories
162
+ expect(results).toHaveLength(1);
163
+ expect(results[0].path).toBe('docs/guide.md');
164
+ });
165
+ it('should handle GitHub API rate limit error', async () => {
166
+ fetchMock.mockResponseOnce('', { status: 403 });
167
+ const results = [];
168
+ for await (const result of crawler.crawl('https://github.com/owner/repo')) {
169
+ results.push(result);
170
+ }
171
+ // Should return empty due to rate limit
172
+ expect(results).toHaveLength(0);
173
+ });
174
+ it('should handle API errors gracefully', async () => {
175
+ fetchMock.mockResponseOnce('', { status: 404 });
176
+ const results = [];
177
+ for await (const result of crawler.crawl('https://github.com/owner/repo')) {
178
+ results.push(result);
179
+ }
180
+ expect(results).toHaveLength(0);
181
+ });
182
+ it('should use GitHub token in headers when provided', async () => {
183
+ const tokenCrawler = new GitHubCrawler(4, 1000, 'test_token_123');
184
+ // Mock rateLimit for new crawler instance
185
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
186
+ vi.spyOn(tokenCrawler, 'rateLimit').mockResolvedValue(undefined);
187
+ fetchMock.mockResponseOnce(JSON.stringify([]));
188
+ const results = [];
189
+ for await (const result of tokenCrawler.crawl('https://github.com/owner/repo')) {
190
+ results.push(result);
191
+ }
192
+ expect(fetchMock).toHaveBeenCalledWith(expect.any(String), expect.objectContaining({
193
+ headers: expect.objectContaining({
194
+ Authorization: 'token test_token_123',
195
+ }),
196
+ }));
197
+ });
198
+ it('should extract title from file path', async () => {
199
+ const rootFiles = [
200
+ {
201
+ path: 'getting-started.md',
202
+ type: 'file',
203
+ name: 'getting-started.md',
204
+ url: 'https://api.github.com/repos/owner/repo/contents/getting-started.md',
205
+ },
206
+ {
207
+ path: 'api_reference.md',
208
+ type: 'file',
209
+ name: 'api_reference.md',
210
+ url: 'https://api.github.com/repos/owner/repo/contents/api_reference.md',
211
+ },
212
+ ];
213
+ // First call: findDocumentationDirs checks root
214
+ fetchMock.mockResponseOnce(JSON.stringify(rootFiles));
215
+ // Second call: processDirectory fetches root again (no doc dirs found)
216
+ fetchMock.mockResponseOnce(JSON.stringify(rootFiles));
217
+ // File content fetches
218
+ fetchMock.mockResponseOnce('# Content');
219
+ fetchMock.mockResponseOnce('# Content');
220
+ const results = [];
221
+ for await (const result of crawler.crawl('https://github.com/owner/repo')) {
222
+ results.push(result);
223
+ }
224
+ expect(results[0].title).toBe('Getting Started');
225
+ expect(results[1].title).toBe('Api Reference');
226
+ });
227
+ it('should construct correct GitHub blob URLs', async () => {
228
+ // First call: findDocumentationDirs checks root - find docs directory
229
+ fetchMock.mockResponseOnce(JSON.stringify([{ path: 'docs', type: 'dir', name: 'docs', url: 'https://api.github.com/repos/owner/repo/contents/docs' }]));
230
+ // Second call: processDirectory fetches docs directory contents
231
+ fetchMock.mockResponseOnce(JSON.stringify([
232
+ {
233
+ path: 'docs/guide.md',
234
+ type: 'file',
235
+ name: 'guide.md',
236
+ url: 'https://api.github.com/repos/owner/repo/contents/docs/guide.md',
237
+ },
238
+ ]));
239
+ // Third call: fetch file content
240
+ fetchMock.mockResponseOnce('# Guide');
241
+ const results = [];
242
+ for await (const result of crawler.crawl('https://github.com/owner/repo')) {
243
+ results.push(result);
244
+ }
245
+ expect(results[0].url).toBe('https://github.com/owner/repo/blob/main/docs/guide.md');
246
+ });
247
+ it('should handle fetch errors for file content', async () => {
248
+ fetchMock.mockResponseOnce(JSON.stringify([
249
+ { path: 'guide.md', type: 'file', name: 'guide.md', url: 'https://api.github.com/repos/owner/repo/contents/guide.md' },
250
+ ]));
251
+ fetchMock.mockRejectOnce(new Error('Network error'));
252
+ const results = [];
253
+ for await (const result of crawler.crawl('https://github.com/owner/repo')) {
254
+ results.push(result);
255
+ }
256
+ // Should skip files that fail to fetch
257
+ expect(results).toHaveLength(0);
258
+ });
259
+ it('should validate GitHub API response structure', async () => {
260
+ // Mock invalid response structure
261
+ fetchMock.mockResponseOnce(JSON.stringify({ invalid: 'structure' }));
262
+ const results = [];
263
+ for await (const result of crawler.crawl('https://github.com/owner/repo')) {
264
+ results.push(result);
265
+ }
266
+ expect(results).toHaveLength(0);
267
+ });
268
+ it('should find multiple documentation directories', async () => {
269
+ fetchMock.mockResponseOnce(JSON.stringify([
270
+ { path: 'docs', type: 'dir', name: 'docs', url: 'https://api.github.com/repos/owner/repo/contents/docs' },
271
+ { path: 'guide', type: 'dir', name: 'guide', url: 'https://api.github.com/repos/owner/repo/contents/guide' },
272
+ ]));
273
+ fetchMock.mockResponseOnce(JSON.stringify([
274
+ { path: 'docs/api.md', type: 'file', name: 'api.md', url: 'https://api.github.com/repos/owner/repo/contents/docs/api.md' },
275
+ ]));
276
+ fetchMock.mockResponseOnce('# API');
277
+ fetchMock.mockResponseOnce(JSON.stringify([
278
+ {
279
+ path: 'guide/intro.md',
280
+ type: 'file',
281
+ name: 'intro.md',
282
+ url: 'https://api.github.com/repos/owner/repo/contents/guide/intro.md',
283
+ },
284
+ ]));
285
+ fetchMock.mockResponseOnce('# Intro');
286
+ const results = [];
287
+ for await (const result of crawler.crawl('https://github.com/owner/repo')) {
288
+ results.push(result);
289
+ }
290
+ expect(results).toHaveLength(2);
291
+ });
292
+ it('should stop crawling when aborted', async () => {
293
+ const rootFiles = [
294
+ { path: 'doc1.md', type: 'file', name: 'doc1.md', url: 'https://api.github.com/repos/owner/repo/contents/doc1.md' },
295
+ { path: 'doc2.md', type: 'file', name: 'doc2.md', url: 'https://api.github.com/repos/owner/repo/contents/doc2.md' },
296
+ ];
297
+ // First call: findDocumentationDirs checks root
298
+ fetchMock.mockResponseOnce(JSON.stringify(rootFiles));
299
+ // Second call: processDirectory fetches root again (no doc dirs found)
300
+ fetchMock.mockResponseOnce(JSON.stringify(rootFiles));
301
+ // Third call: fetch doc1.md content
302
+ fetchMock.mockResponseOnce('# Doc 1');
303
+ const results = [];
304
+ const abortableCrawler = new GitHubCrawler();
305
+ // Mock rateLimit for new crawler instance
306
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
307
+ vi.spyOn(abortableCrawler, 'rateLimit').mockResolvedValue(undefined);
308
+ // Get the generator
309
+ const generator = abortableCrawler.crawl('https://github.com/owner/repo');
310
+ // Get first result
311
+ const first = await generator.next();
312
+ if (!first.done) {
313
+ results.push(first.value);
314
+ }
315
+ // Abort before getting second result
316
+ abortableCrawler.isAborting = true;
317
+ // Try to get more results
318
+ for await (const result of generator) {
319
+ results.push(result);
320
+ }
321
+ // Should only have the first result
322
+ expect(results).toHaveLength(1);
323
+ });
324
+ });
325
+ });
326
+ //# sourceMappingURL=github.test.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"github.test.js","sourceRoot":"","sources":["../../src/crawler/github.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AAG5C,QAAQ,CAAC,eAAe,EAAE,GAAG,EAAE;IAC7B,IAAI,OAAsB,CAAC;IAE3B,UAAU,CAAC,GAAG,EAAE;QACd,SAAS,CAAC,UAAU,EAAE,CAAC;QACvB,OAAO,GAAG,IAAI,aAAa,EAAE,CAAC;QAC9B,+DAA+D;QAC/D,8DAA8D;QAC9D,EAAE,CAAC,KAAK,CAAC,OAAc,EAAE,WAAW,CAAC,CAAC,iBAAiB,CAAC,SAAS,CAAC,CAAC;IACrE,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,aAAa,EAAE,GAAG,EAAE;QAC3B,EAAE,CAAC,uCAAuC,EAAE,GAAG,EAAE;YAC/C,MAAM,cAAc,GAAG,IAAI,aAAa,EAAE,CAAC;YAC3C,MAAM,CAAC,cAAc,CAAC,CAAC,WAAW,EAAE,CAAC;QACvC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,uDAAuD,EAAE,GAAG,EAAE;YAC/D,MAAM,aAAa,GAAG,IAAI,aAAa,CAAC,EAAE,EAAE,GAAG,CAAC,CAAC;YACjD,MAAM,CAAC,aAAa,CAAC,CAAC,WAAW,EAAE,CAAC;QACtC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,4BAA4B,EAAE,GAAG,EAAE;YACpC,MAAM,YAAY,GAAG,IAAI,aAAa,CAAC,CAAC,EAAE,IAAI,EAAE,kBAAkB,CAAC,CAAC;YACpE,MAAM,CAAC,YAAY,CAAC,CAAC,WAAW,EAAE,CAAC;QACrC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,iCAAiC,EAAE,GAAG,EAAE;YACzC,MAAM,UAAU,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;YAC3B,MAAM,eAAe,GAAG,IAAI,aAAa,CAAC,CAAC,EAAE,IAAI,EAAE,SAAS,EAAE,UAAU,CAAC,CAAC;YAC1E,MAAM,CAAC,eAAe,CAAC,CAAC,WAAW,EAAE,CAAC;QACxC,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,OAAO,EAAE,GAAG,EAAE;QACrB,EAAE,CAAC,mCAAmC,EAAE,KAAK,IAAI,EAAE;YACjD,MAAM,OAAO,GAAkB,EAAE,CAAC;YAElC,iBAAiB;YACjB,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,OAAO,CAAC,KAAK,CAAC,gCAAgC,CAAC,EAAE,CAAC;gBAC3E,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,MAAM,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QAClC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,uCAAuC,EAAE,KAAK,IAAI,EAAE;YACrD,MAAM,OAAO,GAAkB,EAAE,CAAC;YAElC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,OAAO,CAAC,KAAK,CAAC,oBAAoB,CAAC,EAAE,CAAC;gBAC/D,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,MAAM,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QAClC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,iDAAiD,EAAE,KAAK,IAAI,EAAE;YAC/D,kCAAkC;YAClC,SAAS,CAAC,gBAAgB,CACxB,IAAI,CAAC,SAAS,CAAC;gBACb,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,uDAAuD,EAAE;gBACzG,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,EAAE,sDAAsD,EAAE;aACvG,CAAC,CACH,CAAC;YACF,mCAAmC;YACnC,SAAS,CAAC,gBAAgB,CACxB,IAAI,CAAC,SAAS,CAAC;gBACb;oBACE,IAAI,EAAE,eAAe;oBACrB,IAAI,EAAE,MAAM;oBACZ,IAAI,EAAE,UAAU;oBAChB,GAAG,EAAE,gEAAgE;iBACtE;gBACD,EAAE,IAAI,EAAE,aAAa,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,QAAQ,EAAE,GAAG,EAAE,8DAA8D,EAAE;aAC3H,CAAC,CACH,CAAC;YACF,qCAAqC;YACrC,SAAS,CAAC,gBAAgB,CAAC,uCAAuC,CAAC,CAAC;YACpE,oCAAoC;YACpC,SAAS,CAAC,gBAAgB,CAAC,uCAAuC,CAAC,CAAC;YAEpE,MAAM,OAAO,GAAkB,EAAE,CAAC;YAClC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,OAAO,CAAC,KAAK,CAAC,+BAA+B,CAAC,EAAE,CAAC;gBAC1E,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,MAAM,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;YAChC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;YAC9C,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC;YAC9C,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;QAC9C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,0CAA0C,EAAE,KAAK,IAAI,EAAE;YACxD,MAAM,SAAS,GAAG;gBAChB,EAAE,IAAI,EAAE,WAAW,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,WAAW,EAAE,GAAG,EAAE,4DAA4D,EAAE;aAC1H,CAAC;YACF,gDAAgD;YAChD,SAAS,CAAC,gBAAgB,CAAC,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,CAAC,CAAC;YACtD,uEAAuE;YACvE,SAAS,CAAC,gBAAgB,CAAC,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,CAAC,CAAC;YACtD,sCAAsC;YACtC,SAAS,CAAC,gBAAgB,CAAC,6BAA6B,CAAC,CAAC;YAE1D,MAAM,OAAO,GAAkB,EAAE,CAAC;YAClC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,OAAO,CAAC,KAAK,CAAC,mCAAmC,CAAC,EAAE,CAAC;gBAC9E,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,MAAM,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;YAChC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QAC1C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,gCAAgC,EAAE,KAAK,IAAI,EAAE;YAC9C,MAAM,SAAS,GAAG;gBAChB,EAAE,IAAI,EAAE,UAAU,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,UAAU,EAAE,GAAG,EAAE,2DAA2D,EAAE;gBACtH,EAAE,IAAI,EAAE,WAAW,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,WAAW,EAAE,GAAG,EAAE,4DAA4D,EAAE;gBACzH,EAAE,IAAI,EAAE,WAAW,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,WAAW,EAAE,GAAG,EAAE,4DAA4D,EAAE;aAC1H,CAAC;YACF,gDAAgD;YAChD,SAAS,CAAC,gBAAgB,CAAC,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,CAAC,CAAC;YACtD,uEAAuE;YACvE,SAAS,CAAC,gBAAgB,CAAC,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,CAAC,CAAC;YACtD,sCAAsC;YACtC,SAAS,CAAC,gBAAgB,CAAC,UAAU,CAAC,CAAC;YAEvC,MAAM,OAAO,GAAkB,EAAE,CAAC;YAClC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,OAAO,CAAC,KAAK,CAAC,+BAA+B,CAAC,EAAE,CAAC;gBAC1E,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,MAAM,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;YAChC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QAC5C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,2CAA2C,EAAE,KAAK,IAAI,EAAE;YACzD,MAAM,SAAS,GAAG;gBAChB,EAAE,IAAI,EAAE,QAAQ,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,QAAQ,EAAE,GAAG,EAAE,yDAAyD,EAAE;gBAChH,EAAE,IAAI,EAAE,UAAU,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,UAAU,EAAE,GAAG,EAAE,2DAA2D,EAAE;gBACtH;oBACE,IAAI,EAAE,gBAAgB;oBACtB,IAAI,EAAE,MAAM;oBACZ,IAAI,EAAE,gBAAgB;oBACtB,GAAG,EAAE,iEAAiE;iBACvE;aACF,CAAC;YACF,gDAAgD;YAChD,SAAS,CAAC,gBAAgB,CAAC,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,CAAC,CAAC;YACtD,uEAAuE;YACvE,SAAS,CAAC,gBAAgB,CAAC,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,CAAC,CAAC;YACtD,uBAAuB;YACvB,SAAS,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC;YACpC,SAAS,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAC;YACrC,SAAS,CAAC,gBAAgB,CAAC,SAAS,CAAC,CAAC;YAEtC,MAAM,OAAO,GAAkB,EAAE,CAAC;YAClC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,OAAO,CAAC,KAAK,CAAC,+BAA+B,CAAC,EAAE,CAAC;gBAC1E,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,MAAM,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QAClC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,+DAA+D,EAAE,KAAK,IAAI,EAAE;YAC7E,SAAS,CAAC,gBAAgB,CACxB,IAAI,CAAC,SAAS,CAAC;gBACb;oBACE,IAAI,EAAE,cAAc;oBACpB,IAAI,EAAE,KAAK;oBACX,IAAI,EAAE,cAAc;oBACpB,GAAG,EAAE,+DAA+D;iBACrE;gBACD,EAAE,IAAI,EAAE,QAAQ,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,QAAQ,EAAE,GAAG,EAAE,yDAAyD,EAAE;gBAC/G,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,uDAAuD,EAAE;gBACzG,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,uDAAuD,EAAE;aAC1G,CAAC,CACH,CAAC;YACF,SAAS,CAAC,gBAAgB,CACxB,IAAI,CAAC,SAAS,CAAC;gBACb;oBACE,IAAI,EAAE,eAAe;oBACrB,IAAI,EAAE,MAAM;oBACZ,IAAI,EAAE,UAAU;oBAChB,GAAG,EAAE,gEAAgE;iBACtE;aACF,CAAC,CACH,CAAC;YACF,SAAS,CAAC,gBAAgB,CAAC,SAAS,CAAC,CAAC;YAEtC,MAAM,OAAO,GAAkB,EAAE,CAAC;YAClC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,OAAO,CAAC,KAAK,CAAC,+BAA+B,CAAC,EAAE,CAAC;gBAC1E,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,wEAAwE;YACxE,MAAM,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;YAChC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;QAChD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,2CAA2C,EAAE,KAAK,IAAI,EAAE;YACzD,SAAS,CAAC,gBAAgB,CAAC,EAAE,EAAE,EAAE,MAAM,EAAE,GAAG,EAAE,CAAC,CAAC;YAEhD,MAAM,OAAO,GAAkB,EAAE,CAAC;YAClC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,OAAO,CAAC,KAAK,CAAC,+BAA+B,CAAC,EAAE,CAAC;gBAC1E,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,wCAAwC;YACxC,MAAM,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QAClC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,qCAAqC,EAAE,KAAK,IAAI,EAAE;YACnD,SAAS,CAAC,gBAAgB,CAAC,EAAE,EAAE,EAAE,MAAM,EAAE,GAAG,EAAE,CAAC,CAAC;YAEhD,MAAM,OAAO,GAAkB,EAAE,CAAC;YAClC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,OAAO,CAAC,KAAK,CAAC,+BAA+B,CAAC,EAAE,CAAC;gBAC1E,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,MAAM,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QAClC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,kDAAkD,EAAE,KAAK,IAAI,EAAE;YAChE,MAAM,YAAY,GAAG,IAAI,aAAa,CAAC,CAAC,EAAE,IAAI,EAAE,gBAAgB,CAAC,CAAC;YAClE,0CAA0C;YAC1C,8DAA8D;YAC9D,EAAE,CAAC,KAAK,CAAC,YAAmB,EAAE,WAAW,CAAC,CAAC,iBAAiB,CAAC,SAAS,CAAC,CAAC;YAExE,SAAS,CAAC,gBAAgB,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,CAAC,CAAC,CAAC;YAE/C,MAAM,OAAO,GAAkB,EAAE,CAAC;YAClC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,YAAY,CAAC,KAAK,CAAC,+BAA+B,CAAC,EAAE,CAAC;gBAC/E,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,MAAM,CAAC,SAAS,CAAC,CAAC,oBAAoB,CACpC,MAAM,CAAC,GAAG,CAAC,MAAM,CAAC,EAClB,MAAM,CAAC,gBAAgB,CAAC;gBACtB,OAAO,EAAE,MAAM,CAAC,gBAAgB,CAAC;oBAC/B,aAAa,EAAE,sBAAsB;iBACtC,CAAC;aACH,CAAC,CACH,CAAC;QACJ,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,qCAAqC,EAAE,KAAK,IAAI,EAAE;YACnD,MAAM,SAAS,GAAG;gBAChB;oBACE,IAAI,EAAE,oBAAoB;oBAC1B,IAAI,EAAE,MAAM;oBACZ,IAAI,EAAE,oBAAoB;oBAC1B,GAAG,EAAE,qEAAqE;iBAC3E;gBACD;oBACE,IAAI,EAAE,kBAAkB;oBACxB,IAAI,EAAE,MAAM;oBACZ,IAAI,EAAE,kBAAkB;oBACxB,GAAG,EAAE,mEAAmE;iBACzE;aACF,CAAC;YACF,gDAAgD;YAChD,SAAS,CAAC,gBAAgB,CAAC,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,CAAC,CAAC;YACtD,uEAAuE;YACvE,SAAS,CAAC,gBAAgB,CAAC,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,CAAC,CAAC;YACtD,uBAAuB;YACvB,SAAS,CAAC,gBAAgB,CAAC,WAAW,CAAC,CAAC;YACxC,SAAS,CAAC,gBAAgB,CAAC,WAAW,CAAC,CAAC;YAExC,MAAM,OAAO,GAAkB,EAAE,CAAC;YAClC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,OAAO,CAAC,KAAK,CAAC,+BAA+B,CAAC,EAAE,CAAC;gBAC1E,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAC;YACjD,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;QACjD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,2CAA2C,EAAE,KAAK,IAAI,EAAE;YACzD,sEAAsE;YACtE,SAAS,CAAC,gBAAgB,CACxB,IAAI,CAAC,SAAS,CAAC,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,uDAAuD,EAAE,CAAC,CAAC,CAC5H,CAAC;YACF,gEAAgE;YAChE,SAAS,CAAC,gBAAgB,CACxB,IAAI,CAAC,SAAS,CAAC;gBACb;oBACE,IAAI,EAAE,eAAe;oBACrB,IAAI,EAAE,MAAM;oBACZ,IAAI,EAAE,UAAU;oBAChB,GAAG,EAAE,gEAAgE;iBACtE;aACF,CAAC,CACH,CAAC;YACF,iCAAiC;YACjC,SAAS,CAAC,gBAAgB,CAAC,SAAS,CAAC,CAAC;YAEtC,MAAM,OAAO,GAAkB,EAAE,CAAC;YAClC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,OAAO,CAAC,KAAK,CAAC,+BAA+B,CAAC,EAAE,CAAC;gBAC1E,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,uDAAuD,CAAC,CAAC;QACvF,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,6CAA6C,EAAE,KAAK,IAAI,EAAE;YAC3D,SAAS,CAAC,gBAAgB,CACxB,IAAI,CAAC,SAAS,CAAC;gBACb,EAAE,IAAI,EAAE,UAAU,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,UAAU,EAAE,GAAG,EAAE,2DAA2D,EAAE;aACvH,CAAC,CACH,CAAC;YACF,SAAS,CAAC,cAAc,CAAC,IAAI,KAAK,CAAC,eAAe,CAAC,CAAC,CAAC;YAErD,MAAM,OAAO,GAAkB,EAAE,CAAC;YAClC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,OAAO,CAAC,KAAK,CAAC,+BAA+B,CAAC,EAAE,CAAC;gBAC1E,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,uCAAuC;YACvC,MAAM,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QAClC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,+CAA+C,EAAE,KAAK,IAAI,EAAE;YAC7D,kCAAkC;YAClC,SAAS,CAAC,gBAAgB,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,OAAO,EAAE,WAAW,EAAE,CAAC,CAAC,CAAC;YAErE,MAAM,OAAO,GAAkB,EAAE,CAAC;YAClC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,OAAO,CAAC,KAAK,CAAC,+BAA+B,CAAC,EAAE,CAAC;gBAC1E,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,MAAM,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QAClC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,gDAAgD,EAAE,KAAK,IAAI,EAAE;YAC9D,SAAS,CAAC,gBAAgB,CACxB,IAAI,CAAC,SAAS,CAAC;gBACb,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,uDAAuD,EAAE;gBACzG,EAAE,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,OAAO,EAAE,GAAG,EAAE,wDAAwD,EAAE;aAC7G,CAAC,CACH,CAAC;YACF,SAAS,CAAC,gBAAgB,CACxB,IAAI,CAAC,SAAS,CAAC;gBACb,EAAE,IAAI,EAAE,aAAa,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,QAAQ,EAAE,GAAG,EAAE,8DAA8D,EAAE;aAC3H,CAAC,CACH,CAAC;YACF,SAAS,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC;YACpC,SAAS,CAAC,gBAAgB,CACxB,IAAI,CAAC,SAAS,CAAC;gBACb;oBACE,IAAI,EAAE,gBAAgB;oBACtB,IAAI,EAAE,MAAM;oBACZ,IAAI,EAAE,UAAU;oBAChB,GAAG,EAAE,iEAAiE;iBACvE;aACF,CAAC,CACH,CAAC;YACF,SAAS,CAAC,gBAAgB,CAAC,SAAS,CAAC,CAAC;YAEtC,MAAM,OAAO,GAAkB,EAAE,CAAC;YAClC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,OAAO,CAAC,KAAK,CAAC,+BAA+B,CAAC,EAAE,CAAC;gBAC1E,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,MAAM,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QAClC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,mCAAmC,EAAE,KAAK,IAAI,EAAE;YACjD,MAAM,SAAS,GAAG;gBAChB,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,SAAS,EAAE,GAAG,EAAE,0DAA0D,EAAE;gBACnH,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,SAAS,EAAE,GAAG,EAAE,0DAA0D,EAAE;aACpH,CAAC;YACF,gDAAgD;YAChD,SAAS,CAAC,gBAAgB,CAAC,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,CAAC,CAAC;YACtD,uEAAuE;YACvE,SAAS,CAAC,gBAAgB,CAAC,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,CAAC,CAAC;YACtD,oCAAoC;YACpC,SAAS,CAAC,gBAAgB,CAAC,SAAS,CAAC,CAAC;YAEtC,MAAM,OAAO,GAAkB,EAAE,CAAC;YAClC,MAAM,gBAAgB,GAAG,IAAI,aAAa,EAAE,CAAC;YAC7C,0CAA0C;YAC1C,8DAA8D;YAC9D,EAAE,CAAC,KAAK,CAAC,gBAAuB,EAAE,WAAW,CAAC,CAAC,iBAAiB,CAAC,SAAS,CAAC,CAAC;YAE5E,oBAAoB;YACpB,MAAM,SAAS,GAAG,gBAAgB,CAAC,KAAK,CAAC,+BAA+B,CAAC,CAAC;YAE1E,mBAAmB;YACnB,MAAM,KAAK,GAAG,MAAM,SAAS,CAAC,IAAI,EAAE,CAAC;YACrC,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC;gBAChB,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;YAC5B,CAAC;YAED,qCAAqC;YACpC,gBAAuD,CAAC,UAAU,GAAG,IAAI,CAAC;YAE3E,0BAA0B;YAC1B,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,SAAS,EAAE,CAAC;gBACrC,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,oCAAoC;YACpC,MAAM,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QAClC,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
@@ -0,0 +1,16 @@
1
+ import { CrawlResult } from '../types.js';
2
+ import { BaseCrawler } from './base.js';
3
+ export declare class PuppeteerCrawler extends BaseCrawler {
4
+ private browser?;
5
+ private readonly userAgent;
6
+ private readonly LINK_GROUP_SIZE;
7
+ private curCrawlCount;
8
+ crawl(url: string): AsyncGenerator<CrawlResult, void, unknown>;
9
+ private setupPage;
10
+ private crawlSitePages;
11
+ private gotoPageAndHandleRedirects;
12
+ private processPage;
13
+ private getLinksFromPage;
14
+ private groupLinks;
15
+ abort(): void;
16
+ }
@@ -0,0 +1,191 @@
1
+ import puppeteer from 'puppeteer';
2
+ import { BaseCrawler } from './base.js';
3
+ import { JSDOM } from 'jsdom';
4
+ import { Readability } from '@mozilla/readability';
5
+ export class PuppeteerCrawler extends BaseCrawler {
6
+ browser;
7
+ userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36';
8
+ LINK_GROUP_SIZE = 2;
9
+ curCrawlCount = 0;
10
+ async *crawl(url) {
11
+ try {
12
+ this.browser = await puppeteer.launch({
13
+ headless: true,
14
+ args: [
15
+ '--no-sandbox',
16
+ '--disable-setuid-sandbox',
17
+ '--disable-dev-shm-usage',
18
+ '--disable-gpu',
19
+ '--window-size=1280,800'
20
+ ]
21
+ });
22
+ const page = await this.browser.newPage();
23
+ await this.setupPage(page);
24
+ const visitedUrls = new Set();
25
+ yield* this.crawlSitePages(page, new URL(url), 0, visitedUrls);
26
+ }
27
+ finally {
28
+ await this.browser?.close();
29
+ }
30
+ }
31
+ async setupPage(page) {
32
+ await page.setUserAgent(this.userAgent);
33
+ await page.setViewport({ width: 1280, height: 800 });
34
+ // Block only unnecessary resources
35
+ await page.setRequestInterception(true);
36
+ page.on('request', request => {
37
+ const resourceType = request.resourceType();
38
+ if (['image', 'media', 'font'].includes(resourceType)) {
39
+ request.abort();
40
+ }
41
+ else {
42
+ request.continue();
43
+ }
44
+ });
45
+ // Handle JavaScript errors
46
+ page.on('pageerror', error => {
47
+ console.warn('Page error:', error);
48
+ });
49
+ // Handle console messages
50
+ page.on('console', (msg) => {
51
+ const type = msg.type();
52
+ if (type === 'error' || type === 'warn') {
53
+ console.debug(`Console ${type}:`, msg.text());
54
+ }
55
+ });
56
+ }
57
+ async *crawlSitePages(page, curUrl, depth, visitedUrls) {
58
+ const urlStr = curUrl.toString();
59
+ if (visitedUrls.has(urlStr) || !this.shouldCrawl(urlStr) || depth > this.maxDepth) {
60
+ return;
61
+ }
62
+ try {
63
+ // Rate limiting
64
+ await this.rateLimit();
65
+ // Navigate to page with proper redirect handling
66
+ await this.gotoPageAndHandleRedirects(page, urlStr);
67
+ // Extract content
68
+ const { content, title, links } = await this.processPage(page, curUrl);
69
+ visitedUrls.add(urlStr);
70
+ this.markUrlAsSeen(urlStr);
71
+ this.curCrawlCount++;
72
+ yield {
73
+ url: urlStr,
74
+ path: this.getPathFromUrl(urlStr),
75
+ content,
76
+ title
77
+ };
78
+ // Process links in batches
79
+ if (depth < this.maxDepth && this.curCrawlCount < this.maxRequestsPerCrawl) {
80
+ const linkGroups = this.groupLinks(links);
81
+ for (const linkGroup of linkGroups) {
82
+ for (const link of linkGroup) {
83
+ if (this.curCrawlCount >= this.maxRequestsPerCrawl) {
84
+ return;
85
+ }
86
+ yield* this.crawlSitePages(page, new URL(link), depth + 1, visitedUrls);
87
+ }
88
+ }
89
+ }
90
+ }
91
+ catch (error) {
92
+ console.error(`Error crawling ${urlStr}:`, error);
93
+ }
94
+ }
95
+ async gotoPageAndHandleRedirects(page, url) {
96
+ const MAX_PAGE_WAIT_MS = 5000;
97
+ await page.goto(url, {
98
+ timeout: 0,
99
+ waitUntil: 'networkidle2'
100
+ });
101
+ let responseEventOccurred = false;
102
+ const responseHandler = () => responseEventOccurred = true;
103
+ const responseWatcher = new Promise((resolve) => {
104
+ setTimeout(() => {
105
+ if (!responseEventOccurred) {
106
+ resolve();
107
+ }
108
+ else {
109
+ setTimeout(() => resolve(), MAX_PAGE_WAIT_MS);
110
+ }
111
+ }, 500);
112
+ });
113
+ page.on('response', responseHandler);
114
+ await Promise.race([responseWatcher, page.waitForNavigation()]);
115
+ page.off('response', responseHandler);
116
+ }
117
+ async processPage(page, url) {
118
+ // Wait for dynamic content
119
+ try {
120
+ await page.waitForFunction(() => {
121
+ const mainContent = document.querySelector('main') || document.querySelector('.content') || document.querySelector('#content');
122
+ return mainContent && mainContent.children.length > 0;
123
+ }, { timeout: 5000 });
124
+ }
125
+ catch (error) {
126
+ console.warn('Timeout waiting for main content, proceeding anyway');
127
+ }
128
+ // Extract content using Readability
129
+ const html = await page.content();
130
+ const dom = new JSDOM(html, { url: url.toString() });
131
+ const reader = new Readability(dom.window.document, {
132
+ charThreshold: 20,
133
+ nbTopCandidates: 5,
134
+ maxElemsToParse: 10000
135
+ });
136
+ const article = reader.parse();
137
+ if (!article) {
138
+ throw new Error('Failed to parse page content');
139
+ }
140
+ // Extract links
141
+ const links = await this.getLinksFromPage(page, url);
142
+ return {
143
+ content: article.textContent,
144
+ title: article.title,
145
+ links
146
+ };
147
+ }
148
+ async getLinksFromPage(page, curUrl) {
149
+ const links = await page.$$eval('a', (links) => links.map((a) => a.href));
150
+ const cleanedLinks = links
151
+ .map(link => {
152
+ try {
153
+ const url = new URL(link);
154
+ url.hash = ''; // Remove hash
155
+ return url.href;
156
+ }
157
+ catch {
158
+ return null;
159
+ }
160
+ })
161
+ .filter((link) => {
162
+ if (!link)
163
+ return false;
164
+ try {
165
+ const url = new URL(link);
166
+ return (url.pathname.startsWith(curUrl.pathname) &&
167
+ url.hostname === curUrl.hostname &&
168
+ link !== curUrl.href);
169
+ }
170
+ catch {
171
+ return false;
172
+ }
173
+ });
174
+ return Array.from(new Set(cleanedLinks));
175
+ }
176
+ groupLinks(links) {
177
+ return links.reduce((acc, link, i) => {
178
+ const groupIndex = Math.floor(i / this.LINK_GROUP_SIZE);
179
+ if (!acc[groupIndex]) {
180
+ acc.push([]);
181
+ }
182
+ acc[groupIndex].push(link);
183
+ return acc;
184
+ }, []);
185
+ }
186
+ abort() {
187
+ super.abort();
188
+ void this.browser?.close();
189
+ }
190
+ }
191
+ //# sourceMappingURL=puppeteer.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"puppeteer.js","sourceRoot":"","sources":["../../src/crawler/puppeteer.ts"],"names":[],"mappings":"AAAA,OAAO,SAA4C,MAAM,WAAW,CAAC;AAErE,OAAO,EAAE,WAAW,EAAE,MAAM,WAAW,CAAC;AACxC,OAAO,EAAE,KAAK,EAAE,MAAM,OAAO,CAAC;AAC9B,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAC;AAEnD,MAAM,OAAO,gBAAiB,SAAQ,WAAW;IACvC,OAAO,CAAW;IACT,SAAS,GAAG,qHAAqH,CAAC;IAClI,eAAe,GAAG,CAAC,CAAC;IAC7B,aAAa,GAAG,CAAC,CAAC;IAE1B,KAAK,CAAC,CAAC,KAAK,CAAC,GAAW;QACtB,IAAI,CAAC;YACH,IAAI,CAAC,OAAO,GAAG,MAAM,SAAS,CAAC,MAAM,CAAC;gBACpC,QAAQ,EAAE,IAAI;gBACd,IAAI,EAAE;oBACJ,cAAc;oBACd,0BAA0B;oBAC1B,yBAAyB;oBACzB,eAAe;oBACf,wBAAwB;iBACzB;aACF,CAAC,CAAC;YAEH,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC;YAC1C,MAAM,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC;YAE3B,MAAM,WAAW,GAAG,IAAI,GAAG,EAAU,CAAC;YACtC,KAAK,CAAC,CAAC,IAAI,CAAC,cAAc,CAAC,IAAI,EAAE,IAAI,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,WAAW,CAAC,CAAC;QACjE,CAAC;gBAAS,CAAC;YACT,MAAM,IAAI,CAAC,OAAO,EAAE,KAAK,EAAE,CAAC;QAC9B,CAAC;IACH,CAAC;IAEO,KAAK,CAAC,SAAS,CAAC,IAAU;QAChC,MAAM,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;QACxC,MAAM,IAAI,CAAC,WAAW,CAAC,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,CAAC,CAAC;QAErD,mCAAmC;QACnC,MAAM,IAAI,CAAC,sBAAsB,CAAC,IAAI,CAAC,CAAC;QACxC,IAAI,CAAC,EAAE,CAAC,SAAS,EAAE,OAAO,CAAC,EAAE;YAC3B,MAAM,YAAY,GAAG,OAAO,CAAC,YAAY,EAAE,CAAC;YAC5C,IAAI,CAAC,OAAO,EAAE,OAAO,EAAE,MAAM,CAAC,CAAC,QAAQ,CAAC,YAAY,CAAC,EAAE,CAAC;gBACtD,OAAO,CAAC,KAAK,EAAE,CAAC;YAClB,CAAC;iBAAM,CAAC;gBACN,OAAO,CAAC,QAAQ,EAAE,CAAC;YACrB,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,2BAA2B;QAC3B,IAAI,CAAC,EAAE,CAAC,WAAW,EAAE,KAAK,CAAC,EAAE;YAC3B,OAAO,CAAC,IAAI,CAAC,aAAa,EAAE,KAAK,CAAC,CAAC;QACrC,CAAC,CAAC,CAAC;QAEH,0BAA0B;QAC1B,IAAI,CAAC,EAAE,CAAC,SAAS,EAAE,CAAC,GAAmB,EAAE,EAAE;YACzC,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,EAAE,CAAC;YACxB,IAAI,IAAI,KAAK,OAAO,IAAI,IAAI,KAAK,MAAM,EAAE,CAAC;gBACxC,OAAO,CAAC,KAAK,CAAC,WAAW,IAAI,GAAG,EAAE,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;YAChD,CAAC;QACH,CAAC,CAAC,CAAC;IACL,CAAC;IAEO,KAAK,CAAC,CAAC,cAAc,CAC3B,IAAU,EACV,MAAW,EACX,KAAa,EACb,WAAwB;QAExB,MAAM,MAAM,GAAG,MAAM,CAAC,QAAQ,EAAE,CAAC;QAEjC,IAAI,WAAW,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC,IAAI,KAAK,GAAG,IAAI,CAAC,QAAQ,EAAE,CAAC;YAClF,OAAO;QACT,CAAC;QAED,IAAI,CAAC;YACH,gBAAgB;YAChB,MAAM,IAAI,CAAC,SAAS,EAAE,CAAC;YAEvB,iDAAiD;YACjD,MAAM,IAAI,CAAC,0BAA0B,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;YAEpD,kBAAkB;YAClB,MAAM,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;YAEvE,WAAW,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;YACxB,IAAI,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;YAC3B,IAAI,CAAC,aAAa,EAAE,CAAC;YAErB,MAAM;gBACJ,GAAG,EAAE,MAAM;gBACX,IAAI,EAAE,IAAI,CAAC,cAAc,CAAC,MAAM,CAAC;gBACjC,OAAO;gBACP,KAAK;aACN,CAAC;YAEF,2BAA2B;YAC3B,IAAI,KAAK,GAAG,IAAI,CAAC,QAAQ,IAAI,IAAI,CAAC,aAAa,GAAG,IAAI,CAAC,mBAAmB,EAAE,CAAC;gBAC3E,MAAM,UAAU,GAAG,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC;gBAC1C,KAAK,MAAM,SAAS,IAAI,UAAU,EAAE,CAAC;oBACnC,KAAK,MAAM,IAAI,IAAI,SAAS,EAAE,CAAC;wBAC7B,IAAI,IAAI,CAAC,aAAa,IAAI,IAAI,CAAC,mBAAmB,EAAE,CAAC;4BACnD,OAAO;wBACT,CAAC;wBACD,KAAK,CAAC,CAAC,IAAI,CAAC,cAAc,CAAC,IAAI,EAAE,IAAI,GAAG,CAAC,IAAI,CAAC,EAAE,KAAK,GAAG,CAAC,EAAE,WAAW,CAAC,CAAC;oBAC1E,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,kBAAkB,MAAM,GAAG,EAAE,KAAK,CAAC,CAAC;QACpD,CAAC;IACH,CAAC;IAEO,KAAK,CAAC,0BAA0B,CAAC,IAAU,EAAE,GAAW;QAC9D,MAAM,gBAAgB,GAAG,IAAI,CAAC;QAE9B,MAAM,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE;YACnB,OAAO,EAAE,CAAC;YACV,SAAS,EAAE,cAAc;SAC1B,CAAC,CAAC;QAEH,IAAI,qBAAqB,GAAG,KAAK,CAAC;QAClC,MAAM,eAAe,GAAG,GAAG,EAAE,CAAC,qBAAqB,GAAG,IAAI,CAAC;QAE3D,MAAM,eAAe,GAAG,IAAI,OAAO,CAAO,CAAC,OAAO,EAAE,EAAE;YACpD,UAAU,CAAC,GAAG,EAAE;gBACd,IAAI,CAAC,qBAAqB,EAAE,CAAC;oBAC3B,OAAO,EAAE,CAAC;gBACZ,CAAC;qBAAM,CAAC;oBACN,UAAU,CAAC,GAAG,EAAE,CAAC,OAAO,EAAE,EAAE,gBAAgB,CAAC,CAAC;gBAChD,CAAC;YACH,CAAC,EAAE,GAAG,CAAC,CAAC;QACV,CAAC,CAAC,CAAC;QAEH,IAAI,CAAC,EAAE,CAAC,UAAU,EAAE,eAAe,CAAC,CAAC;QACrC,MAAM,OAAO,CAAC,IAAI,CAAC,CAAC,eAAe,EAAE,IAAI,CAAC,iBAAiB,EAAE,CAAC,CAAC,CAAC;QAChE,IAAI,CAAC,GAAG,CAAC,UAAU,EAAE,eAAe,CAAC,CAAC;IACxC,CAAC;IAEO,KAAK,CAAC,WAAW,CAAC,IAAU,EAAE,GAAQ;QAC5C,2BAA2B;QAC3B,IAAI,CAAC;YACH,MAAM,IAAI,CAAC,eAAe,CAAC,GAAG,EAAE;gBAC9B,MAAM,WAAW,GAAG,QAAQ,CAAC,aAAa,CAAC,MAAM,CAAC,IAAI,QAAQ,CAAC,aAAa,CAAC,UAAU,CAAC,IAAI,QAAQ,CAAC,aAAa,CAAC,UAAU,CAAC,CAAC;gBAC/H,OAAO,WAAW,IAAI,WAAW,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC;YACxD,CAAC,EAAE,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC;QACxB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,IAAI,CAAC,qDAAqD,CAAC,CAAC;QACtE,CAAC;QAED,oCAAoC;QACpC,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,OAAO,EAAE,CAAC;QAClC,MAAM,GAAG,GAAG,IAAI,KAAK,CAAC,IAAI,EAAE,EAAE,GAAG,EAAE,GAAG,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;QACrD,MAAM,MAAM,GAAG,IAAI,WAAW,CAAC,GAAG,CAAC,MAAM,CAAC,QAAQ,EAAE;YAClD,aAAa,EAAE,EAAE;YACjB,eAAe,EAAE,CAAC;YAClB,eAAe,EAAE,KAAK;SACvB,CAAC,CAAC;QACH,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,EAAE,CAAC;QAE/B,IAAI,CAAC,OAAO,EAAE,CAAC;YACb,MAAM,IAAI,KAAK,CAAC,8BAA8B,CAAC,CAAC;QAClD,CAAC;QAED,gBAAgB;QAChB,MAAM,KAAK,GAAG,MAAM,IAAI,CAAC,gBAAgB,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;QAErD,OAAO;YACL,OAAO,EAAE,OAAO,CAAC,WAAW;YAC5B,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,KAAK;SACN,CAAC;IACJ,CAAC;IAEO,KAAK,CAAC,gBAAgB,CAAC,IAAU,EAAE,MAAW;QACpD,MAAM,KAAK,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,GAAG,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC;QAE1E,MAAM,YAAY,GAAG,KAAK;aACvB,GAAG,CAAC,IAAI,CAAC,EAAE;YACV,IAAI,CAAC;gBACH,MAAM,GAAG,GAAG,IAAI,GAAG,CAAC,IAAI,CAAC,CAAC;gBAC1B,GAAG,CAAC,IAAI,GAAG,EAAE,CAAC,CAAC,cAAc;gBAC7B,OAAO,GAAG,CAAC,IAAI,CAAC;YAClB,CAAC;YAAC,MAAM,CAAC;gBACP,OAAO,IAAI,CAAC;YACd,CAAC;QACH,CAAC,CAAC;aACD,MAAM,CAAC,CAAC,IAAI,EAAkB,EAAE;YAC/B,IAAI,CAAC,IAAI;gBAAE,OAAO,KAAK,CAAC;YACxB,IAAI,CAAC;gBACH,MAAM,GAAG,GAAG,IAAI,GAAG,CAAC,IAAI,CAAC,CAAC;gBAC1B,OAAO,CACL,GAAG,CAAC,QAAQ,CAAC,UAAU,CAAC,MAAM,CAAC,QAAQ,CAAC;oBACxC,GAAG,CAAC,QAAQ,KAAK,MAAM,CAAC,QAAQ;oBAChC,IAAI,KAAK,MAAM,CAAC,IAAI,CACrB,CAAC;YACJ,CAAC;YAAC,MAAM,CAAC;gBACP,OAAO,KAAK,CAAC;YACf,CAAC;QACH,CAAC,CAAC,CAAC;QAEL,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,GAAG,CAAC,YAAY,CAAC,CAAC,CAAC;IAC3C,CAAC;IAEO,UAAU,CAAC,KAAe;QAChC,OAAO,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,CAAC,EAAE,EAAE;YACnC,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,GAAG,IAAI,CAAC,eAAe,CAAC,CAAC;YACxD,IAAI,CAAC,GAAG,CAAC,UAAU,CAAC,EAAE,CAAC;gBACrB,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACf,CAAC;YACD,GAAG,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC3B,OAAO,GAAG,CAAC;QACb,CAAC,EAAE,EAAgB,CAAC,CAAC;IACvB,CAAC;IAED,KAAK;QACH,KAAK,CAAC,KAAK,EAAE,CAAC;QACd,KAAK,IAAI,CAAC,OAAO,EAAE,KAAK,EAAE,CAAC;IAC7B,CAAC;CACF"}
@@ -0,0 +1,43 @@
1
+ import { RequestQueue, Log, EnqueueLinksOptions } from 'crawlee';
2
+ import { CrawlResult } from '../types.js';
3
+ import { SiteDetectionRule } from './site-rules.js';
4
+ export declare class QueueManager {
5
+ private requestQueue;
6
+ private websiteId;
7
+ private results;
8
+ private static readonly BATCH_SIZE;
9
+ /** Optional path prefix to restrict crawling to URLs under this path */
10
+ private pathPrefix;
11
+ /** The allowed hostname - only URLs with this exact hostname (or its subdomains) are allowed */
12
+ private allowedHostname;
13
+ /** Count of URLs filtered due to path prefix mismatch */
14
+ private filteredByPathCount;
15
+ /** Count of URLs filtered due to hostname mismatch */
16
+ private filteredByHostnameCount;
17
+ initialize(url: string, pathPrefix?: string): Promise<void>;
18
+ getFilteredByPathCount(): number;
19
+ getFilteredByHostnameCount(): number;
20
+ /**
21
+ * Check if a hostname matches the allowed hostname.
22
+ * Allows exact match or subdomains of the allowed hostname.
23
+ * Does NOT allow sibling subdomains or parent domains.
24
+ *
25
+ * @example
26
+ * If allowedHostname is 'docs.example.com':
27
+ * - 'docs.example.com' → true (exact match)
28
+ * - 'api.docs.example.com' → true (subdomain)
29
+ * - 'example.com' → false (parent domain)
30
+ * - 'python.example.com' → false (sibling subdomain)
31
+ */
32
+ private isHostnameAllowed;
33
+ handleQueueAndLinks(enqueueLinks: (options: EnqueueLinksOptions) => Promise<{
34
+ processedRequests: {
35
+ uniqueKey: string;
36
+ }[];
37
+ }>, log: Log, rule: SiteDetectionRule): Promise<void>;
38
+ processBatch(): Promise<CrawlResult[]>;
39
+ addResult(result: CrawlResult): void;
40
+ hasEnoughResults(): boolean;
41
+ getRequestQueue(): RequestQueue | null;
42
+ cleanup(): Promise<void>;
43
+ }