@cosmocoder/mcp-web-docs 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (240) hide show
  1. package/LICENSE +22 -0
  2. package/README.md +368 -0
  3. package/build/__mocks__/embeddings.d.ts +17 -0
  4. package/build/__mocks__/embeddings.js +66 -0
  5. package/build/__mocks__/embeddings.js.map +1 -0
  6. package/build/config.d.ts +44 -0
  7. package/build/config.js +158 -0
  8. package/build/config.js.map +1 -0
  9. package/build/config.test.d.ts +1 -0
  10. package/build/config.test.js +165 -0
  11. package/build/config.test.js.map +1 -0
  12. package/build/crawler/auth.d.ts +128 -0
  13. package/build/crawler/auth.js +546 -0
  14. package/build/crawler/auth.js.map +1 -0
  15. package/build/crawler/auth.test.d.ts +1 -0
  16. package/build/crawler/auth.test.js +174 -0
  17. package/build/crawler/auth.test.js.map +1 -0
  18. package/build/crawler/base.d.ts +24 -0
  19. package/build/crawler/base.js +149 -0
  20. package/build/crawler/base.js.map +1 -0
  21. package/build/crawler/base.test.d.ts +1 -0
  22. package/build/crawler/base.test.js +234 -0
  23. package/build/crawler/base.test.js.map +1 -0
  24. package/build/crawler/browser-config.d.ts +2 -0
  25. package/build/crawler/browser-config.js +29 -0
  26. package/build/crawler/browser-config.js.map +1 -0
  27. package/build/crawler/browser-config.test.d.ts +1 -0
  28. package/build/crawler/browser-config.test.js +56 -0
  29. package/build/crawler/browser-config.test.js.map +1 -0
  30. package/build/crawler/cheerio.d.ts +11 -0
  31. package/build/crawler/cheerio.js +134 -0
  32. package/build/crawler/cheerio.js.map +1 -0
  33. package/build/crawler/chromium.d.ts +21 -0
  34. package/build/crawler/chromium.js +596 -0
  35. package/build/crawler/chromium.js.map +1 -0
  36. package/build/crawler/content-extractor-types.d.ts +25 -0
  37. package/build/crawler/content-extractor-types.js +2 -0
  38. package/build/crawler/content-extractor-types.js.map +1 -0
  39. package/build/crawler/content-extractors.d.ts +9 -0
  40. package/build/crawler/content-extractors.js +9 -0
  41. package/build/crawler/content-extractors.js.map +1 -0
  42. package/build/crawler/content-utils.d.ts +2 -0
  43. package/build/crawler/content-utils.js +22 -0
  44. package/build/crawler/content-utils.js.map +1 -0
  45. package/build/crawler/content-utils.test.d.ts +1 -0
  46. package/build/crawler/content-utils.test.js +99 -0
  47. package/build/crawler/content-utils.test.js.map +1 -0
  48. package/build/crawler/crawlee-crawler.d.ts +63 -0
  49. package/build/crawler/crawlee-crawler.js +342 -0
  50. package/build/crawler/crawlee-crawler.js.map +1 -0
  51. package/build/crawler/crawlee-crawler.test.d.ts +1 -0
  52. package/build/crawler/crawlee-crawler.test.js +280 -0
  53. package/build/crawler/crawlee-crawler.test.js.map +1 -0
  54. package/build/crawler/default-extractor.d.ts +4 -0
  55. package/build/crawler/default-extractor.js +26 -0
  56. package/build/crawler/default-extractor.js.map +1 -0
  57. package/build/crawler/default-extractor.test.d.ts +1 -0
  58. package/build/crawler/default-extractor.test.js +200 -0
  59. package/build/crawler/default-extractor.test.js.map +1 -0
  60. package/build/crawler/default.d.ts +11 -0
  61. package/build/crawler/default.js +138 -0
  62. package/build/crawler/default.js.map +1 -0
  63. package/build/crawler/docs-crawler.d.ts +26 -0
  64. package/build/crawler/docs-crawler.js +97 -0
  65. package/build/crawler/docs-crawler.js.map +1 -0
  66. package/build/crawler/docs-crawler.test.d.ts +1 -0
  67. package/build/crawler/docs-crawler.test.js +185 -0
  68. package/build/crawler/docs-crawler.test.js.map +1 -0
  69. package/build/crawler/factory.d.ts +6 -0
  70. package/build/crawler/factory.js +83 -0
  71. package/build/crawler/factory.js.map +1 -0
  72. package/build/crawler/github-pages-extractor.d.ts +4 -0
  73. package/build/crawler/github-pages-extractor.js +33 -0
  74. package/build/crawler/github-pages-extractor.js.map +1 -0
  75. package/build/crawler/github-pages-extractor.test.d.ts +1 -0
  76. package/build/crawler/github-pages-extractor.test.js +184 -0
  77. package/build/crawler/github-pages-extractor.test.js.map +1 -0
  78. package/build/crawler/github.d.ts +20 -0
  79. package/build/crawler/github.js +181 -0
  80. package/build/crawler/github.js.map +1 -0
  81. package/build/crawler/github.test.d.ts +1 -0
  82. package/build/crawler/github.test.js +326 -0
  83. package/build/crawler/github.test.js.map +1 -0
  84. package/build/crawler/puppeteer.d.ts +16 -0
  85. package/build/crawler/puppeteer.js +191 -0
  86. package/build/crawler/puppeteer.js.map +1 -0
  87. package/build/crawler/queue-manager.d.ts +43 -0
  88. package/build/crawler/queue-manager.js +169 -0
  89. package/build/crawler/queue-manager.js.map +1 -0
  90. package/build/crawler/queue-manager.test.d.ts +1 -0
  91. package/build/crawler/queue-manager.test.js +509 -0
  92. package/build/crawler/queue-manager.test.js.map +1 -0
  93. package/build/crawler/site-rules.d.ts +11 -0
  94. package/build/crawler/site-rules.js +104 -0
  95. package/build/crawler/site-rules.js.map +1 -0
  96. package/build/crawler/site-rules.test.d.ts +1 -0
  97. package/build/crawler/site-rules.test.js +139 -0
  98. package/build/crawler/site-rules.test.js.map +1 -0
  99. package/build/crawler/storybook-extractor.d.ts +34 -0
  100. package/build/crawler/storybook-extractor.js +767 -0
  101. package/build/crawler/storybook-extractor.js.map +1 -0
  102. package/build/crawler/storybook-extractor.test.d.ts +1 -0
  103. package/build/crawler/storybook-extractor.test.js +491 -0
  104. package/build/crawler/storybook-extractor.test.js.map +1 -0
  105. package/build/embeddings/fastembed.d.ts +25 -0
  106. package/build/embeddings/fastembed.js +188 -0
  107. package/build/embeddings/fastembed.js.map +1 -0
  108. package/build/embeddings/fastembed.test.d.ts +1 -0
  109. package/build/embeddings/fastembed.test.js +307 -0
  110. package/build/embeddings/fastembed.test.js.map +1 -0
  111. package/build/embeddings/openai.d.ts +8 -0
  112. package/build/embeddings/openai.js +56 -0
  113. package/build/embeddings/openai.js.map +1 -0
  114. package/build/embeddings/types.d.ts +4 -0
  115. package/build/embeddings/types.js +2 -0
  116. package/build/embeddings/types.js.map +1 -0
  117. package/build/index.d.ts +2 -0
  118. package/build/index.js +1007 -0
  119. package/build/index.js.map +1 -0
  120. package/build/index.test.d.ts +1 -0
  121. package/build/index.test.js +364 -0
  122. package/build/index.test.js.map +1 -0
  123. package/build/indexing/queue-manager.d.ts +36 -0
  124. package/build/indexing/queue-manager.js +86 -0
  125. package/build/indexing/queue-manager.js.map +1 -0
  126. package/build/indexing/queue-manager.test.d.ts +1 -0
  127. package/build/indexing/queue-manager.test.js +257 -0
  128. package/build/indexing/queue-manager.test.js.map +1 -0
  129. package/build/indexing/status.d.ts +39 -0
  130. package/build/indexing/status.js +207 -0
  131. package/build/indexing/status.js.map +1 -0
  132. package/build/indexing/status.test.d.ts +1 -0
  133. package/build/indexing/status.test.js +246 -0
  134. package/build/indexing/status.test.js.map +1 -0
  135. package/build/processor/content.d.ts +16 -0
  136. package/build/processor/content.js +286 -0
  137. package/build/processor/content.js.map +1 -0
  138. package/build/processor/content.test.d.ts +1 -0
  139. package/build/processor/content.test.js +369 -0
  140. package/build/processor/content.test.js.map +1 -0
  141. package/build/processor/markdown.d.ts +11 -0
  142. package/build/processor/markdown.js +256 -0
  143. package/build/processor/markdown.js.map +1 -0
  144. package/build/processor/markdown.test.d.ts +1 -0
  145. package/build/processor/markdown.test.js +312 -0
  146. package/build/processor/markdown.test.js.map +1 -0
  147. package/build/processor/metadata-parser.d.ts +37 -0
  148. package/build/processor/metadata-parser.js +245 -0
  149. package/build/processor/metadata-parser.js.map +1 -0
  150. package/build/processor/metadata-parser.test.d.ts +1 -0
  151. package/build/processor/metadata-parser.test.js +357 -0
  152. package/build/processor/metadata-parser.test.js.map +1 -0
  153. package/build/processor/processor.d.ts +8 -0
  154. package/build/processor/processor.js +190 -0
  155. package/build/processor/processor.js.map +1 -0
  156. package/build/processor/processor.test.d.ts +1 -0
  157. package/build/processor/processor.test.js +357 -0
  158. package/build/processor/processor.test.js.map +1 -0
  159. package/build/rag/cache.d.ts +10 -0
  160. package/build/rag/cache.js +10 -0
  161. package/build/rag/cache.js.map +1 -0
  162. package/build/rag/code-generator.d.ts +11 -0
  163. package/build/rag/code-generator.js +30 -0
  164. package/build/rag/code-generator.js.map +1 -0
  165. package/build/rag/context-assembler.d.ts +23 -0
  166. package/build/rag/context-assembler.js +113 -0
  167. package/build/rag/context-assembler.js.map +1 -0
  168. package/build/rag/docs-search.d.ts +55 -0
  169. package/build/rag/docs-search.js +380 -0
  170. package/build/rag/docs-search.js.map +1 -0
  171. package/build/rag/pipeline.d.ts +26 -0
  172. package/build/rag/pipeline.js +91 -0
  173. package/build/rag/pipeline.js.map +1 -0
  174. package/build/rag/query-processor.d.ts +14 -0
  175. package/build/rag/query-processor.js +57 -0
  176. package/build/rag/query-processor.js.map +1 -0
  177. package/build/rag/reranker.d.ts +55 -0
  178. package/build/rag/reranker.js +210 -0
  179. package/build/rag/reranker.js.map +1 -0
  180. package/build/rag/response-generator.d.ts +20 -0
  181. package/build/rag/response-generator.js +101 -0
  182. package/build/rag/response-generator.js.map +1 -0
  183. package/build/rag/retriever.d.ts +19 -0
  184. package/build/rag/retriever.js +111 -0
  185. package/build/rag/retriever.js.map +1 -0
  186. package/build/rag/validator.d.ts +22 -0
  187. package/build/rag/validator.js +128 -0
  188. package/build/rag/validator.js.map +1 -0
  189. package/build/rag/version-manager.d.ts +23 -0
  190. package/build/rag/version-manager.js +98 -0
  191. package/build/rag/version-manager.js.map +1 -0
  192. package/build/setupTests.d.ts +4 -0
  193. package/build/setupTests.js +50 -0
  194. package/build/setupTests.js.map +1 -0
  195. package/build/storage/storage.d.ts +38 -0
  196. package/build/storage/storage.js +700 -0
  197. package/build/storage/storage.js.map +1 -0
  198. package/build/storage/storage.test.d.ts +1 -0
  199. package/build/storage/storage.test.js +338 -0
  200. package/build/storage/storage.test.js.map +1 -0
  201. package/build/types/rag.d.ts +27 -0
  202. package/build/types/rag.js +2 -0
  203. package/build/types/rag.js.map +1 -0
  204. package/build/types.d.ts +120 -0
  205. package/build/types.js +2 -0
  206. package/build/types.js.map +1 -0
  207. package/build/util/content-utils.d.ts +31 -0
  208. package/build/util/content-utils.js +120 -0
  209. package/build/util/content-utils.js.map +1 -0
  210. package/build/util/content.d.ts +1 -0
  211. package/build/util/content.js +16 -0
  212. package/build/util/content.js.map +1 -0
  213. package/build/util/docs.d.ts +1 -0
  214. package/build/util/docs.js +26 -0
  215. package/build/util/docs.js.map +1 -0
  216. package/build/util/docs.test.d.ts +1 -0
  217. package/build/util/docs.test.js +49 -0
  218. package/build/util/docs.test.js.map +1 -0
  219. package/build/util/favicon.d.ts +6 -0
  220. package/build/util/favicon.js +88 -0
  221. package/build/util/favicon.js.map +1 -0
  222. package/build/util/favicon.test.d.ts +1 -0
  223. package/build/util/favicon.test.js +140 -0
  224. package/build/util/favicon.test.js.map +1 -0
  225. package/build/util/logger.d.ts +17 -0
  226. package/build/util/logger.js +72 -0
  227. package/build/util/logger.js.map +1 -0
  228. package/build/util/logger.test.d.ts +1 -0
  229. package/build/util/logger.test.js +46 -0
  230. package/build/util/logger.test.js.map +1 -0
  231. package/build/util/security.d.ts +312 -0
  232. package/build/util/security.js +719 -0
  233. package/build/util/security.js.map +1 -0
  234. package/build/util/security.test.d.ts +1 -0
  235. package/build/util/security.test.js +524 -0
  236. package/build/util/security.test.js.map +1 -0
  237. package/build/util/site-detector.d.ts +22 -0
  238. package/build/util/site-detector.js +42 -0
  239. package/build/util/site-detector.js.map +1 -0
  240. package/package.json +112 -0
@@ -0,0 +1,280 @@
1
+ const mockQueueManager = {
2
+ initialize: vi.fn().mockResolvedValue(undefined),
3
+ getRequestQueue: vi.fn().mockReturnValue({}),
4
+ handleQueueAndLinks: vi.fn().mockResolvedValue(undefined),
5
+ addResult: vi.fn(),
6
+ hasEnoughResults: vi.fn().mockReturnValue(false),
7
+ processBatch: vi.fn().mockResolvedValue([]),
8
+ cleanup: vi.fn().mockResolvedValue(undefined),
9
+ };
10
+ vi.mock('./queue-manager.js', () => ({
11
+ QueueManager: function () {
12
+ return mockQueueManager;
13
+ },
14
+ }));
15
+ vi.mock('./browser-config.js', () => ({
16
+ getBrowserConfig: vi.fn().mockReturnValue({
17
+ requestQueue: {},
18
+ preNavigationHooks: [],
19
+ launchContext: {},
20
+ browserPoolOptions: {},
21
+ }),
22
+ }));
23
+ vi.mock('./site-rules.js', () => ({
24
+ siteRules: [
25
+ {
26
+ type: 'default',
27
+ extractor: {
28
+ extractContent: vi.fn().mockResolvedValue({ content: 'Extracted content', metadata: {} }),
29
+ },
30
+ detect: vi.fn().mockResolvedValue(true),
31
+ },
32
+ ],
33
+ }));
34
+ // Mock PlaywrightCrawler
35
+ const mockCrawlerRun = vi.fn().mockResolvedValue(undefined);
36
+ const mockCrawlerTeardown = vi.fn().mockResolvedValue(undefined);
37
+ vi.mock('crawlee', () => ({
38
+ PlaywrightCrawler: function (options) {
39
+ // Store the request handler for testing
40
+ global.__requestHandler = options.requestHandler;
41
+ return {
42
+ run: mockCrawlerRun,
43
+ teardown: mockCrawlerTeardown,
44
+ };
45
+ },
46
+ }));
47
+ // Import after mocking
48
+ import { CrawleeCrawler } from './crawlee-crawler.js';
49
+ describe('CrawleeCrawler', () => {
50
+ let crawler;
51
+ beforeEach(() => {
52
+ vi.clearAllMocks();
53
+ crawler = new CrawleeCrawler();
54
+ mockQueueManager.hasEnoughResults.mockReturnValue(false);
55
+ mockQueueManager.processBatch.mockResolvedValue([]);
56
+ });
57
+ describe('constructor', () => {
58
+ it('should initialize with default values', () => {
59
+ expect(crawler).toBeDefined();
60
+ });
61
+ it('should accept custom maxDepth and maxRequestsPerCrawl', () => {
62
+ const customCrawler = new CrawleeCrawler(10, 500);
63
+ expect(customCrawler).toBeDefined();
64
+ });
65
+ it('should accept progress callback', () => {
66
+ const progressFn = vi.fn();
67
+ const progressCrawler = new CrawleeCrawler(4, 1000, progressFn);
68
+ expect(progressCrawler).toBeDefined();
69
+ });
70
+ });
71
+ describe('setStorageState', () => {
72
+ it('should accept storage state', () => {
73
+ const state = {
74
+ cookies: [{ name: 'session', value: 'abc123', domain: 'example.com', path: '/' }],
75
+ };
76
+ crawler.setStorageState(state);
77
+ // No error means success
78
+ expect(true).toBe(true);
79
+ });
80
+ it('should accept storage state with origins', () => {
81
+ const state = {
82
+ cookies: [{ name: 'session', value: 'abc123', domain: 'example.com', path: '/' }],
83
+ origins: [
84
+ {
85
+ origin: 'https://example.com',
86
+ localStorage: [{ name: 'token', value: 'xyz' }],
87
+ },
88
+ ],
89
+ };
90
+ crawler.setStorageState(state);
91
+ expect(true).toBe(true);
92
+ });
93
+ });
94
+ describe('crawl', () => {
95
+ it('should initialize queue manager with URL', async () => {
96
+ // Set up processBatch to return results immediately to end the crawl
97
+ mockCrawlerRun.mockResolvedValueOnce(undefined);
98
+ mockQueueManager.processBatch.mockResolvedValueOnce([]);
99
+ const results = [];
100
+ for await (const result of crawler.crawl('https://example.com/docs')) {
101
+ results.push(result);
102
+ }
103
+ expect(mockQueueManager.initialize).toHaveBeenCalledWith('https://example.com/docs', undefined);
104
+ });
105
+ it('should yield results from queue manager', async () => {
106
+ const mockResults = [
107
+ { url: 'https://example.com/page1', path: '/page1', content: 'Page 1', title: 'Page 1' },
108
+ { url: 'https://example.com/page2', path: '/page2', content: 'Page 2', title: 'Page 2' },
109
+ ];
110
+ // Since hasEnoughResults returns false, processBatch is only called once
111
+ // at the end of crawl (line 388 in crawlee-crawler.ts), so we only need one mock value
112
+ mockQueueManager.processBatch.mockResolvedValueOnce(mockResults);
113
+ const results = [];
114
+ for await (const result of crawler.crawl('https://example.com')) {
115
+ results.push(result);
116
+ }
117
+ expect(results).toEqual(mockResults);
118
+ });
119
+ it('should cleanup queue manager after crawl', async () => {
120
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
121
+ for await (const _ of crawler.crawl('https://example.com')) {
122
+ // Just consume results
123
+ }
124
+ expect(mockQueueManager.cleanup).toHaveBeenCalled();
125
+ });
126
+ it('should process batch when enough results accumulated', async () => {
127
+ const mockResults = [{ url: 'https://example.com/page1', path: '/page1', content: 'Page 1', title: 'Page 1' }];
128
+ mockQueueManager.hasEnoughResults.mockReturnValueOnce(true).mockReturnValue(false);
129
+ mockQueueManager.processBatch.mockResolvedValueOnce(mockResults).mockResolvedValueOnce([]);
130
+ const results = [];
131
+ for await (const result of crawler.crawl('https://example.com')) {
132
+ results.push(result);
133
+ }
134
+ expect(results).toHaveLength(1);
135
+ });
136
+ it('should set allowed hostname from URL', async () => {
137
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
138
+ for await (const _ of crawler.crawl('https://docs.example.com/guide')) {
139
+ // Just consume results
140
+ }
141
+ // Verify through the initialize call
142
+ expect(mockQueueManager.initialize).toHaveBeenCalledWith('https://docs.example.com/guide', undefined);
143
+ });
144
+ });
145
+ describe('abort', () => {
146
+ it('should stop the crawler', async () => {
147
+ // Make run() hang until we resolve it, so abort() can be called while crawler exists
148
+ let resolveRun;
149
+ const runPromise = new Promise((resolve) => {
150
+ resolveRun = resolve;
151
+ });
152
+ mockCrawlerRun.mockReturnValueOnce(runPromise);
153
+ // Create a crawler that we can abort
154
+ const abortableCrawler = new CrawleeCrawler();
155
+ // Start consuming the generator - this creates the crawler
156
+ const generator = abortableCrawler.crawl('https://example.com');
157
+ // Get the first value to start the generator (this creates the crawler)
158
+ const firstResultPromise = generator.next();
159
+ // Give the generator time to start and create the crawler
160
+ await new Promise((resolve) => setTimeout(resolve, 10));
161
+ // Now abort - the crawler exists at this point
162
+ abortableCrawler.abort();
163
+ // Let the run() complete so the generator can finish
164
+ resolveRun();
165
+ // Wait for the generator to complete
166
+ await firstResultPromise;
167
+ const results = [];
168
+ for await (const result of generator) {
169
+ results.push(result);
170
+ }
171
+ expect(mockCrawlerTeardown).toHaveBeenCalled();
172
+ });
173
+ });
174
+ describe('domain restriction', () => {
175
+ it('should extract hostname from URL for domain restriction', async () => {
176
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
177
+ for await (const _ of crawler.crawl('https://subdomain.example.com/path')) {
178
+ // Just consume results
179
+ }
180
+ expect(mockQueueManager.initialize).toHaveBeenCalledWith('https://subdomain.example.com/path', undefined);
181
+ });
182
+ it('should pass path prefix to queue manager when set', async () => {
183
+ crawler.setPathPrefix('/docs/api');
184
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
185
+ for await (const _ of crawler.crawl('https://example.com/docs/api')) {
186
+ // Just consume results
187
+ }
188
+ expect(mockQueueManager.initialize).toHaveBeenCalledWith('https://example.com/docs/api', '/docs/api');
189
+ });
190
+ });
191
+ describe('authentication', () => {
192
+ it('should configure crawler with storage state when set', async () => {
193
+ const state = {
194
+ cookies: [{ name: 'auth', value: 'token123', domain: 'example.com', path: '/' }],
195
+ };
196
+ crawler.setStorageState(state);
197
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
198
+ for await (const _ of crawler.crawl('https://example.com')) {
199
+ // Just consume results
200
+ }
201
+ // Verify queue manager was initialized (auth is handled internally)
202
+ expect(mockQueueManager.initialize).toHaveBeenCalled();
203
+ });
204
+ });
205
+ describe('isWithinAllowedDomain', () => {
206
+ // Access the private method through the class prototype for testing
207
+ it('should handle URL parsing for domain check', async () => {
208
+ // This is tested indirectly through the crawl method
209
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
210
+ for await (const _ of crawler.crawl('https://example.com')) {
211
+ // Just consume results
212
+ }
213
+ // No errors means domain parsing worked
214
+ expect(mockQueueManager.initialize).toHaveBeenCalled();
215
+ });
216
+ });
217
+ describe('error handling', () => {
218
+ it('should cleanup on error', async () => {
219
+ mockCrawlerRun.mockRejectedValueOnce(new Error('Crawl failed'));
220
+ await expect(async () => {
221
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
222
+ for await (const _ of crawler.crawl('https://example.com')) {
223
+ // Just consume results
224
+ }
225
+ }).rejects.toThrow('Crawl failed');
226
+ expect(mockQueueManager.cleanup).toHaveBeenCalled();
227
+ });
228
+ it('should handle invalid URLs gracefully', async () => {
229
+ // The crawler should handle this internally
230
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
231
+ for await (const _ of crawler.crawl('https://example.com')) {
232
+ // Just consume results
233
+ }
234
+ expect(mockQueueManager.initialize).toHaveBeenCalled();
235
+ });
236
+ });
237
+ });
238
+ describe('StorageState interface', () => {
239
+ it('should allow cookies with all optional properties', () => {
240
+ const state = {
241
+ cookies: [
242
+ {
243
+ name: 'session',
244
+ value: 'abc',
245
+ domain: 'example.com',
246
+ path: '/',
247
+ expires: 1234567890,
248
+ httpOnly: true,
249
+ secure: true,
250
+ sameSite: 'Strict',
251
+ },
252
+ ],
253
+ };
254
+ expect(state.cookies).toHaveLength(1);
255
+ expect(state.cookies[0].sameSite).toBe('Strict');
256
+ });
257
+ it('should allow minimal cookie definition', () => {
258
+ const state = {
259
+ cookies: [{ name: 'token', value: 'xyz', domain: 'test.com', path: '/' }],
260
+ };
261
+ expect(state.cookies[0].expires).toBeUndefined();
262
+ });
263
+ it('should allow origins for localStorage', () => {
264
+ const state = {
265
+ cookies: [],
266
+ origins: [
267
+ {
268
+ origin: 'https://example.com',
269
+ localStorage: [
270
+ { name: 'key1', value: 'value1' },
271
+ { name: 'key2', value: 'value2' },
272
+ ],
273
+ },
274
+ ],
275
+ };
276
+ expect(state.origins).toHaveLength(1);
277
+ expect(state.origins[0].localStorage).toHaveLength(2);
278
+ });
279
+ });
280
+ //# sourceMappingURL=crawlee-crawler.test.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"crawlee-crawler.test.js","sourceRoot":"","sources":["../../src/crawler/crawlee-crawler.test.ts"],"names":[],"mappings":"AAEA,MAAM,gBAAgB,GAAG;IACvB,UAAU,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC,SAAS,CAAC;IAChD,eAAe,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,eAAe,CAAC,EAAE,CAAC;IAC5C,mBAAmB,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC,SAAS,CAAC;IACzD,SAAS,EAAE,EAAE,CAAC,EAAE,EAAE;IAClB,gBAAgB,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,eAAe,CAAC,KAAK,CAAC;IAChD,YAAY,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC,EAAE,CAAC;IAC3C,OAAO,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC,SAAS,CAAC;CAC9C,CAAC;AAEF,EAAE,CAAC,IAAI,CAAC,oBAAoB,EAAE,GAAG,EAAE,CAAC,CAAC;IACnC,YAAY,EAAE;QACZ,OAAO,gBAAgB,CAAC;IAC1B,CAAC;CACF,CAAC,CAAC,CAAC;AAEJ,EAAE,CAAC,IAAI,CAAC,qBAAqB,EAAE,GAAG,EAAE,CAAC,CAAC;IACpC,gBAAgB,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,eAAe,CAAC;QACxC,YAAY,EAAE,EAAE;QAChB,kBAAkB,EAAE,EAAE;QACtB,aAAa,EAAE,EAAE;QACjB,kBAAkB,EAAE,EAAE;KACvB,CAAC;CACH,CAAC,CAAC,CAAC;AAEJ,EAAE,CAAC,IAAI,CAAC,iBAAiB,EAAE,GAAG,EAAE,CAAC,CAAC;IAChC,SAAS,EAAE;QACT;YACE,IAAI,EAAE,SAAS;YACf,SAAS,EAAE;gBACT,cAAc,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC,EAAE,OAAO,EAAE,mBAAmB,EAAE,QAAQ,EAAE,EAAE,EAAE,CAAC;aAC1F;YACD,MAAM,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC,IAAI,CAAC;SACxC;KACF;CACF,CAAC,CAAC,CAAC;AAEJ,yBAAyB;AACzB,MAAM,cAAc,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC,SAAS,CAAC,CAAC;AAC5D,MAAM,mBAAmB,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC,SAAS,CAAC,CAAC;AAEjE,EAAE,CAAC,IAAI,CAAC,SAAS,EAAE,GAAG,EAAE,CAAC,CAAC;IACxB,iBAAiB,EAAE,UAAU,OAAqC;QAChE,wCAAwC;QACvC,MAAyC,CAAC,gBAAgB,GAAG,OAAO,CAAC,cAAc,CAAC;QACrF,OAAO;YACL,GAAG,EAAE,cAAc;YACnB,QAAQ,EAAE,mBAAmB;SAC9B,CAAC;IACJ,CAAC;CACF,CAAC,CAAC,CAAC;AAEJ,uBAAuB;AACvB,OAAO,EAAE,cAAc,EAAgB,MAAM,sBAAsB,CAAC;AAEpE,QAAQ,CAAC,gBAAgB,EAAE,GAAG,EAAE;IAC9B,IAAI,OAAuB,CAAC;IAE5B,UAAU,CAAC,GAAG,EAAE;QACd,EAAE,CAAC,aAAa,EAAE,CAAC;QACnB,OAAO,GAAG,IAAI,cAAc,EAAE,CAAC;QAC/B,gBAAgB,CAAC,gBAAgB,CAAC,eAAe,CAAC,KAAK,CAAC,CAAC;QACzD,gBAAgB,CAAC,YAAY,CAAC,iBAAiB,CAAC,EAAE,CAAC,CAAC;IACtD,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,aAAa,EAAE,GAAG,EAAE;QAC3B,EAAE,CAAC,uCAAuC,EAAE,GAAG,EAAE;YAC/C,MAAM,CAAC,OAAO,CAAC,CAAC,WAAW,EAAE,CAAC;QAChC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,uDAAuD,EAAE,GAAG,EAAE;YAC/D,MAAM,aAAa,GAAG,IAAI,cAAc,CAAC,EAAE,EAAE,GAAG,CAAC,CAAC;YAClD,MAAM,CAAC,aAAa,CAAC,CAAC,WAAW,EAAE,CAAC;QACtC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,iCAAiC,EAAE,GAAG,EAAE;YACzC,MAAM,UAAU,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;YAC3B,MAAM,eAAe,GAAG,IAAI,cAAc,CAAC,CAAC,EAAE,IAAI,EAAE,UAAU,CAAC,CAAC;YAChE,MAAM,CAAC,eAAe,CAAC,CAAC,WAAW,EAAE,CAAC;QACxC,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,iBAAiB,EAAE,GAAG,EAAE;QAC/B,EAAE,CAAC,6BAA6B,EAAE,GAAG,EAAE;YACrC,MAAM,KAAK,GAAiB;gBAC1B,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,SAAS,EAAE,KAAK,EAAE,QAAQ,EAAE,MAAM,EAAE,aAAa,EAAE,IAAI,EAAE,GAAG,EAAE,CAAC;aAClF,CAAC;YAEF,OAAO,CAAC,eAAe,CAAC,KAAK,CAAC,CAAC;YAE/B,yBAAyB;YACzB,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC1B,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,0CAA0C,EAAE,GAAG,EAAE;YAClD,MAAM,KAAK,GAAiB;gBAC1B,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,SAAS,EAAE,KAAK,EAAE,QAAQ,EAAE,MAAM,EAAE,aAAa,EAAE,IAAI,EAAE,GAAG,EAAE,CAAC;gBACjF,OAAO,EAAE;oBACP;wBACE,MAAM,EAAE,qBAAqB;wBAC7B,YAAY,EAAE,CAAC,EAAE,IAAI,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC;qBAChD;iBACF;aACF,CAAC;YAEF,OAAO,CAAC,eAAe,CAAC,KAAK,CAAC,CAAC;YAC/B,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC1B,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,OAAO,EAAE,GAAG,EAAE;QACrB,EAAE,CAAC,0CAA0C,EAAE,KAAK,IAAI,EAAE;YACxD,qEAAqE;YACrE,cAAc,CAAC,qBAAqB,CAAC,SAAS,CAAC,CAAC;YAChD,gBAAgB,CAAC,YAAY,CAAC,qBAAqB,CAAC,EAAE,CAAC,CAAC;YAExD,MAAM,OAAO,GAAkB,EAAE,CAAC;YAClC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,OAAO,CAAC,KAAK,CAAC,0BAA0B,CAAC,EAAE,CAAC;gBACrE,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,MAAM,CAAC,gBAAgB,CAAC,UAAU,CAAC,CAAC,oBAAoB,CAAC,0BAA0B,EAAE,SAAS,CAAC,CAAC;QAClG,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,yCAAyC,EAAE,KAAK,IAAI,EAAE;YACvD,MAAM,WAAW,GAAkB;gBACjC,EAAE,GAAG,EAAE,2BAA2B,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,QAAQ,EAAE;gBACxF,EAAE,GAAG,EAAE,2BAA2B,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,QAAQ,EAAE;aACzF,CAAC;YAEF,yEAAyE;YACzE,uFAAuF;YACvF,gBAAgB,CAAC,YAAY,CAAC,qBAAqB,CAAC,WAAW,CAAC,CAAC;YAEjE,MAAM,OAAO,GAAkB,EAAE,CAAC;YAClC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,OAAO,CAAC,KAAK,CAAC,qBAAqB,CAAC,EAAE,CAAC;gBAChE,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,MAAM,CAAC,OAAO,CAAC,CAAC,OAAO,CAAC,WAAW,CAAC,CAAC;QACvC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,0CAA0C,EAAE,KAAK,IAAI,EAAE;YACxD,6DAA6D;YAC7D,IAAI,KAAK,EAAE,MAAM,CAAC,IAAI,OAAO,CAAC,KAAK,CAAC,qBAAqB,CAAC,EAAE,CAAC;gBAC3D,uBAAuB;YACzB,CAAC;YAED,MAAM,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC,gBAAgB,EAAE,CAAC;QACtD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,sDAAsD,EAAE,KAAK,IAAI,EAAE;YACpE,MAAM,WAAW,GAAkB,CAAC,EAAE,GAAG,EAAE,2BAA2B,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC,CAAC;YAE9H,gBAAgB,CAAC,gBAAgB,CAAC,mBAAmB,CAAC,IAAI,CAAC,CAAC,eAAe,CAAC,KAAK,CAAC,CAAC;YACnF,gBAAgB,CAAC,YAAY,CAAC,qBAAqB,CAAC,WAAW,CAAC,CAAC,qBAAqB,CAAC,EAAE,CAAC,CAAC;YAE3F,MAAM,OAAO,GAAkB,EAAE,CAAC;YAClC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,OAAO,CAAC,KAAK,CAAC,qBAAqB,CAAC,EAAE,CAAC;gBAChE,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,MAAM,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QAClC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,sCAAsC,EAAE,KAAK,IAAI,EAAE;YACpD,6DAA6D;YAC7D,IAAI,KAAK,EAAE,MAAM,CAAC,IAAI,OAAO,CAAC,KAAK,CAAC,gCAAgC,CAAC,EAAE,CAAC;gBACtE,uBAAuB;YACzB,CAAC;YAED,qCAAqC;YACrC,MAAM,CAAC,gBAAgB,CAAC,UAAU,CAAC,CAAC,oBAAoB,CAAC,gCAAgC,EAAE,SAAS,CAAC,CAAC;QACxG,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,OAAO,EAAE,GAAG,EAAE;QACrB,EAAE,CAAC,yBAAyB,EAAE,KAAK,IAAI,EAAE;YACvC,qFAAqF;YACrF,IAAI,UAAsB,CAAC;YAC3B,MAAM,UAAU,GAAG,IAAI,OAAO,CAAO,CAAC,OAAO,EAAE,EAAE;gBAC/C,UAAU,GAAG,OAAO,CAAC;YACvB,CAAC,CAAC,CAAC;YACH,cAAc,CAAC,mBAAmB,CAAC,UAAU,CAAC,CAAC;YAE/C,qCAAqC;YACrC,MAAM,gBAAgB,GAAG,IAAI,cAAc,EAAE,CAAC;YAE9C,2DAA2D;YAC3D,MAAM,SAAS,GAAG,gBAAgB,CAAC,KAAK,CAAC,qBAAqB,CAAC,CAAC;YAEhE,wEAAwE;YACxE,MAAM,kBAAkB,GAAG,SAAS,CAAC,IAAI,EAAE,CAAC;YAE5C,0DAA0D;YAC1D,MAAM,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,UAAU,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC,CAAC;YAExD,+CAA+C;YAC/C,gBAAgB,CAAC,KAAK,EAAE,CAAC;YAEzB,qDAAqD;YACrD,UAAW,EAAE,CAAC;YAEd,qCAAqC;YACrC,MAAM,kBAAkB,CAAC;YACzB,MAAM,OAAO,GAAkB,EAAE,CAAC;YAClC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,SAAS,EAAE,CAAC;gBACrC,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,MAAM,CAAC,mBAAmB,CAAC,CAAC,gBAAgB,EAAE,CAAC;QACjD,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,oBAAoB,EAAE,GAAG,EAAE;QAClC,EAAE,CAAC,yDAAyD,EAAE,KAAK,IAAI,EAAE;YACvE,6DAA6D;YAC7D,IAAI,KAAK,EAAE,MAAM,CAAC,IAAI,OAAO,CAAC,KAAK,CAAC,oCAAoC,CAAC,EAAE,CAAC;gBAC1E,uBAAuB;YACzB,CAAC;YAED,MAAM,CAAC,gBAAgB,CAAC,UAAU,CAAC,CAAC,oBAAoB,CAAC,oCAAoC,EAAE,SAAS,CAAC,CAAC;QAC5G,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,mDAAmD,EAAE,KAAK,IAAI,EAAE;YACjE,OAAO,CAAC,aAAa,CAAC,WAAW,CAAC,CAAC;YAEnC,6DAA6D;YAC7D,IAAI,KAAK,EAAE,MAAM,CAAC,IAAI,OAAO,CAAC,KAAK,CAAC,8BAA8B,CAAC,EAAE,CAAC;gBACpE,uBAAuB;YACzB,CAAC;YAED,MAAM,CAAC,gBAAgB,CAAC,UAAU,CAAC,CAAC,oBAAoB,CAAC,8BAA8B,EAAE,WAAW,CAAC,CAAC;QACxG,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,gBAAgB,EAAE,GAAG,EAAE;QAC9B,EAAE,CAAC,sDAAsD,EAAE,KAAK,IAAI,EAAE;YACpE,MAAM,KAAK,GAAiB;gBAC1B,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,UAAU,EAAE,MAAM,EAAE,aAAa,EAAE,IAAI,EAAE,GAAG,EAAE,CAAC;aACjF,CAAC;YAEF,OAAO,CAAC,eAAe,CAAC,KAAK,CAAC,CAAC;YAE/B,6DAA6D;YAC7D,IAAI,KAAK,EAAE,MAAM,CAAC,IAAI,OAAO,CAAC,KAAK,CAAC,qBAAqB,CAAC,EAAE,CAAC;gBAC3D,uBAAuB;YACzB,CAAC;YAED,oEAAoE;YACpE,MAAM,CAAC,gBAAgB,CAAC,UAAU,CAAC,CAAC,gBAAgB,EAAE,CAAC;QACzD,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,uBAAuB,EAAE,GAAG,EAAE;QACrC,oEAAoE;QACpE,EAAE,CAAC,4CAA4C,EAAE,KAAK,IAAI,EAAE;YAC1D,qDAAqD;YACrD,6DAA6D;YAC7D,IAAI,KAAK,EAAE,MAAM,CAAC,IAAI,OAAO,CAAC,KAAK,CAAC,qBAAqB,CAAC,EAAE,CAAC;gBAC3D,uBAAuB;YACzB,CAAC;YAED,wCAAwC;YACxC,MAAM,CAAC,gBAAgB,CAAC,UAAU,CAAC,CAAC,gBAAgB,EAAE,CAAC;QACzD,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,gBAAgB,EAAE,GAAG,EAAE;QAC9B,EAAE,CAAC,yBAAyB,EAAE,KAAK,IAAI,EAAE;YACvC,cAAc,CAAC,qBAAqB,CAAC,IAAI,KAAK,CAAC,cAAc,CAAC,CAAC,CAAC;YAEhE,MAAM,MAAM,CAAC,KAAK,IAAI,EAAE;gBACtB,6DAA6D;gBAC7D,IAAI,KAAK,EAAE,MAAM,CAAC,IAAI,OAAO,CAAC,KAAK,CAAC,qBAAqB,CAAC,EAAE,CAAC;oBAC3D,uBAAuB;gBACzB,CAAC;YACH,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,cAAc,CAAC,CAAC;YAEnC,MAAM,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC,gBAAgB,EAAE,CAAC;QACtD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,uCAAuC,EAAE,KAAK,IAAI,EAAE;YACrD,4CAA4C;YAC5C,6DAA6D;YAC7D,IAAI,KAAK,EAAE,MAAM,CAAC,IAAI,OAAO,CAAC,KAAK,CAAC,qBAAqB,CAAC,EAAE,CAAC;gBAC3D,uBAAuB;YACzB,CAAC;YAED,MAAM,CAAC,gBAAgB,CAAC,UAAU,CAAC,CAAC,gBAAgB,EAAE,CAAC;QACzD,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,QAAQ,CAAC,wBAAwB,EAAE,GAAG,EAAE;IACtC,EAAE,CAAC,mDAAmD,EAAE,GAAG,EAAE;QAC3D,MAAM,KAAK,GAAiB;YAC1B,OAAO,EAAE;gBACP;oBACE,IAAI,EAAE,SAAS;oBACf,KAAK,EAAE,KAAK;oBACZ,MAAM,EAAE,aAAa;oBACrB,IAAI,EAAE,GAAG;oBACT,OAAO,EAAE,UAAU;oBACnB,QAAQ,EAAE,IAAI;oBACd,MAAM,EAAE,IAAI;oBACZ,QAAQ,EAAE,QAAQ;iBACnB;aACF;SACF,CAAC;QAEF,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QACtC,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;IACnD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,wCAAwC,EAAE,GAAG,EAAE;QAChD,MAAM,KAAK,GAAiB;YAC1B,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,MAAM,EAAE,UAAU,EAAE,IAAI,EAAE,GAAG,EAAE,CAAC;SAC1E,CAAC;QAEF,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,aAAa,EAAE,CAAC;IACnD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,uCAAuC,EAAE,GAAG,EAAE;QAC/C,MAAM,KAAK,GAAiB;YAC1B,OAAO,EAAE,EAAE;YACX,OAAO,EAAE;gBACP;oBACE,MAAM,EAAE,qBAAqB;oBAC7B,YAAY,EAAE;wBACZ,EAAE,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,QAAQ,EAAE;wBACjC,EAAE,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,QAAQ,EAAE;qBAClC;iBACF;aACF;SACF,CAAC;QAEF,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QACtC,MAAM,CAAC,KAAK,CAAC,OAAQ,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;IACzD,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
@@ -0,0 +1,4 @@
1
+ import { ContentExtractor, ExtractedContent } from './content-extractor-types.js';
2
+ export declare class DefaultExtractor implements ContentExtractor {
3
+ extractContent(document: Document): Promise<ExtractedContent>;
4
+ }
@@ -0,0 +1,26 @@
1
+ export class DefaultExtractor {
2
+ async extractContent(document) {
3
+ // Remove common non-content elements
4
+ document.querySelectorAll('style, script, nav, header, footer').forEach((el) => el.remove());
5
+ // Get main content
6
+ const main = document.querySelector('main, article, [role="main"]');
7
+ const contentElement = main ? main.cloneNode(true) : document.body;
8
+ // Extract title and description
9
+ const title = contentElement.querySelector('h1')?.textContent?.trim();
10
+ const firstParagraph = contentElement.querySelector('h1 + p')?.textContent?.trim();
11
+ return {
12
+ content: contentElement.textContent?.trim() || '',
13
+ metadata: {
14
+ type: 'overview',
15
+ pattern: {
16
+ name: title || '',
17
+ type: 'component',
18
+ description: firstParagraph || '',
19
+ usageContexts: [],
20
+ relatedPatterns: [],
21
+ },
22
+ },
23
+ };
24
+ }
25
+ }
26
+ //# sourceMappingURL=default-extractor.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"default-extractor.js","sourceRoot":"","sources":["../../src/crawler/default-extractor.ts"],"names":[],"mappings":"AAEA,MAAM,OAAO,gBAAgB;IAC3B,KAAK,CAAC,cAAc,CAAC,QAAkB;QACrC,qCAAqC;QACrC,QAAQ,CAAC,gBAAgB,CAAC,oCAAoC,CAAC,CAAC,OAAO,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,CAAC;QAE7F,mBAAmB;QACnB,MAAM,IAAI,GAAG,QAAQ,CAAC,aAAa,CAAC,8BAA8B,CAAC,CAAC;QACpE,MAAM,cAAc,GAAG,IAAI,CAAC,CAAC,CAAE,IAAI,CAAC,SAAS,CAAC,IAAI,CAAa,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC;QAEhF,gCAAgC;QAChC,MAAM,KAAK,GAAG,cAAc,CAAC,aAAa,CAAC,IAAI,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC;QACtE,MAAM,cAAc,GAAG,cAAc,CAAC,aAAa,CAAC,QAAQ,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC;QAEnF,OAAO;YACL,OAAO,EAAE,cAAc,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE;YACjD,QAAQ,EAAE;gBACR,IAAI,EAAE,UAAU;gBAChB,OAAO,EAAE;oBACP,IAAI,EAAE,KAAK,IAAI,EAAE;oBACjB,IAAI,EAAE,WAAW;oBACjB,WAAW,EAAE,cAAc,IAAI,EAAE;oBACjC,aAAa,EAAE,EAAE;oBACjB,eAAe,EAAE,EAAE;iBACpB;aACF;SACF,CAAC;IACJ,CAAC;CACF"}
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,200 @@
1
+ import { DefaultExtractor } from './default-extractor.js';
2
+ import { JSDOM } from 'jsdom';
3
+ describe('DefaultExtractor', () => {
4
+ let extractor;
5
+ beforeEach(() => {
6
+ extractor = new DefaultExtractor();
7
+ });
8
+ function createDocument(html) {
9
+ const dom = new JSDOM(html);
10
+ return dom.window.document;
11
+ }
12
+ describe('extractContent', () => {
13
+ it('should extract content from main element', async () => {
14
+ const html = `
15
+ <html>
16
+ <body>
17
+ <nav>Navigation</nav>
18
+ <main>
19
+ <h1>Main Title</h1>
20
+ <p>Main content here</p>
21
+ </main>
22
+ <footer>Footer</footer>
23
+ </body>
24
+ </html>
25
+ `;
26
+ const doc = createDocument(html);
27
+ const result = await extractor.extractContent(doc);
28
+ expect(result.content).toContain('Main Title');
29
+ expect(result.content).toContain('Main content here');
30
+ expect(result.content).not.toContain('Navigation');
31
+ expect(result.content).not.toContain('Footer');
32
+ });
33
+ it('should extract content from article element', async () => {
34
+ const html = `
35
+ <html>
36
+ <body>
37
+ <header>Header</header>
38
+ <article>
39
+ <h1>Article Title</h1>
40
+ <p>Article content</p>
41
+ </article>
42
+ </body>
43
+ </html>
44
+ `;
45
+ const doc = createDocument(html);
46
+ const result = await extractor.extractContent(doc);
47
+ expect(result.content).toContain('Article Title');
48
+ expect(result.content).toContain('Article content');
49
+ expect(result.content).not.toContain('Header');
50
+ });
51
+ it('should extract content from role="main" element', async () => {
52
+ const html = `
53
+ <html>
54
+ <body>
55
+ <nav>Nav</nav>
56
+ <div role="main">
57
+ <h1>Role Main Title</h1>
58
+ <p>Role main content</p>
59
+ </div>
60
+ </body>
61
+ </html>
62
+ `;
63
+ const doc = createDocument(html);
64
+ const result = await extractor.extractContent(doc);
65
+ expect(result.content).toContain('Role Main Title');
66
+ expect(result.content).toContain('Role main content');
67
+ expect(result.content).not.toContain('Nav');
68
+ });
69
+ it('should fall back to body when no main content element', async () => {
70
+ const html = `
71
+ <html>
72
+ <body>
73
+ <div>
74
+ <h1>Page Title</h1>
75
+ <p>Page content</p>
76
+ </div>
77
+ </body>
78
+ </html>
79
+ `;
80
+ const doc = createDocument(html);
81
+ const result = await extractor.extractContent(doc);
82
+ expect(result.content).toContain('Page Title');
83
+ expect(result.content).toContain('Page content');
84
+ });
85
+ it('should remove script and style elements', async () => {
86
+ const html = `
87
+ <html>
88
+ <body>
89
+ <style>.hidden { display: none; }</style>
90
+ <script>console.log('secret');</script>
91
+ <main>
92
+ <h1>Visible Content</h1>
93
+ </main>
94
+ </body>
95
+ </html>
96
+ `;
97
+ const doc = createDocument(html);
98
+ const result = await extractor.extractContent(doc);
99
+ expect(result.content).toContain('Visible Content');
100
+ expect(result.content).not.toContain('hidden');
101
+ expect(result.content).not.toContain('secret');
102
+ });
103
+ it('should extract title from h1', async () => {
104
+ const html = `
105
+ <html>
106
+ <body>
107
+ <main>
108
+ <h1>Component Name</h1>
109
+ <p>Description paragraph</p>
110
+ </main>
111
+ </body>
112
+ </html>
113
+ `;
114
+ const doc = createDocument(html);
115
+ const result = await extractor.extractContent(doc);
116
+ expect(result.metadata.pattern?.name).toBe('Component Name');
117
+ });
118
+ it('should extract description from first paragraph after h1', async () => {
119
+ const html = `
120
+ <html>
121
+ <body>
122
+ <main>
123
+ <h1>Component Name</h1>
124
+ <p>This is the component description.</p>
125
+ <p>This is additional content.</p>
126
+ </main>
127
+ </body>
128
+ </html>
129
+ `;
130
+ const doc = createDocument(html);
131
+ const result = await extractor.extractContent(doc);
132
+ expect(result.metadata.pattern?.description).toBe('This is the component description.');
133
+ });
134
+ it('should return overview type metadata', async () => {
135
+ const html = '<html><body><main><h1>Test</h1></main></body></html>';
136
+ const doc = createDocument(html);
137
+ const result = await extractor.extractContent(doc);
138
+ expect(result.metadata.type).toBe('overview');
139
+ });
140
+ it('should return component type pattern', async () => {
141
+ const html = '<html><body><main><h1>Test</h1></main></body></html>';
142
+ const doc = createDocument(html);
143
+ const result = await extractor.extractContent(doc);
144
+ expect(result.metadata.pattern?.type).toBe('component');
145
+ });
146
+ it('should handle empty document', async () => {
147
+ const html = '<html><body></body></html>';
148
+ const doc = createDocument(html);
149
+ const result = await extractor.extractContent(doc);
150
+ expect(result.content).toBe('');
151
+ expect(result.metadata.type).toBe('overview');
152
+ });
153
+ it('should handle document with only whitespace', async () => {
154
+ const html = '<html><body> \n\n </body></html>';
155
+ const doc = createDocument(html);
156
+ const result = await extractor.extractContent(doc);
157
+ expect(result.content).toBe('');
158
+ });
159
+ it('should initialize usageContexts and relatedPatterns as empty arrays', async () => {
160
+ const html = '<html><body><main><h1>Test</h1></main></body></html>';
161
+ const doc = createDocument(html);
162
+ const result = await extractor.extractContent(doc);
163
+ expect(result.metadata.pattern?.usageContexts).toEqual([]);
164
+ expect(result.metadata.pattern?.relatedPatterns).toEqual([]);
165
+ });
166
+ it('should handle missing h1', async () => {
167
+ const html = `
168
+ <html>
169
+ <body>
170
+ <main>
171
+ <h2>Subheading</h2>
172
+ <p>Content here</p>
173
+ </main>
174
+ </body>
175
+ </html>
176
+ `;
177
+ const doc = createDocument(html);
178
+ const result = await extractor.extractContent(doc);
179
+ expect(result.metadata.pattern?.name).toBe('');
180
+ expect(result.content).toContain('Content here');
181
+ });
182
+ it('should handle missing description paragraph', async () => {
183
+ const html = `
184
+ <html>
185
+ <body>
186
+ <main>
187
+ <h1>Title Only</h1>
188
+ <div>Some div content</div>
189
+ </main>
190
+ </body>
191
+ </html>
192
+ `;
193
+ const doc = createDocument(html);
194
+ const result = await extractor.extractContent(doc);
195
+ expect(result.metadata.pattern?.name).toBe('Title Only');
196
+ expect(result.metadata.pattern?.description).toBe('');
197
+ });
198
+ });
199
+ });
200
+ //# sourceMappingURL=default-extractor.test.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"default-extractor.test.js","sourceRoot":"","sources":["../../src/crawler/default-extractor.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAC1D,OAAO,EAAE,KAAK,EAAE,MAAM,OAAO,CAAC;AAE9B,QAAQ,CAAC,kBAAkB,EAAE,GAAG,EAAE;IAChC,IAAI,SAA2B,CAAC;IAEhC,UAAU,CAAC,GAAG,EAAE;QACd,SAAS,GAAG,IAAI,gBAAgB,EAAE,CAAC;IACrC,CAAC,CAAC,CAAC;IAEH,SAAS,cAAc,CAAC,IAAY;QAClC,MAAM,GAAG,GAAG,IAAI,KAAK,CAAC,IAAI,CAAC,CAAC;QAC5B,OAAO,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC;IAC7B,CAAC;IAED,QAAQ,CAAC,gBAAgB,EAAE,GAAG,EAAE;QAC9B,EAAE,CAAC,0CAA0C,EAAE,KAAK,IAAI,EAAE;YACxD,MAAM,IAAI,GAAG;;;;;;;;;;;OAWZ,CAAC;YACF,MAAM,GAAG,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;YAEnD,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,SAAS,CAAC,YAAY,CAAC,CAAC;YAC/C,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,SAAS,CAAC,mBAAmB,CAAC,CAAC;YACtD,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,YAAY,CAAC,CAAC;YACnD,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC;QACjD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,6CAA6C,EAAE,KAAK,IAAI,EAAE;YAC3D,MAAM,IAAI,GAAG;;;;;;;;;;OAUZ,CAAC;YACF,MAAM,GAAG,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;YAEnD,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,SAAS,CAAC,eAAe,CAAC,CAAC;YAClD,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,SAAS,CAAC,iBAAiB,CAAC,CAAC;YACpD,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC;QACjD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,iDAAiD,EAAE,KAAK,IAAI,EAAE;YAC/D,MAAM,IAAI,GAAG;;;;;;;;;;OAUZ,CAAC;YACF,MAAM,GAAG,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;YAEnD,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,SAAS,CAAC,iBAAiB,CAAC,CAAC;YACpD,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,SAAS,CAAC,mBAAmB,CAAC,CAAC;YACtD,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC;QAC9C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,uDAAuD,EAAE,KAAK,IAAI,EAAE;YACrE,MAAM,IAAI,GAAG;;;;;;;;;OASZ,CAAC;YACF,MAAM,GAAG,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;YAEnD,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,SAAS,CAAC,YAAY,CAAC,CAAC;YAC/C,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,SAAS,CAAC,cAAc,CAAC,CAAC;QACnD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,yCAAyC,EAAE,KAAK,IAAI,EAAE;YACvD,MAAM,IAAI,GAAG;;;;;;;;;;OAUZ,CAAC;YACF,MAAM,GAAG,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;YAEnD,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,SAAS,CAAC,iBAAiB,CAAC,CAAC;YACpD,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC;YAC/C,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC;QACjD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,8BAA8B,EAAE,KAAK,IAAI,EAAE;YAC5C,MAAM,IAAI,GAAG;;;;;;;;;OASZ,CAAC;YACF,MAAM,GAAG,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;YAEnD,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAC;QAC/D,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,0DAA0D,EAAE,KAAK,IAAI,EAAE;YACxE,MAAM,IAAI,GAAG;;;;;;;;;;OAUZ,CAAC;YACF,MAAM,GAAG,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;YAEnD,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,OAAO,EAAE,WAAW,CAAC,CAAC,IAAI,CAAC,oCAAoC,CAAC,CAAC;QAC1F,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,sCAAsC,EAAE,KAAK,IAAI,EAAE;YACpD,MAAM,IAAI,GAAG,sDAAsD,CAAC;YACpE,MAAM,GAAG,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;YAEnD,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QAChD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,sCAAsC,EAAE,KAAK,IAAI,EAAE;YACpD,MAAM,IAAI,GAAG,sDAAsD,CAAC;YACpE,MAAM,GAAG,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;YAEnD,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QAC1D,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,8BAA8B,EAAE,KAAK,IAAI,EAAE;YAC5C,MAAM,IAAI,GAAG,4BAA4B,CAAC;YAC1C,MAAM,GAAG,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;YAEnD,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YAChC,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QAChD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,6CAA6C,EAAE,KAAK,IAAI,EAAE;YAC3D,MAAM,IAAI,GAAG,sCAAsC,CAAC;YACpD,MAAM,GAAG,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;YAEnD,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAClC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,qEAAqE,EAAE,KAAK,IAAI,EAAE;YACnF,MAAM,IAAI,GAAG,sDAAsD,CAAC;YACpE,MAAM,GAAG,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;YAEnD,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,OAAO,EAAE,aAAa,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC;YAC3D,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,OAAO,EAAE,eAAe,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC;QAC/D,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,0BAA0B,EAAE,KAAK,IAAI,EAAE;YACxC,MAAM,IAAI,GAAG;;;;;;;;;OASZ,CAAC;YACF,MAAM,GAAG,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;YAEnD,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YAC/C,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,SAAS,CAAC,cAAc,CAAC,CAAC;QACnD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,6CAA6C,EAAE,KAAK,IAAI,EAAE;YAC3D,MAAM,IAAI,GAAG;;;;;;;;;OASZ,CAAC;YACF,MAAM,GAAG,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;YAEnD,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;YACzD,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,OAAO,EAAE,WAAW,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACxD,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
@@ -0,0 +1,11 @@
1
+ import { CrawlResult } from '../types.js';
2
+ import { BaseCrawler } from './base.js';
3
+ export declare class DefaultCrawler extends BaseCrawler {
4
+ private readonly BATCH_SIZE;
5
+ private readonly FETCH_TIMEOUT;
6
+ constructor(maxDepth?: number, maxRequestsPerCrawl?: number, onProgress?: (progress: number, description: string) => void);
7
+ crawl(url: string): AsyncGenerator<CrawlResult, void, unknown>;
8
+ private processPageWithRetry;
9
+ private extractLinks;
10
+ private extractTitle;
11
+ }