@staticn0va/wigolo 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (215) hide show
  1. package/LICENSE +74 -0
  2. package/README.md +272 -0
  3. package/dist/cache/db.d.ts +5 -0
  4. package/dist/cache/db.d.ts.map +1 -0
  5. package/dist/cache/db.js +97 -0
  6. package/dist/cache/db.js.map +1 -0
  7. package/dist/cache/store.d.ts +26 -0
  8. package/dist/cache/store.d.ts.map +1 -0
  9. package/dist/cache/store.js +214 -0
  10. package/dist/cache/store.js.map +1 -0
  11. package/dist/cli/daemon.d.ts +2 -0
  12. package/dist/cli/daemon.d.ts.map +1 -0
  13. package/dist/cli/daemon.js +5 -0
  14. package/dist/cli/daemon.js.map +1 -0
  15. package/dist/cli/health.d.ts +2 -0
  16. package/dist/cli/health.d.ts.map +1 -0
  17. package/dist/cli/health.js +5 -0
  18. package/dist/cli/health.js.map +1 -0
  19. package/dist/cli/index.d.ts +7 -0
  20. package/dist/cli/index.d.ts.map +1 -0
  21. package/dist/cli/index.js +9 -0
  22. package/dist/cli/index.js.map +1 -0
  23. package/dist/cli/warmup.d.ts +11 -0
  24. package/dist/cli/warmup.d.ts.map +1 -0
  25. package/dist/cli/warmup.js +107 -0
  26. package/dist/cli/warmup.js.map +1 -0
  27. package/dist/config.d.ts +41 -0
  28. package/dist/config.d.ts.map +1 -0
  29. package/dist/config.js +66 -0
  30. package/dist/config.js.map +1 -0
  31. package/dist/crawl/crawler.d.ts +18 -0
  32. package/dist/crawl/crawler.d.ts.map +1 -0
  33. package/dist/crawl/crawler.js +228 -0
  34. package/dist/crawl/crawler.js.map +1 -0
  35. package/dist/crawl/dedup.d.ts +15 -0
  36. package/dist/crawl/dedup.d.ts.map +1 -0
  37. package/dist/crawl/dedup.js +93 -0
  38. package/dist/crawl/dedup.js.map +1 -0
  39. package/dist/crawl/mapper.d.ts +17 -0
  40. package/dist/crawl/mapper.d.ts.map +1 -0
  41. package/dist/crawl/mapper.js +178 -0
  42. package/dist/crawl/mapper.js.map +1 -0
  43. package/dist/crawl/rate-limiter.d.ts +10 -0
  44. package/dist/crawl/rate-limiter.d.ts.map +1 -0
  45. package/dist/crawl/rate-limiter.js +72 -0
  46. package/dist/crawl/rate-limiter.js.map +1 -0
  47. package/dist/crawl/robots.d.ts +9 -0
  48. package/dist/crawl/robots.d.ts.map +1 -0
  49. package/dist/crawl/robots.js +63 -0
  50. package/dist/crawl/robots.js.map +1 -0
  51. package/dist/crawl/sitemap.d.ts +4 -0
  52. package/dist/crawl/sitemap.d.ts.map +1 -0
  53. package/dist/crawl/sitemap.js +38 -0
  54. package/dist/crawl/sitemap.js.map +1 -0
  55. package/dist/crawl/url-utils.d.ts +3 -0
  56. package/dist/crawl/url-utils.d.ts.map +1 -0
  57. package/dist/crawl/url-utils.js +41 -0
  58. package/dist/crawl/url-utils.js.map +1 -0
  59. package/dist/extraction/defuddle.d.ts +3 -0
  60. package/dist/extraction/defuddle.d.ts.map +1 -0
  61. package/dist/extraction/defuddle.js +26 -0
  62. package/dist/extraction/defuddle.js.map +1 -0
  63. package/dist/extraction/extract.d.ts +5 -0
  64. package/dist/extraction/extract.d.ts.map +1 -0
  65. package/dist/extraction/extract.js +83 -0
  66. package/dist/extraction/extract.js.map +1 -0
  67. package/dist/extraction/jsonld.d.ts +4 -0
  68. package/dist/extraction/jsonld.d.ts.map +1 -0
  69. package/dist/extraction/jsonld.js +64 -0
  70. package/dist/extraction/jsonld.js.map +1 -0
  71. package/dist/extraction/markdown.d.ts +10 -0
  72. package/dist/extraction/markdown.d.ts.map +1 -0
  73. package/dist/extraction/markdown.js +107 -0
  74. package/dist/extraction/markdown.js.map +1 -0
  75. package/dist/extraction/pipeline.d.ts +11 -0
  76. package/dist/extraction/pipeline.d.ts.map +1 -0
  77. package/dist/extraction/pipeline.js +95 -0
  78. package/dist/extraction/pipeline.js.map +1 -0
  79. package/dist/extraction/readability.d.ts +3 -0
  80. package/dist/extraction/readability.d.ts.map +1 -0
  81. package/dist/extraction/readability.js +32 -0
  82. package/dist/extraction/readability.js.map +1 -0
  83. package/dist/extraction/schema.d.ts +7 -0
  84. package/dist/extraction/schema.d.ts.map +1 -0
  85. package/dist/extraction/schema.js +86 -0
  86. package/dist/extraction/schema.js.map +1 -0
  87. package/dist/extraction/site-extractors/docs-generic.d.ts +3 -0
  88. package/dist/extraction/site-extractors/docs-generic.d.ts.map +1 -0
  89. package/dist/extraction/site-extractors/docs-generic.js +104 -0
  90. package/dist/extraction/site-extractors/docs-generic.js.map +1 -0
  91. package/dist/extraction/site-extractors/github.d.ts +3 -0
  92. package/dist/extraction/site-extractors/github.d.ts.map +1 -0
  93. package/dist/extraction/site-extractors/github.js +107 -0
  94. package/dist/extraction/site-extractors/github.js.map +1 -0
  95. package/dist/extraction/site-extractors/mdn.d.ts +3 -0
  96. package/dist/extraction/site-extractors/mdn.d.ts.map +1 -0
  97. package/dist/extraction/site-extractors/mdn.js +58 -0
  98. package/dist/extraction/site-extractors/mdn.js.map +1 -0
  99. package/dist/extraction/site-extractors/stackoverflow.d.ts +3 -0
  100. package/dist/extraction/site-extractors/stackoverflow.d.ts.map +1 -0
  101. package/dist/extraction/site-extractors/stackoverflow.js +88 -0
  102. package/dist/extraction/site-extractors/stackoverflow.js.map +1 -0
  103. package/dist/extraction/trafilatura.d.ts +6 -0
  104. package/dist/extraction/trafilatura.d.ts.map +1 -0
  105. package/dist/extraction/trafilatura.js +105 -0
  106. package/dist/extraction/trafilatura.js.map +1 -0
  107. package/dist/fetch/auth.d.ts +8 -0
  108. package/dist/fetch/auth.d.ts.map +1 -0
  109. package/dist/fetch/auth.js +32 -0
  110. package/dist/fetch/auth.js.map +1 -0
  111. package/dist/fetch/browser-pool.d.ts +28 -0
  112. package/dist/fetch/browser-pool.d.ts.map +1 -0
  113. package/dist/fetch/browser-pool.js +138 -0
  114. package/dist/fetch/browser-pool.js.map +1 -0
  115. package/dist/fetch/content-check.d.ts +2 -0
  116. package/dist/fetch/content-check.d.ts.map +1 -0
  117. package/dist/fetch/content-check.js +62 -0
  118. package/dist/fetch/content-check.js.map +1 -0
  119. package/dist/fetch/http-client.d.ts +15 -0
  120. package/dist/fetch/http-client.d.ts.map +1 -0
  121. package/dist/fetch/http-client.js +146 -0
  122. package/dist/fetch/http-client.js.map +1 -0
  123. package/dist/fetch/router.d.ts +45 -0
  124. package/dist/fetch/router.d.ts.map +1 -0
  125. package/dist/fetch/router.js +89 -0
  126. package/dist/fetch/router.js.map +1 -0
  127. package/dist/index.d.ts +3 -0
  128. package/dist/index.d.ts.map +1 -0
  129. package/dist/index.js +22 -0
  130. package/dist/index.js.map +1 -0
  131. package/dist/logger.d.ts +10 -0
  132. package/dist/logger.d.ts.map +1 -0
  133. package/dist/logger.js +39 -0
  134. package/dist/logger.js.map +1 -0
  135. package/dist/search/dedup.d.ts +10 -0
  136. package/dist/search/dedup.d.ts.map +1 -0
  137. package/dist/search/dedup.js +35 -0
  138. package/dist/search/dedup.js.map +1 -0
  139. package/dist/search/engines/bing.d.ts +7 -0
  140. package/dist/search/engines/bing.d.ts.map +1 -0
  141. package/dist/search/engines/bing.js +48 -0
  142. package/dist/search/engines/bing.js.map +1 -0
  143. package/dist/search/engines/duckduckgo.d.ts +7 -0
  144. package/dist/search/engines/duckduckgo.d.ts.map +1 -0
  145. package/dist/search/engines/duckduckgo.js +50 -0
  146. package/dist/search/engines/duckduckgo.js.map +1 -0
  147. package/dist/search/engines/startpage.d.ts +7 -0
  148. package/dist/search/engines/startpage.d.ts.map +1 -0
  149. package/dist/search/engines/startpage.js +50 -0
  150. package/dist/search/engines/startpage.js.map +1 -0
  151. package/dist/search/filters.d.ts +16 -0
  152. package/dist/search/filters.d.ts.map +1 -0
  153. package/dist/search/filters.js +63 -0
  154. package/dist/search/filters.js.map +1 -0
  155. package/dist/search/flashrank.d.ts +12 -0
  156. package/dist/search/flashrank.d.ts.map +1 -0
  157. package/dist/search/flashrank.js +63 -0
  158. package/dist/search/flashrank.js.map +1 -0
  159. package/dist/search/query.d.ts +2 -0
  160. package/dist/search/query.d.ts.map +1 -0
  161. package/dist/search/query.js +41 -0
  162. package/dist/search/query.js.map +1 -0
  163. package/dist/search/rerank.d.ts +3 -0
  164. package/dist/search/rerank.d.ts.map +1 -0
  165. package/dist/search/rerank.js +40 -0
  166. package/dist/search/rerank.js.map +1 -0
  167. package/dist/search/searxng.d.ts +8 -0
  168. package/dist/search/searxng.d.ts.map +1 -0
  169. package/dist/search/searxng.js +87 -0
  170. package/dist/search/searxng.js.map +1 -0
  171. package/dist/search/validator.d.ts +6 -0
  172. package/dist/search/validator.d.ts.map +1 -0
  173. package/dist/search/validator.js +35 -0
  174. package/dist/search/validator.js.map +1 -0
  175. package/dist/searxng/bootstrap.d.ts +18 -0
  176. package/dist/searxng/bootstrap.d.ts.map +1 -0
  177. package/dist/searxng/bootstrap.js +136 -0
  178. package/dist/searxng/bootstrap.js.map +1 -0
  179. package/dist/searxng/docker.d.ts +9 -0
  180. package/dist/searxng/docker.d.ts.map +1 -0
  181. package/dist/searxng/docker.js +67 -0
  182. package/dist/searxng/docker.js.map +1 -0
  183. package/dist/searxng/process.d.ts +23 -0
  184. package/dist/searxng/process.d.ts.map +1 -0
  185. package/dist/searxng/process.js +188 -0
  186. package/dist/searxng/process.js.map +1 -0
  187. package/dist/server.d.ts +2 -0
  188. package/dist/server.d.ts.map +1 -0
  189. package/dist/server.js +311 -0
  190. package/dist/server.js.map +1 -0
  191. package/dist/tools/cache.d.ts +3 -0
  192. package/dist/tools/cache.d.ts.map +1 -0
  193. package/dist/tools/cache.js +50 -0
  194. package/dist/tools/cache.js.map +1 -0
  195. package/dist/tools/crawl.d.ts +6 -0
  196. package/dist/tools/crawl.d.ts.map +1 -0
  197. package/dist/tools/crawl.js +97 -0
  198. package/dist/tools/crawl.js.map +1 -0
  199. package/dist/tools/extract.d.ts +4 -0
  200. package/dist/tools/extract.d.ts.map +1 -0
  201. package/dist/tools/extract.js +69 -0
  202. package/dist/tools/extract.js.map +1 -0
  203. package/dist/tools/fetch.d.ts +4 -0
  204. package/dist/tools/fetch.d.ts.map +1 -0
  205. package/dist/tools/fetch.js +76 -0
  206. package/dist/tools/fetch.js.map +1 -0
  207. package/dist/tools/search.d.ts +4 -0
  208. package/dist/tools/search.d.ts.map +1 -0
  209. package/dist/tools/search.js +160 -0
  210. package/dist/tools/search.js.map +1 -0
  211. package/dist/types.d.ts +222 -0
  212. package/dist/types.d.ts.map +1 -0
  213. package/dist/types.js +2 -0
  214. package/dist/types.js.map +1 -0
  215. package/package.json +61 -0
@@ -0,0 +1,228 @@
1
+ import { matchesPatterns } from './url-utils.js';
2
+ import { RateLimiter } from './rate-limiter.js';
3
+ import { RobotsParser } from './robots.js';
4
+ import { parseSitemap, parseSitemapIndex, extractSitemapUrlFromRobots } from './sitemap.js';
5
+ import { getConfig } from '../config.js';
6
+ import { createLogger } from '../logger.js';
7
+ const log = createLogger('crawl');
8
+ export class Crawler {
9
+ fetchFn;
10
+ rawFetchFn;
11
+ rateLimiter = new RateLimiter();
12
+ constructor(fetchFn, rawFetchFn) {
13
+ this.fetchFn = fetchFn;
14
+ this.rawFetchFn = rawFetchFn;
15
+ }
16
+ async crawl(input) {
17
+ const strategy = input.strategy ?? 'bfs';
18
+ const maxDepth = input.max_depth ?? 2;
19
+ const maxPages = input.max_pages ?? 20;
20
+ const seedOrigin = new URL(input.url).origin;
21
+ // Fetch and parse robots.txt if configured
22
+ const config = getConfig();
23
+ let robotsParser = null;
24
+ if (config.respectRobotsTxt) {
25
+ robotsParser = await this.fetchRobots(seedOrigin);
26
+ }
27
+ if (strategy === 'sitemap') {
28
+ return this.crawlSitemap(input, seedOrigin, maxPages, robotsParser);
29
+ }
30
+ const traversalStrategy = strategy === 'map' ? 'bfs' : strategy;
31
+ return this.crawlTraversal(input, seedOrigin, maxDepth, maxPages, traversalStrategy, robotsParser);
32
+ }
33
+ robotsTxtContent = null;
34
+ async fetchRobots(origin) {
35
+ try {
36
+ const result = await this.rawFetchFn(`${origin}/robots.txt`);
37
+ if (result.statusCode === 200 && result.html) {
38
+ this.robotsTxtContent = result.html;
39
+ const parser = new RobotsParser(result.html);
40
+ const crawlDelay = parser.getCrawlDelay();
41
+ if (crawlDelay !== null) {
42
+ const domain = new URL(origin).hostname;
43
+ this.rateLimiter.setRobotsCrawlDelay(domain, crawlDelay);
44
+ }
45
+ return parser;
46
+ }
47
+ }
48
+ catch {
49
+ log.debug('Could not fetch robots.txt', { origin });
50
+ }
51
+ return null;
52
+ }
53
+ async crawlTraversal(input, seedOrigin, maxDepth, maxPages, strategy, robotsParser) {
54
+ const visited = new Set();
55
+ const pages = [];
56
+ const allLinks = [];
57
+ // Queue: [url, depth]
58
+ const queue = [[input.url, 0]];
59
+ visited.add(input.url);
60
+ while (queue.length > 0 && pages.length < maxPages) {
61
+ const item = strategy === 'dfs' ? queue.pop() : queue.shift();
62
+ const [url, depth] = item;
63
+ // Check robots.txt
64
+ if (robotsParser && !robotsParser.isAllowed(new URL(url).pathname)) {
65
+ log.debug('Blocked by robots.txt', { url });
66
+ continue;
67
+ }
68
+ // Rate limit
69
+ const release = await this.rateLimiter.acquire(url);
70
+ let fetchResult;
71
+ try {
72
+ fetchResult = await this.fetchFn(url);
73
+ }
74
+ catch (err) {
75
+ log.warn('Fetch failed during crawl', { url, error: String(err) });
76
+ release();
77
+ continue;
78
+ }
79
+ release();
80
+ if (fetchResult.error) {
81
+ log.warn('Fetch returned error', { url, error: fetchResult.error });
82
+ continue;
83
+ }
84
+ pages.push({
85
+ url: fetchResult.url,
86
+ title: fetchResult.title,
87
+ markdown: fetchResult.markdown,
88
+ depth,
89
+ });
90
+ // Discover links for traversal
91
+ if (depth < maxDepth) {
92
+ const newLinks = this.filterLinks(fetchResult.links, seedOrigin, visited, input.include_patterns, input.exclude_patterns, robotsParser);
93
+ for (const link of newLinks) {
94
+ visited.add(link);
95
+ queue.push([link, depth + 1]);
96
+ }
97
+ if (input.extract_links) {
98
+ for (const link of fetchResult.links) {
99
+ allLinks.push({ from: url, to: link });
100
+ }
101
+ }
102
+ }
103
+ }
104
+ // total_found = all unique URLs discovered (visited set), including unvisited queue items
105
+ return {
106
+ pages,
107
+ total_found: visited.size,
108
+ crawled: pages.length,
109
+ ...(input.extract_links ? { links: allLinks } : {}),
110
+ };
111
+ }
112
+ filterLinks(links, seedOrigin, visited, includePatterns, excludePatterns, robotsParser) {
113
+ const filtered = links.filter((link) => {
114
+ try {
115
+ const parsed = new URL(link);
116
+ if (parsed.origin !== seedOrigin)
117
+ return false;
118
+ if (visited.has(link))
119
+ return false;
120
+ if (!matchesPatterns(link, includePatterns, excludePatterns))
121
+ return false;
122
+ if (robotsParser && !robotsParser.isAllowed(parsed.pathname))
123
+ return false;
124
+ return true;
125
+ }
126
+ catch {
127
+ return false;
128
+ }
129
+ });
130
+ // Prioritize documentation pages over marketing/nav pages
131
+ return filtered.sort((a, b) => {
132
+ const aDoc = isDocPage(a) ? 0 : 1;
133
+ const bDoc = isDocPage(b) ? 0 : 1;
134
+ return aDoc - bDoc;
135
+ });
136
+ }
137
+ async crawlSitemap(input, seedOrigin, maxPages, robotsParser) {
138
+ // Discover sitemap URLs (pass already-fetched robots.txt content)
139
+ let sitemapUrls = await this.discoverSitemapUrls(seedOrigin, this.robotsTxtContent);
140
+ if (sitemapUrls.length === 0) {
141
+ log.info('No sitemap found, falling back to BFS');
142
+ return this.crawlTraversal(input, seedOrigin, input.max_depth ?? 2, maxPages, 'bfs', robotsParser);
143
+ }
144
+ // Filter by patterns
145
+ sitemapUrls = sitemapUrls.filter((url) => matchesPatterns(url, input.include_patterns, input.exclude_patterns));
146
+ const totalFound = sitemapUrls.length;
147
+ const toFetch = sitemapUrls.slice(0, maxPages);
148
+ const pages = [];
149
+ const allLinks = [];
150
+ for (const url of toFetch) {
151
+ if (pages.length >= maxPages)
152
+ break;
153
+ if (robotsParser && !robotsParser.isAllowed(new URL(url).pathname))
154
+ continue;
155
+ const release = await this.rateLimiter.acquire(url);
156
+ try {
157
+ const result = await this.fetchFn(url);
158
+ release();
159
+ if (!result.error) {
160
+ pages.push({ url: result.url, title: result.title, markdown: result.markdown, depth: 0 });
161
+ if (input.extract_links) {
162
+ for (const link of result.links) {
163
+ allLinks.push({ from: url, to: link });
164
+ }
165
+ }
166
+ }
167
+ }
168
+ catch (err) {
169
+ release();
170
+ log.warn('Sitemap fetch failed', { url, error: String(err) });
171
+ }
172
+ }
173
+ return {
174
+ pages,
175
+ total_found: totalFound,
176
+ crawled: pages.length,
177
+ ...(input.extract_links ? { links: allLinks } : {}),
178
+ };
179
+ }
180
+ async discoverSitemapUrls(origin, robotsTxt) {
181
+ const sitemapLocations = [];
182
+ // Check robots.txt for sitemap references (reuses already-fetched content)
183
+ if (robotsTxt) {
184
+ sitemapLocations.push(...extractSitemapUrlFromRobots(robotsTxt));
185
+ }
186
+ // Try default location
187
+ if (sitemapLocations.length === 0) {
188
+ sitemapLocations.push(`${origin}/sitemap.xml`);
189
+ }
190
+ const allUrls = [];
191
+ for (const sitemapUrl of sitemapLocations) {
192
+ try {
193
+ const result = await this.rawFetchFn(sitemapUrl);
194
+ if (result.statusCode !== 200)
195
+ continue;
196
+ // Check if it's a sitemap index
197
+ const indexUrls = parseSitemapIndex(result.html);
198
+ if (indexUrls.length > 0) {
199
+ // Fetch each sub-sitemap
200
+ for (const subUrl of indexUrls) {
201
+ try {
202
+ const subResult = await this.rawFetchFn(subUrl);
203
+ if (subResult.statusCode === 200) {
204
+ allUrls.push(...parseSitemap(subResult.html));
205
+ }
206
+ }
207
+ catch {
208
+ // skip failed sub-sitemaps
209
+ }
210
+ }
211
+ }
212
+ else {
213
+ allUrls.push(...parseSitemap(result.html));
214
+ }
215
+ }
216
+ catch {
217
+ // skip failed sitemap fetches
218
+ }
219
+ }
220
+ return allUrls;
221
+ }
222
+ }
223
+ const DOC_PATH_PATTERNS = ['/docs/', '/guide/', '/api/', '/reference/'];
224
+ function isDocPage(url) {
225
+ const path = new URL(url).pathname.toLowerCase();
226
+ return DOC_PATH_PATTERNS.some(p => path.includes(p));
227
+ }
228
+ //# sourceMappingURL=crawler.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"crawler.js","sourceRoot":"","sources":["../../src/crawl/crawler.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,eAAe,EAAE,MAAM,gBAAgB,CAAC;AACjD,OAAO,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAChD,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAC3C,OAAO,EAAE,YAAY,EAAE,iBAAiB,EAAE,2BAA2B,EAAE,MAAM,cAAc,CAAC;AAC5F,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AACzC,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAE5C,MAAM,GAAG,GAAG,YAAY,CAAC,OAAO,CAAC,CAAC;AAKlC,MAAM,OAAO,OAAO;IACV,OAAO,CAAU;IACjB,UAAU,CAAa;IACvB,WAAW,GAAG,IAAI,WAAW,EAAE,CAAC;IAExC,YAAY,OAAgB,EAAE,UAAsB;QAClD,IAAI,CAAC,OAAO,GAAG,OAAO,CAAC;QACvB,IAAI,CAAC,UAAU,GAAG,UAAU,CAAC;IAC/B,CAAC;IAED,KAAK,CAAC,KAAK,CAAC,KAAiB;QAC3B,MAAM,QAAQ,GAAG,KAAK,CAAC,QAAQ,IAAI,KAAK,CAAC;QACzC,MAAM,QAAQ,GAAG,KAAK,CAAC,SAAS,IAAI,CAAC,CAAC;QACtC,MAAM,QAAQ,GAAG,KAAK,CAAC,SAAS,IAAI,EAAE,CAAC;QAEvC,MAAM,UAAU,GAAG,IAAI,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC;QAE7C,2CAA2C;QAC3C,MAAM,MAAM,GAAG,SAAS,EAAE,CAAC;QAC3B,IAAI,YAAY,GAAwB,IAAI,CAAC;QAC7C,IAAI,MAAM,CAAC,gBAAgB,EAAE,CAAC;YAC5B,YAAY,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,UAAU,CAAC,CAAC;QACpD,CAAC;QAED,IAAI,QAAQ,KAAK,SAAS,EAAE,CAAC;YAC3B,OAAO,IAAI,CAAC,YAAY,CAAC,KAAK,EAAE,UAAU,EAAE,QAAQ,EAAE,YAAY,CAAC,CAAC;QACtE,CAAC;QAED,MAAM,iBAAiB,GAAG,QAAQ,KAAK,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,QAAQ,CAAC;QAChE,OAAO,IAAI,CAAC,cAAc,CAAC,KAAK,EAAE,UAAU,EAAE,QAAQ,EAAE,QAAQ,EAAE,iBAAiB,EAAE,YAAY,CAAC,CAAC;IACrG,CAAC;IAEO,gBAAgB,GAAkB,IAAI,CAAC;IAEvC,KAAK,CAAC,WAAW,CAAC,MAAc;QACtC,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,GAAG,MAAM,aAAa,CAAC,CAAC;YAC7D,IAAI,MAAM,CAAC,UAAU,KAAK,GAAG,IAAI,MAAM,CAAC,IAAI,EAAE,CAAC;gBAC7C,IAAI,CAAC,gBAAgB,GAAG,MAAM,CAAC,IAAI,CAAC;gBACpC,MAAM,MAAM,GAAG,IAAI,YAAY,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;gBAC7C,MAAM,UAAU,GAAG,MAAM,CAAC,aAAa,EAAE,CAAC;gBAC1C,IAAI,UAAU,KAAK,IAAI,EAAE,CAAC;oBACxB,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,CAAC,QAAQ,CAAC;oBACxC,IAAI,CAAC,WAAW,CAAC,mBAAmB,CAAC,MAAM,EAAE,UAAU,CAAC,CAAC;gBAC3D,CAAC;gBACD,OAAO,MAAM,CAAC;YAChB,CAAC;QACH,CAAC;QAAC,MAAM,CAAC;YACP,GAAG,CAAC,KAAK,CAAC,4BAA4B,EAAE,EAAE,MAAM,EAAE,CAAC,CAAC;QACtD,CAAC;QACD,OAAO,IAAI,CAAC;IACd,CAAC;IAEO,KAAK,CAAC,cAAc,CAC1B,KAAiB,EACjB,UAAkB,EAClB,QAAgB,EAChB,QAAgB,EAChB,QAAuB,EACvB,YAAiC;QAEjC,MAAM,OAAO,GAAG,IAAI,GAAG,EAAU,CAAC;QAClC,MAAM,KAAK,GAAsB,EAAE,CAAC;QACpC,MAAM,QAAQ,GAAe,EAAE,CAAC;QAEhC,sBAAsB;QACtB,MAAM,KAAK,GAA4B,CAAC,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC;QACxD,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAEvB,OAAO,KAAK,CAAC,MAAM,GAAG,CAAC,IAAI,KAAK,CAAC,MAAM,GAAG,QAAQ,EAAE,CAAC;YACnD,MAAM,IAAI,GAAG,QAAQ,KAAK,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,EAAG,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,EAAG,CAAC;YAChE,MAAM,CAAC,GAAG,EAAE,KAAK,CAAC,GAAG,IAAI,CAAC;YAE1B,mBAAmB;YACnB,IAAI,YAAY,IAAI,CAAC,YAAY,CAAC,SAAS,CAAC,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC,EAAE,CAAC;gBACnE,GAAG,CAAC,KAAK,CAAC,uBAAuB,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC;gBAC5C,SAAS;YACX,CAAC;YAED,aAAa;YACb,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;YAEpD,IAAI,WAAwB,CAAC;YAC7B,IAAI,CAAC;gBACH,WAAW,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;YACxC,CAAC;YAAC,OAAO,GAAG,EAAE,CAAC;gBACb,GAAG,CAAC,IAAI,CAAC,2BAA2B,EAAE,EAAE,GAAG,EAAE,KAAK,EAAE,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;gBACnE,OAAO,EAAE,CAAC;gBACV,SAAS;YACX,CAAC;YAED,OAAO,EAAE,CAAC;YAEV,IAAI,WAAW,CAAC,KAAK,EAAE,CAAC;gBACtB,GAAG,CAAC,IAAI,CAAC,sBAAsB,EAAE,EAAE,GAAG,EAAE,KAAK,EAAE,WAAW,CAAC,KAAK,EAAE,CAAC,CAAC;gBACpE,SAAS;YACX,CAAC;YAED,KAAK,CAAC,IAAI,CAAC;gBACT,GAAG,EAAE,WAAW,CAAC,GAAG;gBACpB,KAAK,EAAE,WAAW,CAAC,KAAK;gBACxB,QAAQ,EAAE,WAAW,CAAC,QAAQ;gBAC9B,KAAK;aACN,CAAC,CAAC;YAEH,+BAA+B;YAC/B,IAAI,KAAK,GAAG,QAAQ,EAAE,CAAC;gBACrB,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,WAAW,CAAC,KAAK,EAAE,UAAU,EAAE,OAAO,EAAE,KAAK,CAAC,gBAAgB,EAAE,KAAK,CAAC,gBAAgB,EAAE,YAAY,CAAC,CAAC;gBAExI,KAAK,MAAM,IAAI,IAAI,QAAQ,EAAE,CAAC;oBAC5B,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;oBAClB,KAAK,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC;gBAChC,CAAC;gBAED,IAAI,KAAK,CAAC,aAAa,EAAE,CAAC;oBACxB,KAAK,MAAM,IAAI,IAAI,WAAW,CAAC,KAAK,EAAE,CAAC;wBACrC,QAAQ,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,GAAG,EAAE,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC;oBACzC,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC;QAED,0FAA0F;QAC1F,OAAO;YACL,KAAK;YACL,WAAW,EAAE,OAAO,CAAC,IAAI;YACzB,OAAO,EAAE,KAAK,CAAC,MAAM;YACrB,GAAG,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;SACpD,CAAC;IACJ,CAAC;IAEO,WAAW,CACjB,KAAe,EACf,UAAkB,EAClB,OAAoB,EACpB,eAAqC,EACrC,eAAqC,EACrC,YAAiC;QAEjC,MAAM,QAAQ,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE;YACrC,IAAI,CAAC;gBACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,IAAI,CAAC,CAAC;gBAC7B,IAAI,MAAM,CAAC,MAAM,KAAK,UAAU;oBAAE,OAAO,KAAK,CAAC;gBAC/C,IAAI,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC;oBAAE,OAAO,KAAK,CAAC;gBACpC,IAAI,CAAC,eAAe,CAAC,IAAI,EAAE,eAAe,EAAE,eAAe,CAAC;oBAAE,OAAO,KAAK,CAAC;gBAC3E,IAAI,YAAY,IAAI,CAAC,YAAY,CAAC,SAAS,CAAC,MAAM,CAAC,QAAQ,CAAC;oBAAE,OAAO,KAAK,CAAC;gBAC3E,OAAO,IAAI,CAAC;YACd,CAAC;YAAC,MAAM,CAAC;gBACP,OAAO,KAAK,CAAC;YACf,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,0DAA0D;QAC1D,OAAO,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;YAC5B,MAAM,IAAI,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YAClC,MAAM,IAAI,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YAClC,OAAO,IAAI,GAAG,IAAI,CAAC;QACrB,CAAC,CAAC,CAAC;IACL,CAAC;IAED,KAAK,CAAC,YAAY,CAChB,KAAiB,EACjB,UAAkB,EAClB,QAAgB,EAChB,YAAiC;QAEjC,kEAAkE;QAClE,IAAI,WAAW,GAAG,MAAM,IAAI,CAAC,mBAAmB,CAAC,UAAU,EAAE,IAAI,CAAC,gBAAgB,CAAC,CAAC;QAEpF,IAAI,WAAW,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC7B,GAAG,CAAC,IAAI,CAAC,uCAAuC,CAAC,CAAC;YAClD,OAAO,IAAI,CAAC,cAAc,CAAC,KAAK,EAAE,UAAU,EAAE,KAAK,CAAC,SAAS,IAAI,CAAC,EAAE,QAAQ,EAAE,KAAK,EAAE,YAAY,CAAC,CAAC;QACrG,CAAC;QAED,qBAAqB;QACrB,WAAW,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,EAAE,CACvC,eAAe,CAAC,GAAG,EAAE,KAAK,CAAC,gBAAgB,EAAE,KAAK,CAAC,gBAAgB,CAAC,CACrE,CAAC;QAEF,MAAM,UAAU,GAAG,WAAW,CAAC,MAAM,CAAC;QACtC,MAAM,OAAO,GAAG,WAAW,CAAC,KAAK,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC;QAC/C,MAAM,KAAK,GAAsB,EAAE,CAAC;QACpC,MAAM,QAAQ,GAAe,EAAE,CAAC;QAEhC,KAAK,MAAM,GAAG,IAAI,OAAO,EAAE,CAAC;YAC1B,IAAI,KAAK,CAAC,MAAM,IAAI,QAAQ;gBAAE,MAAM;YAEpC,IAAI,YAAY,IAAI,CAAC,YAAY,CAAC,SAAS,CAAC,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;gBAAE,SAAS;YAE7E,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;YAEpD,IAAI,CAAC;gBACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;gBACvC,OAAO,EAAE,CAAC;gBAEV,IAAI,CAAC,MAAM,CAAC,KAAK,EAAE,CAAC;oBAClB,KAAK,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,MAAM,CAAC,GAAG,EAAE,KAAK,EAAE,MAAM,CAAC,KAAK,EAAE,QAAQ,EAAE,MAAM,CAAC,QAAQ,EAAE,KAAK,EAAE,CAAC,EAAE,CAAC,CAAC;oBAE1F,IAAI,KAAK,CAAC,aAAa,EAAE,CAAC;wBACxB,KAAK,MAAM,IAAI,IAAI,MAAM,CAAC,KAAK,EAAE,CAAC;4BAChC,QAAQ,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,GAAG,EAAE,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC;wBACzC,CAAC;oBACH,CAAC;gBACH,CAAC;YACH,CAAC;YAAC,OAAO,GAAG,EAAE,CAAC;gBACb,OAAO,EAAE,CAAC;gBACV,GAAG,CAAC,IAAI,CAAC,sBAAsB,EAAE,EAAE,GAAG,EAAE,KAAK,EAAE,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YAChE,CAAC;QACH,CAAC;QAED,OAAO;YACL,KAAK;YACL,WAAW,EAAE,UAAU;YACvB,OAAO,EAAE,KAAK,CAAC,MAAM;YACrB,GAAG,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;SACpD,CAAC;IACJ,CAAC;IAEO,KAAK,CAAC,mBAAmB,CAAC,MAAc,EAAE,SAAoC;QACpF,MAAM,gBAAgB,GAAa,EAAE,CAAC;QAEtC,2EAA2E;QAC3E,IAAI,SAAS,EAAE,CAAC;YACd,gBAAgB,CAAC,IAAI,CAAC,GAAG,2BAA2B,CAAC,SAAS,CAAC,CAAC,CAAC;QACnE,CAAC;QAED,uBAAuB;QACvB,IAAI,gBAAgB,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAClC,gBAAgB,CAAC,IAAI,CAAC,GAAG,MAAM,cAAc,CAAC,CAAC;QACjD,CAAC;QAED,MAAM,OAAO,GAAa,EAAE,CAAC;QAE7B,KAAK,MAAM,UAAU,IAAI,gBAAgB,EAAE,CAAC;YAC1C,IAAI,CAAC;gBACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,UAAU,CAAC,CAAC;gBACjD,IAAI,MAAM,CAAC,UAAU,KAAK,GAAG;oBAAE,SAAS;gBAExC,gCAAgC;gBAChC,MAAM,SAAS,GAAG,iBAAiB,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;gBACjD,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBACzB,yBAAyB;oBACzB,KAAK,MAAM,MAAM,IAAI,SAAS,EAAE,CAAC;wBAC/B,IAAI,CAAC;4BACH,MAAM,SAAS,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC;4BAChD,IAAI,SAAS,CAAC,UAAU,KAAK,GAAG,EAAE,CAAC;gCACjC,OAAO,CAAC,IAAI,CAAC,GAAG,YAAY,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC;4BAChD,CAAC;wBACH,CAAC;wBAAC,MAAM,CAAC;4BACP,2BAA2B;wBAC7B,CAAC;oBACH,CAAC;gBACH,CAAC;qBAAM,CAAC;oBACN,OAAO,CAAC,IAAI,CAAC,GAAG,YAAY,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC;gBAC7C,CAAC;YACH,CAAC;YAAC,MAAM,CAAC;gBACP,8BAA8B;YAChC,CAAC;QACH,CAAC;QAED,OAAO,OAAO,CAAC;IACjB,CAAC;CACF;AAED,MAAM,iBAAiB,GAAG,CAAC,QAAQ,EAAE,SAAS,EAAE,OAAO,EAAE,aAAa,CAAC,CAAC;AAExE,SAAS,SAAS,CAAC,GAAW;IAC5B,MAAM,IAAI,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC,WAAW,EAAE,CAAC;IACjD,OAAO,iBAAiB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC;AACvD,CAAC"}
@@ -0,0 +1,15 @@
1
+ export declare function splitIntoBlocks(markdown: string): string[];
2
+ export declare function normalizeBlockText(text: string): string;
3
+ interface PageInput {
4
+ url: string;
5
+ markdown: string;
6
+ }
7
+ interface PageOutput {
8
+ url: string;
9
+ markdown: string;
10
+ }
11
+ export declare function deduplicatePages(pages: PageInput[], domain?: string): PageOutput[];
12
+ export declare function getStoredBoilerplate(domain: string): string[];
13
+ export declare function storeBoilerplate(domain: string, hashes: string[]): void;
14
+ export {};
15
+ //# sourceMappingURL=dedup.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"dedup.d.ts","sourceRoot":"","sources":["../../src/crawl/dedup.ts"],"names":[],"mappings":"AAGA,wBAAgB,eAAe,CAAC,QAAQ,EAAE,MAAM,GAAG,MAAM,EAAE,CA2B1D;AAED,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAEvD;AAMD,UAAU,SAAS;IACjB,GAAG,EAAE,MAAM,CAAC;IACZ,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,UAAU,UAAU;IAClB,GAAG,EAAE,MAAM,CAAC;IACZ,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,wBAAgB,gBAAgB,CAAC,KAAK,EAAE,SAAS,EAAE,EAAE,MAAM,CAAC,EAAE,MAAM,GAAG,UAAU,EAAE,CA+ClF;AAED,wBAAgB,oBAAoB,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,EAAE,CAI7D;AAED,wBAAgB,gBAAgB,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,GAAG,IAAI,CAavE"}
@@ -0,0 +1,93 @@
1
+ import { createHash } from 'node:crypto';
2
+ import { getDatabase } from '../cache/db.js';
3
+ export function splitIntoBlocks(markdown) {
4
+ if (!markdown.trim())
5
+ return [];
6
+ const lines = markdown.split('\n');
7
+ const headingIndices = [];
8
+ for (let i = 0; i < lines.length; i++) {
9
+ const match = lines[i].match(/^(#{1,6})\s+/);
10
+ if (match) {
11
+ headingIndices.push({ level: match[1].length, lineIdx: i });
12
+ }
13
+ }
14
+ // If no headings, split by double-newline (paragraph blocks)
15
+ if (headingIndices.length === 0) {
16
+ return markdown.split(/\n\n+/).map((b) => b.trim()).filter(Boolean);
17
+ }
18
+ // Non-overlapping split: each heading starts a new block, ending at the next heading of ANY level
19
+ const blocks = [];
20
+ for (let i = 0; i < headingIndices.length; i++) {
21
+ const start = headingIndices[i].lineIdx;
22
+ const end = i + 1 < headingIndices.length ? headingIndices[i + 1].lineIdx : lines.length;
23
+ blocks.push(lines.slice(start, end).join('\n').trim());
24
+ }
25
+ return blocks.filter(Boolean);
26
+ }
27
+ export function normalizeBlockText(text) {
28
+ return text.toLowerCase().replace(/\s+/g, ' ').trim();
29
+ }
30
+ function hashBlock(text) {
31
+ return createHash('sha256').update(normalizeBlockText(text)).digest('hex');
32
+ }
33
+ export function deduplicatePages(pages, domain) {
34
+ if (pages.length <= 1)
35
+ return pages.map((p) => ({ url: p.url, markdown: p.markdown }));
36
+ // Pre-load stored boilerplate hashes for this domain
37
+ const storedHashes = domain ? getStoredBoilerplate(domain) : [];
38
+ const boilerplateHashes = new Set(storedHashes);
39
+ // Split each page into blocks and hash them
40
+ const pageBlocks = pages.map((page) => ({
41
+ url: page.url,
42
+ blocks: splitIntoBlocks(page.markdown),
43
+ }));
44
+ // Count how many pages each block hash appears in
45
+ const hashPageCount = new Map();
46
+ for (const page of pageBlocks) {
47
+ const seenHashes = new Set();
48
+ for (const block of page.blocks) {
49
+ const h = hashBlock(block);
50
+ if (!seenHashes.has(h)) {
51
+ seenHashes.add(h);
52
+ hashPageCount.set(h, (hashPageCount.get(h) ?? 0) + 1);
53
+ }
54
+ }
55
+ }
56
+ // Mark hashes appearing in >50% of pages as boilerplate
57
+ const threshold = pages.length / 2;
58
+ for (const [hash, count] of hashPageCount) {
59
+ if (count > threshold) {
60
+ boilerplateHashes.add(hash);
61
+ }
62
+ }
63
+ // Store updated boilerplate hashes for this domain
64
+ if (domain) {
65
+ storeBoilerplate(domain, Array.from(boilerplateHashes));
66
+ }
67
+ // Strip boilerplate blocks from each page
68
+ return pageBlocks.map((page) => {
69
+ const filtered = page.blocks.filter((block) => !boilerplateHashes.has(hashBlock(block)));
70
+ return {
71
+ url: page.url,
72
+ markdown: filtered.join('\n\n'),
73
+ };
74
+ });
75
+ }
76
+ export function getStoredBoilerplate(domain) {
77
+ const db = getDatabase();
78
+ const rows = db.prepare('SELECT block_hash FROM domain_boilerplate WHERE domain = ?').all(domain);
79
+ return rows.map(r => r.block_hash);
80
+ }
81
+ export function storeBoilerplate(domain, hashes) {
82
+ const db = getDatabase();
83
+ const del = db.prepare('DELETE FROM domain_boilerplate WHERE domain = ?');
84
+ const insert = db.prepare('INSERT OR IGNORE INTO domain_boilerplate (domain, block_hash, sample_text) VALUES (?, ?, ?)');
85
+ const tx = db.transaction((items) => {
86
+ del.run(domain);
87
+ for (const hash of items) {
88
+ insert.run(domain, hash, null);
89
+ }
90
+ });
91
+ tx(hashes);
92
+ }
93
+ //# sourceMappingURL=dedup.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"dedup.js","sourceRoot":"","sources":["../../src/crawl/dedup.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AACzC,OAAO,EAAE,WAAW,EAAE,MAAM,gBAAgB,CAAC;AAE7C,MAAM,UAAU,eAAe,CAAC,QAAgB;IAC9C,IAAI,CAAC,QAAQ,CAAC,IAAI,EAAE;QAAE,OAAO,EAAE,CAAC;IAEhC,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IACnC,MAAM,cAAc,GAAyC,EAAE,CAAC;IAEhE,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,cAAc,CAAC,CAAC;QAC7C,IAAI,KAAK,EAAE,CAAC;YACV,cAAc,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,EAAE,OAAO,EAAE,CAAC,EAAE,CAAC,CAAC;QAC9D,CAAC;IACH,CAAC;IAED,6DAA6D;IAC7D,IAAI,cAAc,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAChC,OAAO,QAAQ,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IACtE,CAAC;IAED,kGAAkG;IAClG,MAAM,MAAM,GAAa,EAAE,CAAC;IAC5B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,cAAc,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC/C,MAAM,KAAK,GAAG,cAAc,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC;QACxC,MAAM,GAAG,GAAG,CAAC,GAAG,CAAC,GAAG,cAAc,CAAC,MAAM,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,KAAK,CAAC,MAAM,CAAC;QACzF,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;IACzD,CAAC;IAED,OAAO,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;AAChC,CAAC;AAED,MAAM,UAAU,kBAAkB,CAAC,IAAY;IAC7C,OAAO,IAAI,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;AACxD,CAAC;AAED,SAAS,SAAS,CAAC,IAAY;IAC7B,OAAO,UAAU,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,kBAAkB,CAAC,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;AAC7E,CAAC;AAYD,MAAM,UAAU,gBAAgB,CAAC,KAAkB,EAAE,MAAe;IAClE,IAAI,KAAK,CAAC,MAAM,IAAI,CAAC;QAAE,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,GAAG,EAAE,QAAQ,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC;IAEvF,qDAAqD;IACrD,MAAM,YAAY,GAAG,MAAM,CAAC,CAAC,CAAC,oBAAoB,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;IAChE,MAAM,iBAAiB,GAAG,IAAI,GAAG,CAAS,YAAY,CAAC,CAAC;IAExD,4CAA4C;IAC5C,MAAM,UAAU,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;QACtC,GAAG,EAAE,IAAI,CAAC,GAAG;QACb,MAAM,EAAE,eAAe,CAAC,IAAI,CAAC,QAAQ,CAAC;KACvC,CAAC,CAAC,CAAC;IAEJ,kDAAkD;IAClD,MAAM,aAAa,GAAG,IAAI,GAAG,EAAkB,CAAC;IAChD,KAAK,MAAM,IAAI,IAAI,UAAU,EAAE,CAAC;QAC9B,MAAM,UAAU,GAAG,IAAI,GAAG,EAAU,CAAC;QACrC,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,MAAM,EAAE,CAAC;YAChC,MAAM,CAAC,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC;YAC3B,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;gBACvB,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;gBAClB,aAAa,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;YACxD,CAAC;QACH,CAAC;IACH,CAAC;IAED,wDAAwD;IACxD,MAAM,SAAS,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC;IACnC,KAAK,MAAM,CAAC,IAAI,EAAE,KAAK,CAAC,IAAI,aAAa,EAAE,CAAC;QAC1C,IAAI,KAAK,GAAG,SAAS,EAAE,CAAC;YACtB,iBAAiB,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;QAC9B,CAAC;IACH,CAAC;IAED,mDAAmD;IACnD,IAAI,MAAM,EAAE,CAAC;QACX,gBAAgB,CAAC,MAAM,EAAE,KAAK,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAC,CAAC;IAC1D,CAAC;IAED,0CAA0C;IAC1C,OAAO,UAAU,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE;QAC7B,MAAM,QAAQ,GAAG,IAAI,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC,iBAAiB,CAAC,GAAG,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QACzF,OAAO;YACL,GAAG,EAAE,IAAI,CAAC,GAAG;YACb,QAAQ,EAAE,QAAQ,CAAC,IAAI,CAAC,MAAM,CAAC;SAChC,CAAC;IACJ,CAAC,CAAC,CAAC;AACL,CAAC;AAED,MAAM,UAAU,oBAAoB,CAAC,MAAc;IACjD,MAAM,EAAE,GAAG,WAAW,EAAE,CAAC;IACzB,MAAM,IAAI,GAAG,EAAE,CAAC,OAAO,CAAC,4DAA4D,CAAC,CAAC,GAAG,CAAC,MAAM,CAA6B,CAAC;IAC9H,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC;AACrC,CAAC;AAED,MAAM,UAAU,gBAAgB,CAAC,MAAc,EAAE,MAAgB;IAC/D,MAAM,EAAE,GAAG,WAAW,EAAE,CAAC;IACzB,MAAM,GAAG,GAAG,EAAE,CAAC,OAAO,CAAC,iDAAiD,CAAC,CAAC;IAC1E,MAAM,MAAM,GAAG,EAAE,CAAC,OAAO,CACvB,6FAA6F,CAC9F,CAAC;IACF,MAAM,EAAE,GAAG,EAAE,CAAC,WAAW,CAAC,CAAC,KAAe,EAAE,EAAE;QAC5C,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;QAChB,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,MAAM,CAAC,GAAG,CAAC,MAAM,EAAE,IAAI,EAAE,IAAI,CAAC,CAAC;QACjC,CAAC;IACH,CAAC,CAAC,CAAC;IACH,EAAE,CAAC,MAAM,CAAC,CAAC;AACb,CAAC"}
@@ -0,0 +1,17 @@
1
+ import type { MapOutput } from '../types.js';
2
+ interface MapInput {
3
+ url: string;
4
+ max_depth?: number;
5
+ max_pages?: number;
6
+ include_patterns?: string[];
7
+ exclude_patterns?: string[];
8
+ }
9
+ export type LightFetchFn = (url: string) => Promise<{
10
+ html: string;
11
+ finalUrl: string;
12
+ statusCode: number;
13
+ }>;
14
+ export declare function extractLinks(html: string, origin: string): string[];
15
+ export declare function mapUrls(input: MapInput, fetchFn: LightFetchFn): Promise<MapOutput>;
16
+ export {};
17
+ //# sourceMappingURL=mapper.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"mapper.d.ts","sourceRoot":"","sources":["../../src/crawl/mapper.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAI7C,UAAU,QAAQ;IAChB,GAAG,EAAE,MAAM,CAAC;IACZ,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,gBAAgB,CAAC,EAAE,MAAM,EAAE,CAAC;IAC5B,gBAAgB,CAAC,EAAE,MAAM,EAAE,CAAC;CAC7B;AAED,MAAM,MAAM,YAAY,GAAG,CAAC,GAAG,EAAE,MAAM,KAAK,OAAO,CAAC;IAAE,IAAI,EAAE,MAAM,CAAC;IAAC,QAAQ,EAAE,MAAM,CAAC;IAAC,UAAU,EAAE,MAAM,CAAA;CAAE,CAAC,CAAC;AAI5G,wBAAgB,YAAY,CAAC,IAAI,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,GAAG,MAAM,EAAE,CA+CnE;AAED,wBAAsB,OAAO,CAAC,KAAK,EAAE,QAAQ,EAAE,OAAO,EAAE,YAAY,GAAG,OAAO,CAAC,SAAS,CAAC,CAqFxF"}
@@ -0,0 +1,178 @@
1
+ import { parseHTML } from 'linkedom';
2
+ import { matchesPatterns } from './url-utils.js';
3
+ import { parseSitemap, parseSitemapIndex, extractSitemapUrlFromRobots } from './sitemap.js';
4
+ import { createLogger } from '../logger.js';
5
+ const log = createLogger('crawl');
6
+ const IGNORED_PROTOCOLS = ['javascript:', 'mailto:', 'tel:', 'data:', 'blob:', 'ftp:'];
7
+ export function extractLinks(html, origin) {
8
+ if (!html || !html.trim())
9
+ return [];
10
+ try {
11
+ const { document: doc } = parseHTML(html);
12
+ const anchors = doc.querySelectorAll('a[href]');
13
+ const seen = new Set();
14
+ const links = [];
15
+ const parsedOrigin = new URL(origin).origin;
16
+ for (const anchor of Array.from(anchors)) {
17
+ const href = anchor.getAttribute('href');
18
+ if (!href)
19
+ continue;
20
+ const trimmed = href.trim();
21
+ if (!trimmed)
22
+ continue;
23
+ // Skip fragment-only links
24
+ if (trimmed.startsWith('#'))
25
+ continue;
26
+ // Skip non-http protocols
27
+ if (IGNORED_PROTOCOLS.some((p) => trimmed.toLowerCase().startsWith(p)))
28
+ continue;
29
+ try {
30
+ const resolved = new URL(trimmed, origin);
31
+ // Same-origin check
32
+ if (resolved.origin !== parsedOrigin)
33
+ continue;
34
+ // Strip fragment, keep path + query
35
+ resolved.hash = '';
36
+ const normalized = resolved.href;
37
+ if (!seen.has(normalized)) {
38
+ seen.add(normalized);
39
+ links.push(normalized);
40
+ }
41
+ }
42
+ catch {
43
+ log.debug('Failed to resolve URL', { href: trimmed, origin });
44
+ }
45
+ }
46
+ return links;
47
+ }
48
+ catch (err) {
49
+ log.debug('Failed to parse HTML for link extraction', { error: String(err) });
50
+ return [];
51
+ }
52
+ }
53
+ export async function mapUrls(input, fetchFn) {
54
+ const maxDepth = input.max_depth ?? 3;
55
+ const maxPages = input.max_pages ?? 200;
56
+ let origin;
57
+ try {
58
+ origin = new URL(input.url).origin;
59
+ }
60
+ catch (err) {
61
+ return {
62
+ urls: [],
63
+ total_found: 0,
64
+ sitemap_found: false,
65
+ error: `Invalid seed URL: ${String(err)}`,
66
+ };
67
+ }
68
+ const discovered = new Set([input.url]);
69
+ const queued = new Set([input.url]);
70
+ const queue = [{ url: input.url, depth: 0 }];
71
+ let sitemapFound = false;
72
+ // Phase 1: Sitemap discovery (best-effort, errors are non-fatal)
73
+ try {
74
+ const sitemapUrls = await discoverSitemapUrls(origin, fetchFn);
75
+ if (sitemapUrls.length > 0) {
76
+ sitemapFound = true;
77
+ for (const sitemapUrl of sitemapUrls) {
78
+ if (discovered.size >= maxPages)
79
+ break;
80
+ if (matchesPatterns(sitemapUrl, input.include_patterns, input.exclude_patterns)) {
81
+ discovered.add(sitemapUrl);
82
+ // Also queue sitemap URLs for BFS traversal so their links are explored
83
+ if (!queued.has(sitemapUrl)) {
84
+ queued.add(sitemapUrl);
85
+ queue.push({ url: sitemapUrl, depth: 0 });
86
+ }
87
+ }
88
+ }
89
+ }
90
+ }
91
+ catch (err) {
92
+ log.debug('Sitemap discovery failed, continuing with BFS only', { error: String(err) });
93
+ }
94
+ // Phase 2: BFS link traversal
95
+ let seedError;
96
+ while (queue.length > 0 && discovered.size < maxPages) {
97
+ const current = queue.shift();
98
+ if (current.depth > maxDepth)
99
+ continue;
100
+ try {
101
+ const { html } = await fetchFn(current.url);
102
+ const links = extractLinks(html, origin);
103
+ for (const link of links) {
104
+ if (discovered.size >= maxPages)
105
+ break;
106
+ if (discovered.has(link))
107
+ continue;
108
+ if (!matchesPatterns(link, input.include_patterns, input.exclude_patterns))
109
+ continue;
110
+ discovered.add(link);
111
+ // Only queue for further traversal if we haven't hit max depth
112
+ if (current.depth + 1 <= maxDepth && !queued.has(link)) {
113
+ queued.add(link);
114
+ queue.push({ url: link, depth: current.depth + 1 });
115
+ }
116
+ }
117
+ }
118
+ catch (err) {
119
+ if (current.url === input.url && current.depth === 0) {
120
+ seedError = String(err);
121
+ log.warn('Seed URL fetch failed during map', { url: current.url, error: String(err) });
122
+ }
123
+ else {
124
+ log.debug('Child page fetch failed during map, skipping', { url: current.url, error: String(err) });
125
+ }
126
+ }
127
+ }
128
+ const urls = Array.from(discovered);
129
+ return {
130
+ urls,
131
+ total_found: urls.length,
132
+ sitemap_found: sitemapFound,
133
+ ...(seedError !== undefined ? { error: seedError } : {}),
134
+ };
135
+ }
136
+ async function discoverSitemapUrls(origin, fetchFn) {
137
+ const sitemapLocations = [];
138
+ // Try robots.txt first for Sitemap directives
139
+ try {
140
+ const { html: robotsTxt } = await fetchFn(`${origin}/robots.txt`);
141
+ const fromRobots = extractSitemapUrlFromRobots(robotsTxt);
142
+ sitemapLocations.push(...fromRobots);
143
+ }
144
+ catch {
145
+ log.debug('robots.txt not found, trying default sitemap location');
146
+ }
147
+ // Fallback to default /sitemap.xml
148
+ if (sitemapLocations.length === 0) {
149
+ sitemapLocations.push(`${origin}/sitemap.xml`);
150
+ }
151
+ const allUrls = [];
152
+ for (const sitemapUrl of sitemapLocations) {
153
+ try {
154
+ const { html: sitemapXml } = await fetchFn(sitemapUrl);
155
+ // Check if it's a sitemap index (contains sub-sitemaps)
156
+ const indexUrls = parseSitemapIndex(sitemapXml);
157
+ if (indexUrls.length > 0) {
158
+ for (const subUrl of indexUrls) {
159
+ try {
160
+ const { html: subXml } = await fetchFn(subUrl);
161
+ allUrls.push(...parseSitemap(subXml));
162
+ }
163
+ catch {
164
+ log.debug('Failed to fetch sub-sitemap', { url: subUrl });
165
+ }
166
+ }
167
+ }
168
+ else {
169
+ allUrls.push(...parseSitemap(sitemapXml));
170
+ }
171
+ }
172
+ catch {
173
+ log.debug('Sitemap fetch failed', { url: sitemapUrl });
174
+ }
175
+ }
176
+ return allUrls;
177
+ }
178
+ //# sourceMappingURL=mapper.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"mapper.js","sourceRoot":"","sources":["../../src/crawl/mapper.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AACrC,OAAO,EAAE,eAAe,EAAE,MAAM,gBAAgB,CAAC;AACjD,OAAO,EAAE,YAAY,EAAE,iBAAiB,EAAE,2BAA2B,EAAE,MAAM,cAAc,CAAC;AAC5F,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAG5C,MAAM,GAAG,GAAG,YAAY,CAAC,OAAO,CAAC,CAAC;AAYlC,MAAM,iBAAiB,GAAG,CAAC,aAAa,EAAE,SAAS,EAAE,MAAM,EAAE,OAAO,EAAE,OAAO,EAAE,MAAM,CAAC,CAAC;AAEvF,MAAM,UAAU,YAAY,CAAC,IAAY,EAAE,MAAc;IACvD,IAAI,CAAC,IAAI,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE;QAAE,OAAO,EAAE,CAAC;IAErC,IAAI,CAAC;QACH,MAAM,EAAE,QAAQ,EAAE,GAAG,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;QAC1C,MAAM,OAAO,GAAG,GAAG,CAAC,gBAAgB,CAAC,SAAS,CAAC,CAAC;QAChD,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAC;QAC/B,MAAM,KAAK,GAAa,EAAE,CAAC;QAC3B,MAAM,YAAY,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC;QAE5C,KAAK,MAAM,MAAM,IAAI,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC;YACzC,MAAM,IAAI,GAAG,MAAM,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;YACzC,IAAI,CAAC,IAAI;gBAAE,SAAS;YAEpB,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;YAC5B,IAAI,CAAC,OAAO;gBAAE,SAAS;YAEvB,2BAA2B;YAC3B,IAAI,OAAO,CAAC,UAAU,CAAC,GAAG,CAAC;gBAAE,SAAS;YAEtC,0BAA0B;YAC1B,IAAI,iBAAiB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;gBAAE,SAAS;YAEjF,IAAI,CAAC;gBACH,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC,OAAO,EAAE,MAAM,CAAC,CAAC;gBAE1C,oBAAoB;gBACpB,IAAI,QAAQ,CAAC,MAAM,KAAK,YAAY;oBAAE,SAAS;gBAE/C,oCAAoC;gBACpC,QAAQ,CAAC,IAAI,GAAG,EAAE,CAAC;gBACnB,MAAM,UAAU,GAAG,QAAQ,CAAC,IAAI,CAAC;gBAEjC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,UAAU,CAAC,EAAE,CAAC;oBAC1B,IAAI,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;oBACrB,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;gBACzB,CAAC;YACH,CAAC;YAAC,MAAM,CAAC;gBACP,GAAG,CAAC,KAAK,CAAC,uBAAuB,EAAE,EAAE,IAAI,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC,CAAC;YAChE,CAAC;QACH,CAAC;QAED,OAAO,KAAK,CAAC;IACf,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,GAAG,CAAC,KAAK,CAAC,0CAA0C,EAAE,EAAE,KAAK,EAAE,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;QAC9E,OAAO,EAAE,CAAC;IACZ,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,OAAO,CAAC,KAAe,EAAE,OAAqB;IAClE,MAAM,QAAQ,GAAG,KAAK,CAAC,SAAS,IAAI,CAAC,CAAC;IACtC,MAAM,QAAQ,GAAG,KAAK,CAAC,SAAS,IAAI,GAAG,CAAC;IAExC,IAAI,MAAc,CAAC;IACnB,IAAI,CAAC;QACH,MAAM,GAAG,IAAI,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC;IACrC,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,OAAO;YACL,IAAI,EAAE,EAAE;YACR,WAAW,EAAE,CAAC;YACd,aAAa,EAAE,KAAK;YACpB,KAAK,EAAE,qBAAqB,MAAM,CAAC,GAAG,CAAC,EAAE;SAC1C,CAAC;IACJ,CAAC;IAED,MAAM,UAAU,GAAG,IAAI,GAAG,CAAS,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC;IAChD,MAAM,MAAM,GAAG,IAAI,GAAG,CAAS,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC;IAC5C,MAAM,KAAK,GAA0C,CAAC,EAAE,GAAG,EAAE,KAAK,CAAC,GAAG,EAAE,KAAK,EAAE,CAAC,EAAE,CAAC,CAAC;IACpF,IAAI,YAAY,GAAG,KAAK,CAAC;IAEzB,iEAAiE;IACjE,IAAI,CAAC;QACH,MAAM,WAAW,GAAG,MAAM,mBAAmB,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QAC/D,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC3B,YAAY,GAAG,IAAI,CAAC;YACpB,KAAK,MAAM,UAAU,IAAI,WAAW,EAAE,CAAC;gBACrC,IAAI,UAAU,CAAC,IAAI,IAAI,QAAQ;oBAAE,MAAM;gBACvC,IAAI,eAAe,CAAC,UAAU,EAAE,KAAK,CAAC,gBAAgB,EAAE,KAAK,CAAC,gBAAgB,CAAC,EAAE,CAAC;oBAChF,UAAU,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;oBAC3B,wEAAwE;oBACxE,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,UAAU,CAAC,EAAE,CAAC;wBAC5B,MAAM,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;wBACvB,KAAK,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,UAAU,EAAE,KAAK,EAAE,CAAC,EAAE,CAAC,CAAC;oBAC5C,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,GAAG,CAAC,KAAK,CAAC,oDAAoD,EAAE,EAAE,KAAK,EAAE,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IAC1F,CAAC;IAED,8BAA8B;IAC9B,IAAI,SAA6B,CAAC;IAElC,OAAO,KAAK,CAAC,MAAM,GAAG,CAAC,IAAI,UAAU,CAAC,IAAI,GAAG,QAAQ,EAAE,CAAC;QACtD,MAAM,OAAO,GAAG,KAAK,CAAC,KAAK,EAAG,CAAC;QAE/B,IAAI,OAAO,CAAC,KAAK,GAAG,QAAQ;YAAE,SAAS;QAEvC,IAAI,CAAC;YACH,MAAM,EAAE,IAAI,EAAE,GAAG,MAAM,OAAO,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;YAC5C,MAAM,KAAK,GAAG,YAAY,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;YAEzC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;gBACzB,IAAI,UAAU,CAAC,IAAI,IAAI,QAAQ;oBAAE,MAAM;gBACvC,IAAI,UAAU,CAAC,GAAG,CAAC,IAAI,CAAC;oBAAE,SAAS;gBACnC,IAAI,CAAC,eAAe,CAAC,IAAI,EAAE,KAAK,CAAC,gBAAgB,EAAE,KAAK,CAAC,gBAAgB,CAAC;oBAAE,SAAS;gBAErF,UAAU,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;gBAErB,+DAA+D;gBAC/D,IAAI,OAAO,CAAC,KAAK,GAAG,CAAC,IAAI,QAAQ,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC;oBACvD,MAAM,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;oBACjB,KAAK,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,IAAI,EAAE,KAAK,EAAE,OAAO,CAAC,KAAK,GAAG,CAAC,EAAE,CAAC,CAAC;gBACtD,CAAC;YACH,CAAC;QACH,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,IAAI,OAAO,CAAC,GAAG,KAAK,KAAK,CAAC,GAAG,IAAI,OAAO,CAAC,KAAK,KAAK,CAAC,EAAE,CAAC;gBACrD,SAAS,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC;gBACxB,GAAG,CAAC,IAAI,CAAC,kCAAkC,EAAE,EAAE,GAAG,EAAE,OAAO,CAAC,GAAG,EAAE,KAAK,EAAE,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YACzF,CAAC;iBAAM,CAAC;gBACN,GAAG,CAAC,KAAK,CAAC,8CAA8C,EAAE,EAAE,GAAG,EAAE,OAAO,CAAC,GAAG,EAAE,KAAK,EAAE,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YACtG,CAAC;QACH,CAAC;IACH,CAAC;IAED,MAAM,IAAI,GAAG,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;IAEpC,OAAO;QACL,IAAI;QACJ,WAAW,EAAE,IAAI,CAAC,MAAM;QACxB,aAAa,EAAE,YAAY;QAC3B,GAAG,CAAC,SAAS,KAAK,SAAS,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,SAAS,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;KACzD,CAAC;AACJ,CAAC;AAED,KAAK,UAAU,mBAAmB,CAAC,MAAc,EAAE,OAAqB;IACtE,MAAM,gBAAgB,GAAa,EAAE,CAAC;IAEtC,8CAA8C;IAC9C,IAAI,CAAC;QACH,MAAM,EAAE,IAAI,EAAE,SAAS,EAAE,GAAG,MAAM,OAAO,CAAC,GAAG,MAAM,aAAa,CAAC,CAAC;QAClE,MAAM,UAAU,GAAG,2BAA2B,CAAC,SAAS,CAAC,CAAC;QAC1D,gBAAgB,CAAC,IAAI,CAAC,GAAG,UAAU,CAAC,CAAC;IACvC,CAAC;IAAC,MAAM,CAAC;QACP,GAAG,CAAC,KAAK,CAAC,uDAAuD,CAAC,CAAC;IACrE,CAAC;IAED,mCAAmC;IACnC,IAAI,gBAAgB,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAClC,gBAAgB,CAAC,IAAI,CAAC,GAAG,MAAM,cAAc,CAAC,CAAC;IACjD,CAAC;IAED,MAAM,OAAO,GAAa,EAAE,CAAC;IAE7B,KAAK,MAAM,UAAU,IAAI,gBAAgB,EAAE,CAAC;QAC1C,IAAI,CAAC;YACH,MAAM,EAAE,IAAI,EAAE,UAAU,EAAE,GAAG,MAAM,OAAO,CAAC,UAAU,CAAC,CAAC;YAEvD,wDAAwD;YACxD,MAAM,SAAS,GAAG,iBAAiB,CAAC,UAAU,CAAC,CAAC;YAChD,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACzB,KAAK,MAAM,MAAM,IAAI,SAAS,EAAE,CAAC;oBAC/B,IAAI,CAAC;wBACH,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,GAAG,MAAM,OAAO,CAAC,MAAM,CAAC,CAAC;wBAC/C,OAAO,CAAC,IAAI,CAAC,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,CAAC;oBACxC,CAAC;oBAAC,MAAM,CAAC;wBACP,GAAG,CAAC,KAAK,CAAC,6BAA6B,EAAE,EAAE,GAAG,EAAE,MAAM,EAAE,CAAC,CAAC;oBAC5D,CAAC;gBACH,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,OAAO,CAAC,IAAI,CAAC,GAAG,YAAY,CAAC,UAAU,CAAC,CAAC,CAAC;YAC5C,CAAC;QACH,CAAC;QAAC,MAAM,CAAC;YACP,GAAG,CAAC,KAAK,CAAC,sBAAsB,EAAE,EAAE,GAAG,EAAE,UAAU,EAAE,CAAC,CAAC;QACzD,CAAC;IACH,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC"}
@@ -0,0 +1,10 @@
1
+ export declare class RateLimiter {
2
+ private domains;
3
+ private robotsDelays;
4
+ setRobotsCrawlDelay(domain: string, delaySeconds: number): void;
5
+ acquire(url: string): Promise<() => void>;
6
+ private getOrCreateState;
7
+ private startRequest;
8
+ private processQueue;
9
+ }
10
+ //# sourceMappingURL=rate-limiter.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"rate-limiter.d.ts","sourceRoot":"","sources":["../../src/crawl/rate-limiter.ts"],"names":[],"mappings":"AAWA,qBAAa,WAAW;IACtB,OAAO,CAAC,OAAO,CAAkC;IACjD,OAAO,CAAC,YAAY,CAA6B;IAEjD,mBAAmB,CAAC,MAAM,EAAE,MAAM,EAAE,YAAY,EAAE,MAAM,GAAG,IAAI;IAIzD,OAAO,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,IAAI,CAAC;IAoB/C,OAAO,CAAC,gBAAgB;IA6BxB,OAAO,CAAC,YAAY;IAUpB,OAAO,CAAC,YAAY;CAarB"}