@staticn0va/wigolo 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (215) hide show
  1. package/LICENSE +74 -0
  2. package/README.md +272 -0
  3. package/dist/cache/db.d.ts +5 -0
  4. package/dist/cache/db.d.ts.map +1 -0
  5. package/dist/cache/db.js +97 -0
  6. package/dist/cache/db.js.map +1 -0
  7. package/dist/cache/store.d.ts +26 -0
  8. package/dist/cache/store.d.ts.map +1 -0
  9. package/dist/cache/store.js +214 -0
  10. package/dist/cache/store.js.map +1 -0
  11. package/dist/cli/daemon.d.ts +2 -0
  12. package/dist/cli/daemon.d.ts.map +1 -0
  13. package/dist/cli/daemon.js +5 -0
  14. package/dist/cli/daemon.js.map +1 -0
  15. package/dist/cli/health.d.ts +2 -0
  16. package/dist/cli/health.d.ts.map +1 -0
  17. package/dist/cli/health.js +5 -0
  18. package/dist/cli/health.js.map +1 -0
  19. package/dist/cli/index.d.ts +7 -0
  20. package/dist/cli/index.d.ts.map +1 -0
  21. package/dist/cli/index.js +9 -0
  22. package/dist/cli/index.js.map +1 -0
  23. package/dist/cli/warmup.d.ts +11 -0
  24. package/dist/cli/warmup.d.ts.map +1 -0
  25. package/dist/cli/warmup.js +107 -0
  26. package/dist/cli/warmup.js.map +1 -0
  27. package/dist/config.d.ts +41 -0
  28. package/dist/config.d.ts.map +1 -0
  29. package/dist/config.js +66 -0
  30. package/dist/config.js.map +1 -0
  31. package/dist/crawl/crawler.d.ts +18 -0
  32. package/dist/crawl/crawler.d.ts.map +1 -0
  33. package/dist/crawl/crawler.js +228 -0
  34. package/dist/crawl/crawler.js.map +1 -0
  35. package/dist/crawl/dedup.d.ts +15 -0
  36. package/dist/crawl/dedup.d.ts.map +1 -0
  37. package/dist/crawl/dedup.js +93 -0
  38. package/dist/crawl/dedup.js.map +1 -0
  39. package/dist/crawl/mapper.d.ts +17 -0
  40. package/dist/crawl/mapper.d.ts.map +1 -0
  41. package/dist/crawl/mapper.js +178 -0
  42. package/dist/crawl/mapper.js.map +1 -0
  43. package/dist/crawl/rate-limiter.d.ts +10 -0
  44. package/dist/crawl/rate-limiter.d.ts.map +1 -0
  45. package/dist/crawl/rate-limiter.js +72 -0
  46. package/dist/crawl/rate-limiter.js.map +1 -0
  47. package/dist/crawl/robots.d.ts +9 -0
  48. package/dist/crawl/robots.d.ts.map +1 -0
  49. package/dist/crawl/robots.js +63 -0
  50. package/dist/crawl/robots.js.map +1 -0
  51. package/dist/crawl/sitemap.d.ts +4 -0
  52. package/dist/crawl/sitemap.d.ts.map +1 -0
  53. package/dist/crawl/sitemap.js +38 -0
  54. package/dist/crawl/sitemap.js.map +1 -0
  55. package/dist/crawl/url-utils.d.ts +3 -0
  56. package/dist/crawl/url-utils.d.ts.map +1 -0
  57. package/dist/crawl/url-utils.js +41 -0
  58. package/dist/crawl/url-utils.js.map +1 -0
  59. package/dist/extraction/defuddle.d.ts +3 -0
  60. package/dist/extraction/defuddle.d.ts.map +1 -0
  61. package/dist/extraction/defuddle.js +26 -0
  62. package/dist/extraction/defuddle.js.map +1 -0
  63. package/dist/extraction/extract.d.ts +5 -0
  64. package/dist/extraction/extract.d.ts.map +1 -0
  65. package/dist/extraction/extract.js +83 -0
  66. package/dist/extraction/extract.js.map +1 -0
  67. package/dist/extraction/jsonld.d.ts +4 -0
  68. package/dist/extraction/jsonld.d.ts.map +1 -0
  69. package/dist/extraction/jsonld.js +64 -0
  70. package/dist/extraction/jsonld.js.map +1 -0
  71. package/dist/extraction/markdown.d.ts +10 -0
  72. package/dist/extraction/markdown.d.ts.map +1 -0
  73. package/dist/extraction/markdown.js +107 -0
  74. package/dist/extraction/markdown.js.map +1 -0
  75. package/dist/extraction/pipeline.d.ts +11 -0
  76. package/dist/extraction/pipeline.d.ts.map +1 -0
  77. package/dist/extraction/pipeline.js +95 -0
  78. package/dist/extraction/pipeline.js.map +1 -0
  79. package/dist/extraction/readability.d.ts +3 -0
  80. package/dist/extraction/readability.d.ts.map +1 -0
  81. package/dist/extraction/readability.js +32 -0
  82. package/dist/extraction/readability.js.map +1 -0
  83. package/dist/extraction/schema.d.ts +7 -0
  84. package/dist/extraction/schema.d.ts.map +1 -0
  85. package/dist/extraction/schema.js +86 -0
  86. package/dist/extraction/schema.js.map +1 -0
  87. package/dist/extraction/site-extractors/docs-generic.d.ts +3 -0
  88. package/dist/extraction/site-extractors/docs-generic.d.ts.map +1 -0
  89. package/dist/extraction/site-extractors/docs-generic.js +104 -0
  90. package/dist/extraction/site-extractors/docs-generic.js.map +1 -0
  91. package/dist/extraction/site-extractors/github.d.ts +3 -0
  92. package/dist/extraction/site-extractors/github.d.ts.map +1 -0
  93. package/dist/extraction/site-extractors/github.js +107 -0
  94. package/dist/extraction/site-extractors/github.js.map +1 -0
  95. package/dist/extraction/site-extractors/mdn.d.ts +3 -0
  96. package/dist/extraction/site-extractors/mdn.d.ts.map +1 -0
  97. package/dist/extraction/site-extractors/mdn.js +58 -0
  98. package/dist/extraction/site-extractors/mdn.js.map +1 -0
  99. package/dist/extraction/site-extractors/stackoverflow.d.ts +3 -0
  100. package/dist/extraction/site-extractors/stackoverflow.d.ts.map +1 -0
  101. package/dist/extraction/site-extractors/stackoverflow.js +88 -0
  102. package/dist/extraction/site-extractors/stackoverflow.js.map +1 -0
  103. package/dist/extraction/trafilatura.d.ts +6 -0
  104. package/dist/extraction/trafilatura.d.ts.map +1 -0
  105. package/dist/extraction/trafilatura.js +105 -0
  106. package/dist/extraction/trafilatura.js.map +1 -0
  107. package/dist/fetch/auth.d.ts +8 -0
  108. package/dist/fetch/auth.d.ts.map +1 -0
  109. package/dist/fetch/auth.js +32 -0
  110. package/dist/fetch/auth.js.map +1 -0
  111. package/dist/fetch/browser-pool.d.ts +28 -0
  112. package/dist/fetch/browser-pool.d.ts.map +1 -0
  113. package/dist/fetch/browser-pool.js +138 -0
  114. package/dist/fetch/browser-pool.js.map +1 -0
  115. package/dist/fetch/content-check.d.ts +2 -0
  116. package/dist/fetch/content-check.d.ts.map +1 -0
  117. package/dist/fetch/content-check.js +62 -0
  118. package/dist/fetch/content-check.js.map +1 -0
  119. package/dist/fetch/http-client.d.ts +15 -0
  120. package/dist/fetch/http-client.d.ts.map +1 -0
  121. package/dist/fetch/http-client.js +146 -0
  122. package/dist/fetch/http-client.js.map +1 -0
  123. package/dist/fetch/router.d.ts +45 -0
  124. package/dist/fetch/router.d.ts.map +1 -0
  125. package/dist/fetch/router.js +89 -0
  126. package/dist/fetch/router.js.map +1 -0
  127. package/dist/index.d.ts +3 -0
  128. package/dist/index.d.ts.map +1 -0
  129. package/dist/index.js +22 -0
  130. package/dist/index.js.map +1 -0
  131. package/dist/logger.d.ts +10 -0
  132. package/dist/logger.d.ts.map +1 -0
  133. package/dist/logger.js +39 -0
  134. package/dist/logger.js.map +1 -0
  135. package/dist/search/dedup.d.ts +10 -0
  136. package/dist/search/dedup.d.ts.map +1 -0
  137. package/dist/search/dedup.js +35 -0
  138. package/dist/search/dedup.js.map +1 -0
  139. package/dist/search/engines/bing.d.ts +7 -0
  140. package/dist/search/engines/bing.d.ts.map +1 -0
  141. package/dist/search/engines/bing.js +48 -0
  142. package/dist/search/engines/bing.js.map +1 -0
  143. package/dist/search/engines/duckduckgo.d.ts +7 -0
  144. package/dist/search/engines/duckduckgo.d.ts.map +1 -0
  145. package/dist/search/engines/duckduckgo.js +50 -0
  146. package/dist/search/engines/duckduckgo.js.map +1 -0
  147. package/dist/search/engines/startpage.d.ts +7 -0
  148. package/dist/search/engines/startpage.d.ts.map +1 -0
  149. package/dist/search/engines/startpage.js +50 -0
  150. package/dist/search/engines/startpage.js.map +1 -0
  151. package/dist/search/filters.d.ts +16 -0
  152. package/dist/search/filters.d.ts.map +1 -0
  153. package/dist/search/filters.js +63 -0
  154. package/dist/search/filters.js.map +1 -0
  155. package/dist/search/flashrank.d.ts +12 -0
  156. package/dist/search/flashrank.d.ts.map +1 -0
  157. package/dist/search/flashrank.js +63 -0
  158. package/dist/search/flashrank.js.map +1 -0
  159. package/dist/search/query.d.ts +2 -0
  160. package/dist/search/query.d.ts.map +1 -0
  161. package/dist/search/query.js +41 -0
  162. package/dist/search/query.js.map +1 -0
  163. package/dist/search/rerank.d.ts +3 -0
  164. package/dist/search/rerank.d.ts.map +1 -0
  165. package/dist/search/rerank.js +40 -0
  166. package/dist/search/rerank.js.map +1 -0
  167. package/dist/search/searxng.d.ts +8 -0
  168. package/dist/search/searxng.d.ts.map +1 -0
  169. package/dist/search/searxng.js +87 -0
  170. package/dist/search/searxng.js.map +1 -0
  171. package/dist/search/validator.d.ts +6 -0
  172. package/dist/search/validator.d.ts.map +1 -0
  173. package/dist/search/validator.js +35 -0
  174. package/dist/search/validator.js.map +1 -0
  175. package/dist/searxng/bootstrap.d.ts +18 -0
  176. package/dist/searxng/bootstrap.d.ts.map +1 -0
  177. package/dist/searxng/bootstrap.js +136 -0
  178. package/dist/searxng/bootstrap.js.map +1 -0
  179. package/dist/searxng/docker.d.ts +9 -0
  180. package/dist/searxng/docker.d.ts.map +1 -0
  181. package/dist/searxng/docker.js +67 -0
  182. package/dist/searxng/docker.js.map +1 -0
  183. package/dist/searxng/process.d.ts +23 -0
  184. package/dist/searxng/process.d.ts.map +1 -0
  185. package/dist/searxng/process.js +188 -0
  186. package/dist/searxng/process.js.map +1 -0
  187. package/dist/server.d.ts +2 -0
  188. package/dist/server.d.ts.map +1 -0
  189. package/dist/server.js +311 -0
  190. package/dist/server.js.map +1 -0
  191. package/dist/tools/cache.d.ts +3 -0
  192. package/dist/tools/cache.d.ts.map +1 -0
  193. package/dist/tools/cache.js +50 -0
  194. package/dist/tools/cache.js.map +1 -0
  195. package/dist/tools/crawl.d.ts +6 -0
  196. package/dist/tools/crawl.d.ts.map +1 -0
  197. package/dist/tools/crawl.js +97 -0
  198. package/dist/tools/crawl.js.map +1 -0
  199. package/dist/tools/extract.d.ts +4 -0
  200. package/dist/tools/extract.d.ts.map +1 -0
  201. package/dist/tools/extract.js +69 -0
  202. package/dist/tools/extract.js.map +1 -0
  203. package/dist/tools/fetch.d.ts +4 -0
  204. package/dist/tools/fetch.d.ts.map +1 -0
  205. package/dist/tools/fetch.js +76 -0
  206. package/dist/tools/fetch.js.map +1 -0
  207. package/dist/tools/search.d.ts +4 -0
  208. package/dist/tools/search.d.ts.map +1 -0
  209. package/dist/tools/search.js +160 -0
  210. package/dist/tools/search.js.map +1 -0
  211. package/dist/types.d.ts +222 -0
  212. package/dist/types.d.ts.map +1 -0
  213. package/dist/types.js +2 -0
  214. package/dist/types.js.map +1 -0
  215. package/package.json +61 -0
@@ -0,0 +1,72 @@
1
+ import { isPrivateUrl } from './url-utils.js';
2
+ import { getConfig } from '../config.js';
3
+ export class RateLimiter {
4
+ domains = new Map();
5
+ robotsDelays = new Map();
6
+ setRobotsCrawlDelay(domain, delaySeconds) {
7
+ this.robotsDelays.set(domain, delaySeconds * 1000);
8
+ }
9
+ async acquire(url) {
10
+ const domain = new URL(url).hostname;
11
+ const state = this.getOrCreateState(url, domain);
12
+ if (state.activeCount < state.maxConcurrency) {
13
+ // Enforce delay even when under concurrency limit
14
+ const elapsed = Date.now() - state.lastRequestTime;
15
+ const remaining = state.delayMs - elapsed;
16
+ if (remaining > 0 && state.lastRequestTime > 0) {
17
+ await new Promise((r) => setTimeout(r, remaining));
18
+ }
19
+ return this.startRequest(state);
20
+ }
21
+ // Wait in queue
22
+ return new Promise((resolve) => {
23
+ state.queue.push(() => resolve(this.startRequest(state)));
24
+ });
25
+ }
26
+ getOrCreateState(url, domain) {
27
+ if (!this.domains.has(domain)) {
28
+ const config = getConfig();
29
+ const isPrivate = isPrivateUrl(url);
30
+ const configDelay = isPrivate ? config.crawlPrivateDelayMs : config.crawlDelayMs;
31
+ // Use robots.txt delay if it's higher than configured delay
32
+ const robotsDelay = this.robotsDelays.get(domain) ?? 0;
33
+ const effectiveDelay = Math.max(configDelay, robotsDelay);
34
+ this.domains.set(domain, {
35
+ activeCount: 0,
36
+ lastRequestTime: 0,
37
+ queue: [],
38
+ maxConcurrency: isPrivate ? config.crawlPrivateConcurrency : config.crawlConcurrency,
39
+ delayMs: effectiveDelay,
40
+ });
41
+ }
42
+ const state = this.domains.get(domain);
43
+ // Update delay if robots delay was set after state creation
44
+ const robotsDelay = this.robotsDelays.get(domain);
45
+ if (robotsDelay !== undefined && robotsDelay > state.delayMs) {
46
+ state.delayMs = robotsDelay;
47
+ }
48
+ return state;
49
+ }
50
+ startRequest(state) {
51
+ state.activeCount++;
52
+ state.lastRequestTime = Date.now();
53
+ return () => {
54
+ state.activeCount--;
55
+ this.processQueue(state);
56
+ };
57
+ }
58
+ processQueue(state) {
59
+ if (state.queue.length === 0 || state.activeCount >= state.maxConcurrency)
60
+ return;
61
+ const next = state.queue.shift();
62
+ const elapsed = Date.now() - state.lastRequestTime;
63
+ const remaining = state.delayMs - elapsed;
64
+ if (remaining <= 0) {
65
+ next();
66
+ }
67
+ else {
68
+ setTimeout(next, remaining);
69
+ }
70
+ }
71
+ }
72
+ //# sourceMappingURL=rate-limiter.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"rate-limiter.js","sourceRoot":"","sources":["../../src/crawl/rate-limiter.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,gBAAgB,CAAC;AAC9C,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AAUzC,MAAM,OAAO,WAAW;IACd,OAAO,GAAG,IAAI,GAAG,EAAuB,CAAC;IACzC,YAAY,GAAG,IAAI,GAAG,EAAkB,CAAC;IAEjD,mBAAmB,CAAC,MAAc,EAAE,YAAoB;QACtD,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,MAAM,EAAE,YAAY,GAAG,IAAI,CAAC,CAAC;IACrD,CAAC;IAED,KAAK,CAAC,OAAO,CAAC,GAAW;QACvB,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;QACrC,MAAM,KAAK,GAAG,IAAI,CAAC,gBAAgB,CAAC,GAAG,EAAE,MAAM,CAAC,CAAC;QAEjD,IAAI,KAAK,CAAC,WAAW,GAAG,KAAK,CAAC,cAAc,EAAE,CAAC;YAC7C,kDAAkD;YAClD,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK,CAAC,eAAe,CAAC;YACnD,MAAM,SAAS,GAAG,KAAK,CAAC,OAAO,GAAG,OAAO,CAAC;YAC1C,IAAI,SAAS,GAAG,CAAC,IAAI,KAAK,CAAC,eAAe,GAAG,CAAC,EAAE,CAAC;gBAC/C,MAAM,IAAI,OAAO,CAAO,CAAC,CAAC,EAAE,EAAE,CAAC,UAAU,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC,CAAC;YAC3D,CAAC;YACD,OAAO,IAAI,CAAC,YAAY,CAAC,KAAK,CAAC,CAAC;QAClC,CAAC;QAED,gBAAgB;QAChB,OAAO,IAAI,OAAO,CAAa,CAAC,OAAO,EAAE,EAAE;YACzC,KAAK,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,EAAE,CAAC,OAAO,CAAC,IAAI,CAAC,YAAY,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QAC5D,CAAC,CAAC,CAAC;IACL,CAAC;IAEO,gBAAgB,CAAC,GAAW,EAAE,MAAc;QAClD,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC;YAC9B,MAAM,MAAM,GAAG,SAAS,EAAE,CAAC;YAC3B,MAAM,SAAS,GAAG,YAAY,CAAC,GAAG,CAAC,CAAC;YACpC,MAAM,WAAW,GAAG,SAAS,CAAC,CAAC,CAAC,MAAM,CAAC,mBAAmB,CAAC,CAAC,CAAC,MAAM,CAAC,YAAY,CAAC;YAEjF,4DAA4D;YAC5D,MAAM,WAAW,GAAG,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;YACvD,MAAM,cAAc,GAAG,IAAI,CAAC,GAAG,CAAC,WAAW,EAAE,WAAW,CAAC,CAAC;YAE1D,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,MAAM,EAAE;gBACvB,WAAW,EAAE,CAAC;gBACd,eAAe,EAAE,CAAC;gBAClB,KAAK,EAAE,EAAE;gBACT,cAAc,EAAE,SAAS,CAAC,CAAC,CAAC,MAAM,CAAC,uBAAuB,CAAC,CAAC,CAAC,MAAM,CAAC,gBAAgB;gBACpF,OAAO,EAAE,cAAc;aACxB,CAAC,CAAC;QACL,CAAC;QAED,MAAM,KAAK,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,MAAM,CAAE,CAAC;QACxC,4DAA4D;QAC5D,MAAM,WAAW,GAAG,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;QAClD,IAAI,WAAW,KAAK,SAAS,IAAI,WAAW,GAAG,KAAK,CAAC,OAAO,EAAE,CAAC;YAC7D,KAAK,CAAC,OAAO,GAAG,WAAW,CAAC;QAC9B,CAAC;QAED,OAAO,KAAK,CAAC;IACf,CAAC;IAEO,YAAY,CAAC,KAAkB;QACrC,KAAK,CAAC,WAAW,EAAE,CAAC;QACpB,KAAK,CAAC,eAAe,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAEnC,OAAO,GAAG,EAAE;YACV,KAAK,CAAC,WAAW,EAAE,CAAC;YACpB,IAAI,CAAC,YAAY,CAAC,KAAK,CAAC,CAAC;QAC3B,CAAC,CAAC;IACJ,CAAC;IAEO,YAAY,CAAC,KAAkB;QACrC,IAAI,KAAK,CAAC,KAAK,CAAC,MAAM,KAAK,CAAC,IAAI,KAAK,CAAC,WAAW,IAAI,KAAK,CAAC,cAAc;YAAE,OAAO;QAElF,MAAM,IAAI,GAAG,KAAK,CAAC,KAAK,CAAC,KAAK,EAAG,CAAC;QAClC,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK,CAAC,eAAe,CAAC;QACnD,MAAM,SAAS,GAAG,KAAK,CAAC,OAAO,GAAG,OAAO,CAAC;QAE1C,IAAI,SAAS,IAAI,CAAC,EAAE,CAAC;YACnB,IAAI,EAAE,CAAC;QACT,CAAC;aAAM,CAAC;YACN,UAAU,CAAC,IAAI,EAAE,SAAS,CAAC,CAAC;QAC9B,CAAC;IACH,CAAC;CACF"}
@@ -0,0 +1,9 @@
1
+ export declare class RobotsParser {
2
+ private rules;
3
+ private crawlDelay;
4
+ constructor(robotsTxt: string);
5
+ private parse;
6
+ isAllowed(path: string): boolean;
7
+ getCrawlDelay(): number | null;
8
+ }
9
+ //# sourceMappingURL=robots.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"robots.d.ts","sourceRoot":"","sources":["../../src/crawl/robots.ts"],"names":[],"mappings":"AAKA,qBAAa,YAAY;IACvB,OAAO,CAAC,KAAK,CAAoB;IACjC,OAAO,CAAC,UAAU,CAAuB;gBAE7B,SAAS,EAAE,MAAM;IAI7B,OAAO,CAAC,KAAK;IA4Cb,SAAS,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO;IAiBhC,aAAa,IAAI,MAAM,GAAG,IAAI;CAG/B"}
@@ -0,0 +1,63 @@
1
+ export class RobotsParser {
2
+ rules = [];
3
+ crawlDelay = null;
4
+ constructor(robotsTxt) {
5
+ this.parse(robotsTxt);
6
+ }
7
+ parse(text) {
8
+ const lines = text.split('\n');
9
+ let inWildcardAgent = false;
10
+ for (const rawLine of lines) {
11
+ const line = rawLine.trim();
12
+ if (line.match(/^user-agent:\s*\*/i)) {
13
+ inWildcardAgent = true;
14
+ continue;
15
+ }
16
+ if (line.match(/^user-agent:/i) && !line.match(/^user-agent:\s*\*/i)) {
17
+ inWildcardAgent = false;
18
+ continue;
19
+ }
20
+ if (!inWildcardAgent)
21
+ continue;
22
+ const disallowMatch = line.match(/^disallow:\s*(.*)/i);
23
+ if (disallowMatch) {
24
+ const path = disallowMatch[1].trim();
25
+ if (path) {
26
+ this.rules.push({ type: 'disallow', path });
27
+ }
28
+ continue;
29
+ }
30
+ const allowMatch = line.match(/^allow:\s*(.*)/i);
31
+ if (allowMatch) {
32
+ const path = allowMatch[1].trim();
33
+ if (path) {
34
+ this.rules.push({ type: 'allow', path });
35
+ }
36
+ continue;
37
+ }
38
+ const delayMatch = line.match(/^crawl-delay:\s*(\d+(?:\.\d+)?)/i);
39
+ if (delayMatch) {
40
+ this.crawlDelay = parseFloat(delayMatch[1]);
41
+ }
42
+ }
43
+ }
44
+ isAllowed(path) {
45
+ let bestMatch = null;
46
+ let bestLength = -1;
47
+ for (const rule of this.rules) {
48
+ if (path.startsWith(rule.path)) {
49
+ if (rule.path.length > bestLength || (rule.path.length === bestLength && rule.type === 'allow')) {
50
+ bestMatch = rule;
51
+ bestLength = rule.path.length;
52
+ }
53
+ }
54
+ }
55
+ if (!bestMatch)
56
+ return true;
57
+ return bestMatch.type === 'allow';
58
+ }
59
+ getCrawlDelay() {
60
+ return this.crawlDelay;
61
+ }
62
+ }
63
+ //# sourceMappingURL=robots.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"robots.js","sourceRoot":"","sources":["../../src/crawl/robots.ts"],"names":[],"mappings":"AAKA,MAAM,OAAO,YAAY;IACf,KAAK,GAAiB,EAAE,CAAC;IACzB,UAAU,GAAkB,IAAI,CAAC;IAEzC,YAAY,SAAiB;QAC3B,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;IACxB,CAAC;IAEO,KAAK,CAAC,IAAY;QACxB,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAC/B,IAAI,eAAe,GAAG,KAAK,CAAC;QAE5B,KAAK,MAAM,OAAO,IAAI,KAAK,EAAE,CAAC;YAC5B,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,EAAE,CAAC;YAE5B,IAAI,IAAI,CAAC,KAAK,CAAC,oBAAoB,CAAC,EAAE,CAAC;gBACrC,eAAe,GAAG,IAAI,CAAC;gBACvB,SAAS;YACX,CAAC;YAED,IAAI,IAAI,CAAC,KAAK,CAAC,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,oBAAoB,CAAC,EAAE,CAAC;gBACrE,eAAe,GAAG,KAAK,CAAC;gBACxB,SAAS;YACX,CAAC;YAED,IAAI,CAAC,eAAe;gBAAE,SAAS;YAE/B,MAAM,aAAa,GAAG,IAAI,CAAC,KAAK,CAAC,oBAAoB,CAAC,CAAC;YACvD,IAAI,aAAa,EAAE,CAAC;gBAClB,MAAM,IAAI,GAAG,aAAa,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;gBACrC,IAAI,IAAI,EAAE,CAAC;oBACT,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,UAAU,EAAE,IAAI,EAAE,CAAC,CAAC;gBAC9C,CAAC;gBACD,SAAS;YACX,CAAC;YAED,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,iBAAiB,CAAC,CAAC;YACjD,IAAI,UAAU,EAAE,CAAC;gBACf,MAAM,IAAI,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;gBAClC,IAAI,IAAI,EAAE,CAAC;oBACT,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC;gBAC3C,CAAC;gBACD,SAAS;YACX,CAAC;YAED,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,kCAAkC,CAAC,CAAC;YAClE,IAAI,UAAU,EAAE,CAAC;gBACf,IAAI,CAAC,UAAU,GAAG,UAAU,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC;YAC9C,CAAC;QACH,CAAC;IACH,CAAC;IAED,SAAS,CAAC,IAAY;QACpB,IAAI,SAAS,GAAsB,IAAI,CAAC;QACxC,IAAI,UAAU,GAAG,CAAC,CAAC,CAAC;QAEpB,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;YAC9B,IAAI,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC/B,IAAI,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,UAAU,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,KAAK,UAAU,IAAI,IAAI,CAAC,IAAI,KAAK,OAAO,CAAC,EAAE,CAAC;oBAChG,SAAS,GAAG,IAAI,CAAC;oBACjB,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC;gBAChC,CAAC;YACH,CAAC;QACH,CAAC;QAED,IAAI,CAAC,SAAS;YAAE,OAAO,IAAI,CAAC;QAC5B,OAAO,SAAS,CAAC,IAAI,KAAK,OAAO,CAAC;IACpC,CAAC;IAED,aAAa;QACX,OAAO,IAAI,CAAC,UAAU,CAAC;IACzB,CAAC;CACF"}
@@ -0,0 +1,4 @@
1
+ export declare function parseSitemap(xml: string): string[];
2
+ export declare function parseSitemapIndex(xml: string): string[];
3
+ export declare function extractSitemapUrlFromRobots(robotsTxt: string): string[];
4
+ //# sourceMappingURL=sitemap.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"sitemap.d.ts","sourceRoot":"","sources":["../../src/crawl/sitemap.ts"],"names":[],"mappings":"AAAA,wBAAgB,YAAY,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,EAAE,CAgBlD;AAED,wBAAgB,iBAAiB,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,EAAE,CASvD;AAED,wBAAgB,2BAA2B,CAAC,SAAS,EAAE,MAAM,GAAG,MAAM,EAAE,CAYvE"}
@@ -0,0 +1,38 @@
1
+ export function parseSitemap(xml) {
2
+ // A sitemapindex document should be parsed with parseSitemapIndex, not here
3
+ if (xml.includes('<sitemapindex'))
4
+ return [];
5
+ if (!xml.includes('<urlset') && !xml.includes('<loc>'))
6
+ return [];
7
+ const urls = [];
8
+ const locMatches = xml.matchAll(/<loc>\s*([^<]+?)\s*<\/loc>/g);
9
+ for (const match of locMatches) {
10
+ const url = match[1].trim();
11
+ if (url) {
12
+ urls.push(url);
13
+ }
14
+ }
15
+ return urls;
16
+ }
17
+ export function parseSitemapIndex(xml) {
18
+ if (!xml.includes('<sitemapindex'))
19
+ return [];
20
+ const urls = [];
21
+ const locMatches = xml.matchAll(/<loc>\s*([^<]+?)\s*<\/loc>/g);
22
+ for (const match of locMatches) {
23
+ urls.push(match[1].trim());
24
+ }
25
+ return urls;
26
+ }
27
+ export function extractSitemapUrlFromRobots(robotsTxt) {
28
+ const urls = [];
29
+ const lines = robotsTxt.split('\n');
30
+ for (const line of lines) {
31
+ const match = line.match(/^sitemap:\s*(.+)/i);
32
+ if (match) {
33
+ urls.push(match[1].trim());
34
+ }
35
+ }
36
+ return urls;
37
+ }
38
+ //# sourceMappingURL=sitemap.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"sitemap.js","sourceRoot":"","sources":["../../src/crawl/sitemap.ts"],"names":[],"mappings":"AAAA,MAAM,UAAU,YAAY,CAAC,GAAW;IACtC,4EAA4E;IAC5E,IAAI,GAAG,CAAC,QAAQ,CAAC,eAAe,CAAC;QAAE,OAAO,EAAE,CAAC;IAE7C,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,SAAS,CAAC,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,OAAO,CAAC;QAAE,OAAO,EAAE,CAAC;IAElE,MAAM,IAAI,GAAa,EAAE,CAAC;IAC1B,MAAM,UAAU,GAAG,GAAG,CAAC,QAAQ,CAAC,6BAA6B,CAAC,CAAC;IAC/D,KAAK,MAAM,KAAK,IAAI,UAAU,EAAE,CAAC;QAC/B,MAAM,GAAG,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QAC5B,IAAI,GAAG,EAAE,CAAC;YACR,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QACjB,CAAC;IACH,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED,MAAM,UAAU,iBAAiB,CAAC,GAAW;IAC3C,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,eAAe,CAAC;QAAE,OAAO,EAAE,CAAC;IAE9C,MAAM,IAAI,GAAa,EAAE,CAAC;IAC1B,MAAM,UAAU,GAAG,GAAG,CAAC,QAAQ,CAAC,6BAA6B,CAAC,CAAC;IAC/D,KAAK,MAAM,KAAK,IAAI,UAAU,EAAE,CAAC;QAC/B,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;IAC7B,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,MAAM,UAAU,2BAA2B,CAAC,SAAiB;IAC3D,MAAM,IAAI,GAAa,EAAE,CAAC;IAC1B,MAAM,KAAK,GAAG,SAAS,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAEpC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,mBAAmB,CAAC,CAAC;QAC9C,IAAI,KAAK,EAAE,CAAC;YACV,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;QAC7B,CAAC;IACH,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC"}
@@ -0,0 +1,3 @@
1
+ export declare function isPrivateUrl(url: string): boolean;
2
+ export declare function matchesPatterns(url: string, includePatterns: string[] | undefined, excludePatterns: string[] | undefined): boolean;
3
+ //# sourceMappingURL=url-utils.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"url-utils.d.ts","sourceRoot":"","sources":["../../src/crawl/url-utils.ts"],"names":[],"mappings":"AAAA,wBAAgB,YAAY,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAgCjD;AAED,wBAAgB,eAAe,CAC7B,GAAG,EAAE,MAAM,EACX,eAAe,EAAE,MAAM,EAAE,GAAG,SAAS,EACrC,eAAe,EAAE,MAAM,EAAE,GAAG,SAAS,GACpC,OAAO,CAYT"}
@@ -0,0 +1,41 @@
1
+ export function isPrivateUrl(url) {
2
+ const parsed = new URL(url);
3
+ const hostname = parsed.hostname.replace(/^\[|\]$/g, ''); // strip IPv6 brackets
4
+ if (hostname === 'localhost' || hostname === '127.0.0.1' || hostname === '::1' || hostname === '0.0.0.0') {
5
+ return true;
6
+ }
7
+ if (hostname.endsWith('.local')) {
8
+ return true;
9
+ }
10
+ // 10.x.x.x
11
+ if (hostname.startsWith('10.')) {
12
+ return true;
13
+ }
14
+ // 192.168.x.x
15
+ if (hostname.startsWith('192.168.')) {
16
+ return true;
17
+ }
18
+ // 172.16.0.0/12 (172.16.x.x – 172.31.x.x)
19
+ if (hostname.startsWith('172.')) {
20
+ const parts = hostname.split('.');
21
+ const second = parseInt(parts[1], 10);
22
+ if (second >= 16 && second <= 31) {
23
+ return true;
24
+ }
25
+ }
26
+ return false;
27
+ }
28
+ export function matchesPatterns(url, includePatterns, excludePatterns) {
29
+ if (includePatterns && includePatterns.length > 0) {
30
+ const matches = includePatterns.some((p) => new RegExp(p).test(url));
31
+ if (!matches)
32
+ return false;
33
+ }
34
+ if (excludePatterns && excludePatterns.length > 0) {
35
+ const excluded = excludePatterns.some((p) => new RegExp(p).test(url));
36
+ if (excluded)
37
+ return false;
38
+ }
39
+ return true;
40
+ }
41
+ //# sourceMappingURL=url-utils.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"url-utils.js","sourceRoot":"","sources":["../../src/crawl/url-utils.ts"],"names":[],"mappings":"AAAA,MAAM,UAAU,YAAY,CAAC,GAAW;IACtC,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;IAC5B,MAAM,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC,CAAC,sBAAsB;IAEhF,IAAI,QAAQ,KAAK,WAAW,IAAI,QAAQ,KAAK,WAAW,IAAI,QAAQ,KAAK,KAAK,IAAI,QAAQ,KAAK,SAAS,EAAE,CAAC;QACzG,OAAO,IAAI,CAAC;IACd,CAAC;IAED,IAAI,QAAQ,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC;QAChC,OAAO,IAAI,CAAC;IACd,CAAC;IAED,WAAW;IACX,IAAI,QAAQ,CAAC,UAAU,CAAC,KAAK,CAAC,EAAE,CAAC;QAC/B,OAAO,IAAI,CAAC;IACd,CAAC;IAED,cAAc;IACd,IAAI,QAAQ,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;QACpC,OAAO,IAAI,CAAC;IACd,CAAC;IAED,0CAA0C;IAC1C,IAAI,QAAQ,CAAC,UAAU,CAAC,MAAM,CAAC,EAAE,CAAC;QAChC,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAClC,MAAM,MAAM,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QACtC,IAAI,MAAM,IAAI,EAAE,IAAI,MAAM,IAAI,EAAE,EAAE,CAAC;YACjC,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED,MAAM,UAAU,eAAe,CAC7B,GAAW,EACX,eAAqC,EACrC,eAAqC;IAErC,IAAI,eAAe,IAAI,eAAe,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAClD,MAAM,OAAO,GAAG,eAAe,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,MAAM,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;QACrE,IAAI,CAAC,OAAO;YAAE,OAAO,KAAK,CAAC;IAC7B,CAAC;IAED,IAAI,eAAe,IAAI,eAAe,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAClD,MAAM,QAAQ,GAAG,eAAe,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,MAAM,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;QACtE,IAAI,QAAQ;YAAE,OAAO,KAAK,CAAC;IAC7B,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC"}
@@ -0,0 +1,3 @@
1
+ import type { ExtractionResult } from '../types.js';
2
+ export declare function defuddleExtract(html: string, url: string): Promise<ExtractionResult | null>;
3
+ //# sourceMappingURL=defuddle.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"defuddle.d.ts","sourceRoot":"","sources":["../../src/extraction/defuddle.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAIpD,wBAAsB,eAAe,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,gBAAgB,GAAG,IAAI,CAAC,CAoBjG"}
@@ -0,0 +1,26 @@
1
+ import { Defuddle } from 'defuddle/node';
2
+ const MIN_CONTENT_THRESHOLD = 100;
3
+ export async function defuddleExtract(html, url) {
4
+ try {
5
+ const result = await Defuddle(html, url, { markdown: true });
6
+ if (!result.content || result.content.length < MIN_CONTENT_THRESHOLD)
7
+ return null;
8
+ return {
9
+ title: result.title ?? '',
10
+ markdown: result.content,
11
+ metadata: {
12
+ description: result.description || undefined,
13
+ author: result.author || undefined,
14
+ date: result.published || undefined,
15
+ language: result.language || undefined,
16
+ },
17
+ links: [],
18
+ images: [],
19
+ extractor: 'defuddle',
20
+ };
21
+ }
22
+ catch {
23
+ return null;
24
+ }
25
+ }
26
+ //# sourceMappingURL=defuddle.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"defuddle.js","sourceRoot":"","sources":["../../src/extraction/defuddle.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,eAAe,CAAC;AAGzC,MAAM,qBAAqB,GAAG,GAAG,CAAC;AAElC,MAAM,CAAC,KAAK,UAAU,eAAe,CAAC,IAAY,EAAE,GAAW;IAC7D,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,GAAG,EAAE,EAAE,QAAQ,EAAE,IAAI,EAAE,CAAC,CAAC;QAC7D,IAAI,CAAC,MAAM,CAAC,OAAO,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,GAAG,qBAAqB;YAAE,OAAO,IAAI,CAAC;QAClF,OAAO;YACL,KAAK,EAAE,MAAM,CAAC,KAAK,IAAI,EAAE;YACzB,QAAQ,EAAE,MAAM,CAAC,OAAO;YACxB,QAAQ,EAAE;gBACR,WAAW,EAAE,MAAM,CAAC,WAAW,IAAI,SAAS;gBAC5C,MAAM,EAAE,MAAM,CAAC,MAAM,IAAI,SAAS;gBAClC,IAAI,EAAE,MAAM,CAAC,SAAS,IAAI,SAAS;gBACnC,QAAQ,EAAE,MAAM,CAAC,QAAQ,IAAI,SAAS;aACvC;YACD,KAAK,EAAE,EAAE;YACT,MAAM,EAAE,EAAE;YACV,SAAS,EAAE,UAAU;SACtB,CAAC;IACJ,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC"}
@@ -0,0 +1,5 @@
1
+ import type { MetadataData, TableData } from '../types.js';
2
+ export declare function extractMetadata(html: string): MetadataData;
3
+ export declare function extractSelector(html: string, selector: string, multiple: boolean): string | string[];
4
+ export declare function extractTables(html: string): TableData[];
5
+ //# sourceMappingURL=extract.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"extract.d.ts","sourceRoot":"","sources":["../../src/extraction/extract.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,YAAY,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAS3D,wBAAgB,eAAe,CAAC,IAAI,EAAE,MAAM,GAAG,YAAY,CA2B1D;AAED,wBAAgB,eAAe,CAC7B,IAAI,EAAE,MAAM,EACZ,QAAQ,EAAE,MAAM,EAChB,QAAQ,EAAE,OAAO,GAChB,MAAM,GAAG,MAAM,EAAE,CAUnB;AAED,wBAAgB,aAAa,CAAC,IAAI,EAAE,MAAM,GAAG,SAAS,EAAE,CA6CvD"}
@@ -0,0 +1,83 @@
1
+ import { parseHTML } from 'linkedom';
2
+ function getMetaContent(doc, nameOrProperty) {
3
+ const el = doc.querySelector(`meta[name="${nameOrProperty}"]`) ??
4
+ doc.querySelector(`meta[property="${nameOrProperty}"]`);
5
+ return el?.getAttribute('content') ?? undefined;
6
+ }
7
+ export function extractMetadata(html) {
8
+ const { document: doc } = parseHTML(html);
9
+ const result = {};
10
+ const title = doc.querySelector('title')?.textContent?.trim();
11
+ if (title)
12
+ result.title = title;
13
+ const description = getMetaContent(doc, 'description') ?? getMetaContent(doc, 'og:description');
14
+ if (description)
15
+ result.description = description;
16
+ const author = getMetaContent(doc, 'author');
17
+ if (author)
18
+ result.author = author;
19
+ const date = getMetaContent(doc, 'date') ?? getMetaContent(doc, 'article:published_time');
20
+ if (date)
21
+ result.date = date;
22
+ const keywords = getMetaContent(doc, 'keywords');
23
+ if (keywords) {
24
+ result.keywords = keywords.split(',').map((k) => k.trim()).filter(Boolean);
25
+ }
26
+ const ogImage = getMetaContent(doc, 'og:image');
27
+ if (ogImage)
28
+ result.og_image = ogImage;
29
+ return result;
30
+ }
31
+ export function extractSelector(html, selector, multiple) {
32
+ const { document: doc } = parseHTML(html);
33
+ if (multiple) {
34
+ const elements = doc.querySelectorAll(selector);
35
+ return Array.from(elements).map((el) => (el.textContent ?? '').trim());
36
+ }
37
+ const el = doc.querySelector(selector);
38
+ return el ? (el.textContent ?? '').trim() : '';
39
+ }
40
+ export function extractTables(html) {
41
+ const { document: doc } = parseHTML(html);
42
+ const tables = doc.querySelectorAll('table');
43
+ if (tables.length === 0)
44
+ return [];
45
+ return Array.from(tables).map((table) => {
46
+ const caption = table.querySelector('caption')?.textContent?.trim() || undefined;
47
+ const thElements = table.querySelectorAll('thead th');
48
+ let headers;
49
+ let bodyRows;
50
+ if (thElements.length > 0) {
51
+ headers = Array.from(thElements).map((th) => (th.textContent ?? '').trim());
52
+ bodyRows = Array.from(table.querySelectorAll('tbody tr'));
53
+ if (bodyRows.length === 0) {
54
+ const allRows = Array.from(table.querySelectorAll('tr'));
55
+ bodyRows = allRows.slice(1);
56
+ }
57
+ }
58
+ else {
59
+ const allRows = Array.from(table.querySelectorAll('tr'));
60
+ const firstRow = allRows[0];
61
+ const firstRowThs = firstRow ? Array.from(firstRow.querySelectorAll('th')) : [];
62
+ if (firstRowThs.length > 0) {
63
+ headers = firstRowThs.map((th) => (th.textContent ?? '').trim());
64
+ bodyRows = allRows.slice(1);
65
+ }
66
+ else {
67
+ const cellCount = firstRow ? firstRow.querySelectorAll('td').length : 0;
68
+ headers = Array.from({ length: cellCount }, (_, i) => `col_${i + 1}`);
69
+ bodyRows = allRows;
70
+ }
71
+ }
72
+ const rows = bodyRows.map((row) => {
73
+ const cells = Array.from(row.querySelectorAll('td'));
74
+ const obj = {};
75
+ headers.forEach((header, i) => {
76
+ obj[header] = (cells[i]?.textContent ?? '').trim();
77
+ });
78
+ return obj;
79
+ });
80
+ return { caption, headers, rows };
81
+ });
82
+ }
83
+ //# sourceMappingURL=extract.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"extract.js","sourceRoot":"","sources":["../../src/extraction/extract.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AAGrC,SAAS,cAAc,CAAC,GAAa,EAAE,cAAsB;IAC3D,MAAM,EAAE,GACN,GAAG,CAAC,aAAa,CAAC,cAAc,cAAc,IAAI,CAAC;QACnD,GAAG,CAAC,aAAa,CAAC,kBAAkB,cAAc,IAAI,CAAC,CAAC;IAC1D,OAAO,EAAE,EAAE,YAAY,CAAC,SAAS,CAAC,IAAI,SAAS,CAAC;AAClD,CAAC;AAED,MAAM,UAAU,eAAe,CAAC,IAAY;IAC1C,MAAM,EAAE,QAAQ,EAAE,GAAG,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;IAC1C,MAAM,MAAM,GAAiB,EAAE,CAAC;IAEhC,MAAM,KAAK,GAAG,GAAG,CAAC,aAAa,CAAC,OAAO,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC;IAC9D,IAAI,KAAK;QAAE,MAAM,CAAC,KAAK,GAAG,KAAK,CAAC;IAEhC,MAAM,WAAW,GACf,cAAc,CAAC,GAAG,EAAE,aAAa,CAAC,IAAI,cAAc,CAAC,GAAG,EAAE,gBAAgB,CAAC,CAAC;IAC9E,IAAI,WAAW;QAAE,MAAM,CAAC,WAAW,GAAG,WAAW,CAAC;IAElD,MAAM,MAAM,GAAG,cAAc,CAAC,GAAG,EAAE,QAAQ,CAAC,CAAC;IAC7C,IAAI,MAAM;QAAE,MAAM,CAAC,MAAM,GAAG,MAAM,CAAC;IAEnC,MAAM,IAAI,GACR,cAAc,CAAC,GAAG,EAAE,MAAM,CAAC,IAAI,cAAc,CAAC,GAAG,EAAE,wBAAwB,CAAC,CAAC;IAC/E,IAAI,IAAI;QAAE,MAAM,CAAC,IAAI,GAAG,IAAI,CAAC;IAE7B,MAAM,QAAQ,GAAG,cAAc,CAAC,GAAG,EAAE,UAAU,CAAC,CAAC;IACjD,IAAI,QAAQ,EAAE,CAAC;QACb,MAAM,CAAC,QAAQ,GAAG,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAC7E,CAAC;IAED,MAAM,OAAO,GAAG,cAAc,CAAC,GAAG,EAAE,UAAU,CAAC,CAAC;IAChD,IAAI,OAAO;QAAE,MAAM,CAAC,QAAQ,GAAG,OAAO,CAAC;IAEvC,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,MAAM,UAAU,eAAe,CAC7B,IAAY,EACZ,QAAgB,EAChB,QAAiB;IAEjB,MAAM,EAAE,QAAQ,EAAE,GAAG,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;IAE1C,IAAI,QAAQ,EAAE,CAAC;QACb,MAAM,QAAQ,GAAG,GAAG,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAC;QAChD,OAAO,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;IACzE,CAAC;IAED,MAAM,EAAE,GAAG,GAAG,CAAC,aAAa,CAAC,QAAQ,CAAC,CAAC;IACvC,OAAO,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;AACjD,CAAC;AAED,MAAM,UAAU,aAAa,CAAC,IAAY;IACxC,MAAM,EAAE,QAAQ,EAAE,GAAG,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;IAC1C,MAAM,MAAM,GAAG,GAAG,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC;IAC7C,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAEnC,OAAO,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,EAAE;QACtC,MAAM,OAAO,GAAG,KAAK,CAAC,aAAa,CAAC,SAAS,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,SAAS,CAAC;QAEjF,MAAM,UAAU,GAAG,KAAK,CAAC,gBAAgB,CAAC,UAAU,CAAC,CAAC;QACtD,IAAI,OAAiB,CAAC;QACtB,IAAI,QAAmB,CAAC;QAExB,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC1B,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;YAC5E,QAAQ,GAAG,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,gBAAgB,CAAC,UAAU,CAAC,CAAC,CAAC;YAC1D,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBAC1B,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,gBAAgB,CAAC,IAAI,CAAC,CAAC,CAAC;gBACzD,QAAQ,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;YAC9B,CAAC;QACH,CAAC;aAAM,CAAC;YACN,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,gBAAgB,CAAC,IAAI,CAAC,CAAC,CAAC;YACzD,MAAM,QAAQ,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC;YAC5B,MAAM,WAAW,GAAG,QAAQ,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,gBAAgB,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YAEhF,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC3B,OAAO,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;gBACjE,QAAQ,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;YAC9B,CAAC;iBAAM,CAAC;gBACN,MAAM,SAAS,GAAG,QAAQ,CAAC,CAAC,CAAC,QAAQ,CAAC,gBAAgB,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;gBACxE,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,SAAS,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;gBACtE,QAAQ,GAAG,OAAO,CAAC;YACrB,CAAC;QACH,CAAC;QAED,MAAM,IAAI,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE;YAChC,MAAM,KAAK,GAAG,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,gBAAgB,CAAC,IAAI,CAAC,CAAC,CAAC;YACrD,MAAM,GAAG,GAA2B,EAAE,CAAC;YACvC,OAAO,CAAC,OAAO,CAAC,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE;gBAC5B,GAAG,CAAC,MAAM,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,WAAW,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;YACrD,CAAC,CAAC,CAAC;YACH,OAAO,GAAG,CAAC;QACb,CAAC,CAAC,CAAC;QAEH,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC;IACpC,CAAC,CAAC,CAAC;AACL,CAAC"}
@@ -0,0 +1,4 @@
1
+ import type { JsonSchema } from './schema.js';
2
+ export declare function extractJsonLd(html: string): Record<string, unknown>[];
3
+ export declare function matchJsonLdToSchema(jsonLdBlocks: Record<string, unknown>[], schema: JsonSchema): Record<string, unknown>;
4
+ //# sourceMappingURL=jsonld.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"jsonld.d.ts","sourceRoot":"","sources":["../../src/extraction/jsonld.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AAI9C,wBAAgB,aAAa,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,EAAE,CAyBrE;AAED,wBAAgB,mBAAmB,CACjC,YAAY,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,EAAE,EACvC,MAAM,EAAE,UAAU,GACjB,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAazB"}
@@ -0,0 +1,64 @@
1
+ import { parseHTML } from 'linkedom';
2
+ import { createLogger } from '../logger.js';
3
+ const log = createLogger('jsonld');
4
+ export function extractJsonLd(html) {
5
+ const { document: doc } = parseHTML(html);
6
+ const scripts = doc.querySelectorAll('script[type="application/ld+json"]');
7
+ const results = [];
8
+ for (const script of scripts) {
9
+ try {
10
+ const text = script.textContent?.trim();
11
+ if (!text)
12
+ continue;
13
+ const parsed = JSON.parse(text);
14
+ if (Array.isArray(parsed)) {
15
+ results.push(...parsed);
16
+ }
17
+ else if (parsed['@graph'] && Array.isArray(parsed['@graph'])) {
18
+ results.push(...parsed['@graph']);
19
+ }
20
+ else {
21
+ results.push(parsed);
22
+ }
23
+ }
24
+ catch (err) {
25
+ log.debug('Failed to parse JSON-LD block', { error: String(err) });
26
+ }
27
+ }
28
+ return results;
29
+ }
30
+ export function matchJsonLdToSchema(jsonLdBlocks, schema) {
31
+ if (!schema.properties || jsonLdBlocks.length === 0)
32
+ return {};
33
+ const result = {};
34
+ const flattened = flattenJsonLd(jsonLdBlocks);
35
+ for (const fieldName of Object.keys(schema.properties)) {
36
+ if (flattened[fieldName] !== undefined) {
37
+ result[fieldName] = flattened[fieldName];
38
+ }
39
+ }
40
+ return result;
41
+ }
42
+ function flattenJsonLd(blocks) {
43
+ const flat = {};
44
+ for (const block of blocks) {
45
+ flattenObject(block, flat);
46
+ }
47
+ return flat;
48
+ }
49
+ function flattenObject(obj, target) {
50
+ for (const [key, value] of Object.entries(obj)) {
51
+ if (key.startsWith('@'))
52
+ continue;
53
+ // First-wins: earlier blocks and shallower keys take priority
54
+ if (!(key in target)) {
55
+ if (typeof value === 'object' && value !== null && !Array.isArray(value)) {
56
+ flattenObject(value, target);
57
+ }
58
+ else {
59
+ target[key] = value;
60
+ }
61
+ }
62
+ }
63
+ }
64
+ //# sourceMappingURL=jsonld.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"jsonld.js","sourceRoot":"","sources":["../../src/extraction/jsonld.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AACrC,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAG5C,MAAM,GAAG,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;AAEnC,MAAM,UAAU,aAAa,CAAC,IAAY;IACxC,MAAM,EAAE,QAAQ,EAAE,GAAG,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;IAC1C,MAAM,OAAO,GAAG,GAAG,CAAC,gBAAgB,CAAC,oCAAoC,CAAC,CAAC;IAC3E,MAAM,OAAO,GAA8B,EAAE,CAAC;IAE9C,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;QAC7B,IAAI,CAAC;YACH,MAAM,IAAI,GAAG,MAAM,CAAC,WAAW,EAAE,IAAI,EAAE,CAAC;YACxC,IAAI,CAAC,IAAI;gBAAE,SAAS;YAEpB,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;YAEhC,IAAI,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE,CAAC;gBAC1B,OAAO,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC,CAAC;YAC1B,CAAC;iBAAM,IAAI,MAAM,CAAC,QAAQ,CAAC,IAAI,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,EAAE,CAAC;gBAC/D,OAAO,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC;YACpC,CAAC;iBAAM,CAAC;gBACN,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;QACH,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,GAAG,CAAC,KAAK,CAAC,+BAA+B,EAAE,EAAE,KAAK,EAAE,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;QACrE,CAAC;IACH,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,MAAM,UAAU,mBAAmB,CACjC,YAAuC,EACvC,MAAkB;IAElB,IAAI,CAAC,MAAM,CAAC,UAAU,IAAI,YAAY,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAE/D,MAAM,MAAM,GAA4B,EAAE,CAAC;IAC3C,MAAM,SAAS,GAAG,aAAa,CAAC,YAAY,CAAC,CAAC;IAE9C,KAAK,MAAM,SAAS,IAAI,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,UAAU,CAAC,EAAE,CAAC;QACvD,IAAI,SAAS,CAAC,SAAS,CAAC,KAAK,SAAS,EAAE,CAAC;YACvC,MAAM,CAAC,SAAS,CAAC,GAAG,SAAS,CAAC,SAAS,CAAC,CAAC;QAC3C,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,SAAS,aAAa,CACpB,MAAiC;IAEjC,MAAM,IAAI,GAA4B,EAAE,CAAC;IAEzC,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC3B,aAAa,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;IAC7B,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED,SAAS,aAAa,CACpB,GAA4B,EAC5B,MAA+B;IAE/B,KAAK,MAAM,CAAC,GAAG,EAAE,KAAK,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC;QAC/C,IAAI,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC;YAAE,SAAS;QAElC,8DAA8D;QAC9D,IAAI,CAAC,CAAC,GAAG,IAAI,MAAM,CAAC,EAAE,CAAC;YACrB,IAAI,OAAO,KAAK,KAAK,QAAQ,IAAI,KAAK,KAAK,IAAI,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC;gBACzE,aAAa,CAAC,KAAgC,EAAE,MAAM,CAAC,CAAC;YAC1D,CAAC;iBAAM,CAAC;gBACN,MAAM,CAAC,GAAG,CAAC,GAAG,KAAK,CAAC;YACtB,CAAC;QACH,CAAC;IACH,CAAC;AACH,CAAC"}
@@ -0,0 +1,10 @@
1
+ export declare function htmlToMarkdown(html: string): string;
2
+ export declare function extractSection(markdown: string, section: string, sectionIndex?: number): {
3
+ content: string;
4
+ matched: boolean;
5
+ };
6
+ export declare function extractLinksAndImages(markdown: string): {
7
+ links: string[];
8
+ images: string[];
9
+ };
10
+ //# sourceMappingURL=markdown.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"markdown.d.ts","sourceRoot":"","sources":["../../src/extraction/markdown.ts"],"names":[],"mappings":"AAkDA,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAGnD;AAmCD,wBAAgB,cAAc,CAC5B,QAAQ,EAAE,MAAM,EAChB,OAAO,EAAE,MAAM,EACf,YAAY,SAAI,GACf;IAAE,OAAO,EAAE,MAAM,CAAC;IAAC,OAAO,EAAE,OAAO,CAAA;CAAE,CA2BvC;AAED,wBAAgB,qBAAqB,CAAC,QAAQ,EAAE,MAAM,GAAG;IAAE,KAAK,EAAE,MAAM,EAAE,CAAC;IAAC,MAAM,EAAE,MAAM,EAAE,CAAA;CAAE,CAoB7F"}
@@ -0,0 +1,107 @@
1
+ import TurndownService from 'turndown';
2
+ function buildTurndown() {
3
+ const td = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced' });
4
+ // Remove script and style tags entirely
5
+ td.remove(['script', 'style']);
6
+ // Custom rule: convert <table> to markdown table
7
+ td.addRule('table', {
8
+ filter: 'table',
9
+ replacement(_content, node) {
10
+ const el = node;
11
+ const rows = Array.from(el.querySelectorAll('tr'));
12
+ if (rows.length === 0)
13
+ return '';
14
+ const renderRow = (row) => {
15
+ const cells = Array.from(row.querySelectorAll('th, td'));
16
+ return '| ' + cells.map(c => c.textContent?.replace(/\n/g, ' ').trim() ?? '').join(' | ') + ' |';
17
+ };
18
+ const headerRow = rows[0];
19
+ const isHeaderRow = headerRow.querySelectorAll('th').length > 0;
20
+ const headerCells = Array.from(headerRow.querySelectorAll('th, td'));
21
+ const separator = '| ' + headerCells.map(() => '---').join(' | ') + ' |';
22
+ if (isHeaderRow) {
23
+ const bodyRows = rows.slice(1);
24
+ const lines = [renderRow(headerRow), separator, ...bodyRows.map(renderRow)];
25
+ return '\n\n' + lines.join('\n') + '\n\n';
26
+ }
27
+ const lines = [renderRow(headerRow), separator, ...rows.slice(1).map(renderRow)];
28
+ return '\n\n' + lines.join('\n') + '\n\n';
29
+ },
30
+ });
31
+ // Suppress thead/tbody/tr/th/td individually since table rule handles the whole node
32
+ td.addRule('tableCell', {
33
+ filter: ['thead', 'tbody', 'tfoot', 'tr', 'th', 'td'],
34
+ replacement(content) {
35
+ return content;
36
+ },
37
+ });
38
+ return td;
39
+ }
40
+ const turndown = buildTurndown();
41
+ export function htmlToMarkdown(html) {
42
+ if (!html)
43
+ return '';
44
+ return turndown.turndown(html);
45
+ }
46
+ function parseHeadings(lines) {
47
+ const headings = [];
48
+ for (let i = 0; i < lines.length; i++) {
49
+ const match = lines[i].match(/^(#{1,6})\s+(.+)/);
50
+ if (match) {
51
+ headings.push({ level: match[1].length, text: match[2].trim(), lineIndex: i });
52
+ }
53
+ }
54
+ return headings;
55
+ }
56
+ function extractFromHeading(lines, headings, headingIdx) {
57
+ const heading = headings[headingIdx];
58
+ const start = heading.lineIndex;
59
+ // Find the next heading of equal or higher level (lower or equal # count)
60
+ let end = lines.length;
61
+ for (let i = headingIdx + 1; i < headings.length; i++) {
62
+ if (headings[i].level <= heading.level) {
63
+ end = headings[i].lineIndex;
64
+ break;
65
+ }
66
+ }
67
+ return lines.slice(start, end).join('\n');
68
+ }
69
+ export function extractSection(markdown, section, sectionIndex = 0) {
70
+ const lines = markdown.split('\n');
71
+ const headings = parseHeadings(lines);
72
+ if (headings.length === 0)
73
+ return { content: markdown, matched: false };
74
+ const lower = section.toLowerCase();
75
+ const indexed = headings.map((h, i) => ({ h, i }));
76
+ // Collect exact matches first
77
+ const exactMatches = indexed.filter(({ h }) => h.text.toLowerCase() === lower);
78
+ // If exact matches satisfy the requested index, use them
79
+ if (exactMatches.length > 0 && sectionIndex < exactMatches.length) {
80
+ const { i } = exactMatches[sectionIndex];
81
+ return { content: extractFromHeading(lines, headings, i), matched: true };
82
+ }
83
+ // Fall back to substring matches (includes exact headings and partial ones)
84
+ const substringMatches = indexed.filter(({ h }) => h.text.toLowerCase().includes(lower));
85
+ if (substringMatches.length === 0 || sectionIndex >= substringMatches.length) {
86
+ return { content: markdown, matched: false };
87
+ }
88
+ const { i } = substringMatches[sectionIndex];
89
+ return { content: extractFromHeading(lines, headings, i), matched: true };
90
+ }
91
+ export function extractLinksAndImages(markdown) {
92
+ const imagePattern = /!\[[^\]]*\]\(([^)]+)\)/g;
93
+ const linkPattern = /(?<!!)\[[^\]]*\]\(([^)]+)\)/g;
94
+ const images = new Set();
95
+ const links = new Set();
96
+ let match;
97
+ // Extract images first
98
+ while ((match = imagePattern.exec(markdown)) !== null) {
99
+ images.add(match[1]);
100
+ }
101
+ // Extract links (non-image)
102
+ while ((match = linkPattern.exec(markdown)) !== null) {
103
+ links.add(match[1]);
104
+ }
105
+ return { links: Array.from(links), images: Array.from(images) };
106
+ }
107
+ //# sourceMappingURL=markdown.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"markdown.js","sourceRoot":"","sources":["../../src/extraction/markdown.ts"],"names":[],"mappings":"AAAA,OAAO,eAAe,MAAM,UAAU,CAAC;AAEvC,SAAS,aAAa;IACpB,MAAM,EAAE,GAAG,IAAI,eAAe,CAAC,EAAE,YAAY,EAAE,KAAK,EAAE,cAAc,EAAE,QAAQ,EAAE,CAAC,CAAC;IAElF,wCAAwC;IACxC,EAAE,CAAC,MAAM,CAAC,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC,CAAC;IAE/B,iDAAiD;IACjD,EAAE,CAAC,OAAO,CAAC,OAAO,EAAE;QAClB,MAAM,EAAE,OAAO;QACf,WAAW,CAAC,QAAQ,EAAE,IAAI;YACxB,MAAM,EAAE,GAAG,IAAe,CAAC;YAC3B,MAAM,IAAI,GAAc,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,gBAAgB,CAAC,IAAI,CAAC,CAAC,CAAC;YAC9D,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;gBAAE,OAAO,EAAE,CAAC;YAEjC,MAAM,SAAS,GAAG,CAAC,GAAY,EAAU,EAAE;gBACzC,MAAM,KAAK,GAAG,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAC,CAAC;gBACzD,OAAO,IAAI,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,WAAW,EAAE,OAAO,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC;YACnG,CAAC,CAAC;YAEF,MAAM,SAAS,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC;YAC1B,MAAM,WAAW,GAAG,SAAS,CAAC,gBAAgB,CAAC,IAAI,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;YAChE,MAAM,WAAW,GAAG,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAC,CAAC;YACrE,MAAM,SAAS,GAAG,IAAI,GAAG,WAAW,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC;YAEzE,IAAI,WAAW,EAAE,CAAC;gBAChB,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;gBAC/B,MAAM,KAAK,GAAG,CAAC,SAAS,CAAC,SAAS,CAAC,EAAE,SAAS,EAAE,GAAG,QAAQ,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC,CAAC;gBAC5E,OAAO,MAAM,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC;YAC5C,CAAC;YAED,MAAM,KAAK,GAAG,CAAC,SAAS,CAAC,SAAS,CAAC,EAAE,SAAS,EAAE,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC,CAAC;YACjF,OAAO,MAAM,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC;QAC5C,CAAC;KACF,CAAC,CAAC;IAEH,qFAAqF;IACrF,EAAE,CAAC,OAAO,CAAC,WAAW,EAAE;QACtB,MAAM,EAAE,CAAC,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,CAAC;QACrD,WAAW,CAAC,OAAO;YACjB,OAAO,OAAO,CAAC;QACjB,CAAC;KACF,CAAC,CAAC;IAEH,OAAO,EAAE,CAAC;AACZ,CAAC;AAED,MAAM,QAAQ,GAAG,aAAa,EAAE,CAAC;AAEjC,MAAM,UAAU,cAAc,CAAC,IAAY;IACzC,IAAI,CAAC,IAAI;QAAE,OAAO,EAAE,CAAC;IACrB,OAAO,QAAQ,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;AACjC,CAAC;AAQD,SAAS,aAAa,CAAC,KAAe;IACpC,MAAM,QAAQ,GAAc,EAAE,CAAC;IAC/B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,kBAAkB,CAAC,CAAC;QACjD,IAAI,KAAK,EAAE,CAAC;YACV,QAAQ,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,EAAE,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,SAAS,EAAE,CAAC,EAAE,CAAC,CAAC;QACjF,CAAC;IACH,CAAC;IACD,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED,SAAS,kBAAkB,CAAC,KAAe,EAAE,QAAmB,EAAE,UAAkB;IAClF,MAAM,OAAO,GAAG,QAAQ,CAAC,UAAU,CAAC,CAAC;IACrC,MAAM,KAAK,GAAG,OAAO,CAAC,SAAS,CAAC;IAEhC,0EAA0E;IAC1E,IAAI,GAAG,GAAG,KAAK,CAAC,MAAM,CAAC;IACvB,KAAK,IAAI,CAAC,GAAG,UAAU,GAAG,CAAC,EAAE,CAAC,GAAG,QAAQ,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtD,IAAI,QAAQ,CAAC,CAAC,CAAC,CAAC,KAAK,IAAI,OAAO,CAAC,KAAK,EAAE,CAAC;YACvC,GAAG,GAAG,QAAQ,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;YAC5B,MAAM;QACR,CAAC;IACH,CAAC;IAED,OAAO,KAAK,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC5C,CAAC;AAED,MAAM,UAAU,cAAc,CAC5B,QAAgB,EAChB,OAAe,EACf,YAAY,GAAG,CAAC;IAEhB,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IACnC,MAAM,QAAQ,GAAG,aAAa,CAAC,KAAK,CAAC,CAAC;IAEtC,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC;IAExE,MAAM,KAAK,GAAG,OAAO,CAAC,WAAW,EAAE,CAAC;IACpC,MAAM,OAAO,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC;IAEnD,8BAA8B;IAC9B,MAAM,YAAY,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,EAAE,KAAK,KAAK,CAAC,CAAC;IAE/E,yDAAyD;IACzD,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,IAAI,YAAY,GAAG,YAAY,CAAC,MAAM,EAAE,CAAC;QAClE,MAAM,EAAE,CAAC,EAAE,GAAG,YAAY,CAAC,YAAY,CAAC,CAAC;QACzC,OAAO,EAAE,OAAO,EAAE,kBAAkB,CAAC,KAAK,EAAE,QAAQ,EAAE,CAAC,CAAC,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC;IAC5E,CAAC;IAED,4EAA4E;IAC5E,MAAM,gBAAgB,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC;IAEzF,IAAI,gBAAgB,CAAC,MAAM,KAAK,CAAC,IAAI,YAAY,IAAI,gBAAgB,CAAC,MAAM,EAAE,CAAC;QAC7E,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC;IAC/C,CAAC;IAED,MAAM,EAAE,CAAC,EAAE,GAAG,gBAAgB,CAAC,YAAY,CAAC,CAAC;IAC7C,OAAO,EAAE,OAAO,EAAE,kBAAkB,CAAC,KAAK,EAAE,QAAQ,EAAE,CAAC,CAAC,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC;AAC5E,CAAC;AAED,MAAM,UAAU,qBAAqB,CAAC,QAAgB;IACpD,MAAM,YAAY,GAAG,yBAAyB,CAAC;IAC/C,MAAM,WAAW,GAAG,8BAA8B,CAAC;IAEnD,MAAM,MAAM,GAAG,IAAI,GAAG,EAAU,CAAC;IACjC,MAAM,KAAK,GAAG,IAAI,GAAG,EAAU,CAAC;IAEhC,IAAI,KAA6B,CAAC;IAElC,uBAAuB;IACvB,OAAO,CAAC,KAAK,GAAG,YAAY,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QACtD,MAAM,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;IACvB,CAAC;IAED,4BAA4B;IAC5B,OAAO,CAAC,KAAK,GAAG,WAAW,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QACrD,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;IACtB,CAAC;IAED,OAAO,EAAE,KAAK,EAAE,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,MAAM,EAAE,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC;AAClE,CAAC"}