@staticn0va/wigolo 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (215) hide show
  1. package/LICENSE +74 -0
  2. package/README.md +272 -0
  3. package/dist/cache/db.d.ts +5 -0
  4. package/dist/cache/db.d.ts.map +1 -0
  5. package/dist/cache/db.js +97 -0
  6. package/dist/cache/db.js.map +1 -0
  7. package/dist/cache/store.d.ts +26 -0
  8. package/dist/cache/store.d.ts.map +1 -0
  9. package/dist/cache/store.js +214 -0
  10. package/dist/cache/store.js.map +1 -0
  11. package/dist/cli/daemon.d.ts +2 -0
  12. package/dist/cli/daemon.d.ts.map +1 -0
  13. package/dist/cli/daemon.js +5 -0
  14. package/dist/cli/daemon.js.map +1 -0
  15. package/dist/cli/health.d.ts +2 -0
  16. package/dist/cli/health.d.ts.map +1 -0
  17. package/dist/cli/health.js +5 -0
  18. package/dist/cli/health.js.map +1 -0
  19. package/dist/cli/index.d.ts +7 -0
  20. package/dist/cli/index.d.ts.map +1 -0
  21. package/dist/cli/index.js +9 -0
  22. package/dist/cli/index.js.map +1 -0
  23. package/dist/cli/warmup.d.ts +11 -0
  24. package/dist/cli/warmup.d.ts.map +1 -0
  25. package/dist/cli/warmup.js +107 -0
  26. package/dist/cli/warmup.js.map +1 -0
  27. package/dist/config.d.ts +41 -0
  28. package/dist/config.d.ts.map +1 -0
  29. package/dist/config.js +66 -0
  30. package/dist/config.js.map +1 -0
  31. package/dist/crawl/crawler.d.ts +18 -0
  32. package/dist/crawl/crawler.d.ts.map +1 -0
  33. package/dist/crawl/crawler.js +228 -0
  34. package/dist/crawl/crawler.js.map +1 -0
  35. package/dist/crawl/dedup.d.ts +15 -0
  36. package/dist/crawl/dedup.d.ts.map +1 -0
  37. package/dist/crawl/dedup.js +93 -0
  38. package/dist/crawl/dedup.js.map +1 -0
  39. package/dist/crawl/mapper.d.ts +17 -0
  40. package/dist/crawl/mapper.d.ts.map +1 -0
  41. package/dist/crawl/mapper.js +178 -0
  42. package/dist/crawl/mapper.js.map +1 -0
  43. package/dist/crawl/rate-limiter.d.ts +10 -0
  44. package/dist/crawl/rate-limiter.d.ts.map +1 -0
  45. package/dist/crawl/rate-limiter.js +72 -0
  46. package/dist/crawl/rate-limiter.js.map +1 -0
  47. package/dist/crawl/robots.d.ts +9 -0
  48. package/dist/crawl/robots.d.ts.map +1 -0
  49. package/dist/crawl/robots.js +63 -0
  50. package/dist/crawl/robots.js.map +1 -0
  51. package/dist/crawl/sitemap.d.ts +4 -0
  52. package/dist/crawl/sitemap.d.ts.map +1 -0
  53. package/dist/crawl/sitemap.js +38 -0
  54. package/dist/crawl/sitemap.js.map +1 -0
  55. package/dist/crawl/url-utils.d.ts +3 -0
  56. package/dist/crawl/url-utils.d.ts.map +1 -0
  57. package/dist/crawl/url-utils.js +41 -0
  58. package/dist/crawl/url-utils.js.map +1 -0
  59. package/dist/extraction/defuddle.d.ts +3 -0
  60. package/dist/extraction/defuddle.d.ts.map +1 -0
  61. package/dist/extraction/defuddle.js +26 -0
  62. package/dist/extraction/defuddle.js.map +1 -0
  63. package/dist/extraction/extract.d.ts +5 -0
  64. package/dist/extraction/extract.d.ts.map +1 -0
  65. package/dist/extraction/extract.js +83 -0
  66. package/dist/extraction/extract.js.map +1 -0
  67. package/dist/extraction/jsonld.d.ts +4 -0
  68. package/dist/extraction/jsonld.d.ts.map +1 -0
  69. package/dist/extraction/jsonld.js +64 -0
  70. package/dist/extraction/jsonld.js.map +1 -0
  71. package/dist/extraction/markdown.d.ts +10 -0
  72. package/dist/extraction/markdown.d.ts.map +1 -0
  73. package/dist/extraction/markdown.js +107 -0
  74. package/dist/extraction/markdown.js.map +1 -0
  75. package/dist/extraction/pipeline.d.ts +11 -0
  76. package/dist/extraction/pipeline.d.ts.map +1 -0
  77. package/dist/extraction/pipeline.js +95 -0
  78. package/dist/extraction/pipeline.js.map +1 -0
  79. package/dist/extraction/readability.d.ts +3 -0
  80. package/dist/extraction/readability.d.ts.map +1 -0
  81. package/dist/extraction/readability.js +32 -0
  82. package/dist/extraction/readability.js.map +1 -0
  83. package/dist/extraction/schema.d.ts +7 -0
  84. package/dist/extraction/schema.d.ts.map +1 -0
  85. package/dist/extraction/schema.js +86 -0
  86. package/dist/extraction/schema.js.map +1 -0
  87. package/dist/extraction/site-extractors/docs-generic.d.ts +3 -0
  88. package/dist/extraction/site-extractors/docs-generic.d.ts.map +1 -0
  89. package/dist/extraction/site-extractors/docs-generic.js +104 -0
  90. package/dist/extraction/site-extractors/docs-generic.js.map +1 -0
  91. package/dist/extraction/site-extractors/github.d.ts +3 -0
  92. package/dist/extraction/site-extractors/github.d.ts.map +1 -0
  93. package/dist/extraction/site-extractors/github.js +107 -0
  94. package/dist/extraction/site-extractors/github.js.map +1 -0
  95. package/dist/extraction/site-extractors/mdn.d.ts +3 -0
  96. package/dist/extraction/site-extractors/mdn.d.ts.map +1 -0
  97. package/dist/extraction/site-extractors/mdn.js +58 -0
  98. package/dist/extraction/site-extractors/mdn.js.map +1 -0
  99. package/dist/extraction/site-extractors/stackoverflow.d.ts +3 -0
  100. package/dist/extraction/site-extractors/stackoverflow.d.ts.map +1 -0
  101. package/dist/extraction/site-extractors/stackoverflow.js +88 -0
  102. package/dist/extraction/site-extractors/stackoverflow.js.map +1 -0
  103. package/dist/extraction/trafilatura.d.ts +6 -0
  104. package/dist/extraction/trafilatura.d.ts.map +1 -0
  105. package/dist/extraction/trafilatura.js +105 -0
  106. package/dist/extraction/trafilatura.js.map +1 -0
  107. package/dist/fetch/auth.d.ts +8 -0
  108. package/dist/fetch/auth.d.ts.map +1 -0
  109. package/dist/fetch/auth.js +32 -0
  110. package/dist/fetch/auth.js.map +1 -0
  111. package/dist/fetch/browser-pool.d.ts +28 -0
  112. package/dist/fetch/browser-pool.d.ts.map +1 -0
  113. package/dist/fetch/browser-pool.js +138 -0
  114. package/dist/fetch/browser-pool.js.map +1 -0
  115. package/dist/fetch/content-check.d.ts +2 -0
  116. package/dist/fetch/content-check.d.ts.map +1 -0
  117. package/dist/fetch/content-check.js +62 -0
  118. package/dist/fetch/content-check.js.map +1 -0
  119. package/dist/fetch/http-client.d.ts +15 -0
  120. package/dist/fetch/http-client.d.ts.map +1 -0
  121. package/dist/fetch/http-client.js +146 -0
  122. package/dist/fetch/http-client.js.map +1 -0
  123. package/dist/fetch/router.d.ts +45 -0
  124. package/dist/fetch/router.d.ts.map +1 -0
  125. package/dist/fetch/router.js +89 -0
  126. package/dist/fetch/router.js.map +1 -0
  127. package/dist/index.d.ts +3 -0
  128. package/dist/index.d.ts.map +1 -0
  129. package/dist/index.js +22 -0
  130. package/dist/index.js.map +1 -0
  131. package/dist/logger.d.ts +10 -0
  132. package/dist/logger.d.ts.map +1 -0
  133. package/dist/logger.js +39 -0
  134. package/dist/logger.js.map +1 -0
  135. package/dist/search/dedup.d.ts +10 -0
  136. package/dist/search/dedup.d.ts.map +1 -0
  137. package/dist/search/dedup.js +35 -0
  138. package/dist/search/dedup.js.map +1 -0
  139. package/dist/search/engines/bing.d.ts +7 -0
  140. package/dist/search/engines/bing.d.ts.map +1 -0
  141. package/dist/search/engines/bing.js +48 -0
  142. package/dist/search/engines/bing.js.map +1 -0
  143. package/dist/search/engines/duckduckgo.d.ts +7 -0
  144. package/dist/search/engines/duckduckgo.d.ts.map +1 -0
  145. package/dist/search/engines/duckduckgo.js +50 -0
  146. package/dist/search/engines/duckduckgo.js.map +1 -0
  147. package/dist/search/engines/startpage.d.ts +7 -0
  148. package/dist/search/engines/startpage.d.ts.map +1 -0
  149. package/dist/search/engines/startpage.js +50 -0
  150. package/dist/search/engines/startpage.js.map +1 -0
  151. package/dist/search/filters.d.ts +16 -0
  152. package/dist/search/filters.d.ts.map +1 -0
  153. package/dist/search/filters.js +63 -0
  154. package/dist/search/filters.js.map +1 -0
  155. package/dist/search/flashrank.d.ts +12 -0
  156. package/dist/search/flashrank.d.ts.map +1 -0
  157. package/dist/search/flashrank.js +63 -0
  158. package/dist/search/flashrank.js.map +1 -0
  159. package/dist/search/query.d.ts +2 -0
  160. package/dist/search/query.d.ts.map +1 -0
  161. package/dist/search/query.js +41 -0
  162. package/dist/search/query.js.map +1 -0
  163. package/dist/search/rerank.d.ts +3 -0
  164. package/dist/search/rerank.d.ts.map +1 -0
  165. package/dist/search/rerank.js +40 -0
  166. package/dist/search/rerank.js.map +1 -0
  167. package/dist/search/searxng.d.ts +8 -0
  168. package/dist/search/searxng.d.ts.map +1 -0
  169. package/dist/search/searxng.js +87 -0
  170. package/dist/search/searxng.js.map +1 -0
  171. package/dist/search/validator.d.ts +6 -0
  172. package/dist/search/validator.d.ts.map +1 -0
  173. package/dist/search/validator.js +35 -0
  174. package/dist/search/validator.js.map +1 -0
  175. package/dist/searxng/bootstrap.d.ts +18 -0
  176. package/dist/searxng/bootstrap.d.ts.map +1 -0
  177. package/dist/searxng/bootstrap.js +136 -0
  178. package/dist/searxng/bootstrap.js.map +1 -0
  179. package/dist/searxng/docker.d.ts +9 -0
  180. package/dist/searxng/docker.d.ts.map +1 -0
  181. package/dist/searxng/docker.js +67 -0
  182. package/dist/searxng/docker.js.map +1 -0
  183. package/dist/searxng/process.d.ts +23 -0
  184. package/dist/searxng/process.d.ts.map +1 -0
  185. package/dist/searxng/process.js +188 -0
  186. package/dist/searxng/process.js.map +1 -0
  187. package/dist/server.d.ts +2 -0
  188. package/dist/server.d.ts.map +1 -0
  189. package/dist/server.js +311 -0
  190. package/dist/server.js.map +1 -0
  191. package/dist/tools/cache.d.ts +3 -0
  192. package/dist/tools/cache.d.ts.map +1 -0
  193. package/dist/tools/cache.js +50 -0
  194. package/dist/tools/cache.js.map +1 -0
  195. package/dist/tools/crawl.d.ts +6 -0
  196. package/dist/tools/crawl.d.ts.map +1 -0
  197. package/dist/tools/crawl.js +97 -0
  198. package/dist/tools/crawl.js.map +1 -0
  199. package/dist/tools/extract.d.ts +4 -0
  200. package/dist/tools/extract.d.ts.map +1 -0
  201. package/dist/tools/extract.js +69 -0
  202. package/dist/tools/extract.js.map +1 -0
  203. package/dist/tools/fetch.d.ts +4 -0
  204. package/dist/tools/fetch.d.ts.map +1 -0
  205. package/dist/tools/fetch.js +76 -0
  206. package/dist/tools/fetch.js.map +1 -0
  207. package/dist/tools/search.d.ts +4 -0
  208. package/dist/tools/search.d.ts.map +1 -0
  209. package/dist/tools/search.js +160 -0
  210. package/dist/tools/search.js.map +1 -0
  211. package/dist/types.d.ts +222 -0
  212. package/dist/types.d.ts.map +1 -0
  213. package/dist/types.js +2 -0
  214. package/dist/types.js.map +1 -0
  215. package/package.json +61 -0
@@ -0,0 +1,11 @@
1
+ import type { ExtractionResult, Extractor } from '../types.js';
2
+ export interface ExtractionOptions {
3
+ maxChars?: number;
4
+ section?: string;
5
+ sectionIndex?: number;
6
+ contentType?: string;
7
+ pdfBuffer?: Buffer;
8
+ }
9
+ export declare function registerExtractor(extractor: Extractor): void;
10
+ export declare function extractContent(html: string, url: string, options?: ExtractionOptions): Promise<ExtractionResult>;
11
+ //# sourceMappingURL=pipeline.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"pipeline.d.ts","sourceRoot":"","sources":["../../src/extraction/pipeline.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,gBAAgB,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAU/D,MAAM,WAAW,iBAAiB;IAChC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AASD,wBAAgB,iBAAiB,CAAC,SAAS,EAAE,SAAS,GAAG,IAAI,CAE5D;AAED,wBAAsB,cAAc,CAClC,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,GAAE,iBAAsB,GAC9B,OAAO,CAAC,gBAAgB,CAAC,CAmE3B"}
@@ -0,0 +1,95 @@
1
+ import { defuddleExtract } from './defuddle.js';
2
+ import { readabilityExtract } from './readability.js';
3
+ import { trafilaturaExtract, isTrafilaturaAvailable } from './trafilatura.js';
4
+ import { htmlToMarkdown, extractSection, extractLinksAndImages } from './markdown.js';
5
+ import { githubExtractor } from './site-extractors/github.js';
6
+ import { stackoverflowExtractor } from './site-extractors/stackoverflow.js';
7
+ import { mdnExtractor } from './site-extractors/mdn.js';
8
+ import { docsGenericExtractor } from './site-extractors/docs-generic.js';
9
+ import { createLogger } from '../logger.js';
10
+ import { getConfig } from '../config.js';
11
+ const log = createLogger('extract');
12
+ const siteExtractors = [
13
+ githubExtractor,
14
+ stackoverflowExtractor,
15
+ mdnExtractor,
16
+ docsGenericExtractor,
17
+ ];
18
+ export function registerExtractor(extractor) {
19
+ siteExtractors.push(extractor);
20
+ }
21
+ export async function extractContent(html, url, options = {}) {
22
+ let result = null;
23
+ if (options.contentType === 'application/pdf') {
24
+ let pdfText = '';
25
+ if (options.pdfBuffer) {
26
+ try {
27
+ const pdfParse = (await import('pdf-parse')).default;
28
+ const parsed = await pdfParse(options.pdfBuffer);
29
+ pdfText = parsed.text ?? '';
30
+ }
31
+ catch (err) {
32
+ log.warn('pdf-parse failed', { url, error: String(err) });
33
+ }
34
+ }
35
+ result = {
36
+ title: '',
37
+ markdown: pdfText,
38
+ metadata: {},
39
+ links: [],
40
+ images: [],
41
+ extractor: 'turndown',
42
+ };
43
+ return applyPostProcessing(result, options);
44
+ }
45
+ const siteExtractor = siteExtractors.find((e) => e.canHandle(url, html));
46
+ if (siteExtractor) {
47
+ const extracted = siteExtractor.extract(html, url);
48
+ if (extracted) {
49
+ result = extracted;
50
+ return applyPostProcessing(result, options);
51
+ }
52
+ }
53
+ result = await defuddleExtract(html, url);
54
+ if (!result) {
55
+ const config = getConfig();
56
+ if (config.trafilatura !== 'never') {
57
+ const trafAvailable = await isTrafilaturaAvailable();
58
+ if (trafAvailable) {
59
+ result = await trafilaturaExtract(html, url);
60
+ if (result) {
61
+ log.info('Trafilatura extraction succeeded', { url, chars: result.markdown.length });
62
+ return applyPostProcessing(result, options);
63
+ }
64
+ }
65
+ }
66
+ }
67
+ if (!result) {
68
+ result = readabilityExtract(html, url);
69
+ }
70
+ if (!result) {
71
+ const markdown = htmlToMarkdown(html);
72
+ result = {
73
+ title: '',
74
+ markdown,
75
+ metadata: {},
76
+ links: [],
77
+ images: [],
78
+ extractor: 'turndown',
79
+ };
80
+ }
81
+ return applyPostProcessing(result, options);
82
+ }
83
+ function applyPostProcessing(result, options) {
84
+ let markdown = result.markdown;
85
+ if (options.section) {
86
+ const { content } = extractSection(markdown, options.section, options.sectionIndex ?? 0);
87
+ markdown = content;
88
+ }
89
+ const { links, images } = extractLinksAndImages(markdown);
90
+ if (options.maxChars && markdown.length > options.maxChars) {
91
+ markdown = markdown.slice(0, options.maxChars);
92
+ }
93
+ return { ...result, markdown, links, images };
94
+ }
95
+ //# sourceMappingURL=pipeline.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"pipeline.js","sourceRoot":"","sources":["../../src/extraction/pipeline.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AAChD,OAAO,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AACtD,OAAO,EAAE,kBAAkB,EAAE,sBAAsB,EAAE,MAAM,kBAAkB,CAAC;AAC9E,OAAO,EAAE,cAAc,EAAE,cAAc,EAAE,qBAAqB,EAAE,MAAM,eAAe,CAAC;AAEtF,OAAO,EAAE,eAAe,EAAE,MAAM,6BAA6B,CAAC;AAC9D,OAAO,EAAE,sBAAsB,EAAE,MAAM,oCAAoC,CAAC;AAC5E,OAAO,EAAE,YAAY,EAAE,MAAM,0BAA0B,CAAC;AACxD,OAAO,EAAE,oBAAoB,EAAE,MAAM,mCAAmC,CAAC;AACzE,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAC5C,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AAEzC,MAAM,GAAG,GAAG,YAAY,CAAC,SAAS,CAAC,CAAC;AAUpC,MAAM,cAAc,GAAgB;IAClC,eAAe;IACf,sBAAsB;IACtB,YAAY;IACZ,oBAAoB;CACrB,CAAC;AAEF,MAAM,UAAU,iBAAiB,CAAC,SAAoB;IACpD,cAAc,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;AACjC,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,IAAY,EACZ,GAAW,EACX,UAA6B,EAAE;IAE/B,IAAI,MAAM,GAA4B,IAAI,CAAC;IAE3C,IAAI,OAAO,CAAC,WAAW,KAAK,iBAAiB,EAAE,CAAC;QAC9C,IAAI,OAAO,GAAG,EAAE,CAAC;QACjB,IAAI,OAAO,CAAC,SAAS,EAAE,CAAC;YACtB,IAAI,CAAC;gBACH,MAAM,QAAQ,GAAG,CAAC,MAAM,MAAM,CAAC,WAAW,CAAC,CAAC,CAAC,OAAO,CAAC;gBACrD,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;gBACjD,OAAO,GAAG,MAAM,CAAC,IAAI,IAAI,EAAE,CAAC;YAC9B,CAAC;YAAC,OAAO,GAAG,EAAE,CAAC;gBACb,GAAG,CAAC,IAAI,CAAC,kBAAkB,EAAE,EAAE,GAAG,EAAE,KAAK,EAAE,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YAC5D,CAAC;QACH,CAAC;QACD,MAAM,GAAG;YACP,KAAK,EAAE,EAAE;YACT,QAAQ,EAAE,OAAO;YACjB,QAAQ,EAAE,EAAE;YACZ,KAAK,EAAE,EAAE;YACT,MAAM,EAAE,EAAE;YACV,SAAS,EAAE,UAAU;SACtB,CAAC;QACF,OAAO,mBAAmB,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IAC9C,CAAC;IAED,MAAM,aAAa,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC,CAAC;IACzE,IAAI,aAAa,EAAE,CAAC;QAClB,MAAM,SAAS,GAAG,aAAa,CAAC,OAAO,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;QACnD,IAAI,SAAS,EAAE,CAAC;YACd,MAAM,GAAG,SAAS,CAAC;YACnB,OAAO,mBAAmB,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QAC9C,CAAC;IACH,CAAC;IAED,MAAM,GAAG,MAAM,eAAe,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;IAE1C,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,MAAM,MAAM,GAAG,SAAS,EAAE,CAAC;QAC3B,IAAI,MAAM,CAAC,WAAW,KAAK,OAAO,EAAE,CAAC;YACnC,MAAM,aAAa,GAAG,MAAM,sBAAsB,EAAE,CAAC;YACrD,IAAI,aAAa,EAAE,CAAC;gBAClB,MAAM,GAAG,MAAM,kBAAkB,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;gBAC7C,IAAI,MAAM,EAAE,CAAC;oBACX,GAAG,CAAC,IAAI,CAAC,kCAAkC,EAAE,EAAE,GAAG,EAAE,KAAK,EAAE,MAAM,CAAC,QAAQ,CAAC,MAAM,EAAE,CAAC,CAAC;oBACrF,OAAO,mBAAmB,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;gBAC9C,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAED,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,MAAM,GAAG,kBAAkB,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;IACzC,CAAC;IAED,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,MAAM,QAAQ,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;QACtC,MAAM,GAAG;YACP,KAAK,EAAE,EAAE;YACT,QAAQ;YACR,QAAQ,EAAE,EAAE;YACZ,KAAK,EAAE,EAAE;YACT,MAAM,EAAE,EAAE;YACV,SAAS,EAAE,UAAU;SACtB,CAAC;IACJ,CAAC;IAED,OAAO,mBAAmB,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;AAC9C,CAAC;AAED,SAAS,mBAAmB,CAC1B,MAAwB,EACxB,OAA0B;IAE1B,IAAI,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC;IAE/B,IAAI,OAAO,CAAC,OAAO,EAAE,CAAC;QACpB,MAAM,EAAE,OAAO,EAAE,GAAG,cAAc,CAAC,QAAQ,EAAE,OAAO,CAAC,OAAO,EAAE,OAAO,CAAC,YAAY,IAAI,CAAC,CAAC,CAAC;QACzF,QAAQ,GAAG,OAAO,CAAC;IACrB,CAAC;IAED,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,GAAG,qBAAqB,CAAC,QAAQ,CAAC,CAAC;IAE1D,IAAI,OAAO,CAAC,QAAQ,IAAI,QAAQ,CAAC,MAAM,GAAG,OAAO,CAAC,QAAQ,EAAE,CAAC;QAC3D,QAAQ,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,OAAO,CAAC,QAAQ,CAAC,CAAC;IACjD,CAAC;IAED,OAAO,EAAE,GAAG,MAAM,EAAE,QAAQ,EAAE,KAAK,EAAE,MAAM,EAAE,CAAC;AAChD,CAAC"}
@@ -0,0 +1,3 @@
1
+ import type { ExtractionResult } from '../types.js';
2
+ export declare function readabilityExtract(html: string, url: string): ExtractionResult | null;
3
+ //# sourceMappingURL=readability.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"readability.d.ts","sourceRoot":"","sources":["../../src/extraction/readability.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAIpD,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,gBAAgB,GAAG,IAAI,CA0BrF"}
@@ -0,0 +1,32 @@
1
+ import { Readability } from '@mozilla/readability';
2
+ import { parseHTML } from 'linkedom';
3
+ import TurndownService from 'turndown';
4
+ const MIN_CONTENT_THRESHOLD = 100;
5
+ export function readabilityExtract(html, url) {
6
+ try {
7
+ const { document } = parseHTML(html);
8
+ const reader = new Readability(document);
9
+ const article = reader.parse();
10
+ if (!article || !article.content)
11
+ return null;
12
+ const turndown = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced' });
13
+ const markdown = turndown.turndown(article.content);
14
+ if (markdown.length < MIN_CONTENT_THRESHOLD)
15
+ return null;
16
+ return {
17
+ title: article.title ?? '',
18
+ markdown,
19
+ metadata: {
20
+ author: article.byline || undefined,
21
+ language: article.lang || undefined,
22
+ },
23
+ links: [],
24
+ images: [],
25
+ extractor: 'readability',
26
+ };
27
+ }
28
+ catch {
29
+ return null;
30
+ }
31
+ }
32
+ //# sourceMappingURL=readability.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"readability.js","sourceRoot":"","sources":["../../src/extraction/readability.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAC;AACnD,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AACrC,OAAO,eAAe,MAAM,UAAU,CAAC;AAGvC,MAAM,qBAAqB,GAAG,GAAG,CAAC;AAElC,MAAM,UAAU,kBAAkB,CAAC,IAAY,EAAE,GAAW;IAC1D,IAAI,CAAC;QACH,MAAM,EAAE,QAAQ,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;QACrC,MAAM,MAAM,GAAG,IAAI,WAAW,CAAC,QAAe,CAAC,CAAC;QAChD,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,EAAE,CAAC;QAC/B,IAAI,CAAC,OAAO,IAAI,CAAC,OAAO,CAAC,OAAO;YAAE,OAAO,IAAI,CAAC;QAE9C,MAAM,QAAQ,GAAG,IAAI,eAAe,CAAC,EAAE,YAAY,EAAE,KAAK,EAAE,cAAc,EAAE,QAAQ,EAAE,CAAC,CAAC;QACxF,MAAM,QAAQ,GAAG,QAAQ,CAAC,QAAQ,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;QAEpD,IAAI,QAAQ,CAAC,MAAM,GAAG,qBAAqB;YAAE,OAAO,IAAI,CAAC;QAEzD,OAAO;YACL,KAAK,EAAE,OAAO,CAAC,KAAK,IAAI,EAAE;YAC1B,QAAQ;YACR,QAAQ,EAAE;gBACR,MAAM,EAAE,OAAO,CAAC,MAAM,IAAI,SAAS;gBACnC,QAAQ,EAAE,OAAO,CAAC,IAAI,IAAI,SAAS;aACpC;YACD,KAAK,EAAE,EAAE;YACT,MAAM,EAAE,EAAE;YACV,SAAS,EAAE,aAAa;SACzB,CAAC;IACJ,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC"}
@@ -0,0 +1,7 @@
1
+ export interface JsonSchema {
2
+ type?: string;
3
+ properties?: Record<string, JsonSchema>;
4
+ items?: JsonSchema;
5
+ }
6
+ export declare function extractWithSchema(html: string, schema: JsonSchema): Record<string, unknown>;
7
+ //# sourceMappingURL=schema.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"schema.d.ts","sourceRoot":"","sources":["../../src/extraction/schema.ts"],"names":[],"mappings":"AAGA,MAAM,WAAW,UAAU;IACzB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,UAAU,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,UAAU,CAAC,CAAC;IACxC,KAAK,CAAC,EAAE,UAAU,CAAC;CACpB;AAED,wBAAgB,iBAAiB,CAC/B,IAAI,EAAE,MAAM,EACZ,MAAM,EAAE,UAAU,GACjB,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAmBzB"}
@@ -0,0 +1,86 @@
1
+ import { parseHTML } from 'linkedom';
2
+ import { extractJsonLd, matchJsonLdToSchema } from './jsonld.js';
3
+ export function extractWithSchema(html, schema) {
4
+ if (!html || !schema.properties)
5
+ return {};
6
+ const jsonLdBlocks = extractJsonLd(html);
7
+ const jsonLdResult = matchJsonLdToSchema(jsonLdBlocks, schema);
8
+ const { document: doc } = parseHTML(html);
9
+ const heuristicResult = {};
10
+ for (const [fieldName, fieldSchema] of Object.entries(schema.properties)) {
11
+ if (jsonLdResult[fieldName] !== undefined)
12
+ continue;
13
+ const value = findFieldValue(doc, fieldName, fieldSchema);
14
+ if (value !== undefined) {
15
+ heuristicResult[fieldName] = value;
16
+ }
17
+ }
18
+ return { ...jsonLdResult, ...heuristicResult };
19
+ }
20
+ function findFieldValue(doc, fieldName, schema) {
21
+ const normalizedName = fieldName.toLowerCase().replace(/_/g, '-');
22
+ const compactName = fieldName.replace(/_/g, '').toLowerCase();
23
+ const variants = [fieldName, normalizedName, compactName];
24
+ if (schema.type === 'array') {
25
+ return findArrayValues(doc, variants);
26
+ }
27
+ return findSingleValue(doc, variants);
28
+ }
29
+ function cssEscape(value) {
30
+ return value.replace(/([^\w-])/g, '\\$1');
31
+ }
32
+ function findSingleValue(doc, variants) {
33
+ for (const name of variants) {
34
+ const byItemprop = doc.querySelector(`[itemprop="${name}"]`);
35
+ if (byItemprop) {
36
+ const text = byItemprop.getAttribute('content') ?? byItemprop.textContent?.trim();
37
+ if (text)
38
+ return text;
39
+ }
40
+ // Substring match is intentional — heuristic best-effort for partial class names
41
+ const byClass = doc.querySelector(`[class*="${name}"]`);
42
+ if (byClass) {
43
+ const text = byClass.textContent?.trim();
44
+ if (text)
45
+ return text;
46
+ }
47
+ const allWithAria = doc.querySelectorAll('[aria-label]');
48
+ for (const el of allWithAria) {
49
+ const label = el.getAttribute('aria-label')?.toLowerCase().replace(/\s+/g, '-') ?? '';
50
+ if (label === name.toLowerCase()) {
51
+ const text = el.textContent?.trim();
52
+ if (text)
53
+ return text;
54
+ }
55
+ }
56
+ const byId = doc.querySelector(`#${cssEscape(name)}`);
57
+ if (byId) {
58
+ const text = byId.textContent?.trim();
59
+ if (text)
60
+ return text;
61
+ }
62
+ const byData = doc.querySelector(`[data-${name}]`);
63
+ if (byData) {
64
+ return byData.getAttribute(`data-${name}`) ?? byData.textContent?.trim() ?? undefined;
65
+ }
66
+ }
67
+ return undefined;
68
+ }
69
+ function findArrayValues(doc, variants) {
70
+ for (const name of variants) {
71
+ const container = doc.querySelector(`[class*="${name}"]`);
72
+ if (container) {
73
+ const items = container.querySelectorAll('li, [class*="item"]');
74
+ if (items.length > 0) {
75
+ return Array.from(items).map((el) => (el.textContent ?? '').trim()).filter(Boolean);
76
+ }
77
+ }
78
+ const singular = name.replace(/s$/, '');
79
+ const elements = doc.querySelectorAll(`[class*="${singular}"]`);
80
+ if (elements.length > 1) {
81
+ return Array.from(elements).map((el) => (el.textContent ?? '').trim()).filter(Boolean);
82
+ }
83
+ }
84
+ return undefined;
85
+ }
86
+ //# sourceMappingURL=schema.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"schema.js","sourceRoot":"","sources":["../../src/extraction/schema.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AACrC,OAAO,EAAE,aAAa,EAAE,mBAAmB,EAAE,MAAM,aAAa,CAAC;AAQjE,MAAM,UAAU,iBAAiB,CAC/B,IAAY,EACZ,MAAkB;IAElB,IAAI,CAAC,IAAI,IAAI,CAAC,MAAM,CAAC,UAAU;QAAE,OAAO,EAAE,CAAC;IAE3C,MAAM,YAAY,GAAG,aAAa,CAAC,IAAI,CAAC,CAAC;IACzC,MAAM,YAAY,GAAG,mBAAmB,CAAC,YAAY,EAAE,MAAM,CAAC,CAAC;IAE/D,MAAM,EAAE,QAAQ,EAAE,GAAG,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;IAC1C,MAAM,eAAe,GAA4B,EAAE,CAAC;IAEpD,KAAK,MAAM,CAAC,SAAS,EAAE,WAAW,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,UAAU,CAAC,EAAE,CAAC;QACzE,IAAI,YAAY,CAAC,SAAS,CAAC,KAAK,SAAS;YAAE,SAAS;QAEpD,MAAM,KAAK,GAAG,cAAc,CAAC,GAAG,EAAE,SAAS,EAAE,WAAW,CAAC,CAAC;QAC1D,IAAI,KAAK,KAAK,SAAS,EAAE,CAAC;YACxB,eAAe,CAAC,SAAS,CAAC,GAAG,KAAK,CAAC;QACrC,CAAC;IACH,CAAC;IAED,OAAO,EAAE,GAAG,YAAY,EAAE,GAAG,eAAe,EAAE,CAAC;AACjD,CAAC;AAED,SAAS,cAAc,CACrB,GAAa,EACb,SAAiB,EACjB,MAAkB;IAElB,MAAM,cAAc,GAAG,SAAS,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;IAClE,MAAM,WAAW,GAAG,SAAS,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,WAAW,EAAE,CAAC;IAC9D,MAAM,QAAQ,GAAG,CAAC,SAAS,EAAE,cAAc,EAAE,WAAW,CAAC,CAAC;IAE1D,IAAI,MAAM,CAAC,IAAI,KAAK,OAAO,EAAE,CAAC;QAC5B,OAAO,eAAe,CAAC,GAAG,EAAE,QAAQ,CAAC,CAAC;IACxC,CAAC;IAED,OAAO,eAAe,CAAC,GAAG,EAAE,QAAQ,CAAC,CAAC;AACxC,CAAC;AAED,SAAS,SAAS,CAAC,KAAa;IAC9B,OAAO,KAAK,CAAC,OAAO,CAAC,WAAW,EAAE,MAAM,CAAC,CAAC;AAC5C,CAAC;AAED,SAAS,eAAe,CAAC,GAAa,EAAE,QAAkB;IACxD,KAAK,MAAM,IAAI,IAAI,QAAQ,EAAE,CAAC;QAC5B,MAAM,UAAU,GAAG,GAAG,CAAC,aAAa,CAAC,cAAc,IAAI,IAAI,CAAC,CAAC;QAC7D,IAAI,UAAU,EAAE,CAAC;YACf,MAAM,IAAI,GAAG,UAAU,CAAC,YAAY,CAAC,SAAS,CAAC,IAAI,UAAU,CAAC,WAAW,EAAE,IAAI,EAAE,CAAC;YAClF,IAAI,IAAI;gBAAE,OAAO,IAAI,CAAC;QACxB,CAAC;QAED,iFAAiF;QACjF,MAAM,OAAO,GAAG,GAAG,CAAC,aAAa,CAAC,YAAY,IAAI,IAAI,CAAC,CAAC;QACxD,IAAI,OAAO,EAAE,CAAC;YACZ,MAAM,IAAI,GAAG,OAAO,CAAC,WAAW,EAAE,IAAI,EAAE,CAAC;YACzC,IAAI,IAAI;gBAAE,OAAO,IAAI,CAAC;QACxB,CAAC;QAED,MAAM,WAAW,GAAG,GAAG,CAAC,gBAAgB,CAAC,cAAc,CAAC,CAAC;QACzD,KAAK,MAAM,EAAE,IAAI,WAAW,EAAE,CAAC;YAC7B,MAAM,KAAK,GAAG,EAAE,CAAC,YAAY,CAAC,YAAY,CAAC,EAAE,WAAW,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,IAAI,EAAE,CAAC;YACtF,IAAI,KAAK,KAAK,IAAI,CAAC,WAAW,EAAE,EAAE,CAAC;gBACjC,MAAM,IAAI,GAAG,EAAE,CAAC,WAAW,EAAE,IAAI,EAAE,CAAC;gBACpC,IAAI,IAAI;oBAAE,OAAO,IAAI,CAAC;YACxB,CAAC;QACH,CAAC;QAED,MAAM,IAAI,GAAG,GAAG,CAAC,aAAa,CAAC,IAAI,SAAS,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACtD,IAAI,IAAI,EAAE,CAAC;YACT,MAAM,IAAI,GAAG,IAAI,CAAC,WAAW,EAAE,IAAI,EAAE,CAAC;YACtC,IAAI,IAAI;gBAAE,OAAO,IAAI,CAAC;QACxB,CAAC;QAED,MAAM,MAAM,GAAG,GAAG,CAAC,aAAa,CAAC,SAAS,IAAI,GAAG,CAAC,CAAC;QACnD,IAAI,MAAM,EAAE,CAAC;YACX,OAAO,MAAM,CAAC,YAAY,CAAC,QAAQ,IAAI,EAAE,CAAC,IAAI,MAAM,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,SAAS,CAAC;QACxF,CAAC;IACH,CAAC;IAED,OAAO,SAAS,CAAC;AACnB,CAAC;AAED,SAAS,eAAe,CAAC,GAAa,EAAE,QAAkB;IACxD,KAAK,MAAM,IAAI,IAAI,QAAQ,EAAE,CAAC;QAC5B,MAAM,SAAS,GAAG,GAAG,CAAC,aAAa,CAAC,YAAY,IAAI,IAAI,CAAC,CAAC;QAC1D,IAAI,SAAS,EAAE,CAAC;YACd,MAAM,KAAK,GAAG,SAAS,CAAC,gBAAgB,CAAC,qBAAqB,CAAC,CAAC;YAChE,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACrB,OAAO,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;YACtF,CAAC;QACH,CAAC;QAED,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;QACxC,MAAM,QAAQ,GAAG,GAAG,CAAC,gBAAgB,CAAC,YAAY,QAAQ,IAAI,CAAC,CAAC;QAChE,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACxB,OAAO,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QACzF,CAAC;IACH,CAAC;IAED,OAAO,SAAS,CAAC;AACnB,CAAC"}
@@ -0,0 +1,3 @@
1
+ import type { Extractor } from '../../types.js';
2
+ export declare const docsGenericExtractor: Extractor;
3
+ //# sourceMappingURL=docs-generic.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"docs-generic.d.ts","sourceRoot":"","sources":["../../../src/extraction/site-extractors/docs-generic.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAoB,MAAM,gBAAgB,CAAC;AAqFlE,eAAO,MAAM,oBAAoB,EAAE,SAqClC,CAAC"}
@@ -0,0 +1,104 @@
1
+ import { parseHTML } from 'linkedom';
2
+ import TurndownService from 'turndown';
3
+ const turndown = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced' });
4
+ const STRIP_SELECTORS = [
5
+ 'nav',
6
+ '.docs-sidebar',
7
+ '.sidebar',
8
+ '.toc-wrapper',
9
+ '.table-of-contents',
10
+ '.version-picker',
11
+ '.pagination-nav',
12
+ 'header',
13
+ 'footer',
14
+ ];
15
+ function detectFramework(html) {
16
+ if (html.includes('docs-sidebar') || html.includes('data-docusaurus-page')) {
17
+ return 'docusaurus';
18
+ }
19
+ if (html.includes('md-content')) {
20
+ return 'mkdocs';
21
+ }
22
+ if (html.includes('class="document"') || html.includes("class='document'") ||
23
+ (html.includes('class="body"') && html.includes('highlight'))) {
24
+ return 'sphinx';
25
+ }
26
+ if (html.includes('page-body')) {
27
+ return 'gitbook';
28
+ }
29
+ return null;
30
+ }
31
+ function extractWithSelectors(document, contentSelectors) {
32
+ for (const selector of contentSelectors) {
33
+ const el = document.querySelector(selector);
34
+ if (el)
35
+ return el;
36
+ }
37
+ return null;
38
+ }
39
+ function stripElements(root, selectors) {
40
+ for (const selector of selectors) {
41
+ for (const el of Array.from(root.querySelectorAll(selector))) {
42
+ el.parentNode?.removeChild(el);
43
+ }
44
+ }
45
+ }
46
+ function buildResult(document, contentEl) {
47
+ stripElements(contentEl, STRIP_SELECTORS);
48
+ const titleEl = contentEl.querySelector('h1') ??
49
+ document.querySelector('h1') ??
50
+ document.querySelector('title');
51
+ const rawTitle = titleEl?.textContent?.trim() ?? '';
52
+ const title = rawTitle.includes('|')
53
+ ? rawTitle.split('|')[0].trim()
54
+ : rawTitle;
55
+ if (!title)
56
+ return null;
57
+ const markdown = turndown.turndown(contentEl.innerHTML).trim();
58
+ if (!markdown)
59
+ return null;
60
+ return {
61
+ title,
62
+ markdown,
63
+ metadata: {},
64
+ links: [],
65
+ images: [],
66
+ extractor: 'site-specific',
67
+ };
68
+ }
69
+ export const docsGenericExtractor = {
70
+ name: 'docs-generic',
71
+ canHandle(_url, html) {
72
+ if (!html)
73
+ return false;
74
+ return detectFramework(html) !== null;
75
+ },
76
+ extract(html, _url) {
77
+ if (!html)
78
+ return null;
79
+ const framework = detectFramework(html);
80
+ if (!framework)
81
+ return null;
82
+ const { document } = parseHTML(html);
83
+ let contentSelectors;
84
+ switch (framework) {
85
+ case 'docusaurus':
86
+ contentSelectors = ['.markdown', 'article', 'main'];
87
+ break;
88
+ case 'mkdocs':
89
+ contentSelectors = ['.md-content'];
90
+ break;
91
+ case 'sphinx':
92
+ contentSelectors = ['.document', '.body'];
93
+ break;
94
+ case 'gitbook':
95
+ contentSelectors = ['.page-body'];
96
+ break;
97
+ }
98
+ const contentEl = extractWithSelectors(document, contentSelectors);
99
+ if (!contentEl)
100
+ return null;
101
+ return buildResult(document, contentEl);
102
+ },
103
+ };
104
+ //# sourceMappingURL=docs-generic.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"docs-generic.js","sourceRoot":"","sources":["../../../src/extraction/site-extractors/docs-generic.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AACrC,OAAO,eAAe,MAAM,UAAU,CAAC;AAGvC,MAAM,QAAQ,GAAG,IAAI,eAAe,CAAC,EAAE,YAAY,EAAE,KAAK,EAAE,cAAc,EAAE,QAAQ,EAAE,CAAC,CAAC;AAIxF,MAAM,eAAe,GAAG;IACtB,KAAK;IACL,eAAe;IACf,UAAU;IACV,cAAc;IACd,oBAAoB;IACpB,iBAAiB;IACjB,iBAAiB;IACjB,QAAQ;IACR,QAAQ;CACT,CAAC;AAEF,SAAS,eAAe,CAAC,IAAY;IACnC,IAAI,IAAI,CAAC,QAAQ,CAAC,cAAc,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,sBAAsB,CAAC,EAAE,CAAC;QAC3E,OAAO,YAAY,CAAC;IACtB,CAAC;IACD,IAAI,IAAI,CAAC,QAAQ,CAAC,YAAY,CAAC,EAAE,CAAC;QAChC,OAAO,QAAQ,CAAC;IAClB,CAAC;IACD,IAAI,IAAI,CAAC,QAAQ,CAAC,kBAAkB,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,kBAAkB,CAAC;QACtE,CAAC,IAAI,CAAC,QAAQ,CAAC,cAAc,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC,EAAE,CAAC;QAClE,OAAO,QAAQ,CAAC;IAClB,CAAC;IACD,IAAI,IAAI,CAAC,QAAQ,CAAC,WAAW,CAAC,EAAE,CAAC;QAC/B,OAAO,SAAS,CAAC;IACnB,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,SAAS,oBAAoB,CAC3B,QAAkB,EAClB,gBAA0B;IAE1B,KAAK,MAAM,QAAQ,IAAI,gBAAgB,EAAE,CAAC;QACxC,MAAM,EAAE,GAAG,QAAQ,CAAC,aAAa,CAAC,QAAQ,CAAC,CAAC;QAC5C,IAAI,EAAE;YAAE,OAAO,EAAa,CAAC;IAC/B,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,SAAS,aAAa,CAAC,IAAa,EAAE,SAAmB;IACvD,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;QACjC,KAAK,MAAM,EAAE,IAAI,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAC,EAAE,CAAC;YAC7D,EAAE,CAAC,UAAU,EAAE,WAAW,CAAC,EAAE,CAAC,CAAC;QACjC,CAAC;IACH,CAAC;AACH,CAAC;AAED,SAAS,WAAW,CAClB,QAAkB,EAClB,SAAkB;IAElB,aAAa,CAAC,SAAS,EAAE,eAAe,CAAC,CAAC;IAE1C,MAAM,OAAO,GACX,SAAS,CAAC,aAAa,CAAC,IAAI,CAAC;QAC7B,QAAQ,CAAC,aAAa,CAAC,IAAI,CAAC;QAC5B,QAAQ,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC;IAElC,MAAM,QAAQ,GAAG,OAAO,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;IACpD,MAAM,KAAK,GAAG,QAAQ,CAAC,QAAQ,CAAC,GAAG,CAAC;QAClC,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAE,CAAC,IAAI,EAAE;QAChC,CAAC,CAAC,QAAQ,CAAC;IAEb,IAAI,CAAC,KAAK;QAAE,OAAO,IAAI,CAAC;IAExB,MAAM,QAAQ,GAAG,QAAQ,CAAC,QAAQ,CAAC,SAAS,CAAC,SAAS,CAAC,CAAC,IAAI,EAAE,CAAC;IAC/D,IAAI,CAAC,QAAQ;QAAE,OAAO,IAAI,CAAC;IAE3B,OAAO;QACL,KAAK;QACL,QAAQ;QACR,QAAQ,EAAE,EAAE;QACZ,KAAK,EAAE,EAAE;QACT,MAAM,EAAE,EAAE;QACV,SAAS,EAAE,eAAe;KAC3B,CAAC;AACJ,CAAC;AAED,MAAM,CAAC,MAAM,oBAAoB,GAAc;IAC7C,IAAI,EAAE,cAAc;IAEpB,SAAS,CAAC,IAAY,EAAE,IAAa;QACnC,IAAI,CAAC,IAAI;YAAE,OAAO,KAAK,CAAC;QACxB,OAAO,eAAe,CAAC,IAAI,CAAC,KAAK,IAAI,CAAC;IACxC,CAAC;IAED,OAAO,CAAC,IAAY,EAAE,IAAY;QAChC,IAAI,CAAC,IAAI;YAAE,OAAO,IAAI,CAAC;QAEvB,MAAM,SAAS,GAAG,eAAe,CAAC,IAAI,CAAC,CAAC;QACxC,IAAI,CAAC,SAAS;YAAE,OAAO,IAAI,CAAC;QAE5B,MAAM,EAAE,QAAQ,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;QAErC,IAAI,gBAA0B,CAAC;QAC/B,QAAQ,SAAS,EAAE,CAAC;YAClB,KAAK,YAAY;gBACf,gBAAgB,GAAG,CAAC,WAAW,EAAE,SAAS,EAAE,MAAM,CAAC,CAAC;gBACpD,MAAM;YACR,KAAK,QAAQ;gBACX,gBAAgB,GAAG,CAAC,aAAa,CAAC,CAAC;gBACnC,MAAM;YACR,KAAK,QAAQ;gBACX,gBAAgB,GAAG,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC;gBAC1C,MAAM;YACR,KAAK,SAAS;gBACZ,gBAAgB,GAAG,CAAC,YAAY,CAAC,CAAC;gBAClC,MAAM;QACV,CAAC;QAED,MAAM,SAAS,GAAG,oBAAoB,CAAC,QAAQ,EAAE,gBAAgB,CAAC,CAAC;QACnE,IAAI,CAAC,SAAS;YAAE,OAAO,IAAI,CAAC;QAE5B,OAAO,WAAW,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;IAC1C,CAAC;CACF,CAAC"}
@@ -0,0 +1,3 @@
1
+ import type { Extractor } from '../../types.js';
2
+ export declare const githubExtractor: Extractor;
3
+ //# sourceMappingURL=github.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"github.d.ts","sourceRoot":"","sources":["../../../src/extraction/site-extractors/github.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAoB,MAAM,gBAAgB,CAAC;AAoGlE,eAAO,MAAM,eAAe,EAAE,SA2B7B,CAAC"}
@@ -0,0 +1,107 @@
1
+ import { parseHTML } from 'linkedom';
2
+ import TurndownService from 'turndown';
3
+ const turndown = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced' });
4
+ function isIssueOrPR(url) {
5
+ return /\/issues\/\d+|\/pull\/\d+/.test(url);
6
+ }
7
+ function isBlob(url) {
8
+ return /\/blob\//.test(url);
9
+ }
10
+ function extractIssue(document, url) {
11
+ const titleEl = document.querySelector('.js-issue-title') ?? document.querySelector('.gh-header-title');
12
+ if (!titleEl)
13
+ return null;
14
+ const title = titleEl.textContent?.trim() ?? '';
15
+ const labelEls = document.querySelectorAll('.IssueLabel');
16
+ const labels = Array.from(labelEls)
17
+ .map((el) => el.textContent?.trim() ?? '')
18
+ .filter(Boolean);
19
+ const commentBodies = document.querySelectorAll('.d-block.comment-body');
20
+ if (commentBodies.length === 0)
21
+ return null;
22
+ const sections = [];
23
+ if (labels.length > 0) {
24
+ sections.push(`**Labels:** ${labels.join(', ')}\n`);
25
+ }
26
+ Array.from(commentBodies).forEach((body, i) => {
27
+ const html = body.innerHTML;
28
+ const md = turndown.turndown(html).trim();
29
+ if (md) {
30
+ sections.push(i === 0 ? md : `---\n\n${md}`);
31
+ }
32
+ });
33
+ const markdown = sections.join('\n\n');
34
+ return {
35
+ title,
36
+ markdown,
37
+ metadata: {},
38
+ links: [],
39
+ images: [],
40
+ extractor: 'site-specific',
41
+ };
42
+ }
43
+ function extractReadme(document) {
44
+ const titleEl = document.querySelector('title');
45
+ const rawTitle = titleEl?.textContent?.trim() ?? '';
46
+ const title = rawTitle.split(':')[0]?.trim() ?? rawTitle;
47
+ const readmeBody = document.querySelector('#readme .markdown-body') ??
48
+ document.querySelector('.markdown-body');
49
+ if (!readmeBody)
50
+ return null;
51
+ const markdown = turndown.turndown(readmeBody.innerHTML).trim();
52
+ if (!markdown)
53
+ return null;
54
+ return {
55
+ title,
56
+ markdown,
57
+ metadata: {},
58
+ links: [],
59
+ images: [],
60
+ extractor: 'site-specific',
61
+ };
62
+ }
63
+ function extractBlob(document) {
64
+ const titleEl = document.querySelector('title');
65
+ const title = titleEl?.textContent?.trim() ?? '';
66
+ const codeBlock = document.querySelector('.blob-code-content') ??
67
+ document.querySelector('.highlight') ??
68
+ document.querySelector('.markdown-body');
69
+ if (!codeBlock)
70
+ return null;
71
+ const markdown = turndown.turndown(codeBlock.innerHTML).trim();
72
+ if (!markdown)
73
+ return null;
74
+ return {
75
+ title,
76
+ markdown,
77
+ metadata: {},
78
+ links: [],
79
+ images: [],
80
+ extractor: 'site-specific',
81
+ };
82
+ }
83
+ export const githubExtractor = {
84
+ name: 'github',
85
+ canHandle(url) {
86
+ try {
87
+ const hostname = new URL(url).hostname;
88
+ return hostname === 'github.com' || hostname.endsWith('.github.com');
89
+ }
90
+ catch {
91
+ return false;
92
+ }
93
+ },
94
+ extract(html, url) {
95
+ if (!html)
96
+ return null;
97
+ const { document } = parseHTML(html);
98
+ if (isIssueOrPR(url)) {
99
+ return extractIssue(document, url);
100
+ }
101
+ if (isBlob(url)) {
102
+ return extractBlob(document);
103
+ }
104
+ return extractReadme(document);
105
+ },
106
+ };
107
+ //# sourceMappingURL=github.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"github.js","sourceRoot":"","sources":["../../../src/extraction/site-extractors/github.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AACrC,OAAO,eAAe,MAAM,UAAU,CAAC;AAGvC,MAAM,QAAQ,GAAG,IAAI,eAAe,CAAC,EAAE,YAAY,EAAE,KAAK,EAAE,cAAc,EAAE,QAAQ,EAAE,CAAC,CAAC;AAExF,SAAS,WAAW,CAAC,GAAW;IAC9B,OAAO,2BAA2B,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;AAC/C,CAAC;AAED,SAAS,MAAM,CAAC,GAAW;IACzB,OAAO,UAAU,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;AAC9B,CAAC;AAED,SAAS,YAAY,CAAC,QAAkB,EAAE,GAAW;IACnD,MAAM,OAAO,GAAG,QAAQ,CAAC,aAAa,CAAC,iBAAiB,CAAC,IAAI,QAAQ,CAAC,aAAa,CAAC,kBAAkB,CAAC,CAAC;IACxG,IAAI,CAAC,OAAO;QAAE,OAAO,IAAI,CAAC;IAE1B,MAAM,KAAK,GAAG,OAAO,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;IAEhD,MAAM,QAAQ,GAAG,QAAQ,CAAC,gBAAgB,CAAC,aAAa,CAAC,CAAC;IAC1D,MAAM,MAAM,GAAG,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC;SAChC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;SACzC,MAAM,CAAC,OAAO,CAAC,CAAC;IAEnB,MAAM,aAAa,GAAG,QAAQ,CAAC,gBAAgB,CAAC,uBAAuB,CAAC,CAAC;IACzE,IAAI,aAAa,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,IAAI,CAAC;IAE5C,MAAM,QAAQ,GAAa,EAAE,CAAC;IAE9B,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACtB,QAAQ,CAAC,IAAI,CAAC,eAAe,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACtD,CAAC;IAED,KAAK,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC,EAAE,EAAE;QAC5C,MAAM,IAAI,GAAI,IAAgB,CAAC,SAAS,CAAC;QACzC,MAAM,EAAE,GAAG,QAAQ,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;QAC1C,IAAI,EAAE,EAAE,CAAC;YACP,QAAQ,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC;QAC/C,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,MAAM,QAAQ,GAAG,QAAQ,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IAEvC,OAAO;QACL,KAAK;QACL,QAAQ;QACR,QAAQ,EAAE,EAAE;QACZ,KAAK,EAAE,EAAE;QACT,MAAM,EAAE,EAAE;QACV,SAAS,EAAE,eAAe;KAC3B,CAAC;AACJ,CAAC;AAED,SAAS,aAAa,CAAC,QAAkB;IACvC,MAAM,OAAO,GAAG,QAAQ,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC;IAChD,MAAM,QAAQ,GAAG,OAAO,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;IACpD,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,IAAI,QAAQ,CAAC;IAEzD,MAAM,UAAU,GACd,QAAQ,CAAC,aAAa,CAAC,wBAAwB,CAAC;QAChD,QAAQ,CAAC,aAAa,CAAC,gBAAgB,CAAC,CAAC;IAE3C,IAAI,CAAC,UAAU;QAAE,OAAO,IAAI,CAAC;IAE7B,MAAM,QAAQ,GAAG,QAAQ,CAAC,QAAQ,CAAE,UAAsB,CAAC,SAAS,CAAC,CAAC,IAAI,EAAE,CAAC;IAC7E,IAAI,CAAC,QAAQ;QAAE,OAAO,IAAI,CAAC;IAE3B,OAAO;QACL,KAAK;QACL,QAAQ;QACR,QAAQ,EAAE,EAAE;QACZ,KAAK,EAAE,EAAE;QACT,MAAM,EAAE,EAAE;QACV,SAAS,EAAE,eAAe;KAC3B,CAAC;AACJ,CAAC;AAED,SAAS,WAAW,CAAC,QAAkB;IACrC,MAAM,OAAO,GAAG,QAAQ,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC;IAChD,MAAM,KAAK,GAAG,OAAO,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;IAEjD,MAAM,SAAS,GACb,QAAQ,CAAC,aAAa,CAAC,oBAAoB,CAAC;QAC5C,QAAQ,CAAC,aAAa,CAAC,YAAY,CAAC;QACpC,QAAQ,CAAC,aAAa,CAAC,gBAAgB,CAAC,CAAC;IAE3C,IAAI,CAAC,SAAS;QAAE,OAAO,IAAI,CAAC;IAE5B,MAAM,QAAQ,GAAG,QAAQ,CAAC,QAAQ,CAAE,SAAqB,CAAC,SAAS,CAAC,CAAC,IAAI,EAAE,CAAC;IAC5E,IAAI,CAAC,QAAQ;QAAE,OAAO,IAAI,CAAC;IAE3B,OAAO;QACL,KAAK;QACL,QAAQ;QACR,QAAQ,EAAE,EAAE;QACZ,KAAK,EAAE,EAAE;QACT,MAAM,EAAE,EAAE;QACV,SAAS,EAAE,eAAe;KAC3B,CAAC;AACJ,CAAC;AAED,MAAM,CAAC,MAAM,eAAe,GAAc;IACxC,IAAI,EAAE,QAAQ;IAEd,SAAS,CAAC,GAAW;QACnB,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;YACvC,OAAO,QAAQ,KAAK,YAAY,IAAI,QAAQ,CAAC,QAAQ,CAAC,aAAa,CAAC,CAAC;QACvE,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,KAAK,CAAC;QACf,CAAC;IACH,CAAC;IAED,OAAO,CAAC,IAAY,EAAE,GAAW;QAC/B,IAAI,CAAC,IAAI;YAAE,OAAO,IAAI,CAAC;QAEvB,MAAM,EAAE,QAAQ,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;QAErC,IAAI,WAAW,CAAC,GAAG,CAAC,EAAE,CAAC;YACrB,OAAO,YAAY,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAC;QACrC,CAAC;QAED,IAAI,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC;YAChB,OAAO,WAAW,CAAC,QAAQ,CAAC,CAAC;QAC/B,CAAC;QAED,OAAO,aAAa,CAAC,QAAQ,CAAC,CAAC;IACjC,CAAC;CACF,CAAC"}
@@ -0,0 +1,3 @@
1
+ import type { Extractor } from '../../types.js';
2
+ export declare const mdnExtractor: Extractor;
3
+ //# sourceMappingURL=mdn.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"mdn.d.ts","sourceRoot":"","sources":["../../../src/extraction/site-extractors/mdn.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAoB,MAAM,gBAAgB,CAAC;AAalE,eAAO,MAAM,YAAY,EAAE,SAqD1B,CAAC"}
@@ -0,0 +1,58 @@
1
+ import { parseHTML } from 'linkedom';
2
+ import TurndownService from 'turndown';
3
+ const turndown = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced' });
4
+ const STRIP_SELECTORS = [
5
+ 'nav',
6
+ '.sidebar',
7
+ 'header',
8
+ 'footer',
9
+ '.bc-head',
10
+ '.metadata',
11
+ ];
12
+ export const mdnExtractor = {
13
+ name: 'mdn',
14
+ canHandle(url) {
15
+ try {
16
+ const hostname = new URL(url).hostname;
17
+ return hostname === 'developer.mozilla.org';
18
+ }
19
+ catch {
20
+ return false;
21
+ }
22
+ },
23
+ extract(html, url) {
24
+ if (!html)
25
+ return null;
26
+ const { document } = parseHTML(html);
27
+ const article = document.querySelector('article.main-page-content') ??
28
+ document.querySelector('.section-content') ??
29
+ document.querySelector('article');
30
+ if (!article)
31
+ return null;
32
+ for (const selector of STRIP_SELECTORS) {
33
+ for (const el of Array.from(article.querySelectorAll(selector))) {
34
+ el.parentNode?.removeChild(el);
35
+ }
36
+ }
37
+ const titleEl = article.querySelector('h1') ??
38
+ document.querySelector('title');
39
+ const rawTitle = titleEl?.textContent?.trim() ?? '';
40
+ const title = rawTitle.includes('|')
41
+ ? rawTitle.split('|')[0].trim()
42
+ : rawTitle;
43
+ if (!title)
44
+ return null;
45
+ const markdown = turndown.turndown(article.innerHTML).trim();
46
+ if (!markdown)
47
+ return null;
48
+ return {
49
+ title,
50
+ markdown,
51
+ metadata: {},
52
+ links: [],
53
+ images: [],
54
+ extractor: 'site-specific',
55
+ };
56
+ },
57
+ };
58
+ //# sourceMappingURL=mdn.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"mdn.js","sourceRoot":"","sources":["../../../src/extraction/site-extractors/mdn.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AACrC,OAAO,eAAe,MAAM,UAAU,CAAC;AAGvC,MAAM,QAAQ,GAAG,IAAI,eAAe,CAAC,EAAE,YAAY,EAAE,KAAK,EAAE,cAAc,EAAE,QAAQ,EAAE,CAAC,CAAC;AAExF,MAAM,eAAe,GAAG;IACtB,KAAK;IACL,UAAU;IACV,QAAQ;IACR,QAAQ;IACR,UAAU;IACV,WAAW;CACZ,CAAC;AAEF,MAAM,CAAC,MAAM,YAAY,GAAc;IACrC,IAAI,EAAE,KAAK;IAEX,SAAS,CAAC,GAAW;QACnB,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;YACvC,OAAO,QAAQ,KAAK,uBAAuB,CAAC;QAC9C,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,KAAK,CAAC;QACf,CAAC;IACH,CAAC;IAED,OAAO,CAAC,IAAY,EAAE,GAAW;QAC/B,IAAI,CAAC,IAAI;YAAE,OAAO,IAAI,CAAC;QAEvB,MAAM,EAAE,QAAQ,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;QAErC,MAAM,OAAO,GACX,QAAQ,CAAC,aAAa,CAAC,2BAA2B,CAAC;YACnD,QAAQ,CAAC,aAAa,CAAC,kBAAkB,CAAC;YAC1C,QAAQ,CAAC,aAAa,CAAC,SAAS,CAAC,CAAC;QAEpC,IAAI,CAAC,OAAO;YAAE,OAAO,IAAI,CAAC;QAE1B,KAAK,MAAM,QAAQ,IAAI,eAAe,EAAE,CAAC;YACvC,KAAK,MAAM,EAAE,IAAI,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAC,EAAE,CAAC;gBAChE,EAAE,CAAC,UAAU,EAAE,WAAW,CAAC,EAAE,CAAC,CAAC;YACjC,CAAC;QACH,CAAC;QAED,MAAM,OAAO,GACX,OAAO,CAAC,aAAa,CAAC,IAAI,CAAC;YAC3B,QAAQ,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC;QAElC,MAAM,QAAQ,GAAG,OAAO,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;QACpD,MAAM,KAAK,GAAG,QAAQ,CAAC,QAAQ,CAAC,GAAG,CAAC;YAClC,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAE,CAAC,IAAI,EAAE;YAChC,CAAC,CAAC,QAAQ,CAAC;QAEb,IAAI,CAAC,KAAK;YAAE,OAAO,IAAI,CAAC;QAExB,MAAM,QAAQ,GAAG,QAAQ,CAAC,QAAQ,CAAE,OAAmB,CAAC,SAAS,CAAC,CAAC,IAAI,EAAE,CAAC;QAC1E,IAAI,CAAC,QAAQ;YAAE,OAAO,IAAI,CAAC;QAE3B,OAAO;YACL,KAAK;YACL,QAAQ;YACR,QAAQ,EAAE,EAAE;YACZ,KAAK,EAAE,EAAE;YACT,MAAM,EAAE,EAAE;YACV,SAAS,EAAE,eAAe;SAC3B,CAAC;IACJ,CAAC;CACF,CAAC"}
@@ -0,0 +1,3 @@
1
+ import type { Extractor } from '../../types.js';
2
+ export declare const stackoverflowExtractor: Extractor;
3
+ //# sourceMappingURL=stackoverflow.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"stackoverflow.d.ts","sourceRoot":"","sources":["../../../src/extraction/site-extractors/stackoverflow.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAoB,MAAM,gBAAgB,CAAC;AAgElE,eAAO,MAAM,sBAAsB,EAAE,SAkDpC,CAAC"}