@staticn0va/wigolo 0.6.6 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cache/store.d.ts +9 -1
- package/dist/cache/store.d.ts.map +1 -1
- package/dist/cache/store.js +30 -4
- package/dist/cache/store.js.map +1 -1
- package/dist/cli/doctor.d.ts.map +1 -1
- package/dist/cli/doctor.js +56 -2
- package/dist/cli/doctor.js.map +1 -1
- package/dist/cli/status.js +1 -1
- package/dist/cli/status.js.map +1 -1
- package/dist/cli/tui/hooks/useInstall.js +1 -1
- package/dist/cli/tui/hooks/useInstall.js.map +1 -1
- package/dist/cli/tui/hooks/useVerify.js +1 -1
- package/dist/cli/tui/hooks/useVerify.js.map +1 -1
- package/dist/cli/tui/status-format.d.ts +1 -1
- package/dist/cli/tui/status-format.d.ts.map +1 -1
- package/dist/cli/tui/status-format.js +1 -1
- package/dist/cli/tui/status-format.js.map +1 -1
- package/dist/cli/tui/status-python.d.ts +1 -1
- package/dist/cli/tui/status-python.d.ts.map +1 -1
- package/dist/cli/tui/status-python.js +17 -1
- package/dist/cli/tui/status-python.js.map +1 -1
- package/dist/cli/tui/verify-suggestions.d.ts +1 -1
- package/dist/cli/tui/verify-suggestions.d.ts.map +1 -1
- package/dist/cli/tui/verify-suggestions.js +3 -3
- package/dist/cli/tui/verify-suggestions.js.map +1 -1
- package/dist/cli/tui/verify.d.ts +2 -2
- package/dist/cli/tui/verify.d.ts.map +1 -1
- package/dist/cli/tui/verify.js +32 -6
- package/dist/cli/tui/verify.js.map +1 -1
- package/dist/cli/warmup.d.ts.map +1 -1
- package/dist/cli/warmup.js +16 -12
- package/dist/cli/warmup.js.map +1 -1
- package/dist/config.d.ts +6 -1
- package/dist/config.d.ts.map +1 -1
- package/dist/config.js +15 -2
- package/dist/config.js.map +1 -1
- package/dist/crawl/dedup.d.ts +1 -0
- package/dist/crawl/dedup.d.ts.map +1 -1
- package/dist/crawl/dedup.js +47 -1
- package/dist/crawl/dedup.js.map +1 -1
- package/dist/extraction/boilerplate.d.ts +15 -0
- package/dist/extraction/boilerplate.d.ts.map +1 -0
- package/dist/extraction/boilerplate.js +49 -0
- package/dist/extraction/boilerplate.js.map +1 -0
- package/dist/extraction/defuddle.d.ts.map +1 -1
- package/dist/extraction/defuddle.js +7 -3
- package/dist/extraction/defuddle.js.map +1 -1
- package/dist/extraction/jsonld.js +1 -1
- package/dist/extraction/jsonld.js.map +1 -1
- package/dist/extraction/lang-hints.d.ts +2 -0
- package/dist/extraction/lang-hints.d.ts.map +1 -0
- package/dist/extraction/lang-hints.js +28 -0
- package/dist/extraction/lang-hints.js.map +1 -0
- package/dist/extraction/llm/anthropic.d.ts +3 -0
- package/dist/extraction/llm/anthropic.d.ts.map +1 -0
- package/dist/extraction/llm/anthropic.js +33 -0
- package/dist/extraction/llm/anthropic.js.map +1 -0
- package/dist/extraction/llm/cache.d.ts +5 -0
- package/dist/extraction/llm/cache.d.ts.map +1 -0
- package/dist/extraction/llm/cache.js +35 -0
- package/dist/extraction/llm/cache.js.map +1 -0
- package/dist/extraction/llm/gemini.d.ts +3 -0
- package/dist/extraction/llm/gemini.d.ts.map +1 -0
- package/dist/extraction/llm/gemini.js +35 -0
- package/dist/extraction/llm/gemini.js.map +1 -0
- package/dist/extraction/llm/groq.d.ts +3 -0
- package/dist/extraction/llm/groq.d.ts.map +1 -0
- package/dist/extraction/llm/groq.js +63 -0
- package/dist/extraction/llm/groq.js.map +1 -0
- package/dist/extraction/llm/hash.d.ts +3 -0
- package/dist/extraction/llm/hash.d.ts.map +1 -0
- package/dist/extraction/llm/hash.js +22 -0
- package/dist/extraction/llm/hash.js.map +1 -0
- package/dist/extraction/llm/openai.d.ts +3 -0
- package/dist/extraction/llm/openai.d.ts.map +1 -0
- package/dist/extraction/llm/openai.js +38 -0
- package/dist/extraction/llm/openai.js.map +1 -0
- package/dist/extraction/llm/select.d.ts +5 -0
- package/dist/extraction/llm/select.d.ts.map +1 -0
- package/dist/extraction/llm/select.js +27 -0
- package/dist/extraction/llm/select.js.map +1 -0
- package/dist/extraction/llm/types.d.ts +24 -0
- package/dist/extraction/llm/types.d.ts.map +1 -0
- package/dist/extraction/llm/types.js +2 -0
- package/dist/extraction/llm/types.js.map +1 -0
- package/dist/extraction/llm/validate.d.ts +6 -0
- package/dist/extraction/llm/validate.d.ts.map +1 -0
- package/dist/extraction/llm/validate.js +63 -0
- package/dist/extraction/llm/validate.js.map +1 -0
- package/dist/extraction/llm-fallback.d.ts +17 -0
- package/dist/extraction/llm-fallback.d.ts.map +1 -0
- package/dist/extraction/llm-fallback.js +129 -0
- package/dist/extraction/llm-fallback.js.map +1 -0
- package/dist/extraction/markdown.d.ts +9 -0
- package/dist/extraction/markdown.d.ts.map +1 -1
- package/dist/extraction/markdown.js +52 -3
- package/dist/extraction/markdown.js.map +1 -1
- package/dist/extraction/pipeline.d.ts.map +1 -1
- package/dist/extraction/pipeline.js +17 -5
- package/dist/extraction/pipeline.js.map +1 -1
- package/dist/extraction/readability.d.ts.map +1 -1
- package/dist/extraction/readability.js +2 -3
- package/dist/extraction/readability.js.map +1 -1
- package/dist/extraction/schema.d.ts +12 -0
- package/dist/extraction/schema.d.ts.map +1 -1
- package/dist/extraction/schema.js +81 -11
- package/dist/extraction/schema.js.map +1 -1
- package/dist/extraction/site-extractors/docs-generic.d.ts.map +1 -1
- package/dist/extraction/site-extractors/docs-generic.js +2 -3
- package/dist/extraction/site-extractors/docs-generic.js.map +1 -1
- package/dist/extraction/site-extractors/github.d.ts.map +1 -1
- package/dist/extraction/site-extractors/github.js +4 -5
- package/dist/extraction/site-extractors/github.js.map +1 -1
- package/dist/extraction/site-extractors/mdn.d.ts.map +1 -1
- package/dist/extraction/site-extractors/mdn.js +2 -3
- package/dist/extraction/site-extractors/mdn.js.map +1 -1
- package/dist/extraction/site-extractors/stackoverflow.d.ts.map +1 -1
- package/dist/extraction/site-extractors/stackoverflow.js +3 -4
- package/dist/extraction/site-extractors/stackoverflow.js.map +1 -1
- package/dist/extraction/structured-data.d.ts +4 -0
- package/dist/extraction/structured-data.d.ts.map +1 -0
- package/dist/extraction/structured-data.js +203 -0
- package/dist/extraction/structured-data.js.map +1 -0
- package/dist/fetch/router.d.ts +2 -1
- package/dist/fetch/router.d.ts.map +1 -1
- package/dist/fetch/router.js +19 -1
- package/dist/fetch/router.js.map +1 -1
- package/dist/instructions.d.ts +7 -7
- package/dist/instructions.d.ts.map +1 -1
- package/dist/instructions.js +43 -36
- package/dist/instructions.js.map +1 -1
- package/dist/logger.d.ts +1 -1
- package/dist/logger.d.ts.map +1 -1
- package/dist/research/brief.js +1 -1
- package/dist/research/brief.js.map +1 -1
- package/dist/search/evidence.d.ts +25 -0
- package/dist/search/evidence.d.ts.map +1 -0
- package/dist/search/evidence.js +260 -0
- package/dist/search/evidence.js.map +1 -0
- package/dist/search/highlights.d.ts +11 -2
- package/dist/search/highlights.d.ts.map +1 -1
- package/dist/search/highlights.js +131 -48
- package/dist/search/highlights.js.map +1 -1
- package/dist/search/multi-query.d.ts +1 -0
- package/dist/search/multi-query.d.ts.map +1 -1
- package/dist/search/multi-query.js +13 -0
- package/dist/search/multi-query.js.map +1 -1
- package/dist/search/rerank.d.ts +3 -2
- package/dist/search/rerank.d.ts.map +1 -1
- package/dist/search/rerank.js +16 -44
- package/dist/search/rerank.js.map +1 -1
- package/dist/search/reranker/download.d.ts +9 -0
- package/dist/search/reranker/download.d.ts.map +1 -0
- package/dist/search/reranker/download.js +77 -0
- package/dist/search/reranker/download.js.map +1 -0
- package/dist/search/reranker/models.d.ts +14 -0
- package/dist/search/reranker/models.d.ts.map +1 -0
- package/dist/search/reranker/models.js +37 -0
- package/dist/search/reranker/models.js.map +1 -0
- package/dist/search/reranker/onnx.d.ts +13 -0
- package/dist/search/reranker/onnx.d.ts.map +1 -0
- package/dist/search/reranker/onnx.js +70 -0
- package/dist/search/reranker/onnx.js.map +1 -0
- package/dist/search/reranker/recency-boost.d.ts +3 -0
- package/dist/search/reranker/recency-boost.d.ts.map +1 -0
- package/dist/search/reranker/recency-boost.js +12 -0
- package/dist/search/reranker/recency-boost.js.map +1 -0
- package/dist/search/reranker/recency.d.ts +3 -0
- package/dist/search/reranker/recency.d.ts.map +1 -0
- package/dist/search/reranker/recency.js +26 -0
- package/dist/search/reranker/recency.js.map +1 -0
- package/dist/search/reranker/tokenizer.d.ts +30 -0
- package/dist/search/reranker/tokenizer.d.ts.map +1 -0
- package/dist/search/reranker/tokenizer.js +49 -0
- package/dist/search/reranker/tokenizer.js.map +1 -0
- package/dist/search/tokens.d.ts +3 -0
- package/dist/search/tokens.d.ts.map +1 -0
- package/dist/search/tokens.js +38 -0
- package/dist/search/tokens.js.map +1 -0
- package/dist/search/truncate.d.ts +4 -0
- package/dist/search/truncate.d.ts.map +1 -1
- package/dist/search/truncate.js +13 -0
- package/dist/search/truncate.js.map +1 -1
- package/dist/server/tool-schemas.d.ts +503 -0
- package/dist/server/tool-schemas.d.ts.map +1 -0
- package/dist/server/tool-schemas.js +425 -0
- package/dist/server/tool-schemas.js.map +1 -0
- package/dist/server.d.ts.map +1 -1
- package/dist/server.js +1 -326
- package/dist/server.js.map +1 -1
- package/dist/tools/agent.d.ts.map +1 -1
- package/dist/tools/agent.js +36 -0
- package/dist/tools/agent.js.map +1 -1
- package/dist/tools/crawl.d.ts.map +1 -1
- package/dist/tools/crawl.js +37 -2
- package/dist/tools/crawl.js.map +1 -1
- package/dist/tools/extract.d.ts.map +1 -1
- package/dist/tools/extract.js +19 -3
- package/dist/tools/extract.js.map +1 -1
- package/dist/tools/fetch.d.ts.map +1 -1
- package/dist/tools/fetch.js +44 -7
- package/dist/tools/fetch.js.map +1 -1
- package/dist/tools/find-similar.d.ts.map +1 -1
- package/dist/tools/find-similar.js +32 -1
- package/dist/tools/find-similar.js.map +1 -1
- package/dist/tools/research.d.ts.map +1 -1
- package/dist/tools/research.js +34 -1
- package/dist/tools/research.js.map +1 -1
- package/dist/tools/search.d.ts.map +1 -1
- package/dist/tools/search.js +97 -53
- package/dist/tools/search.js.map +1 -1
- package/dist/types.d.ts +65 -1
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +1 -1
- package/dist/types.js.map +1 -1
- package/dist/util/mode.d.ts +4 -0
- package/dist/util/mode.d.ts.map +1 -0
- package/dist/util/mode.js +13 -0
- package/dist/util/mode.js.map +1 -0
- package/package.json +9 -1
- package/dist/search/flashrank.d.ts +0 -12
- package/dist/search/flashrank.d.ts.map +0 -1
- package/dist/search/flashrank.js +0 -64
- package/dist/search/flashrank.js.map +0 -1
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"markdown.js","sourceRoot":"","sources":["../../src/extraction/markdown.ts"],"names":[],"mappings":"AAAA,OAAO,eAAe,MAAM,UAAU,CAAC;
|
|
1
|
+
{"version":3,"file":"markdown.js","sourceRoot":"","sources":["../../src/extraction/markdown.ts"],"names":[],"mappings":"AAAA,OAAO,eAAe,MAAM,UAAU,CAAC;AACvC,OAAO,EAAE,kBAAkB,EAAE,MAAM,iBAAiB,CAAC;AAErD,SAAS,kBAAkB,CAAC,CAAS;IACnC,IAAI,GAAG,GAAG,CAAC,CAAC;IACZ,IAAI,GAAG,GAAG,CAAC,CAAC;IACZ,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAClC,IAAI,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC;YAC3B,GAAG,EAAE,CAAC;YACN,IAAI,GAAG,GAAG,GAAG;gBAAE,GAAG,GAAG,GAAG,CAAC;QAC3B,CAAC;aAAM,CAAC;YACN,GAAG,GAAG,CAAC,CAAC;QACV,CAAC;IACH,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED,MAAM,UAAU,aAAa;IAC3B,MAAM,EAAE,GAAG,IAAI,eAAe,CAAC,EAAE,YAAY,EAAE,KAAK,EAAE,cAAc,EAAE,QAAQ,EAAE,CAAC,CAAC;IAElF,wCAAwC;IACxC,EAAE,CAAC,MAAM,CAAC,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC,CAAC;IAE/B,iDAAiD;IACjD,EAAE,CAAC,OAAO,CAAC,OAAO,EAAE;QAClB,MAAM,EAAE,OAAO;QACf,WAAW,CAAC,QAAQ,EAAE,IAAI;YACxB,MAAM,EAAE,GAAG,IAAe,CAAC;YAC3B,MAAM,IAAI,GAAc,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,gBAAgB,CAAC,IAAI,CAAC,CAAC,CAAC;YAC9D,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;gBAAE,OAAO,EAAE,CAAC;YAEjC,MAAM,SAAS,GAAG,CAAC,GAAY,EAAU,EAAE;gBACzC,MAAM,KAAK,GAAG,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAC,CAAC;gBACzD,OAAO,IAAI,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,WAAW,EAAE,OAAO,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC;YACnG,CAAC,CAAC;YAEF,MAAM,SAAS,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC;YAC1B,MAAM,WAAW,GAAG,SAAS,CAAC,gBAAgB,CAAC,IAAI,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;YAChE,MAAM,WAAW,GAAG,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAC,CAAC;YACrE,MAAM,SAAS,GAAG,IAAI,GAAG,WAAW,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC;YAEzE,IAAI,WAAW,EAAE,CAAC;gBAChB,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;gBAC/B,MAAM,KAAK,GAAG,CAAC,SAAS,CAAC,SAAS,CAAC,EAAE,SAAS,EAAE,GAAG,QAAQ,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC,CAAC;gBAC5E,OAAO,MAAM,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC;YAC5C,CAAC;YAED,MAAM,KAAK,GAAG,CAAC,SAAS,CAAC,SAAS,CAAC,EAAE,SAAS,EAAE,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC,CAAC;YACjF,OAAO,MAAM,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC;QAC5C,CAAC;KACF,CAAC,CAAC;IAEH,qFAAqF;IACrF,EAAE,CAAC,OAAO,CAAC,WAAW,EAAE;QACtB,MAAM,EAAE,CAAC,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,CAAC;QACrD,WAAW,CAAC,OAAO;YACjB,OAAO,OAAO,CAAC;QACjB,CAAC;KACF,CAAC,CAAC;IAEH,EAAE,CAAC,OAAO,CAAC,eAAe,EAAE;QAC1B,MAAM,CAAC,IAAI;YACT,OAAO,IAAI,CAAC,QAAQ,KAAK,KAAK,IAAK,IAAgB,CAAC,aAAa,CAAC,MAAM,CAAC,KAAK,IAAI,CAAC;QACrF,CAAC;QACD,WAAW,CAAC,QAAQ,EAAE,IAAI;YACxB,MAAM,GAAG,GAAG,IAAe,CAAC;YAC5B,MAAM,IAAI,GAAG,GAAG,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;YACvC,MAAM,GAAG,GAAG,IAAI,EAAE,YAAY,CAAC,OAAO,CAAC,IAAI,GAAG,CAAC,YAAY,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC;YAC3E,MAAM,IAAI,GAAG,kBAAkB,CAAC,GAAG,CAAC,CAAC;YACrC,MAAM,IAAI,GAAG,IAAI,EAAE,WAAW,IAAI,GAAG,CAAC,WAAW,IAAI,EAAE,CAAC;YACxD,MAAM,KAAK,GAAG,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,kBAAkB,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;YACpE,OAAO,OAAO,KAAK,GAAG,IAAI,IAAI,EAAE,KAAK,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,KAAK,KAAK,MAAM,CAAC;QAChF,CAAC;KACF,CAAC,CAAC;IAEH,OAAO,EAAE,CAAC;AACZ,CAAC;AAED,MAAM,QAAQ,GAAG,aAAa,EAAE,CAAC;AAEjC,MAAM,UAAU,cAAc,CAAC,IAAY;IACzC,IAAI,CAAC,IAAI;QAAE,OAAO,EAAE,CAAC;IACrB,OAAO,QAAQ,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;AACjC,CAAC;AAQD,MAAM,UAAU,aAAa,CAAC,KAAe;IAC3C,MAAM,QAAQ,GAAc,EAAE,CAAC;IAC/B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,kBAAkB,CAAC,CAAC;QACjD,IAAI,KAAK,EAAE,CAAC;YACV,QAAQ,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,EAAE,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,SAAS,EAAE,CAAC,EAAE,CAAC,CAAC;QACjF,CAAC;IACH,CAAC;IACD,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED,+DAA+D;AAC/D,+CAA+C;AAC/C,MAAM,UAAU,oBAAoB,CAAC,KAAe;IAClD,MAAM,OAAO,GAAG,IAAI,KAAK,CAAS,KAAK,CAAC,MAAM,CAAC,CAAC;IAChD,IAAI,GAAG,GAAG,CAAC,CAAC;IACZ,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,OAAO,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC;QACjB,GAAG,IAAI,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,4BAA4B;IAC1D,CAAC;IACD,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,SAAS,kBAAkB,CAAC,KAAe,EAAE,QAAmB,EAAE,UAAkB;IAClF,MAAM,OAAO,GAAG,QAAQ,CAAC,UAAU,CAAC,CAAC;IACrC,MAAM,KAAK,GAAG,OAAO,CAAC,SAAS,CAAC;IAEhC,0EAA0E;IAC1E,IAAI,GAAG,GAAG,KAAK,CAAC,MAAM,CAAC;IACvB,KAAK,IAAI,CAAC,GAAG,UAAU,GAAG,CAAC,EAAE,CAAC,GAAG,QAAQ,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtD,IAAI,QAAQ,CAAC,CAAC,CAAC,CAAC,KAAK,IAAI,OAAO,CAAC,KAAK,EAAE,CAAC;YACvC,GAAG,GAAG,QAAQ,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;YAC5B,MAAM;QACR,CAAC;IACH,CAAC;IAED,OAAO,KAAK,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC5C,CAAC;AAED,MAAM,UAAU,cAAc,CAC5B,QAAgB,EAChB,OAAe,EACf,YAAY,GAAG,CAAC;IAEhB,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IACnC,MAAM,QAAQ,GAAG,aAAa,CAAC,KAAK,CAAC,CAAC;IAEtC,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC;IAExE,MAAM,KAAK,GAAG,OAAO,CAAC,WAAW,EAAE,CAAC;IACpC,MAAM,OAAO,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC;IAEnD,8BAA8B;IAC9B,MAAM,YAAY,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,EAAE,KAAK,KAAK,CAAC,CAAC;IAE/E,yDAAyD;IACzD,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,IAAI,YAAY,GAAG,YAAY,CAAC,MAAM,EAAE,CAAC;QAClE,MAAM,EAAE,CAAC,EAAE,GAAG,YAAY,CAAC,YAAY,CAAC,CAAC;QACzC,OAAO,EAAE,OAAO,EAAE,kBAAkB,CAAC,KAAK,EAAE,QAAQ,EAAE,CAAC,CAAC,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC;IAC5E,CAAC;IAED,4EAA4E;IAC5E,MAAM,gBAAgB,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC;IAEzF,IAAI,gBAAgB,CAAC,MAAM,KAAK,CAAC,IAAI,YAAY,IAAI,gBAAgB,CAAC,MAAM,EAAE,CAAC;QAC7E,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC;IAC/C,CAAC;IAED,MAAM,EAAE,CAAC,EAAE,GAAG,gBAAgB,CAAC,YAAY,CAAC,CAAC;IAC7C,OAAO,EAAE,OAAO,EAAE,kBAAkB,CAAC,KAAK,EAAE,QAAQ,EAAE,CAAC,CAAC,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC;AAC5E,CAAC;AAED,MAAM,UAAU,qBAAqB,CAAC,QAAgB;IACpD,MAAM,YAAY,GAAG,yBAAyB,CAAC;IAC/C,MAAM,WAAW,GAAG,8BAA8B,CAAC;IAEnD,MAAM,MAAM,GAAG,IAAI,GAAG,EAAU,CAAC;IACjC,MAAM,KAAK,GAAG,IAAI,GAAG,EAAU,CAAC;IAEhC,IAAI,KAA6B,CAAC;IAElC,uBAAuB;IACvB,OAAO,CAAC,KAAK,GAAG,YAAY,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QACtD,MAAM,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;IACvB,CAAC;IAED,4BAA4B;IAC5B,OAAO,CAAC,KAAK,GAAG,WAAW,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QACrD,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;IACtB,CAAC;IAED,OAAO,EAAE,KAAK,EAAE,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,MAAM,EAAE,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC;AAClE,CAAC;AAED,MAAM,sBAAsB,GAAG;IAC7B,QAAQ;IACR,MAAM;IACN,MAAM;IACN,OAAO;IACP,QAAQ;IACR,UAAU;IACV,OAAO;IACP,QAAQ;IACR,OAAO;IACP,SAAS;CACV,CAAC;AAEF,yEAAyE;AACzE,0EAA0E;AAC1E,wEAAwE;AACxE,MAAM,UAAU,sBAAsB,CAAC,QAAgB;IACrD,IAAI,CAAC,QAAQ;QAAE,OAAO,QAAQ,CAAC;IAC/B,OAAO,QAAQ,CAAC,OAAO,CAAC,2BAA2B,EAAE,CAAC,KAAK,EAAE,GAAW,EAAE,GAAW,EAAE,EAAE;QACvF,MAAM,UAAU,GAAG,GAAG,CAAC,IAAI,EAAE,CAAC;QAC9B,MAAM,QAAQ,GAAG,GAAG,CAAC,WAAW,EAAE,CAAC;QAEnC,iDAAiD;QACjD,IAAI,QAAQ,CAAC,UAAU,CAAC,wBAAwB,CAAC;YAAE,OAAO,EAAE,CAAC;QAE7D,oEAAoE;QACpE,IAAI,QAAQ,CAAC,UAAU,CAAC,oBAAoB,CAAC,IAAI,GAAG,CAAC,MAAM,GAAG,GAAG;YAAE,OAAO,EAAE,CAAC;QAE7E,+CAA+C;QAC/C,KAAK,MAAM,MAAM,IAAI,sBAAsB,EAAE,CAAC;YAC5C,IAAI,QAAQ,CAAC,QAAQ,CAAC,MAAM,CAAC;gBAAE,OAAO,EAAE,CAAC;QAC3C,CAAC;QAED,sCAAsC;QACtC,IAAI,CAAC,UAAU;YAAE,OAAO,EAAE,CAAC;QAE3B,OAAO,KAAK,CAAC;IACf,CAAC,CAAC,CAAC;AACL,CAAC;AAED,8EAA8E;AAC9E,8EAA8E;AAC9E,MAAM,UAAU,mBAAmB,CAAC,QAAgB,EAAE,OAAe;IACnE,IAAI,CAAC,QAAQ,IAAI,CAAC,OAAO;QAAE,OAAO,QAAQ,CAAC;IAE3C,MAAM,OAAO,GAAG,CAAC,IAAY,EAAU,EAAE;QACvC,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;QAC5B,IAAI,CAAC,OAAO;YAAE,OAAO,IAAI,CAAC;QAC1B,IAAI,8CAA8C,CAAC,IAAI,CAAC,OAAO,CAAC;YAAE,OAAO,IAAI,CAAC;QAC9E,IAAI,OAAO,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;YAC5B,IAAI,CAAC;gBACH,OAAO,IAAI,GAAG,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC,IAAI,CAAC;YACxC,CAAC;YAAC,MAAM,CAAC;gBACP,OAAO,IAAI,CAAC;YACd,CAAC;QACH,CAAC;QACD,IAAI,OAAO,CAAC,UAAU,CAAC,IAAI,CAAC,EAAE,CAAC;YAC7B,IAAI,CAAC;gBACH,MAAM,IAAI,GAAG,IAAI,GAAG,CAAC,OAAO,CAAC,CAAC;gBAC9B,OAAO,GAAG,IAAI,CAAC,QAAQ,GAAG,OAAO,EAAE,CAAC;YACtC,CAAC;YAAC,MAAM,CAAC;gBACP,OAAO,IAAI,CAAC;YACd,CAAC;QACH,CAAC;QACD,IAAI,CAAC;YACH,OAAO,IAAI,GAAG,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC,IAAI,CAAC;QACxC,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC,CAAC;IAEF,0EAA0E;IAC1E,IAAI,MAAM,GAAG,QAAQ,CAAC,OAAO,CAC3B,8CAA8C,EAC9C,CAAC,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE,CAAC,GAAG,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC,GAAG,KAAK,EAAE,CAC7D,CAAC;IAEF,MAAM,GAAG,MAAM,CAAC,OAAO,CACrB,qDAAqD,EACrD,CAAC,EAAE,EAAE,GAAG,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE,CAAC,GAAG,GAAG,GAAG,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC,GAAG,KAAK,EAAE,CACxE,CAAC;IAEF,OAAO,MAAM,CAAC;AAChB,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"pipeline.d.ts","sourceRoot":"","sources":["../../src/extraction/pipeline.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"pipeline.d.ts","sourceRoot":"","sources":["../../src/extraction/pipeline.ts"],"names":[],"mappings":"AAaA,OAAO,KAAK,EAAE,gBAAgB,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAU/D,MAAM,WAAW,iBAAiB;IAChC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AASD,wBAAgB,iBAAiB,CAAC,SAAS,EAAE,SAAS,GAAG,IAAI,CAE5D;AAED,wBAAsB,cAAc,CAClC,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,GAAE,iBAAsB,GAC9B,OAAO,CAAC,gBAAgB,CAAC,CA4E3B"}
|
|
@@ -1,8 +1,10 @@
|
|
|
1
|
+
import { parseHTML } from 'linkedom';
|
|
1
2
|
import { defuddleExtract } from './defuddle.js';
|
|
2
3
|
import { readabilityExtract } from './readability.js';
|
|
3
4
|
import { trafilaturaExtract, isTrafilaturaAvailable } from './trafilatura.js';
|
|
4
5
|
import { htmlToMarkdown, extractSection, extractLinksAndImages, filterDecorativeImages, resolveRelativeUrls, } from './markdown.js';
|
|
5
6
|
import { extractMetadata } from './extract.js';
|
|
7
|
+
import { stripBoilerplateDom, stripBoilerplateMarkdown } from './boilerplate.js';
|
|
6
8
|
import { githubExtractor } from './site-extractors/github.js';
|
|
7
9
|
import { stackoverflowExtractor } from './site-extractors/stackoverflow.js';
|
|
8
10
|
import { mdnExtractor } from './site-extractors/mdn.js';
|
|
@@ -43,21 +45,30 @@ export async function extractContent(html, url, options = {}) {
|
|
|
43
45
|
};
|
|
44
46
|
return applyPostProcessing(result, url, html, options);
|
|
45
47
|
}
|
|
48
|
+
let cleanedHtml = html;
|
|
49
|
+
try {
|
|
50
|
+
const { document } = parseHTML(html);
|
|
51
|
+
stripBoilerplateDom(document);
|
|
52
|
+
cleanedHtml = document.toString();
|
|
53
|
+
}
|
|
54
|
+
catch (err) {
|
|
55
|
+
log.warn('boilerplate DOM pre-pass failed', { url, error: String(err) });
|
|
56
|
+
}
|
|
46
57
|
const siteExtractor = siteExtractors.find((e) => e.canHandle(url, html));
|
|
47
58
|
if (siteExtractor) {
|
|
48
|
-
const extracted = siteExtractor.extract(
|
|
59
|
+
const extracted = siteExtractor.extract(cleanedHtml, url);
|
|
49
60
|
if (extracted) {
|
|
50
61
|
result = extracted;
|
|
51
62
|
return applyPostProcessing(result, url, html, options);
|
|
52
63
|
}
|
|
53
64
|
}
|
|
54
|
-
result = await defuddleExtract(
|
|
65
|
+
result = await defuddleExtract(cleanedHtml, url);
|
|
55
66
|
if (!result) {
|
|
56
67
|
const config = getConfig();
|
|
57
68
|
if (config.trafilatura !== 'never') {
|
|
58
69
|
const trafAvailable = await isTrafilaturaAvailable();
|
|
59
70
|
if (trafAvailable) {
|
|
60
|
-
result = await trafilaturaExtract(
|
|
71
|
+
result = await trafilaturaExtract(cleanedHtml, url);
|
|
61
72
|
if (result) {
|
|
62
73
|
log.info('Trafilatura extraction succeeded', { url, chars: result.markdown.length });
|
|
63
74
|
return applyPostProcessing(result, url, html, options);
|
|
@@ -66,10 +77,10 @@ export async function extractContent(html, url, options = {}) {
|
|
|
66
77
|
}
|
|
67
78
|
}
|
|
68
79
|
if (!result) {
|
|
69
|
-
result = readabilityExtract(
|
|
80
|
+
result = readabilityExtract(cleanedHtml, url);
|
|
70
81
|
}
|
|
71
82
|
if (!result) {
|
|
72
|
-
const markdown = htmlToMarkdown(
|
|
83
|
+
const markdown = htmlToMarkdown(cleanedHtml);
|
|
73
84
|
result = {
|
|
74
85
|
title: '',
|
|
75
86
|
markdown,
|
|
@@ -105,6 +116,7 @@ function applyPostProcessing(result, url, html, options) {
|
|
|
105
116
|
let markdown = result.markdown;
|
|
106
117
|
// Resolve relative links/images before slicing so downstream consumers get absolute URLs.
|
|
107
118
|
markdown = resolveRelativeUrls(markdown, url);
|
|
119
|
+
markdown = stripBoilerplateMarkdown(markdown);
|
|
108
120
|
markdown = filterDecorativeImages(markdown);
|
|
109
121
|
if (options.section) {
|
|
110
122
|
const { content } = extractSection(markdown, options.section, options.sectionIndex ?? 0);
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"pipeline.js","sourceRoot":"","sources":["../../src/extraction/pipeline.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AAChD,OAAO,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AACtD,OAAO,EAAE,kBAAkB,EAAE,sBAAsB,EAAE,MAAM,kBAAkB,CAAC;AAC9E,OAAO,EACL,cAAc,EACd,cAAc,EACd,qBAAqB,EACrB,sBAAsB,EACtB,mBAAmB,GACpB,MAAM,eAAe,CAAC;AACvB,OAAO,EAAE,eAAe,EAAE,MAAM,cAAc,CAAC;
|
|
1
|
+
{"version":3,"file":"pipeline.js","sourceRoot":"","sources":["../../src/extraction/pipeline.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AACrC,OAAO,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AAChD,OAAO,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AACtD,OAAO,EAAE,kBAAkB,EAAE,sBAAsB,EAAE,MAAM,kBAAkB,CAAC;AAC9E,OAAO,EACL,cAAc,EACd,cAAc,EACd,qBAAqB,EACrB,sBAAsB,EACtB,mBAAmB,GACpB,MAAM,eAAe,CAAC;AACvB,OAAO,EAAE,eAAe,EAAE,MAAM,cAAc,CAAC;AAC/C,OAAO,EAAE,mBAAmB,EAAE,wBAAwB,EAAE,MAAM,kBAAkB,CAAC;AAEjF,OAAO,EAAE,eAAe,EAAE,MAAM,6BAA6B,CAAC;AAC9D,OAAO,EAAE,sBAAsB,EAAE,MAAM,oCAAoC,CAAC;AAC5E,OAAO,EAAE,YAAY,EAAE,MAAM,0BAA0B,CAAC;AACxD,OAAO,EAAE,oBAAoB,EAAE,MAAM,mCAAmC,CAAC;AACzE,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAC5C,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AAEzC,MAAM,GAAG,GAAG,YAAY,CAAC,SAAS,CAAC,CAAC;AAUpC,MAAM,cAAc,GAAgB;IAClC,eAAe;IACf,sBAAsB;IACtB,YAAY;IACZ,oBAAoB;CACrB,CAAC;AAEF,MAAM,UAAU,iBAAiB,CAAC,SAAoB;IACpD,cAAc,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;AACjC,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,IAAY,EACZ,GAAW,EACX,UAA6B,EAAE;IAE/B,IAAI,MAAM,GAA4B,IAAI,CAAC;IAE3C,IAAI,OAAO,CAAC,WAAW,KAAK,iBAAiB,EAAE,CAAC;QAC9C,IAAI,OAAO,GAAG,EAAE,CAAC;QACjB,IAAI,OAAO,CAAC,SAAS,EAAE,CAAC;YACtB,IAAI,CAAC;gBACH,MAAM,QAAQ,GAAG,CAAC,MAAM,MAAM,CAAC,WAAW,CAAC,CAAC,CAAC,OAAO,CAAC;gBACrD,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;gBACjD,OAAO,GAAG,MAAM,CAAC,IAAI,IAAI,EAAE,CAAC;YAC9B,CAAC;YAAC,OAAO,GAAG,EAAE,CAAC;gBACb,GAAG,CAAC,IAAI,CAAC,kBAAkB,EAAE,EAAE,GAAG,EAAE,KAAK,EAAE,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YAC5D,CAAC;QACH,CAAC;QACD,MAAM,GAAG;YACP,KAAK,EAAE,EAAE;YACT,QAAQ,EAAE,OAAO;YACjB,QAAQ,EAAE,EAAE;YACZ,KAAK,EAAE,EAAE;YACT,MAAM,EAAE,EAAE;YACV,SAAS,EAAE,UAAU;SACtB,CAAC;QACF,OAAO,mBAAmB,CAAC,MAAM,EAAE,GAAG,EAAE,IAAI,EAAE,OAAO,CAAC,CAAC;IACzD,CAAC;IAED,IAAI,WAAW,GAAG,IAAI,CAAC;IACvB,IAAI,CAAC;QACH,MAAM,EAAE,QAAQ,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;QACrC,mBAAmB,CAAC,QAAQ,CAAC,CAAC;QAC9B,WAAW,GAAG,QAAQ,CAAC,QAAQ,EAAE,CAAC;IACpC,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,GAAG,CAAC,IAAI,CAAC,iCAAiC,EAAE,EAAE,GAAG,EAAE,KAAK,EAAE,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IAC3E,CAAC;IAED,MAAM,aAAa,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC,CAAC;IACzE,IAAI,aAAa,EAAE,CAAC;QAClB,MAAM,SAAS,GAAG,aAAa,CAAC,OAAO,CAAC,WAAW,EAAE,GAAG,CAAC,CAAC;QAC1D,IAAI,SAAS,EAAE,CAAC;YACd,MAAM,GAAG,SAAS,CAAC;YACnB,OAAO,mBAAmB,CAAC,MAAM,EAAE,GAAG,EAAE,IAAI,EAAE,OAAO,CAAC,CAAC;QACzD,CAAC;IACH,CAAC;IAED,MAAM,GAAG,MAAM,eAAe,CAAC,WAAW,EAAE,GAAG,CAAC,CAAC;IAEjD,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,MAAM,MAAM,GAAG,SAAS,EAAE,CAAC;QAC3B,IAAI,MAAM,CAAC,WAAW,KAAK,OAAO,EAAE,CAAC;YACnC,MAAM,aAAa,GAAG,MAAM,sBAAsB,EAAE,CAAC;YACrD,IAAI,aAAa,EAAE,CAAC;gBAClB,MAAM,GAAG,MAAM,kBAAkB,CAAC,WAAW,EAAE,GAAG,CAAC,CAAC;gBACpD,IAAI,MAAM,EAAE,CAAC;oBACX,GAAG,CAAC,IAAI,CAAC,kCAAkC,EAAE,EAAE,GAAG,EAAE,KAAK,EAAE,MAAM,CAAC,QAAQ,CAAC,MAAM,EAAE,CAAC,CAAC;oBACrF,OAAO,mBAAmB,CAAC,MAAM,EAAE,GAAG,EAAE,IAAI,EAAE,OAAO,CAAC,CAAC;gBACzD,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAED,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,MAAM,GAAG,kBAAkB,CAAC,WAAW,EAAE,GAAG,CAAC,CAAC;IAChD,CAAC;IAED,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,MAAM,QAAQ,GAAG,cAAc,CAAC,WAAW,CAAC,CAAC;QAC7C,MAAM,GAAG;YACP,KAAK,EAAE,EAAE;YACT,QAAQ;YACR,QAAQ,EAAE,EAAE;YACZ,KAAK,EAAE,EAAE;YACT,MAAM,EAAE,EAAE;YACV,SAAS,EAAE,UAAU;SACtB,CAAC;IACJ,CAAC;IAED,OAAO,mBAAmB,CAAC,MAAM,EAAE,GAAG,EAAE,IAAI,EAAE,OAAO,CAAC,CAAC;AACzD,CAAC;AAED,SAAS,aAAa,CACpB,IAAkC,EAClC,IAAY;IAEZ,IAAI,CAAC;QACH,MAAM,IAAI,GAAG,eAAe,CAAC,IAAI,CAAC,CAAC;QACnC,OAAO;YACL,GAAG,IAAI;YACP,oFAAoF;YACpF,WAAW,EAAE,IAAI,CAAC,WAAW,IAAI,IAAI,CAAC,WAAW;YACjD,MAAM,EAAE,IAAI,CAAC,MAAM,IAAI,IAAI,CAAC,MAAM;YAClC,IAAI,EAAE,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI;YAC5B,QAAQ,EAAE,IAAI,CAAC,QAAQ;YACvB,QAAQ,EAAE,IAAI,CAAC,QAAQ,IAAI,IAAI,CAAC,QAAQ;YACxC,OAAO,EAAE,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,OAAO;YACrC,aAAa,EAAE,IAAI,CAAC,aAAa,IAAI,IAAI,CAAC,aAAa;YACvD,QAAQ,EAAE,IAAI,CAAC,QAAQ,IAAI,IAAI,CAAC,QAAQ;SACzC,CAAC;IACJ,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED,SAAS,mBAAmB,CAC1B,MAAwB,EACxB,GAAW,EACX,IAAY,EACZ,OAA0B;IAE1B,IAAI,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC;IAE/B,0FAA0F;IAC1F,QAAQ,GAAG,mBAAmB,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAC;IAC9C,QAAQ,GAAG,wBAAwB,CAAC,QAAQ,CAAC,CAAC;IAC9C,QAAQ,GAAG,sBAAsB,CAAC,QAAQ,CAAC,CAAC;IAE5C,IAAI,OAAO,CAAC,OAAO,EAAE,CAAC;QACpB,MAAM,EAAE,OAAO,EAAE,GAAG,cAAc,CAAC,QAAQ,EAAE,OAAO,CAAC,OAAO,EAAE,OAAO,CAAC,YAAY,IAAI,CAAC,CAAC,CAAC;QACzF,QAAQ,GAAG,OAAO,CAAC;IACrB,CAAC;IAED,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,GAAG,qBAAqB,CAAC,QAAQ,CAAC,CAAC;IAC1D,MAAM,QAAQ,GAAG,aAAa,CAAC,MAAM,CAAC,QAAQ,EAAE,IAAI,CAAC,CAAC;IAEtD,IAAI,OAAO,CAAC,QAAQ,IAAI,QAAQ,CAAC,MAAM,GAAG,OAAO,CAAC,QAAQ,EAAE,CAAC;QAC3D,QAAQ,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,OAAO,CAAC,QAAQ,CAAC,CAAC;IACjD,CAAC;IAED,OAAO,EAAE,GAAG,MAAM,EAAE,QAAQ,EAAE,KAAK,EAAE,MAAM,EAAE,QAAQ,EAAE,CAAC;AAC1D,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"readability.d.ts","sourceRoot":"","sources":["../../src/extraction/readability.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAIpD,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,GAAG,gBAAgB,GAAG,IAAI,
|
|
1
|
+
{"version":3,"file":"readability.d.ts","sourceRoot":"","sources":["../../src/extraction/readability.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAIpD,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,GAAG,gBAAgB,GAAG,IAAI,CAyBtF"}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { Readability } from '@mozilla/readability';
|
|
2
2
|
import { parseHTML } from 'linkedom';
|
|
3
|
-
import
|
|
3
|
+
import { htmlToMarkdown } from './markdown.js';
|
|
4
4
|
const MIN_CONTENT_THRESHOLD = 100;
|
|
5
5
|
export function readabilityExtract(html, _url) {
|
|
6
6
|
try {
|
|
@@ -9,8 +9,7 @@ export function readabilityExtract(html, _url) {
|
|
|
9
9
|
const article = reader.parse();
|
|
10
10
|
if (!article || !article.content)
|
|
11
11
|
return null;
|
|
12
|
-
const
|
|
13
|
-
const markdown = turndown.turndown(article.content);
|
|
12
|
+
const markdown = htmlToMarkdown(article.content);
|
|
14
13
|
if (markdown.length < MIN_CONTENT_THRESHOLD)
|
|
15
14
|
return null;
|
|
16
15
|
return {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"readability.js","sourceRoot":"","sources":["../../src/extraction/readability.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAC;AACnD,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AACrC,OAAO,
|
|
1
|
+
{"version":3,"file":"readability.js","sourceRoot":"","sources":["../../src/extraction/readability.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAC;AACnD,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AACrC,OAAO,EAAE,cAAc,EAAE,MAAM,eAAe,CAAC;AAG/C,MAAM,qBAAqB,GAAG,GAAG,CAAC;AAElC,MAAM,UAAU,kBAAkB,CAAC,IAAY,EAAE,IAAY;IAC3D,IAAI,CAAC;QACH,MAAM,EAAE,QAAQ,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;QACrC,MAAM,MAAM,GAAG,IAAI,WAAW,CAAC,QAAe,CAAC,CAAC;QAChD,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,EAAE,CAAC;QAC/B,IAAI,CAAC,OAAO,IAAI,CAAC,OAAO,CAAC,OAAO;YAAE,OAAO,IAAI,CAAC;QAE9C,MAAM,QAAQ,GAAG,cAAc,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;QAEjD,IAAI,QAAQ,CAAC,MAAM,GAAG,qBAAqB;YAAE,OAAO,IAAI,CAAC;QAEzD,OAAO;YACL,KAAK,EAAE,OAAO,CAAC,KAAK,IAAI,EAAE;YAC1B,QAAQ;YACR,QAAQ,EAAE;gBACR,MAAM,EAAE,OAAO,CAAC,MAAM,IAAI,SAAS;gBACnC,QAAQ,EAAE,OAAO,CAAC,IAAI,IAAI,SAAS;aACpC;YACD,KAAK,EAAE,EAAE;YACT,MAAM,EAAE,EAAE;YACV,SAAS,EAAE,aAAa;SACzB,CAAC;IACJ,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC"}
|
|
@@ -1,7 +1,19 @@
|
|
|
1
|
+
import { type LLMFallbackBudget } from './llm-fallback.js';
|
|
2
|
+
import type { SchemaExtractionResult } from '../types.js';
|
|
1
3
|
export interface JsonSchema {
|
|
2
4
|
type?: string;
|
|
3
5
|
properties?: Record<string, JsonSchema>;
|
|
4
6
|
items?: JsonSchema;
|
|
7
|
+
required?: string[];
|
|
8
|
+
}
|
|
9
|
+
export interface SchemaExtractionOpts {
|
|
10
|
+
signal?: AbortSignal;
|
|
11
|
+
budget?: LLMFallbackBudget;
|
|
5
12
|
}
|
|
6
13
|
export declare function extractWithSchema(html: string, schema: JsonSchema): Record<string, unknown>;
|
|
14
|
+
export declare function extractWithSchemaDetailed(html: string, schema: JsonSchema): SchemaExtractionResult;
|
|
15
|
+
export interface SchemaExtractionAsyncResult extends SchemaExtractionResult {
|
|
16
|
+
warnings: string[];
|
|
17
|
+
}
|
|
18
|
+
export declare function extractWithSchemaDetailedAsync(html: string, schema: JsonSchema, opts?: SchemaExtractionOpts): Promise<SchemaExtractionAsyncResult>;
|
|
7
19
|
//# sourceMappingURL=schema.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"schema.d.ts","sourceRoot":"","sources":["../../src/extraction/schema.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"schema.d.ts","sourceRoot":"","sources":["../../src/extraction/schema.ts"],"names":[],"mappings":"AAEA,OAAO,EAAkB,KAAK,iBAAiB,EAAE,MAAM,mBAAmB,CAAC;AAC3E,OAAO,KAAK,EAEV,sBAAsB,EAEvB,MAAM,aAAa,CAAC;AAErB,MAAM,WAAW,UAAU;IACzB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,UAAU,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,UAAU,CAAC,CAAC;IACxC,KAAK,CAAC,EAAE,UAAU,CAAC;IACnB,QAAQ,CAAC,EAAE,MAAM,EAAE,CAAC;CACrB;AAED,MAAM,WAAW,oBAAoB;IACnC,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,MAAM,CAAC,EAAE,iBAAiB,CAAC;CAC5B;AAQD,wBAAgB,iBAAiB,CAC/B,IAAI,EAAE,MAAM,EACZ,MAAM,EAAE,UAAU,GACjB,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAEzB;AAED,wBAAgB,yBAAyB,CACvC,IAAI,EAAE,MAAM,EACZ,MAAM,EAAE,UAAU,GACjB,sBAAsB,CAsCxB;AAED,MAAM,WAAW,2BAA4B,SAAQ,sBAAsB;IACzE,QAAQ,EAAE,MAAM,EAAE,CAAC;CACpB;AAED,wBAAsB,8BAA8B,CAClD,IAAI,EAAE,MAAM,EACZ,MAAM,EAAE,UAAU,EAClB,IAAI,GAAE,oBAAyB,GAC9B,OAAO,CAAC,2BAA2B,CAAC,CA+BtC"}
|
|
@@ -1,22 +1,93 @@
|
|
|
1
1
|
import { parseHTML } from 'linkedom';
|
|
2
|
-
import {
|
|
2
|
+
import { extractStructuredData } from './structured-data.js';
|
|
3
|
+
import { extractWithLLM } from './llm-fallback.js';
|
|
4
|
+
const PROVENANCE_PRIORITY = [
|
|
5
|
+
'json-ld',
|
|
6
|
+
'microdata',
|
|
7
|
+
'rdfa',
|
|
8
|
+
];
|
|
3
9
|
export function extractWithSchema(html, schema) {
|
|
10
|
+
return extractWithSchemaDetailed(html, schema).values;
|
|
11
|
+
}
|
|
12
|
+
export function extractWithSchemaDetailed(html, schema) {
|
|
13
|
+
const values = {};
|
|
14
|
+
const provenance = {};
|
|
4
15
|
if (!html || !schema.properties)
|
|
5
|
-
return {};
|
|
6
|
-
const
|
|
7
|
-
const
|
|
16
|
+
return { values, provenance };
|
|
17
|
+
const blocks = extractStructuredData(html);
|
|
18
|
+
for (const source of PROVENANCE_PRIORITY) {
|
|
19
|
+
for (const block of blocks) {
|
|
20
|
+
if (block.provenance !== source)
|
|
21
|
+
continue;
|
|
22
|
+
for (const fieldName of Object.keys(schema.properties)) {
|
|
23
|
+
if (values[fieldName] !== undefined)
|
|
24
|
+
continue;
|
|
25
|
+
const v = pickField(block.fields, fieldName);
|
|
26
|
+
if (v !== undefined) {
|
|
27
|
+
values[fieldName] = v;
|
|
28
|
+
provenance[fieldName] = source;
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
const allCovered = Object.keys(schema.properties).every((k) => values[k] !== undefined);
|
|
34
|
+
if (allCovered)
|
|
35
|
+
return { values, provenance };
|
|
36
|
+
// Heuristic fallback only for fields still missing
|
|
8
37
|
const { document: doc } = parseHTML(html);
|
|
9
|
-
const heuristicResult = {};
|
|
10
38
|
for (const [fieldName, fieldSchema] of Object.entries(schema.properties)) {
|
|
11
|
-
if (
|
|
39
|
+
if (values[fieldName] !== undefined)
|
|
12
40
|
continue;
|
|
13
|
-
const
|
|
14
|
-
if (
|
|
15
|
-
|
|
41
|
+
const v = findFieldValue(doc, fieldName, fieldSchema);
|
|
42
|
+
if (v !== undefined) {
|
|
43
|
+
values[fieldName] = v;
|
|
44
|
+
provenance[fieldName] = 'heuristic';
|
|
16
45
|
}
|
|
17
46
|
}
|
|
18
|
-
return {
|
|
47
|
+
return { values, provenance };
|
|
48
|
+
}
|
|
49
|
+
export async function extractWithSchemaDetailedAsync(html, schema, opts = {}) {
|
|
50
|
+
const det = extractWithSchemaDetailed(html, schema);
|
|
51
|
+
const warnings = [];
|
|
52
|
+
if (!schema.required || schema.required.length === 0) {
|
|
53
|
+
return { ...det, warnings };
|
|
54
|
+
}
|
|
55
|
+
const missing = schema.required.filter((k) => det.values[k] === undefined);
|
|
56
|
+
if (missing.length === 0) {
|
|
57
|
+
return { ...det, warnings };
|
|
58
|
+
}
|
|
59
|
+
const llm = await extractWithLLM({
|
|
60
|
+
html,
|
|
61
|
+
jsonSchema: schema,
|
|
62
|
+
partial: det.values,
|
|
63
|
+
missing,
|
|
64
|
+
signal: opts.signal,
|
|
65
|
+
budget: opts.budget,
|
|
66
|
+
});
|
|
67
|
+
const values = { ...det.values };
|
|
68
|
+
const provenance = { ...det.provenance };
|
|
69
|
+
for (const key of missing) {
|
|
70
|
+
if (llm.values[key] !== undefined && values[key] === undefined) {
|
|
71
|
+
values[key] = llm.values[key];
|
|
72
|
+
provenance[key] = 'llm';
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
return { values, provenance, warnings: llm.warnings };
|
|
76
|
+
}
|
|
77
|
+
function pickField(fields, name) {
|
|
78
|
+
if (fields[name] !== undefined)
|
|
79
|
+
return fields[name];
|
|
80
|
+
// Shallow nested — e.g. JSON-LD Product.offers.price
|
|
81
|
+
for (const v of Object.values(fields)) {
|
|
82
|
+
if (v && typeof v === 'object' && !Array.isArray(v)) {
|
|
83
|
+
const nested = v[name];
|
|
84
|
+
if (nested !== undefined)
|
|
85
|
+
return nested;
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
return undefined;
|
|
19
89
|
}
|
|
90
|
+
// ---------- heuristic helpers (preserved from prior schema.ts) ----------
|
|
20
91
|
function findFieldValue(doc, fieldName, schema) {
|
|
21
92
|
const normalizedName = fieldName.toLowerCase().replace(/_/g, '-');
|
|
22
93
|
const compactName = fieldName.replace(/_/g, '').toLowerCase();
|
|
@@ -37,7 +108,6 @@ function findSingleValue(doc, variants) {
|
|
|
37
108
|
if (text)
|
|
38
109
|
return text;
|
|
39
110
|
}
|
|
40
|
-
// Substring match is intentional — heuristic best-effort for partial class names
|
|
41
111
|
const byClass = doc.querySelector(`[class*="${name}"]`);
|
|
42
112
|
if (byClass) {
|
|
43
113
|
const text = byClass.textContent?.trim();
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"schema.js","sourceRoot":"","sources":["../../src/extraction/schema.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AACrC,OAAO,EAAE,
|
|
1
|
+
{"version":3,"file":"schema.js","sourceRoot":"","sources":["../../src/extraction/schema.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AACrC,OAAO,EAAE,qBAAqB,EAAE,MAAM,sBAAsB,CAAC;AAC7D,OAAO,EAAE,cAAc,EAA0B,MAAM,mBAAmB,CAAC;AAmB3E,MAAM,mBAAmB,GAAyC;IAChE,SAAS;IACT,WAAW;IACX,MAAM;CACP,CAAC;AAEF,MAAM,UAAU,iBAAiB,CAC/B,IAAY,EACZ,MAAkB;IAElB,OAAO,yBAAyB,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC,MAAM,CAAC;AACxD,CAAC;AAED,MAAM,UAAU,yBAAyB,CACvC,IAAY,EACZ,MAAkB;IAElB,MAAM,MAAM,GAA4B,EAAE,CAAC;IAC3C,MAAM,UAAU,GAAoC,EAAE,CAAC;IACvD,IAAI,CAAC,IAAI,IAAI,CAAC,MAAM,CAAC,UAAU;QAAE,OAAO,EAAE,MAAM,EAAE,UAAU,EAAE,CAAC;IAE/D,MAAM,MAAM,GAAG,qBAAqB,CAAC,IAAI,CAAC,CAAC;IAE3C,KAAK,MAAM,MAAM,IAAI,mBAAmB,EAAE,CAAC;QACzC,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;YAC3B,IAAI,KAAK,CAAC,UAAU,KAAK,MAAM;gBAAE,SAAS;YAC1C,KAAK,MAAM,SAAS,IAAI,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,UAAU,CAAC,EAAE,CAAC;gBACvD,IAAI,MAAM,CAAC,SAAS,CAAC,KAAK,SAAS;oBAAE,SAAS;gBAC9C,MAAM,CAAC,GAAG,SAAS,CAAC,KAAK,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;gBAC7C,IAAI,CAAC,KAAK,SAAS,EAAE,CAAC;oBACpB,MAAM,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;oBACtB,UAAU,CAAC,SAAS,CAAC,GAAG,MAAM,CAAC;gBACjC,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAED,MAAM,UAAU,GAAG,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC,KAAK,CACrD,CAAC,CAAC,EAAE,EAAE,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,SAAS,CAC/B,CAAC;IACF,IAAI,UAAU;QAAE,OAAO,EAAE,MAAM,EAAE,UAAU,EAAE,CAAC;IAE9C,mDAAmD;IACnD,MAAM,EAAE,QAAQ,EAAE,GAAG,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;IAC1C,KAAK,MAAM,CAAC,SAAS,EAAE,WAAW,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,UAAU,CAAC,EAAE,CAAC;QACzE,IAAI,MAAM,CAAC,SAAS,CAAC,KAAK,SAAS;YAAE,SAAS;QAC9C,MAAM,CAAC,GAAG,cAAc,CAAC,GAAG,EAAE,SAAS,EAAE,WAAW,CAAC,CAAC;QACtD,IAAI,CAAC,KAAK,SAAS,EAAE,CAAC;YACpB,MAAM,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;YACtB,UAAU,CAAC,SAAS,CAAC,GAAG,WAAW,CAAC;QACtC,CAAC;IACH,CAAC;IAED,OAAO,EAAE,MAAM,EAAE,UAAU,EAAE,CAAC;AAChC,CAAC;AAMD,MAAM,CAAC,KAAK,UAAU,8BAA8B,CAClD,IAAY,EACZ,MAAkB,EAClB,OAA6B,EAAE;IAE/B,MAAM,GAAG,GAAG,yBAAyB,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;IACpD,MAAM,QAAQ,GAAa,EAAE,CAAC;IAE9B,IAAI,CAAC,MAAM,CAAC,QAAQ,IAAI,MAAM,CAAC,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACrD,OAAO,EAAE,GAAG,GAAG,EAAE,QAAQ,EAAE,CAAC;IAC9B,CAAC;IAED,MAAM,OAAO,GAAG,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,SAAS,CAAC,CAAC;IAC3E,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzB,OAAO,EAAE,GAAG,GAAG,EAAE,QAAQ,EAAE,CAAC;IAC9B,CAAC;IAED,MAAM,GAAG,GAAG,MAAM,cAAc,CAAC;QAC/B,IAAI;QACJ,UAAU,EAAE,MAA4C;QACxD,OAAO,EAAE,GAAG,CAAC,MAAM;QACnB,OAAO;QACP,MAAM,EAAE,IAAI,CAAC,MAAM;QACnB,MAAM,EAAE,IAAI,CAAC,MAAM;KACpB,CAAC,CAAC;IAEH,MAAM,MAAM,GAAG,EAAE,GAAG,GAAG,CAAC,MAAM,EAAE,CAAC;IACjC,MAAM,UAAU,GAAoC,EAAE,GAAG,GAAG,CAAC,UAAU,EAAE,CAAC;IAC1E,KAAK,MAAM,GAAG,IAAI,OAAO,EAAE,CAAC;QAC1B,IAAI,GAAG,CAAC,MAAM,CAAC,GAAG,CAAC,KAAK,SAAS,IAAI,MAAM,CAAC,GAAG,CAAC,KAAK,SAAS,EAAE,CAAC;YAC/D,MAAM,CAAC,GAAG,CAAC,GAAG,GAAG,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;YAC9B,UAAU,CAAC,GAAG,CAAC,GAAG,KAAK,CAAC;QAC1B,CAAC;IACH,CAAC;IACD,OAAO,EAAE,MAAM,EAAE,UAAU,EAAE,QAAQ,EAAE,GAAG,CAAC,QAAQ,EAAE,CAAC;AACxD,CAAC;AAED,SAAS,SAAS,CAAC,MAA+B,EAAE,IAAY;IAC9D,IAAI,MAAM,CAAC,IAAI,CAAC,KAAK,SAAS;QAAE,OAAO,MAAM,CAAC,IAAI,CAAC,CAAC;IACpD,qDAAqD;IACrD,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,EAAE,CAAC;QACtC,IAAI,CAAC,IAAI,OAAO,CAAC,KAAK,QAAQ,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC;YACpD,MAAM,MAAM,GAAI,CAA6B,CAAC,IAAI,CAAC,CAAC;YACpD,IAAI,MAAM,KAAK,SAAS;gBAAE,OAAO,MAAM,CAAC;QAC1C,CAAC;IACH,CAAC;IACD,OAAO,SAAS,CAAC;AACnB,CAAC;AAED,2EAA2E;AAE3E,SAAS,cAAc,CACrB,GAAa,EACb,SAAiB,EACjB,MAAkB;IAElB,MAAM,cAAc,GAAG,SAAS,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;IAClE,MAAM,WAAW,GAAG,SAAS,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,WAAW,EAAE,CAAC;IAC9D,MAAM,QAAQ,GAAG,CAAC,SAAS,EAAE,cAAc,EAAE,WAAW,CAAC,CAAC;IAE1D,IAAI,MAAM,CAAC,IAAI,KAAK,OAAO,EAAE,CAAC;QAC5B,OAAO,eAAe,CAAC,GAAG,EAAE,QAAQ,CAAC,CAAC;IACxC,CAAC;IAED,OAAO,eAAe,CAAC,GAAG,EAAE,QAAQ,CAAC,CAAC;AACxC,CAAC;AAED,SAAS,SAAS,CAAC,KAAa;IAC9B,OAAO,KAAK,CAAC,OAAO,CAAC,WAAW,EAAE,MAAM,CAAC,CAAC;AAC5C,CAAC;AAED,SAAS,eAAe,CAAC,GAAa,EAAE,QAAkB;IACxD,KAAK,MAAM,IAAI,IAAI,QAAQ,EAAE,CAAC;QAC5B,MAAM,UAAU,GAAG,GAAG,CAAC,aAAa,CAAC,cAAc,IAAI,IAAI,CAAC,CAAC;QAC7D,IAAI,UAAU,EAAE,CAAC;YACf,MAAM,IAAI,GAAG,UAAU,CAAC,YAAY,CAAC,SAAS,CAAC,IAAI,UAAU,CAAC,WAAW,EAAE,IAAI,EAAE,CAAC;YAClF,IAAI,IAAI;gBAAE,OAAO,IAAI,CAAC;QACxB,CAAC;QAED,MAAM,OAAO,GAAG,GAAG,CAAC,aAAa,CAAC,YAAY,IAAI,IAAI,CAAC,CAAC;QACxD,IAAI,OAAO,EAAE,CAAC;YACZ,MAAM,IAAI,GAAG,OAAO,CAAC,WAAW,EAAE,IAAI,EAAE,CAAC;YACzC,IAAI,IAAI;gBAAE,OAAO,IAAI,CAAC;QACxB,CAAC;QAED,MAAM,WAAW,GAAG,GAAG,CAAC,gBAAgB,CAAC,cAAc,CAAC,CAAC;QACzD,KAAK,MAAM,EAAE,IAAI,WAAW,EAAE,CAAC;YAC7B,MAAM,KAAK,GAAG,EAAE,CAAC,YAAY,CAAC,YAAY,CAAC,EAAE,WAAW,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,IAAI,EAAE,CAAC;YACtF,IAAI,KAAK,KAAK,IAAI,CAAC,WAAW,EAAE,EAAE,CAAC;gBACjC,MAAM,IAAI,GAAG,EAAE,CAAC,WAAW,EAAE,IAAI,EAAE,CAAC;gBACpC,IAAI,IAAI;oBAAE,OAAO,IAAI,CAAC;YACxB,CAAC;QACH,CAAC;QAED,MAAM,IAAI,GAAG,GAAG,CAAC,aAAa,CAAC,IAAI,SAAS,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACtD,IAAI,IAAI,EAAE,CAAC;YACT,MAAM,IAAI,GAAG,IAAI,CAAC,WAAW,EAAE,IAAI,EAAE,CAAC;YACtC,IAAI,IAAI;gBAAE,OAAO,IAAI,CAAC;QACxB,CAAC;QAED,MAAM,MAAM,GAAG,GAAG,CAAC,aAAa,CAAC,SAAS,IAAI,GAAG,CAAC,CAAC;QACnD,IAAI,MAAM,EAAE,CAAC;YACX,OAAO,MAAM,CAAC,YAAY,CAAC,QAAQ,IAAI,EAAE,CAAC,IAAI,MAAM,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,SAAS,CAAC;QACxF,CAAC;IACH,CAAC;IAED,OAAO,SAAS,CAAC;AACnB,CAAC;AAED,SAAS,eAAe,CAAC,GAAa,EAAE,QAAkB;IACxD,KAAK,MAAM,IAAI,IAAI,QAAQ,EAAE,CAAC;QAC5B,MAAM,SAAS,GAAG,GAAG,CAAC,aAAa,CAAC,YAAY,IAAI,IAAI,CAAC,CAAC;QAC1D,IAAI,SAAS,EAAE,CAAC;YACd,MAAM,KAAK,GAAG,SAAS,CAAC,gBAAgB,CAAC,qBAAqB,CAAC,CAAC;YAChE,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACrB,OAAO,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;YACtF,CAAC;QACH,CAAC;QAED,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;QACxC,MAAM,QAAQ,GAAG,GAAG,CAAC,gBAAgB,CAAC,YAAY,QAAQ,IAAI,CAAC,CAAC;QAChE,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACxB,OAAO,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QACzF,CAAC;IACH,CAAC;IAED,OAAO,SAAS,CAAC;AACnB,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"docs-generic.d.ts","sourceRoot":"","sources":["../../../src/extraction/site-extractors/docs-generic.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAoB,MAAM,gBAAgB,CAAC;
|
|
1
|
+
{"version":3,"file":"docs-generic.d.ts","sourceRoot":"","sources":["../../../src/extraction/site-extractors/docs-generic.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAoB,MAAM,gBAAgB,CAAC;AAmFlE,eAAO,MAAM,oBAAoB,EAAE,SAqClC,CAAC"}
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import { parseHTML } from 'linkedom';
|
|
2
|
-
import
|
|
3
|
-
const turndown = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced' });
|
|
2
|
+
import { htmlToMarkdown } from '../markdown.js';
|
|
4
3
|
const STRIP_SELECTORS = [
|
|
5
4
|
'nav',
|
|
6
5
|
'.docs-sidebar',
|
|
@@ -54,7 +53,7 @@ function buildResult(document, contentEl) {
|
|
|
54
53
|
: rawTitle;
|
|
55
54
|
if (!title)
|
|
56
55
|
return null;
|
|
57
|
-
const markdown =
|
|
56
|
+
const markdown = htmlToMarkdown(contentEl.innerHTML).trim();
|
|
58
57
|
if (!markdown)
|
|
59
58
|
return null;
|
|
60
59
|
return {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"docs-generic.js","sourceRoot":"","sources":["../../../src/extraction/site-extractors/docs-generic.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AACrC,OAAO,
|
|
1
|
+
{"version":3,"file":"docs-generic.js","sourceRoot":"","sources":["../../../src/extraction/site-extractors/docs-generic.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AACrC,OAAO,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAC;AAKhD,MAAM,eAAe,GAAG;IACtB,KAAK;IACL,eAAe;IACf,UAAU;IACV,cAAc;IACd,oBAAoB;IACpB,iBAAiB;IACjB,iBAAiB;IACjB,QAAQ;IACR,QAAQ;CACT,CAAC;AAEF,SAAS,eAAe,CAAC,IAAY;IACnC,IAAI,IAAI,CAAC,QAAQ,CAAC,cAAc,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,sBAAsB,CAAC,EAAE,CAAC;QAC3E,OAAO,YAAY,CAAC;IACtB,CAAC;IACD,IAAI,IAAI,CAAC,QAAQ,CAAC,YAAY,CAAC,EAAE,CAAC;QAChC,OAAO,QAAQ,CAAC;IAClB,CAAC;IACD,IAAI,IAAI,CAAC,QAAQ,CAAC,kBAAkB,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,kBAAkB,CAAC;QACtE,CAAC,IAAI,CAAC,QAAQ,CAAC,cAAc,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC,EAAE,CAAC;QAClE,OAAO,QAAQ,CAAC;IAClB,CAAC;IACD,IAAI,IAAI,CAAC,QAAQ,CAAC,WAAW,CAAC,EAAE,CAAC;QAC/B,OAAO,SAAS,CAAC;IACnB,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,SAAS,oBAAoB,CAC3B,QAAkB,EAClB,gBAA0B;IAE1B,KAAK,MAAM,QAAQ,IAAI,gBAAgB,EAAE,CAAC;QACxC,MAAM,EAAE,GAAG,QAAQ,CAAC,aAAa,CAAC,QAAQ,CAAC,CAAC;QAC5C,IAAI,EAAE;YAAE,OAAO,EAAa,CAAC;IAC/B,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,SAAS,aAAa,CAAC,IAAa,EAAE,SAAmB;IACvD,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;QACjC,KAAK,MAAM,EAAE,IAAI,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAC,EAAE,CAAC;YAC7D,EAAE,CAAC,UAAU,EAAE,WAAW,CAAC,EAAE,CAAC,CAAC;QACjC,CAAC;IACH,CAAC;AACH,CAAC;AAED,SAAS,WAAW,CAClB,QAAkB,EAClB,SAAkB;IAElB,aAAa,CAAC,SAAS,EAAE,eAAe,CAAC,CAAC;IAE1C,MAAM,OAAO,GACX,SAAS,CAAC,aAAa,CAAC,IAAI,CAAC;QAC7B,QAAQ,CAAC,aAAa,CAAC,IAAI,CAAC;QAC5B,QAAQ,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC;IAElC,MAAM,QAAQ,GAAG,OAAO,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;IACpD,MAAM,KAAK,GAAG,QAAQ,CAAC,QAAQ,CAAC,GAAG,CAAC;QAClC,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAE,CAAC,IAAI,EAAE;QAChC,CAAC,CAAC,QAAQ,CAAC;IAEb,IAAI,CAAC,KAAK;QAAE,OAAO,IAAI,CAAC;IAExB,MAAM,QAAQ,GAAG,cAAc,CAAC,SAAS,CAAC,SAAS,CAAC,CAAC,IAAI,EAAE,CAAC;IAC5D,IAAI,CAAC,QAAQ;QAAE,OAAO,IAAI,CAAC;IAE3B,OAAO;QACL,KAAK;QACL,QAAQ;QACR,QAAQ,EAAE,EAAE;QACZ,KAAK,EAAE,EAAE;QACT,MAAM,EAAE,EAAE;QACV,SAAS,EAAE,eAAe;KAC3B,CAAC;AACJ,CAAC;AAED,MAAM,CAAC,MAAM,oBAAoB,GAAc;IAC7C,IAAI,EAAE,cAAc;IAEpB,SAAS,CAAC,IAAY,EAAE,IAAa;QACnC,IAAI,CAAC,IAAI;YAAE,OAAO,KAAK,CAAC;QACxB,OAAO,eAAe,CAAC,IAAI,CAAC,KAAK,IAAI,CAAC;IACxC,CAAC;IAED,OAAO,CAAC,IAAY,EAAE,IAAY;QAChC,IAAI,CAAC,IAAI;YAAE,OAAO,IAAI,CAAC;QAEvB,MAAM,SAAS,GAAG,eAAe,CAAC,IAAI,CAAC,CAAC;QACxC,IAAI,CAAC,SAAS;YAAE,OAAO,IAAI,CAAC;QAE5B,MAAM,EAAE,QAAQ,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;QAErC,IAAI,gBAA0B,CAAC;QAC/B,QAAQ,SAAS,EAAE,CAAC;YAClB,KAAK,YAAY;gBACf,gBAAgB,GAAG,CAAC,WAAW,EAAE,SAAS,EAAE,MAAM,CAAC,CAAC;gBACpD,MAAM;YACR,KAAK,QAAQ;gBACX,gBAAgB,GAAG,CAAC,aAAa,CAAC,CAAC;gBACnC,MAAM;YACR,KAAK,QAAQ;gBACX,gBAAgB,GAAG,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC;gBAC1C,MAAM;YACR,KAAK,SAAS;gBACZ,gBAAgB,GAAG,CAAC,YAAY,CAAC,CAAC;gBAClC,MAAM;QACV,CAAC;QAED,MAAM,SAAS,GAAG,oBAAoB,CAAC,QAAQ,EAAE,gBAAgB,CAAC,CAAC;QACnE,IAAI,CAAC,SAAS;YAAE,OAAO,IAAI,CAAC;QAE5B,OAAO,WAAW,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;IAC1C,CAAC;CACF,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"github.d.ts","sourceRoot":"","sources":["../../../src/extraction/site-extractors/github.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAoB,MAAM,gBAAgB,CAAC;
|
|
1
|
+
{"version":3,"file":"github.d.ts","sourceRoot":"","sources":["../../../src/extraction/site-extractors/github.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAoB,MAAM,gBAAgB,CAAC;AAkGlE,eAAO,MAAM,eAAe,EAAE,SA2B7B,CAAC"}
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import { parseHTML } from 'linkedom';
|
|
2
|
-
import
|
|
3
|
-
const turndown = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced' });
|
|
2
|
+
import { htmlToMarkdown } from '../markdown.js';
|
|
4
3
|
function isIssueOrPR(url) {
|
|
5
4
|
return /\/issues\/\d+|\/pull\/\d+/.test(url);
|
|
6
5
|
}
|
|
@@ -25,7 +24,7 @@ function extractIssue(document, _url) {
|
|
|
25
24
|
}
|
|
26
25
|
Array.from(commentBodies).forEach((body, i) => {
|
|
27
26
|
const html = body.innerHTML;
|
|
28
|
-
const md =
|
|
27
|
+
const md = htmlToMarkdown(html).trim();
|
|
29
28
|
if (md) {
|
|
30
29
|
sections.push(i === 0 ? md : `---\n\n${md}`);
|
|
31
30
|
}
|
|
@@ -48,7 +47,7 @@ function extractReadme(document) {
|
|
|
48
47
|
document.querySelector('.markdown-body');
|
|
49
48
|
if (!readmeBody)
|
|
50
49
|
return null;
|
|
51
|
-
const markdown =
|
|
50
|
+
const markdown = htmlToMarkdown(readmeBody.innerHTML).trim();
|
|
52
51
|
if (!markdown)
|
|
53
52
|
return null;
|
|
54
53
|
return {
|
|
@@ -68,7 +67,7 @@ function extractBlob(document) {
|
|
|
68
67
|
document.querySelector('.markdown-body');
|
|
69
68
|
if (!codeBlock)
|
|
70
69
|
return null;
|
|
71
|
-
const markdown =
|
|
70
|
+
const markdown = htmlToMarkdown(codeBlock.innerHTML).trim();
|
|
72
71
|
if (!markdown)
|
|
73
72
|
return null;
|
|
74
73
|
return {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"github.js","sourceRoot":"","sources":["../../../src/extraction/site-extractors/github.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AACrC,OAAO,
|
|
1
|
+
{"version":3,"file":"github.js","sourceRoot":"","sources":["../../../src/extraction/site-extractors/github.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AACrC,OAAO,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAC;AAGhD,SAAS,WAAW,CAAC,GAAW;IAC9B,OAAO,2BAA2B,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;AAC/C,CAAC;AAED,SAAS,MAAM,CAAC,GAAW;IACzB,OAAO,UAAU,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;AAC9B,CAAC;AAED,SAAS,YAAY,CAAC,QAAkB,EAAE,IAAY;IACpD,MAAM,OAAO,GAAG,QAAQ,CAAC,aAAa,CAAC,iBAAiB,CAAC,IAAI,QAAQ,CAAC,aAAa,CAAC,kBAAkB,CAAC,CAAC;IACxG,IAAI,CAAC,OAAO;QAAE,OAAO,IAAI,CAAC;IAE1B,MAAM,KAAK,GAAG,OAAO,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;IAEhD,MAAM,QAAQ,GAAG,QAAQ,CAAC,gBAAgB,CAAC,aAAa,CAAC,CAAC;IAC1D,MAAM,MAAM,GAAG,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC;SAChC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;SACzC,MAAM,CAAC,OAAO,CAAC,CAAC;IAEnB,MAAM,aAAa,GAAG,QAAQ,CAAC,gBAAgB,CAAC,uBAAuB,CAAC,CAAC;IACzE,IAAI,aAAa,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,IAAI,CAAC;IAE5C,MAAM,QAAQ,GAAa,EAAE,CAAC;IAE9B,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACtB,QAAQ,CAAC,IAAI,CAAC,eAAe,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACtD,CAAC;IAED,KAAK,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC,EAAE,EAAE;QAC5C,MAAM,IAAI,GAAI,IAAgB,CAAC,SAAS,CAAC;QACzC,MAAM,EAAE,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;QACvC,IAAI,EAAE,EAAE,CAAC;YACP,QAAQ,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC;QAC/C,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,MAAM,QAAQ,GAAG,QAAQ,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IAEvC,OAAO;QACL,KAAK;QACL,QAAQ;QACR,QAAQ,EAAE,EAAE;QACZ,KAAK,EAAE,EAAE;QACT,MAAM,EAAE,EAAE;QACV,SAAS,EAAE,eAAe;KAC3B,CAAC;AACJ,CAAC;AAED,SAAS,aAAa,CAAC,QAAkB;IACvC,MAAM,OAAO,GAAG,QAAQ,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC;IAChD,MAAM,QAAQ,GAAG,OAAO,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;IACpD,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,IAAI,QAAQ,CAAC;IAEzD,MAAM,UAAU,GACd,QAAQ,CAAC,aAAa,CAAC,wBAAwB,CAAC;QAChD,QAAQ,CAAC,aAAa,CAAC,gBAAgB,CAAC,CAAC;IAE3C,IAAI,CAAC,UAAU;QAAE,OAAO,IAAI,CAAC;IAE7B,MAAM,QAAQ,GAAG,cAAc,CAAE,UAAsB,CAAC,SAAS,CAAC,CAAC,IAAI,EAAE,CAAC;IAC1E,IAAI,CAAC,QAAQ;QAAE,OAAO,IAAI,CAAC;IAE3B,OAAO;QACL,KAAK;QACL,QAAQ;QACR,QAAQ,EAAE,EAAE;QACZ,KAAK,EAAE,EAAE;QACT,MAAM,EAAE,EAAE;QACV,SAAS,EAAE,eAAe;KAC3B,CAAC;AACJ,CAAC;AAED,SAAS,WAAW,CAAC,QAAkB;IACrC,MAAM,OAAO,GAAG,QAAQ,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC;IAChD,MAAM,KAAK,GAAG,OAAO,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;IAEjD,MAAM,SAAS,GACb,QAAQ,CAAC,aAAa,CAAC,oBAAoB,CAAC;QAC5C,QAAQ,CAAC,aAAa,CAAC,YAAY,CAAC;QACpC,QAAQ,CAAC,aAAa,CAAC,gBAAgB,CAAC,CAAC;IAE3C,IAAI,CAAC,SAAS;QAAE,OAAO,IAAI,CAAC;IAE5B,MAAM,QAAQ,GAAG,cAAc,CAAE,SAAqB,CAAC,SAAS,CAAC,CAAC,IAAI,EAAE,CAAC;IACzE,IAAI,CAAC,QAAQ;QAAE,OAAO,IAAI,CAAC;IAE3B,OAAO;QACL,KAAK;QACL,QAAQ;QACR,QAAQ,EAAE,EAAE;QACZ,KAAK,EAAE,EAAE;QACT,MAAM,EAAE,EAAE;QACV,SAAS,EAAE,eAAe;KAC3B,CAAC;AACJ,CAAC;AAED,MAAM,CAAC,MAAM,eAAe,GAAc;IACxC,IAAI,EAAE,QAAQ;IAEd,SAAS,CAAC,GAAW;QACnB,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;YACvC,OAAO,QAAQ,KAAK,YAAY,IAAI,QAAQ,CAAC,QAAQ,CAAC,aAAa,CAAC,CAAC;QACvE,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,KAAK,CAAC;QACf,CAAC;IACH,CAAC;IAED,OAAO,CAAC,IAAY,EAAE,GAAW;QAC/B,IAAI,CAAC,IAAI;YAAE,OAAO,IAAI,CAAC;QAEvB,MAAM,EAAE,QAAQ,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;QAErC,IAAI,WAAW,CAAC,GAAG,CAAC,EAAE,CAAC;YACrB,OAAO,YAAY,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAC;QACrC,CAAC;QAED,IAAI,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC;YAChB,OAAO,WAAW,CAAC,QAAQ,CAAC,CAAC;QAC/B,CAAC;QAED,OAAO,aAAa,CAAC,QAAQ,CAAC,CAAC;IACjC,CAAC;CACF,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"mdn.d.ts","sourceRoot":"","sources":["../../../src/extraction/site-extractors/mdn.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAoB,MAAM,gBAAgB,CAAC;
|
|
1
|
+
{"version":3,"file":"mdn.d.ts","sourceRoot":"","sources":["../../../src/extraction/site-extractors/mdn.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAoB,MAAM,gBAAgB,CAAC;AAWlE,eAAO,MAAM,YAAY,EAAE,SAqD1B,CAAC"}
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import { parseHTML } from 'linkedom';
|
|
2
|
-
import
|
|
3
|
-
const turndown = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced' });
|
|
2
|
+
import { htmlToMarkdown } from '../markdown.js';
|
|
4
3
|
const STRIP_SELECTORS = [
|
|
5
4
|
'nav',
|
|
6
5
|
'.sidebar',
|
|
@@ -42,7 +41,7 @@ export const mdnExtractor = {
|
|
|
42
41
|
: rawTitle;
|
|
43
42
|
if (!title)
|
|
44
43
|
return null;
|
|
45
|
-
const markdown =
|
|
44
|
+
const markdown = htmlToMarkdown(article.innerHTML).trim();
|
|
46
45
|
if (!markdown)
|
|
47
46
|
return null;
|
|
48
47
|
return {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"mdn.js","sourceRoot":"","sources":["../../../src/extraction/site-extractors/mdn.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AACrC,OAAO,
|
|
1
|
+
{"version":3,"file":"mdn.js","sourceRoot":"","sources":["../../../src/extraction/site-extractors/mdn.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AACrC,OAAO,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAC;AAGhD,MAAM,eAAe,GAAG;IACtB,KAAK;IACL,UAAU;IACV,QAAQ;IACR,QAAQ;IACR,UAAU;IACV,WAAW;CACZ,CAAC;AAEF,MAAM,CAAC,MAAM,YAAY,GAAc;IACrC,IAAI,EAAE,KAAK;IAEX,SAAS,CAAC,GAAW;QACnB,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;YACvC,OAAO,QAAQ,KAAK,uBAAuB,CAAC;QAC9C,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,KAAK,CAAC;QACf,CAAC;IACH,CAAC;IAED,OAAO,CAAC,IAAY,EAAE,IAAY;QAChC,IAAI,CAAC,IAAI;YAAE,OAAO,IAAI,CAAC;QAEvB,MAAM,EAAE,QAAQ,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;QAErC,MAAM,OAAO,GACX,QAAQ,CAAC,aAAa,CAAC,2BAA2B,CAAC;YACnD,QAAQ,CAAC,aAAa,CAAC,kBAAkB,CAAC;YAC1C,QAAQ,CAAC,aAAa,CAAC,SAAS,CAAC,CAAC;QAEpC,IAAI,CAAC,OAAO;YAAE,OAAO,IAAI,CAAC;QAE1B,KAAK,MAAM,QAAQ,IAAI,eAAe,EAAE,CAAC;YACvC,KAAK,MAAM,EAAE,IAAI,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAC,EAAE,CAAC;gBAChE,EAAE,CAAC,UAAU,EAAE,WAAW,CAAC,EAAE,CAAC,CAAC;YACjC,CAAC;QACH,CAAC;QAED,MAAM,OAAO,GACX,OAAO,CAAC,aAAa,CAAC,IAAI,CAAC;YAC3B,QAAQ,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC;QAElC,MAAM,QAAQ,GAAG,OAAO,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;QACpD,MAAM,KAAK,GAAG,QAAQ,CAAC,QAAQ,CAAC,GAAG,CAAC;YAClC,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAE,CAAC,IAAI,EAAE;YAChC,CAAC,CAAC,QAAQ,CAAC;QAEb,IAAI,CAAC,KAAK;YAAE,OAAO,IAAI,CAAC;QAExB,MAAM,QAAQ,GAAG,cAAc,CAAE,OAAmB,CAAC,SAAS,CAAC,CAAC,IAAI,EAAE,CAAC;QACvE,IAAI,CAAC,QAAQ;YAAE,OAAO,IAAI,CAAC;QAE3B,OAAO;YACL,KAAK;YACL,QAAQ;YACR,QAAQ,EAAE,EAAE;YACZ,KAAK,EAAE,EAAE;YACT,MAAM,EAAE,EAAE;YACV,SAAS,EAAE,eAAe;SAC3B,CAAC;IACJ,CAAC;CACF,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"stackoverflow.d.ts","sourceRoot":"","sources":["../../../src/extraction/site-extractors/stackoverflow.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAoB,MAAM,gBAAgB,CAAC;
|
|
1
|
+
{"version":3,"file":"stackoverflow.d.ts","sourceRoot":"","sources":["../../../src/extraction/site-extractors/stackoverflow.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAoB,MAAM,gBAAgB,CAAC;AA8DlE,eAAO,MAAM,sBAAsB,EAAE,SAkDpC,CAAC"}
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import { parseHTML } from 'linkedom';
|
|
2
|
-
import
|
|
3
|
-
const turndown = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced' });
|
|
2
|
+
import { htmlToMarkdown } from '../markdown.js';
|
|
4
3
|
function parseVotes(el) {
|
|
5
4
|
if (!el)
|
|
6
5
|
return 0;
|
|
@@ -22,7 +21,7 @@ function parseAnswers(document) {
|
|
|
22
21
|
}
|
|
23
22
|
function buildMarkdown(title, tags, votes, questionHtml, answers) {
|
|
24
23
|
const tagLine = `Tags: ${tags.join(', ')} | Votes: ${votes}`;
|
|
25
|
-
const questionMd =
|
|
24
|
+
const questionMd = htmlToMarkdown(questionHtml).trim();
|
|
26
25
|
const sections = [
|
|
27
26
|
`# ${title}`,
|
|
28
27
|
tagLine,
|
|
@@ -36,7 +35,7 @@ function buildMarkdown(title, tags, votes, questionHtml, answers) {
|
|
|
36
35
|
const heading = answer.accepted
|
|
37
36
|
? `## Accepted Answer (Votes: ${answer.votes})`
|
|
38
37
|
: `## Answer (Votes: ${answer.votes})`;
|
|
39
|
-
const bodyMd =
|
|
38
|
+
const bodyMd = htmlToMarkdown(answer.bodyHtml).trim();
|
|
40
39
|
sections.push('---', '', heading, '', bodyMd);
|
|
41
40
|
}
|
|
42
41
|
return sections.join('\n\n');
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"stackoverflow.js","sourceRoot":"","sources":["../../../src/extraction/site-extractors/stackoverflow.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AACrC,OAAO,
|
|
1
|
+
{"version":3,"file":"stackoverflow.js","sourceRoot":"","sources":["../../../src/extraction/site-extractors/stackoverflow.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AACrC,OAAO,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAC;AAShD,SAAS,UAAU,CAAC,EAAkB;IACpC,IAAI,CAAC,EAAE;QAAE,OAAO,CAAC,CAAC;IAClB,MAAM,MAAM,GAAG,EAAE,CAAC,aAAa,CAAC,gBAAgB,CAAC,CAAC;IAClD,MAAM,GAAG,GAAG,MAAM,EAAE,YAAY,CAAC,YAAY,CAAC,IAAI,MAAM,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,GAAG,CAAC;IACrF,OAAO,QAAQ,CAAC,GAAG,EAAE,EAAE,CAAC,IAAI,CAAC,CAAC;AAChC,CAAC;AAED,SAAS,YAAY,CAAC,QAAkB;IACtC,MAAM,SAAS,GAAG,QAAQ,CAAC,gBAAgB,CAAC,kBAAkB,CAAC,CAAC;IAChE,MAAM,OAAO,GAAa,EAAE,CAAC;IAE7B,KAAK,MAAM,EAAE,IAAI,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,CAAC;QACvC,MAAM,QAAQ,GAAG,EAAE,CAAC,SAAS,CAAC,QAAQ,CAAC,iBAAiB,CAAC,CAAC;QAC1D,MAAM,KAAK,GAAG,UAAU,CAAC,EAAa,CAAC,CAAC;QACxC,MAAM,MAAM,GAAG,EAAE,CAAC,aAAa,CAAC,qCAAqC,CAAC,CAAC;QACvE,MAAM,QAAQ,GAAG,MAAM,CAAC,CAAC,CAAE,MAAkB,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE,CAAC;QAC7D,OAAO,CAAC,IAAI,CAAC,EAAE,QAAQ,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC,CAAC;IAC9C,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,SAAS,aAAa,CACpB,KAAa,EACb,IAAc,EACd,KAAa,EACb,YAAoB,EACpB,OAAiB;IAEjB,MAAM,OAAO,GAAG,SAAS,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,aAAa,KAAK,EAAE,CAAC;IAC7D,MAAM,UAAU,GAAG,cAAc,CAAC,YAAY,CAAC,CAAC,IAAI,EAAE,CAAC;IAEvD,MAAM,QAAQ,GAAa;QACzB,KAAK,KAAK,EAAE;QACZ,OAAO;QACP,EAAE;QACF,UAAU;KACX,CAAC;IAEF,MAAM,QAAQ,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC;IACnD,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;IACpF,MAAM,OAAO,GAAG,CAAC,GAAG,QAAQ,EAAE,GAAG,MAAM,CAAC,CAAC;IAEzC,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;QAC7B,MAAM,OAAO,GAAG,MAAM,CAAC,QAAQ;YAC7B,CAAC,CAAC,8BAA8B,MAAM,CAAC,KAAK,GAAG;YAC/C,CAAC,CAAC,qBAAqB,MAAM,CAAC,KAAK,GAAG,CAAC;QACzC,MAAM,MAAM,GAAG,cAAc,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,IAAI,EAAE,CAAC;QACtD,QAAQ,CAAC,IAAI,CAAC,KAAK,EAAE,EAAE,EAAE,OAAO,EAAE,EAAE,EAAE,MAAM,CAAC,CAAC;IAChD,CAAC;IAED,OAAO,QAAQ,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;AAC/B,CAAC;AAED,MAAM,CAAC,MAAM,sBAAsB,GAAc;IAC/C,IAAI,EAAE,eAAe;IAErB,SAAS,CAAC,GAAW;QACnB,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;YACvC,OAAO,QAAQ,KAAK,mBAAmB;gBACrC,QAAQ,CAAC,QAAQ,CAAC,oBAAoB,CAAC;gBACvC,QAAQ,KAAK,mBAAmB;gBAChC,QAAQ,CAAC,QAAQ,CAAC,oBAAoB,CAAC,CAAC;QAC5C,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,KAAK,CAAC;QACf,CAAC;IACH,CAAC;IAED,OAAO,CAAC,IAAY,EAAE,IAAY;QAChC,IAAI,CAAC,IAAI;YAAE,OAAO,IAAI,CAAC;QAEvB,MAAM,EAAE,QAAQ,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;QAErC,MAAM,OAAO,GAAG,QAAQ,CAAC,aAAa,CAAC,qBAAqB,CAAC,CAAC;QAC9D,IAAI,CAAC,OAAO;YAAE,OAAO,IAAI,CAAC;QAE1B,MAAM,KAAK,GAAG,OAAO,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;QAChD,IAAI,CAAC,KAAK;YAAE,OAAO,IAAI,CAAC;QAExB,MAAM,cAAc,GAAG,QAAQ,CAAC,aAAa,CAAC,mEAAmE,CAAC,CAAC;QACnH,IAAI,CAAC,cAAc;YAAE,OAAO,IAAI,CAAC;QAEjC,MAAM,YAAY,GAAI,cAA0B,CAAC,SAAS,CAAC;QAE3D,MAAM,MAAM,GAAG,QAAQ,CAAC,gBAAgB,CAAC,8DAA8D,CAAC,CAAC;QACzG,MAAM,IAAI,GAAG,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QAE1F,MAAM,UAAU,GAAG,QAAQ,CAAC,aAAa,CAAC,WAAW,CAAC,CAAC;QACvD,MAAM,KAAK,GAAG,UAAU,CAAC,UAA4B,CAAC,CAAC;QAEvD,MAAM,OAAO,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;QAEvC,MAAM,QAAQ,GAAG,aAAa,CAAC,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,YAAY,EAAE,OAAO,CAAC,CAAC;QAE1E,OAAO;YACL,KAAK;YACL,QAAQ;YACR,QAAQ,EAAE,EAAE;YACZ,KAAK,EAAE,EAAE;YACT,MAAM,EAAE,EAAE;YACV,SAAS,EAAE,eAAe;SAC3B,CAAC;IACJ,CAAC;CACF,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"structured-data.d.ts","sourceRoot":"","sources":["../../src/extraction/structured-data.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,oBAAoB,EAAE,MAAM,aAAa,CAAC;AAaxD,wBAAgB,qBAAqB,CAAC,IAAI,EAAE,MAAM,GAAG,oBAAoB,EAAE,CAQ1E;AA8KD,eAAO,MAAM,kBAAkB,EAAE,WAAW,CAAC,MAAM,CAAe,CAAC"}
|