@plurnk/plurnk-mimetypes-text-html 0.6.2 → 0.6.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -5
- package/dist/TextHtml.d.ts +2 -0
- package/dist/TextHtml.d.ts.map +1 -1
- package/dist/TextHtml.js +28 -0
- package/dist/TextHtml.js.map +1 -1
- package/dist/htmlToMarkdown.d.ts +2 -0
- package/dist/htmlToMarkdown.d.ts.map +1 -0
- package/dist/htmlToMarkdown.js +53 -0
- package/dist/htmlToMarkdown.js.map +1 -0
- package/package.json +8 -4
package/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# @plurnk/plurnk-mimetypes-text-html
|
|
2
2
|
|
|
3
|
-
`text/html` AND `application/xhtml+xml` mimetype handler for the [plurnk](https://github.com/plurnk) ecosystem.
|
|
3
|
+
`text/html` AND `application/xhtml+xml` mimetype handler for the [plurnk](https://github.com/plurnk) ecosystem. Two faces: **structural** extraction via [parse5](https://www.npmjs.com/package/parse5) (symbols, deep-json/deep-xml, real-DOM xpath via [@xmldom/xmldom](https://www.npmjs.com/package/@xmldom/xmldom) + [xpath](https://www.npmjs.com/package/xpath)) and **readable** projection — the page's main content as clean reading markdown via [@mozilla/readability](https://www.npmjs.com/package/@mozilla/readability) + [turndown](https://www.npmjs.com/package/turndown) over a [linkedom](https://www.npmjs.com/package/linkedom) DOM.
|
|
4
4
|
|
|
5
5
|
## install
|
|
6
6
|
|
|
@@ -10,15 +10,15 @@ npm i @plurnk/plurnk-mimetypes-text-html
|
|
|
10
10
|
|
|
11
11
|
## what it does
|
|
12
12
|
|
|
13
|
+
- `content(content)` — the **content channel** (SPEC §18): the page's markup-free reading markdown. Main-content extraction via Readability strips nav, ads, and chrome; turndown renders the article body as markdown. Non-article pages (apps, forms, fragments, very short HTML) degrade to best-effort markdown of the `<body>` — never raw HTML, never a throw. Empty/whitespace input → absent. This is also the embed-source: an HTML entry's embedding reflects the article, not `<div class>` noise. HTML is the only mimetype that populates this channel.
|
|
13
14
|
- `extractRaw(content)` — h1–h6 headings as `heading` symbols (with `level`), `<title>` as an h1 fallback when no headings exist, and code blocks as `module` symbols. Source line numbers come from parse5's location info.
|
|
14
|
-
- `preview(content)` — hybrid per SPEC §1: a `SymbolPreview` when structural signals were found, otherwise a head-oriented `TextPreview` over the raw HTML (the framework truncates and marks it).
|
|
15
15
|
- `deepJson(content)` — the parse5 DOM as a nested node tree, with source-algebra attributes under the `attrs` convention (framework projects this to the deep-xml channel).
|
|
16
|
-
- `query(content, dialect, pattern)` — overrides xpath to dispatch against the real parsed DOM (XPath 1.0) instead of the projected deep-xml.
|
|
16
|
+
- `query(content, dialect, pattern)` — overrides xpath to dispatch against the real parsed DOM (XPath 1.0) instead of the projected deep-xml. regex/glob run against the same readable markdown the content channel produces (one projection, shared by `toText`).
|
|
17
17
|
- `validate(content)` — no-op (HTML is forgiving).
|
|
18
18
|
|
|
19
|
-
##
|
|
19
|
+
## two faces, one handler
|
|
20
20
|
|
|
21
|
-
|
|
21
|
+
The structural channels (`extractRaw`/`deepJson`/`query` xpath) stay parse5-based with source positions — they answer "where is this tag, on what line." The content channel answers a different question — "what does this page *say*" — and for that the raw markup is noise. Readability + turndown denoise it into reading markdown. Web-page denoising used to be deferred to the fetcher layer; SPEC §18 moved it here, because the readable projection is a pure function of the HTML bytes (whatever a browser scheme rendered and serialized, or a file on disk) and belongs with the mimetype that owns HTML.
|
|
22
22
|
|
|
23
23
|
## license
|
|
24
24
|
|
package/dist/TextHtml.d.ts
CHANGED
|
@@ -3,6 +3,8 @@ import type { HandlerContent, MimeSymbol, QueryDialect, QueryMatch } from "@plur
|
|
|
3
3
|
export default class TextHtml extends BaseHandler {
|
|
4
4
|
extractRaw(content: string | Uint8Array): MimeSymbol[];
|
|
5
5
|
deepJson(content: HandlerContent): unknown;
|
|
6
|
+
content(content: HandlerContent): string | undefined;
|
|
7
|
+
protected toText(content: HandlerContent): string;
|
|
6
8
|
query(content: HandlerContent, dialect: QueryDialect, pattern: string, flags?: string): Promise<QueryMatch[]>;
|
|
7
9
|
}
|
|
8
10
|
//# sourceMappingURL=TextHtml.d.ts.map
|
package/dist/TextHtml.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"TextHtml.d.ts","sourceRoot":"","sources":["../src/TextHtml.ts"],"names":[],"mappings":"AAAA,OAAO,EACH,WAAW,EAGd,MAAM,0BAA0B,CAAC;AAClC,OAAO,KAAK,EACR,cAAc,EACd,UAAU,EACV,YAAY,EACZ,UAAU,EACb,MAAM,0BAA0B,CAAC;
|
|
1
|
+
{"version":3,"file":"TextHtml.d.ts","sourceRoot":"","sources":["../src/TextHtml.ts"],"names":[],"mappings":"AAAA,OAAO,EACH,WAAW,EAGd,MAAM,0BAA0B,CAAC;AAClC,OAAO,KAAK,EACR,cAAc,EACd,UAAU,EACV,YAAY,EACZ,UAAU,EACb,MAAM,0BAA0B,CAAC;AAgClC,MAAM,CAAC,OAAO,OAAO,QAAS,SAAQ,WAAW;IACpC,UAAU,CAAC,OAAO,EAAE,MAAM,GAAG,UAAU,GAAG,UAAU,EAAE;IAkCtD,QAAQ,CAAC,OAAO,EAAE,cAAc,GAAG,OAAO;IA4B1C,OAAO,CAAC,OAAO,EAAE,cAAc,GAAG,MAAM,GAAG,SAAS;cAa1C,MAAM,CAAC,OAAO,EAAE,cAAc,GAAG,MAAM;IAa3C,KAAK,CAChB,OAAO,EAAE,cAAc,EACvB,OAAO,EAAE,YAAY,EACrB,OAAO,EAAE,MAAM,EACf,KAAK,CAAC,EAAE,MAAM,GACf,OAAO,CAAC,UAAU,EAAE,CAAC;CAiC3B"}
|
package/dist/TextHtml.js
CHANGED
|
@@ -2,6 +2,7 @@ import { BaseHandler, InvalidExpressionError, QueryParseFailureError, } from "@p
|
|
|
2
2
|
import { parse } from "parse5";
|
|
3
3
|
import { DOMParser } from "@xmldom/xmldom";
|
|
4
4
|
import * as xpath from "xpath";
|
|
5
|
+
import { htmlToMarkdown } from "./htmlToMarkdown.js";
|
|
5
6
|
const HEADING_TAGS = new Set(["h1", "h2", "h3", "h4", "h5", "h6"]);
|
|
6
7
|
export default class TextHtml extends BaseHandler {
|
|
7
8
|
extractRaw(content) {
|
|
@@ -53,6 +54,33 @@ export default class TextHtml extends BaseHandler {
|
|
|
53
54
|
};
|
|
54
55
|
return root;
|
|
55
56
|
}
|
|
57
|
+
// Content channel (SPEC §18) — the model-facing readable markdown. HTML
|
|
58
|
+
// is the only mimetype that populates this channel: an already-textual but
|
|
59
|
+
// markup-noisy body projected to clean reading markdown via Readability
|
|
60
|
+
// (main-content extraction, strips nav/ads/chrome) + turndown. Absent
|
|
61
|
+
// (undefined) for empty/whitespace input so the channel stays absent when
|
|
62
|
+
// there is no readable content. Also the embed-source — the framework
|
|
63
|
+
// embeds content() over the raw bytes, so HTML embeddings carry the
|
|
64
|
+
// article, not the chrome. Binary content is decoded utf-8 first, mirroring
|
|
65
|
+
// extractRaw/deepJson.
|
|
66
|
+
content(content) {
|
|
67
|
+
const html = typeof content === "string"
|
|
68
|
+
? content
|
|
69
|
+
: new TextDecoder("utf-8").decode(content);
|
|
70
|
+
return htmlToMarkdown(html);
|
|
71
|
+
}
|
|
72
|
+
// Route the regex/glob query surface (and, transitively, the framework's
|
|
73
|
+
// content()??toText() embed-source) through the SAME markdown projection,
|
|
74
|
+
// so body matchers scan the readable text, not raw `<div class>` markup.
|
|
75
|
+
// xpath is unaffected — query() overrides it to hit the real DOM. When the
|
|
76
|
+
// page has no readable content, fall back to the raw body so regex/glob
|
|
77
|
+
// still have something to match rather than throwing.
|
|
78
|
+
toText(content) {
|
|
79
|
+
const html = typeof content === "string"
|
|
80
|
+
? content
|
|
81
|
+
: new TextDecoder("utf-8").decode(content);
|
|
82
|
+
return htmlToMarkdown(html) ?? html;
|
|
83
|
+
}
|
|
56
84
|
// Override xpath dispatch. parse5's tree isn't xpath-traversable, so we
|
|
57
85
|
// re-parse via @xmldom/xmldom (which produces a real DOM that the `xpath`
|
|
58
86
|
// package can walk). Line numbers default to 1 because xmldom doesn't
|
package/dist/TextHtml.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"TextHtml.js","sourceRoot":"","sources":["../src/TextHtml.ts"],"names":[],"mappings":"AAAA,OAAO,EACH,WAAW,EACX,sBAAsB,EACtB,sBAAsB,GACzB,MAAM,0BAA0B,CAAC;AAOlC,OAAO,EAAE,KAAK,EAAE,MAAM,QAAQ,CAAC;AAE/B,OAAO,EAAE,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAC3C,OAAO,KAAK,KAAK,MAAM,OAAO,CAAC;
|
|
1
|
+
{"version":3,"file":"TextHtml.js","sourceRoot":"","sources":["../src/TextHtml.ts"],"names":[],"mappings":"AAAA,OAAO,EACH,WAAW,EACX,sBAAsB,EACtB,sBAAsB,GACzB,MAAM,0BAA0B,CAAC;AAOlC,OAAO,EAAE,KAAK,EAAE,MAAM,QAAQ,CAAC;AAE/B,OAAO,EAAE,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAC3C,OAAO,KAAK,KAAK,MAAM,OAAO,CAAC;AAC/B,OAAO,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AAyBrD,MAAM,YAAY,GAAG,IAAI,GAAG,CAAC,CAAC,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,CAAC,CAAC,CAAC;AAEnE,MAAM,CAAC,OAAO,OAAO,QAAS,SAAQ,WAAW;IACpC,UAAU,CAAC,OAA4B;QAC5C,MAAM,IAAI,GAAG,OAAO,OAAO,KAAK,QAAQ;YACpC,CAAC,CAAC,OAAO;YACT,CAAC,CAAC,IAAI,WAAW,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QAE/C,MAAM,GAAG,GAAG,KAAK,CAAC,IAAI,EAAE,EAAE,sBAAsB,EAAE,IAAI,EAAE,CAAC,CAAC;QAC1D,MAAM,OAAO,GAAiB,EAAE,CAAC;QACjC,MAAM,SAAS,GAA0C,iBAAiB,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC;QAEzF,wEAAwE;QACxE,qEAAqE;QACrE,oEAAoE;QACpE,MAAM,KAAK,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,SAAS,IAAI,CAAC,CAAC,KAAK,KAAK,CAAC,CAAC,CAAC;QACzE,IAAI,SAAS,KAAK,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;YAC/B,OAAO,CAAC,OAAO,CAAC;gBACZ,IAAI,EAAE,SAAS,CAAC,IAAI;gBACpB,IAAI,EAAE,SAAS;gBACf,KAAK,EAAE,CAAC;gBACR,IAAI,EAAE,SAAS,CAAC,IAAI;gBACpB,OAAO,EAAE,SAAS,CAAC,IAAI;aAC1B,CAAC,CAAC;QACP,CAAC;QAED,OAAO,OAAO,CAAC;IACnB,CAAC;IAED,yEAAyE;IACzE,mEAAmE;IACnE,wEAAwE;IACxE,sEAAsE;IACtE,iEAAiE;IACjE,oEAAoE;IACpE,qEAAqE;IACrE,0CAA0C;IACjC,QAAQ,CAAC,OAAuB;QACrC,MAAM,IAAI,GAAG,OAAO,OAAO,KAAK,QAAQ;YACpC,CAAC,CAAC,OAAO;YACT,CAAC,CAAC,IAAI,WAAW,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QAC/C,IAAI,GAAG,CAAC;QACR,IAAI,CAAC;YACD,GAAG,GAAG,KAAK,CAAC,IAAI,EAAE,EAAE,sBAAsB,EAAE,IAAI,EAAE,CAAC,CAAC;QACxD,CAAC;QAAC,MAAM,CAAC;YACL,OAAO,IAAI,CAAC;QAChB,CAAC;QACD,MAAM,IAAI,GAA4B;YAClC,IAAI,EAAE,UAAU;YAChB,IAAI,EAAE,CAAC;YACP,OAAO,EAAE,CAAC;YACV,QAAQ,EAAE,eAAe,CAAC,GAAG,CAAC;SACjC,CAAC;QACF,OAAO,IAAI,CAAC;IAChB,CAAC;IAED,wEAAwE;IACxE,2EAA2E;IAC3E,wEAAwE;IACxE,sEAAsE;IACtE,0EAA0E;IAC1E,sEAAsE;IACtE,oEAAoE;IACpE,4EAA4E;IAC5E,uBAAuB;IACd,OAAO,CAAC,OAAuB;QACpC,MAAM,IAAI,GAAG,OAAO,OAAO,KAAK,QAAQ;YACpC,CAAC,CAAC,OAAO;YACT,CAAC,CAAC,IAAI,WAAW,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QAC/C,OAAO,cAAc,CAAC,IAAI,CAAC,CAAC;IAChC,CAAC;IAED,yEAAyE;IACzE,0EAA0E;IAC1E,yEAAyE;IACzE,2EAA2E;IAC3E,wEAAwE;IACxE,sDAAsD;IACnC,MAAM,CAAC,OAAuB;QAC7C,MAAM,IAAI,GAAG,OAAO,OAAO,KAAK,QAAQ;YACpC,CAAC,CAAC,OAAO;YACT,CAAC,CAAC,IAAI,WAAW,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QAC/C,OAAO,cAAc,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC;IACxC,CAAC;IAED,wEAAwE;IACxE,0EAA0E;IAC1E,sEAAsE;IACtE,uEAAuE;IACvE,yEAAyE;IACzE,yEAAyE;IAChE,KAAK,CAAC,KAAK,CAChB,OAAuB,EACvB,OAAqB,EACrB,OAAe,EACf,KAAc;QAEd,IAAI,OAAO,KAAK,OAAO,EAAE,CAAC;YACtB,MAAM,IAAI,GAAG,OAAO,OAAO,KAAK,QAAQ;gBACpC,CAAC,CAAC,OAAO;gBACT,CAAC,CAAC,IAAI,WAAW,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;YAE/C,2DAA2D;YAC3D,kEAAkE;YAClE,iEAAiE;YACjE,6DAA6D;YAC7D,6DAA6D;YAC7D,+DAA+D;YAC/D,6DAA6D;YAC7D,+DAA+D;YAC/D,qDAAqD;YACrD,IAAI,GAAG,CAAC;YACR,IAAI,CAAC;gBACD,GAAG,GAAG,IAAI,SAAS,EAAE,CAAC,eAAe,CAAC,IAAI,EAAE,UAAU,CAAC,CAAC;YAC5D,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACb,MAAM,IAAI,sBAAsB,CAAC,EAAE,QAAQ,EAAE,IAAI,CAAC,QAAQ,EAAE,KAAK,EAAE,CAAC,CAAC;YACzE,CAAC;YAED,IAAI,MAA8B,CAAC;YACnC,IAAI,CAAC;gBACD,MAAM,GAAG,KAAK,CAAC,MAAM,CAAC,OAAO,EAAE,GAAsB,CAAC,CAAC;YAC3D,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACb,MAAM,IAAI,sBAAsB,CAAC,EAAE,OAAO,EAAE,OAAO,EAAE,UAAU,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC,CAAC;YACvF,CAAC;YAED,OAAO,gBAAgB,CAAC,OAAO,EAAE,MAAM,CAAC,CAAC;QAC7C,CAAC;QACD,OAAO,KAAK,CAAC,KAAK,CAAC,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,KAAK,CAAC,CAAC;IACzD,CAAC;CACJ;AAED,0EAA0E;AAC1E,SAAS,gBAAgB,CAAC,OAAe,EAAE,MAA8B;IACrE,IAAI,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE,CAAC;QACxB,OAAO,MAAM,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,EAAc,EAAE,CAAC,CAAC;YACxC,IAAI,EAAE,CAAC;YACP,OAAO,EAAE,aAAa,CAAC,IAAI,CAAC;YAC5B,QAAQ,EAAE,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,OAAO,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,SAAS;SACrE,CAAC,CAAC,CAAC;IACR,CAAC;IACD,IAAI,MAAM,KAAK,IAAI,IAAI,MAAM,KAAK,SAAS;QAAE,OAAO,EAAE,CAAC;IACvD,0EAA0E;IAC1E,0CAA0C;IAC1C,OAAO,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,OAAO,EAAE,OAAO,MAAM,KAAK,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;AACxF,CAAC;AAED,4EAA4E;AAC5E,4EAA4E;AAC5E,6EAA6E;AAC7E,wEAAwE;AACxE,yCAAyC;AACzC,MAAM,cAAc,GAAG,CAAC,CAAC;AACzB,MAAM,SAAS,GAAG,CAAC,CAAC;AACpB,MAAM,kBAAkB,GAAG,CAAC,CAAC;AAC7B,MAAM,2BAA2B,GAAG,CAAC,CAAC;AACtC,MAAM,YAAY,GAAG,CAAC,CAAC;AACvB,SAAS,aAAa,CAAC,IAAU;IAC7B,MAAM,EAAE,GAAG,IAAI,CAAC,QAAQ,CAAC;IACzB,IAAI,EAAE,KAAK,cAAc;QAAE,OAAQ,IAAa,CAAC,KAAK,CAAC;IACvD,IAAI,EAAE,KAAK,SAAS,IAAI,EAAE,KAAK,kBAAkB;QAAE,OAAQ,IAAa,CAAC,IAAI,CAAC;IAC9E,IAAI,EAAE,KAAK,YAAY;QAAE,OAAQ,IAAgB,CAAC,IAAI,CAAC;IACvD,IAAI,EAAE,KAAK,2BAA2B;QAAE,OAAQ,IAA8B,CAAC,IAAI,CAAC;IACpF,OAAQ,IAA8C,CAAC,QAAQ,EAAE,CAAC;AACtE,CAAC;AAED,4EAA4E;AAC5E,uEAAuE;AACvE,wBAAwB;AACxB,SAAS,iBAAiB,CACtB,IAAgB,EAChB,GAAiB;IAEjB,IAAI,KAAK,GAA0C,IAAI,CAAC;IACxD,sEAAsE;IACtE,yEAAyE;IACzE,iEAAiE;IACjE,MAAM,IAAI,GAA2C,EAAE,CAAC;IAExD,SAAS,IAAI,CAAC,IAA4B;QACtC,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,EAAE,CAAC;YACnB,IAAI,aAAa,CAAC,IAAI,CAAC,EAAE,CAAC;gBACtB,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,UAAU;oBAAE,IAAI,CAAC,KAAK,CAAC,CAAC;YACrD,CAAC;YACD,OAAO;QACX,CAAC;QAED,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC;QACzB,IAAI,GAAG,KAAK,OAAO,IAAI,KAAK,KAAK,IAAI,EAAE,CAAC;YACpC,MAAM,IAAI,GAAG,WAAW,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;YACtC,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAClB,KAAK,GAAG;oBACJ,IAAI,EAAE,IAAI;oBACV,IAAI,EAAE,IAAI,CAAC,kBAAkB,EAAE,SAAS,IAAI,CAAC;iBAChD,CAAC;YACN,CAAC;QACL,CAAC;aAAM,IAAI,YAAY,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;YAC/B,MAAM,IAAI,GAAG,WAAW,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;YACtC,MAAM,GAAG,GAAG,IAAI,CAAC,kBAAkB,CAAC;YACpC,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAClB,MAAM,KAAK,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;gBAC7B,OAAO,IAAI,CAAC,MAAM,GAAG,CAAC,IAAI,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,KAAK,IAAI,KAAK,EAAE,CAAC;oBAC7D,IAAI,CAAC,GAAG,EAAE,CAAC;gBACf,CAAC;gBACD,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;gBACpD,GAAG,CAAC,IAAI,CAAC;oBACL,IAAI,EAAE,IAAI;oBACV,IAAI,EAAE,SAAS;oBACf,KAAK;oBACL,IAAI,EAAE,GAAG,EAAE,SAAS,IAAI,CAAC;oBACzB,OAAO,EAAE,GAAG,EAAE,OAAO,IAAI,GAAG,EAAE,SAAS,IAAI,CAAC;oBAC5C,GAAG,CAAC,GAAG,IAAI,EAAE,MAAM,EAAE,GAAG,CAAC,QAAQ,EAAE,SAAS,EAAE,GAAG,CAAC,MAAM,EAAE,CAAC;oBAC3D,GAAG,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,IAAI,EAAE,SAAS,EAAE,CAAC;iBAC7C,CAAC,CAAC;gBACH,IAAI,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC;YACrC,CAAC;QACL,CAAC;aAAM,IAAI,GAAG,KAAK,KAAK,EAAE,CAAC;YACvB,MAAM,SAAS,GAAG,gBAAgB,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;YACjD,MAAM,QAAQ,GAAG,SAAS,CAAC,CAAC,CAAC,eAAe,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;YACjE,MAAM,GAAG,GAAG,IAAI,CAAC,kBAAkB,CAAC;YACpC,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YACpD,GAAG,CAAC,IAAI,CAAC;gBACL,IAAI,EAAE,QAAQ;gBACd,IAAI,EAAE,QAAQ;gBACd,IAAI,EAAE,GAAG,EAAE,SAAS,IAAI,CAAC;gBACzB,OAAO,EAAE,GAAG,EAAE,OAAO,IAAI,GAAG,EAAE,SAAS,IAAI,CAAC;gBAC5C,GAAG,CAAC,GAAG,IAAI,EAAE,MAAM,EAAE,GAAG,CAAC,QAAQ,EAAE,SAAS,EAAE,GAAG,CAAC,MAAM,EAAE,CAAC;gBAC3D,GAAG,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,IAAI,EAAE,SAAS,EAAE,CAAC;aAC7C,CAAC,CAAC;YACH,OAAO;QACX,CAAC;QAED,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,UAAU;YAAE,IAAI,CAAC,KAAK,CAAC,CAAC;IACrD,CAAC;IAED,IAAI,CAAC,IAAI,CAAC,CAAC;IACX,OAAO,KAAK,CAAC;AACjB,CAAC;AAED,SAAS,eAAe,CAAC,MAAe;IACpC,MAAM,SAAS,GAAG,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,OAAO,CAAC,CAAC;IAC/D,IAAI,CAAC,SAAS;QAAE,OAAO,MAAM,CAAC;IAC9B,MAAM,KAAK,GAAG,SAAS,CAAC,KAAK,CAAC,KAAK,CAAC,wBAAwB,CAAC,CAAC;IAC9D,OAAO,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;AACrC,CAAC;AAED,SAAS,gBAAgB,CAAC,MAAkB,EAAE,OAAe;IACzD,KAAK,MAAM,KAAK,IAAI,MAAM,CAAC,UAAU,EAAE,CAAC;QACpC,IAAI,SAAS,CAAC,KAAK,CAAC,IAAI,KAAK,CAAC,OAAO,KAAK,OAAO;YAAE,OAAO,KAAK,CAAC;IACpE,CAAC;IACD,OAAO,IAAI,CAAC;AAChB,CAAC;AAED,SAAS,WAAW,CAAC,IAA4B;IAC7C,IAAI,UAAU,CAAC,IAAI,CAAC;QAAE,OAAO,IAAI,CAAC,KAAK,CAAC;IACxC,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC;QAAE,OAAO,EAAE,CAAC;IACpC,IAAI,GAAG,GAAG,EAAE,CAAC;IACb,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,UAAU;QAAE,GAAG,IAAI,WAAW,CAAC,KAAK,CAAC,CAAC;IAC/D,OAAO,GAAG,CAAC;AACf,CAAC;AAED,SAAS,SAAS,CAAC,IAA4B;IAC3C,OAAQ,IAAgB,CAAC,OAAO,KAAK,SAAS,IAAK,IAAgB,CAAC,KAAK,KAAK,SAAS,CAAC;AAC5F,CAAC;AAED,SAAS,UAAU,CAAC,IAA4B;IAC5C,OAAQ,IAA8B,CAAC,QAAQ,KAAK,OAAO,CAAC;AAChE,CAAC;AAED,SAAS,aAAa,CAAC,IAAa;IAChC,OAAO,KAAK,CAAC,OAAO,CAAE,IAAiC,CAAC,UAAU,CAAC,CAAC;AACxE,CAAC;AAED,6EAA6E;AAC7E,8EAA8E;AAC9E,8EAA8E;AAC9E,+EAA+E;AAC/E,qCAAqC;AACrC,SAAS,eAAe,CAAC,MAAkB;IACvC,MAAM,GAAG,GAAc,EAAE,CAAC;IAC1B,IAAI,CAAC,aAAa,CAAC,MAAM,CAAC;QAAE,OAAO,GAAG,CAAC;IACvC,KAAK,MAAM,KAAK,IAAI,MAAM,CAAC,UAAU,EAAE,CAAC;QACpC,IAAI,UAAU,CAAC,KAAK,CAAC,EAAE,CAAC;YACpB,MAAM,IAAI,GAAG,KAAK,CAAC,KAAK,CAAC;YACzB,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;gBAAE,SAAS;YAChC,GAAG,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC;YAClC,SAAS;QACb,CAAC;QACD,IAAI,SAAS,CAAC,KAAK,CAAC,EAAE,CAAC;YACnB,GAAG,CAAC,IAAI,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC,CAAC;YAC/B,SAAS;QACb,CAAC;QACD,2DAA2D;IAC/D,CAAC;IACD,OAAO,GAAG,CAAC;AACf,CAAC;AAED,SAAS,aAAa,CAAC,EAAW;IAC9B,MAAM,GAAG,GAAG,EAAE,CAAC,kBAAkB,CAAC;IAClC,MAAM,IAAI,GAA4B;QAClC,IAAI,EAAE,EAAE,CAAC,OAAO;QAChB,IAAI,EAAE,GAAG,EAAE,SAAS,IAAI,CAAC;QACzB,OAAO,EAAE,GAAG,EAAE,OAAO,IAAI,GAAG,EAAE,SAAS,IAAI,CAAC;KAC/C,CAAC;IACF,IAAI,EAAE,CAAC,KAAK,IAAI,EAAE,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAClC,MAAM,KAAK,GAA2B,EAAE,CAAC;QACzC,KAAK,MAAM,CAAC,IAAI,EAAE,CAAC,KAAK;YAAE,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,KAAK,CAAC;QAClD,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;IACvB,CAAC;IACD,MAAM,QAAQ,GAAG,eAAe,CAAC,EAAE,CAAC,CAAC;IACrC,uEAAuE;IACvE,sBAAsB;IACtB,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC;WAClB,OAAO,QAAQ,CAAC,CAAC,CAAC,KAAK,QAAQ;WAC/B,QAAQ,CAAC,CAAC,CAAC,KAAK,IAAI;WACnB,QAAQ,CAAC,CAAC,CAAsB,CAAC,IAAI,KAAK,OAAO,EAAE,CAAC;QACxD,IAAI,CAAC,IAAI,GAAI,QAAQ,CAAC,CAAC,CAAsB,CAAC,IAAI,CAAC;IACvD,CAAC;SAAM,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC7B,IAAI,CAAC,QAAQ,GAAG,QAAQ,CAAC;IAC7B,CAAC;IACD,OAAO,IAAI,CAAC;AAChB,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"htmlToMarkdown.d.ts","sourceRoot":"","sources":["../src/htmlToMarkdown.ts"],"names":[],"mappings":"AAuBA,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS,CAuB/D"}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import { parseHTML } from "linkedom";
|
|
2
|
+
import { Readability } from "@mozilla/readability";
|
|
3
|
+
import TurndownService from "turndown";
|
|
4
|
+
// The single readable-text projection backing both content() (the content
|
|
5
|
+
// channel) and toText() (the regex/glob query surface + the framework's
|
|
6
|
+
// embed-source). One implementation, so the markdown a model READs, the text
|
|
7
|
+
// a regex/glob body-matcher scans, and the bytes the embedder vectorizes are
|
|
8
|
+
// all the same denoised markdown — never raw HTML.
|
|
9
|
+
//
|
|
10
|
+
// Pipeline (SPEC §18): main-content extraction via @mozilla/readability over a
|
|
11
|
+
// linkedom DOM, then HTML→markdown via turndown. Readability strips nav, ads,
|
|
12
|
+
// and chrome and returns the article body; turndown renders it as markdown.
|
|
13
|
+
// When Readability finds no article (apps, forms, fragments, very short HTML)
|
|
14
|
+
// it returns null — we degrade to best-effort markdown of the <body> (or the
|
|
15
|
+
// whole document for unwrapped fragments). Never raw HTML, never a throw.
|
|
16
|
+
const turndown = new TurndownService({
|
|
17
|
+
headingStyle: "atx",
|
|
18
|
+
bulletListMarker: "-",
|
|
19
|
+
codeBlockStyle: "fenced",
|
|
20
|
+
});
|
|
21
|
+
export function htmlToMarkdown(html) {
|
|
22
|
+
if (html.trim().length === 0)
|
|
23
|
+
return undefined;
|
|
24
|
+
const { document } = parseHTML(html);
|
|
25
|
+
// Whitespace/degenerate input can leave linkedom without a root element.
|
|
26
|
+
if (document.documentElement === null)
|
|
27
|
+
return undefined;
|
|
28
|
+
let articleHtml;
|
|
29
|
+
try {
|
|
30
|
+
// Readability mutates the document it walks; hand it a clone so the
|
|
31
|
+
// original stays intact for the body fallback below.
|
|
32
|
+
articleHtml = new Readability(document.cloneNode(true)).parse()?.content;
|
|
33
|
+
}
|
|
34
|
+
catch {
|
|
35
|
+
// Readability is best-effort denoising — its failure is not ours.
|
|
36
|
+
articleHtml = undefined;
|
|
37
|
+
}
|
|
38
|
+
if (articleHtml === null || articleHtml === undefined || articleHtml.trim().length === 0) {
|
|
39
|
+
articleHtml = readableBody(document);
|
|
40
|
+
}
|
|
41
|
+
const markdown = turndown.turndown(articleHtml).trim();
|
|
42
|
+
return markdown.length === 0 ? undefined : markdown;
|
|
43
|
+
}
|
|
44
|
+
// Best-effort source for the fallback turndown: the <body> when it carries
|
|
45
|
+
// content, else the whole document element (unwrapped fragments like a bare
|
|
46
|
+
// <form> or <div> become documentElement with an empty auto-inserted <body>).
|
|
47
|
+
function readableBody(document) {
|
|
48
|
+
const bodyHtml = document.body?.innerHTML ?? "";
|
|
49
|
+
if (bodyHtml.trim().length > 0)
|
|
50
|
+
return bodyHtml;
|
|
51
|
+
return document.documentElement?.innerHTML ?? "";
|
|
52
|
+
}
|
|
53
|
+
//# sourceMappingURL=htmlToMarkdown.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"htmlToMarkdown.js","sourceRoot":"","sources":["../src/htmlToMarkdown.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AACrC,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAC;AACnD,OAAO,eAAe,MAAM,UAAU,CAAC;AAEvC,0EAA0E;AAC1E,wEAAwE;AACxE,6EAA6E;AAC7E,6EAA6E;AAC7E,mDAAmD;AACnD,EAAE;AACF,+EAA+E;AAC/E,8EAA8E;AAC9E,4EAA4E;AAC5E,8EAA8E;AAC9E,6EAA6E;AAC7E,0EAA0E;AAE1E,MAAM,QAAQ,GAAG,IAAI,eAAe,CAAC;IACjC,YAAY,EAAE,KAAK;IACnB,gBAAgB,EAAE,GAAG;IACrB,cAAc,EAAE,QAAQ;CAC3B,CAAC,CAAC;AAEH,MAAM,UAAU,cAAc,CAAC,IAAY;IACvC,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,SAAS,CAAC;IAE/C,MAAM,EAAE,QAAQ,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;IACrC,yEAAyE;IACzE,IAAI,QAAQ,CAAC,eAAe,KAAK,IAAI;QAAE,OAAO,SAAS,CAAC;IAExD,IAAI,WAAsC,CAAC;IAC3C,IAAI,CAAC;QACD,oEAAoE;QACpE,qDAAqD;QACrD,WAAW,GAAG,IAAI,WAAW,CAAC,QAAQ,CAAC,SAAS,CAAC,IAAI,CAAa,CAAC,CAAC,KAAK,EAAE,EAAE,OAAO,CAAC;IACzF,CAAC;IAAC,MAAM,CAAC;QACL,kEAAkE;QAClE,WAAW,GAAG,SAAS,CAAC;IAC5B,CAAC;IAED,IAAI,WAAW,KAAK,IAAI,IAAI,WAAW,KAAK,SAAS,IAAI,WAAW,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACvF,WAAW,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IACzC,CAAC;IAED,MAAM,QAAQ,GAAG,QAAQ,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC,IAAI,EAAE,CAAC;IACvD,OAAO,QAAQ,CAAC,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,QAAQ,CAAC;AACxD,CAAC;AAED,2EAA2E;AAC3E,4EAA4E;AAC5E,8EAA8E;AAC9E,SAAS,YAAY,CAAC,QAAkB;IACpC,MAAM,QAAQ,GAAG,QAAQ,CAAC,IAAI,EAAE,SAAS,IAAI,EAAE,CAAC;IAChD,IAAI,QAAQ,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC;QAAE,OAAO,QAAQ,CAAC;IAChD,OAAO,QAAQ,CAAC,eAAe,EAAE,SAAS,IAAI,EAAE,CAAC;AACrD,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@plurnk/plurnk-mimetypes-text-html",
|
|
3
|
-
"version": "0.6.
|
|
4
|
-
"description": "text/html and application/xhtml+xml mimetype handler for plurnk-service.
|
|
3
|
+
"version": "0.6.3",
|
|
4
|
+
"description": "text/html and application/xhtml+xml mimetype handler for plurnk-service. Structural extraction via parse5; readable markdown (content channel) via Readability + turndown.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"license": "MIT",
|
|
7
7
|
"publishConfig": {
|
|
@@ -50,16 +50,20 @@
|
|
|
50
50
|
"prepare": "npm run build"
|
|
51
51
|
},
|
|
52
52
|
"dependencies": {
|
|
53
|
+
"@mozilla/readability": "^0.6.0",
|
|
53
54
|
"@xmldom/xmldom": "^0.9.10",
|
|
55
|
+
"linkedom": "^0.18.12",
|
|
54
56
|
"parse5": "^8.0.1",
|
|
57
|
+
"turndown": "^7.2.4",
|
|
55
58
|
"xpath": "^0.0.34"
|
|
56
59
|
},
|
|
57
60
|
"devDependencies": {
|
|
58
61
|
"@types/node": "^25.8.0",
|
|
62
|
+
"@types/turndown": "^5.0.6",
|
|
59
63
|
"typescript": "^6.0.3",
|
|
60
|
-
"@plurnk/plurnk-mimetypes": "^0.15.
|
|
64
|
+
"@plurnk/plurnk-mimetypes": "^0.15.10"
|
|
61
65
|
},
|
|
62
66
|
"peerDependencies": {
|
|
63
|
-
"@plurnk/plurnk-mimetypes": "^0.15.
|
|
67
|
+
"@plurnk/plurnk-mimetypes": "^0.15.10"
|
|
64
68
|
}
|
|
65
69
|
}
|