@rtif-sdk/formats 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/README.md +111 -0
  2. package/dist/html/codec.d.ts +22 -0
  3. package/dist/html/codec.d.ts.map +1 -0
  4. package/dist/html/codec.js +25 -0
  5. package/dist/html/codec.js.map +1 -0
  6. package/dist/html/entities.d.ts +14 -0
  7. package/dist/html/entities.d.ts.map +1 -0
  8. package/dist/html/entities.js +80 -0
  9. package/dist/html/entities.js.map +1 -0
  10. package/dist/html/index.d.ts +5 -0
  11. package/dist/html/index.d.ts.map +1 -0
  12. package/dist/html/index.js +3 -0
  13. package/dist/html/index.js.map +1 -0
  14. package/dist/html/parse-tree.d.ts +33 -0
  15. package/dist/html/parse-tree.d.ts.map +1 -0
  16. package/dist/html/parse-tree.js +191 -0
  17. package/dist/html/parse-tree.js.map +1 -0
  18. package/dist/html/parse.d.ts +28 -0
  19. package/dist/html/parse.d.ts.map +1 -0
  20. package/dist/html/parse.js +282 -0
  21. package/dist/html/parse.js.map +1 -0
  22. package/dist/html/rules.d.ts +51 -0
  23. package/dist/html/rules.d.ts.map +1 -0
  24. package/dist/html/rules.js +74 -0
  25. package/dist/html/rules.js.map +1 -0
  26. package/dist/html/serialize.d.ts +15 -0
  27. package/dist/html/serialize.d.ts.map +1 -0
  28. package/dist/html/serialize.js +68 -0
  29. package/dist/html/serialize.js.map +1 -0
  30. package/dist/markdown/codec.d.ts +15 -0
  31. package/dist/markdown/codec.d.ts.map +1 -0
  32. package/dist/markdown/codec.js +56 -0
  33. package/dist/markdown/codec.js.map +1 -0
  34. package/dist/markdown/index.d.ts +3 -0
  35. package/dist/markdown/index.d.ts.map +1 -0
  36. package/dist/markdown/index.js +3 -0
  37. package/dist/markdown/index.js.map +1 -0
  38. package/dist/markdown/parse-blocks.d.ts +25 -0
  39. package/dist/markdown/parse-blocks.d.ts.map +1 -0
  40. package/dist/markdown/parse-blocks.js +122 -0
  41. package/dist/markdown/parse-blocks.js.map +1 -0
  42. package/dist/markdown/parse-inline.d.ts +15 -0
  43. package/dist/markdown/parse-inline.d.ts.map +1 -0
  44. package/dist/markdown/parse-inline.js +164 -0
  45. package/dist/markdown/parse-inline.js.map +1 -0
  46. package/dist/markdown/serialize.d.ts +17 -0
  47. package/dist/markdown/serialize.d.ts.map +1 -0
  48. package/dist/markdown/serialize.js +120 -0
  49. package/dist/markdown/serialize.js.map +1 -0
  50. package/dist/plaintext/codec.d.ts +15 -0
  51. package/dist/plaintext/codec.d.ts.map +1 -0
  52. package/dist/plaintext/codec.js +30 -0
  53. package/dist/plaintext/codec.js.map +1 -0
  54. package/dist/plaintext/index.d.ts +3 -0
  55. package/dist/plaintext/index.d.ts.map +1 -0
  56. package/dist/plaintext/index.js +3 -0
  57. package/dist/plaintext/index.js.map +1 -0
  58. package/dist/shared/block-text.d.ts +4 -0
  59. package/dist/shared/block-text.d.ts.map +1 -0
  60. package/dist/shared/block-text.js +5 -0
  61. package/dist/shared/block-text.js.map +1 -0
  62. package/dist/shared/ids.d.ts +9 -0
  63. package/dist/shared/ids.d.ts.map +1 -0
  64. package/dist/shared/ids.js +12 -0
  65. package/dist/shared/ids.js.map +1 -0
  66. package/dist/shared/url.d.ts +20 -0
  67. package/dist/shared/url.d.ts.map +1 -0
  68. package/dist/shared/url.js +33 -0
  69. package/dist/shared/url.js.map +1 -0
  70. package/package.json +28 -0
package/README.md ADDED
@@ -0,0 +1,111 @@
1
+ # @rtif-sdk/formats
2
+
3
+ Format codecs for RTIF documents. Three subpath exports, each returning a
4
+ `FormatCodec` (from `@rtif-sdk/core`): `serialize(doc) → string` and
5
+ `parse(input) → Document`, where `parse` **never throws** and always returns a
6
+ valid normalized document.
7
+
8
+ ```bash
9
+ npm install @rtif-sdk/formats @rtif-sdk/core
10
+ ```
11
+
12
+ ```ts
13
+ import { createHtmlCodec } from '@rtif-sdk/formats/html';
14
+ import { createMarkdownCodec } from '@rtif-sdk/formats/markdown';
15
+ import { createPlaintextCodec } from '@rtif-sdk/formats/plaintext';
16
+ ```
17
+
18
+ There is no root export — import the codec you need.
19
+
20
+ ## HTML
21
+
22
+ ```ts
23
+ const html = createHtmlCodec();
24
+ html.serialize(doc); // '<p>Hello <strong>world</strong></p>'
25
+ html.parse(clipboard); // Document
26
+ ```
27
+
28
+ **DOM-free and SSR-safe.** Parsing uses a hand-rolled tokenizer with zero DOM
29
+ dependency — the codec behaves identically in Node, workers, and browsers, so
30
+ you can parse untrusted HTML server-side.
31
+
32
+ **Sanitization guarantees** (applied during parse):
33
+
34
+ - Protocol allowlist — `http:`, `https:`, `mailto:`, and relative URLs — on
35
+ *every* URL-bearing attribute: `href` **and** `src`. Anything else
36
+ (`javascript:`, `data:`, `vbscript:`, …) is dropped.
37
+ - `<script>`, `<style>`, `<iframe>`, `<object>`, `<embed>` subtrees stripped.
38
+ - All `on*` event-handler attributes removed.
39
+
40
+ Default rules cover the canonical types: `p`/`div`, `h1`–`h6`, `blockquote`,
41
+ `pre > code` (with `language-*` class), `li` (one `list` block per item),
42
+ `hr`, and the marks `strong`/`b`, `em`/`i`, `u`, `s`/`strike`/`del`, `code`,
43
+ `a[href]`.
44
+
45
+ Extend per name with `HtmlRules` — this is how custom features join the
46
+ clipboard story:
47
+
48
+ ```ts
49
+ const html = createHtmlCodec({
50
+ rules: {
51
+ marks: {
52
+ highlight: { tag: 'mark', parse: { tags: ['mark'] } },
53
+ },
54
+ blocks: {
55
+ callout: {
56
+ serialize: (block, inner) => `<aside class="callout">${inner}</aside>`,
57
+ parse: { tags: ['aside'] },
58
+ },
59
+ },
60
+ },
61
+ });
62
+ ```
63
+
64
+ `HtmlMarkRule`: `{ tag, attrs?(value), parse?: { tags, getValue?(attrs) } }`.
65
+ `HtmlBlockRule`: `{ serialize(block, inner), parse?: { tags, getAttrs?(attrs, tag) } }`.
66
+ User rules merge over the defaults per name. Structural tags (`ul`/`ol`/`li`,
67
+ `blockquote`, `pre`, `br`, `hr`) are handled by the parser itself and cannot be
68
+ remapped. Pass the codec to the editor via `createEditor({ codecs: { html } })`.
69
+
70
+ ## Markdown
71
+
72
+ ```ts
73
+ const md = createMarkdownCodec();
74
+ md.serialize(doc); // '# Title\n\nHello **world**'
75
+ md.parse(source); // Document
76
+ ```
77
+
78
+ A pragmatic CommonMark subset matching the canonical types:
79
+
80
+ | RTIF | Markdown |
81
+ |---|---|
82
+ | `heading` (level 1–6) | `#`–`######` |
83
+ | `blockquote` | `> ` (consecutive `>` lines merge into one block) |
84
+ | `code_block` | fenced ``` (fence sized past inner backticks; `attrs.language` on the fence) |
85
+ | `list` bulleted / ordered | `- ` / `1.` `2.` … (numbered per consecutive group) |
86
+ | `horizontal_rule` | `---` |
87
+ | `bold` / `italic` / `strikethrough` / `code` | `**` / `*` / `~~` / backticks |
88
+ | `link` | `[text](href)` — destinations pass the same protocol allowlist |
89
+
90
+ Known limits: **underline has no markdown form** and serializes as plain text;
91
+ block ids are not preserved across a round trip; single-level only (no nested
92
+ lists or quotes); no reference links, titles, or autolinks. Unparseable
93
+ constructs degrade to literal text.
94
+
95
+ ## Plaintext
96
+
97
+ ```ts
98
+ const txt = createPlaintextCodec();
99
+ txt.serialize(doc); // blocks joined by '\n'; marks and structure discarded
100
+ txt.parse('a\nb'); // one paragraph per line; '' → one empty paragraph
101
+ ```
102
+
103
+ Handles LF, CRLF, and lone CR on parse. That's the whole format.
104
+
105
+ ## Notes
106
+
107
+ - Parsers generate fresh block ids (`b1`, `b2`, … per call). Use
108
+ `assertDocEqual` from `@rtif-sdk/test-kit` (ids ignored by default) when testing
109
+ round trips.
110
+ - A custom codec is just an object satisfying `FormatCodec` — nothing here is
111
+ special-cased by the editor.
@@ -0,0 +1,22 @@
1
+ /**
2
+ * The HTML codec factory. Serialization is pure string building; parsing
3
+ * uses a hand-rolled tokenizer with zero DOM dependency, so the codec is
4
+ * SSR-safe and behaves identically in Node, workers, and browsers.
5
+ */
6
+ import type { FormatCodec } from '@rtif-sdk/core';
7
+ import type { HtmlRules } from './rules.js';
8
+ /** Options for {@link createHtmlCodec}. */
9
+ export interface HtmlCodecOptions {
10
+ /** Extra mark/block rules, merged over the canonical defaults per name. */
11
+ readonly rules?: HtmlRules;
12
+ }
13
+ /**
14
+ * Create an HTML codec.
15
+ *
16
+ * Parsing sanitizes aggressively: protocol allowlist (`http:`, `https:`,
17
+ * `mailto:`, relative) on every `href`/`src`; `<script>`/`<style>`/`<iframe>`/
18
+ * `<object>`/`<embed>` subtrees stripped; all `on*` attributes dropped.
19
+ * It never throws and always returns a valid normalized document.
20
+ */
21
+ export declare function createHtmlCodec(options?: HtmlCodecOptions): FormatCodec;
22
+ //# sourceMappingURL=codec.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"codec.d.ts","sourceRoot":"","sources":["../../src/html/codec.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,gBAAgB,CAAC;AAElD,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AAI5C,2CAA2C;AAC3C,MAAM,WAAW,gBAAgB;IAC/B,2EAA2E;IAC3E,QAAQ,CAAC,KAAK,CAAC,EAAE,SAAS,CAAC;CAC5B;AAED;;;;;;;GAOG;AACH,wBAAgB,eAAe,CAAC,OAAO,CAAC,EAAE,gBAAgB,GAAG,WAAW,CAOvE"}
@@ -0,0 +1,25 @@
1
+ /**
2
+ * The HTML codec factory. Serialization is pure string building; parsing
3
+ * uses a hand-rolled tokenizer with zero DOM dependency, so the codec is
4
+ * SSR-safe and behaves identically in Node, workers, and browsers.
5
+ */
6
+ import { parseHtml } from './parse.js';
7
+ import { resolveRules } from './rules.js';
8
+ import { serializeHtml } from './serialize.js';
9
+ /**
10
+ * Create an HTML codec.
11
+ *
12
+ * Parsing sanitizes aggressively: protocol allowlist (`http:`, `https:`,
13
+ * `mailto:`, relative) on every `href`/`src`; `<script>`/`<style>`/`<iframe>`/
14
+ * `<object>`/`<embed>` subtrees stripped; all `on*` attributes dropped.
15
+ * It never throws and always returns a valid normalized document.
16
+ */
17
+ export function createHtmlCodec(options) {
18
+ const rules = resolveRules(options?.rules);
19
+ return {
20
+ format: 'html',
21
+ serialize: (doc) => serializeHtml(doc, rules),
22
+ parse: (input) => parseHtml(input, rules),
23
+ };
24
+ }
25
+ //# sourceMappingURL=codec.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"codec.js","sourceRoot":"","sources":["../../src/html/codec.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAGH,OAAO,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AAEvC,OAAO,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AAC1C,OAAO,EAAE,aAAa,EAAE,MAAM,gBAAgB,CAAC;AAQ/C;;;;;;;GAOG;AACH,MAAM,UAAU,eAAe,CAAC,OAA0B;IACxD,MAAM,KAAK,GAAG,YAAY,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC;IAC3C,OAAO;QACL,MAAM,EAAE,MAAM;QACd,SAAS,EAAE,CAAC,GAAG,EAAE,EAAE,CAAC,aAAa,CAAC,GAAG,EAAE,KAAK,CAAC;QAC7C,KAAK,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,SAAS,CAAC,KAAK,EAAE,KAAK,CAAC;KAC1C,CAAC;AACJ,CAAC"}
@@ -0,0 +1,14 @@
1
+ /**
2
+ * HTML entity decoding and text/attribute escaping. Pure string functions —
3
+ * no DOM, so the codec is SSR-safe.
4
+ */
5
+ /**
6
+ * Decode named and numeric (decimal and hex) character references.
7
+ * Unknown references pass through literally — never throws.
8
+ */
9
+ export declare function decodeEntities(text: string): string;
10
+ /** Escape text content for HTML output (`&`, `<`, `>`). */
11
+ export declare function escapeText(text: string): string;
12
+ /** Escape an attribute value for quoted HTML output (`&`, `<`, `>`, `"`, `'`). */
13
+ export declare function escapeAttr(value: string): string;
14
+ //# sourceMappingURL=entities.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"entities.d.ts","sourceRoot":"","sources":["../../src/html/entities.ts"],"names":[],"mappings":"AAAA;;;GAGG;AA+CH;;;GAGG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAMnD;AAcD,2DAA2D;AAC3D,wBAAgB,UAAU,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAE/C;AAED,kFAAkF;AAClF,wBAAgB,UAAU,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,CAEhD"}
@@ -0,0 +1,80 @@
1
+ /**
2
+ * HTML entity decoding and text/attribute escaping. Pure string functions —
3
+ * no DOM, so the codec is SSR-safe.
4
+ */
5
+ /** Practical subset of named entities seen in real-world pasted HTML. */
6
+ const NAMED_ENTITIES = {
7
+ amp: '&',
8
+ lt: '<',
9
+ gt: '>',
10
+ quot: '"',
11
+ apos: "'",
12
+ nbsp: '\u00a0',
13
+ copy: '©',
14
+ reg: '®',
15
+ trade: '™',
16
+ deg: '°',
17
+ plusmn: '±',
18
+ middot: '·',
19
+ bull: '•',
20
+ hellip: '…',
21
+ ndash: '–',
22
+ mdash: '—',
23
+ lsquo: '‘',
24
+ rsquo: '’',
25
+ ldquo: '“',
26
+ rdquo: '”',
27
+ laquo: '«',
28
+ raquo: '»',
29
+ sect: '§',
30
+ para: '¶',
31
+ times: '×',
32
+ divide: '÷',
33
+ cent: '¢',
34
+ pound: '£',
35
+ yen: '¥',
36
+ euro: '€',
37
+ };
38
+ const ENTITY_PATTERN = /&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*);/g;
39
+ /** Decode one numeric entity body (`#65` or `#x1F600`); invalid code points become U+FFFD. */
40
+ const decodeNumeric = (body) => {
41
+ const hex = body[1] === 'x' || body[1] === 'X';
42
+ const codePoint = Number.parseInt(body.slice(hex ? 2 : 1), hex ? 16 : 10);
43
+ if (Number.isNaN(codePoint) || codePoint <= 0 || codePoint > 0x10ffff)
44
+ return '\ufffd';
45
+ if (codePoint >= 0xd800 && codePoint <= 0xdfff)
46
+ return '\ufffd'; // lone surrogate
47
+ return String.fromCodePoint(codePoint); // handles astral-plane code points
48
+ };
49
+ /**
50
+ * Decode named and numeric (decimal and hex) character references.
51
+ * Unknown references pass through literally — never throws.
52
+ */
53
+ export function decodeEntities(text) {
54
+ if (!text.includes('&'))
55
+ return text;
56
+ return text.replace(ENTITY_PATTERN, (full, body) => {
57
+ if (body.startsWith('#'))
58
+ return decodeNumeric(body);
59
+ return NAMED_ENTITIES[body] ?? NAMED_ENTITIES[body.toLowerCase()] ?? full;
60
+ });
61
+ }
62
+ const TEXT_ESCAPES = {
63
+ '&': '&amp;',
64
+ '<': '&lt;',
65
+ '>': '&gt;',
66
+ };
67
+ const ATTR_ESCAPES = {
68
+ ...TEXT_ESCAPES,
69
+ '"': '&quot;',
70
+ "'": '&#39;',
71
+ };
72
+ /** Escape text content for HTML output (`&`, `<`, `>`). */
73
+ export function escapeText(text) {
74
+ return text.replace(/[&<>]/g, (ch) => TEXT_ESCAPES[ch]);
75
+ }
76
+ /** Escape an attribute value for quoted HTML output (`&`, `<`, `>`, `"`, `'`). */
77
+ export function escapeAttr(value) {
78
+ return value.replace(/[&<>"']/g, (ch) => ATTR_ESCAPES[ch]);
79
+ }
80
+ //# sourceMappingURL=entities.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"entities.js","sourceRoot":"","sources":["../../src/html/entities.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,yEAAyE;AACzE,MAAM,cAAc,GAAqC;IACvD,GAAG,EAAE,GAAG;IACR,EAAE,EAAE,GAAG;IACP,EAAE,EAAE,GAAG;IACP,IAAI,EAAE,GAAG;IACT,IAAI,EAAE,GAAG;IACT,IAAI,EAAE,QAAQ;IACd,IAAI,EAAE,GAAG;IACT,GAAG,EAAE,GAAG;IACR,KAAK,EAAE,GAAG;IACV,GAAG,EAAE,GAAG;IACR,MAAM,EAAE,GAAG;IACX,MAAM,EAAE,GAAG;IACX,IAAI,EAAE,GAAG;IACT,MAAM,EAAE,GAAG;IACX,KAAK,EAAE,GAAG;IACV,KAAK,EAAE,GAAG;IACV,KAAK,EAAE,GAAG;IACV,KAAK,EAAE,GAAG;IACV,KAAK,EAAE,GAAG;IACV,KAAK,EAAE,GAAG;IACV,KAAK,EAAE,GAAG;IACV,KAAK,EAAE,GAAG;IACV,IAAI,EAAE,GAAG;IACT,IAAI,EAAE,GAAG;IACT,KAAK,EAAE,GAAG;IACV,MAAM,EAAE,GAAG;IACX,IAAI,EAAE,GAAG;IACT,KAAK,EAAE,GAAG;IACV,GAAG,EAAE,GAAG;IACR,IAAI,EAAE,GAAG;CACV,CAAC;AAEF,MAAM,cAAc,GAAG,qDAAqD,CAAC;AAE7E,8FAA8F;AAC9F,MAAM,aAAa,GAAG,CAAC,IAAY,EAAU,EAAE;IAC7C,MAAM,GAAG,GAAG,IAAI,CAAC,CAAC,CAAC,KAAK,GAAG,IAAI,IAAI,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC;IAC/C,MAAM,SAAS,GAAG,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;IAC1E,IAAI,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,IAAI,SAAS,IAAI,CAAC,IAAI,SAAS,GAAG,QAAQ;QAAE,OAAO,QAAQ,CAAC;IACvF,IAAI,SAAS,IAAI,MAAM,IAAI,SAAS,IAAI,MAAM;QAAE,OAAO,QAAQ,CAAC,CAAC,iBAAiB;IAClF,OAAO,MAAM,CAAC,aAAa,CAAC,SAAS,CAAC,CAAC,CAAC,mCAAmC;AAC7E,CAAC,CAAC;AAEF;;;GAGG;AACH,MAAM,UAAU,cAAc,CAAC,IAAY;IACzC,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC;QAAE,OAAO,IAAI,CAAC;IACrC,OAAO,IAAI,CAAC,OAAO,CAAC,cAAc,EAAE,CAAC,IAAI,EAAE,IAAY,EAAE,EAAE;QACzD,IAAI,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC;YAAE,OAAO,aAAa,CAAC,IAAI,CAAC,CAAC;QACrD,OAAO,cAAc,CAAC,IAAI,CAAC,IAAI,cAAc,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC,IAAI,IAAI,CAAC;IAC5E,CAAC,CAAC,CAAC;AACL,CAAC;AAED,MAAM,YAAY,GAAqC;IACrD,GAAG,EAAE,OAAO;IACZ,GAAG,EAAE,MAAM;IACX,GAAG,EAAE,MAAM;CACZ,CAAC;AAEF,MAAM,YAAY,GAAqC;IACrD,GAAG,YAAY;IACf,GAAG,EAAE,QAAQ;IACb,GAAG,EAAE,OAAO;CACb,CAAC;AAEF,2DAA2D;AAC3D,MAAM,UAAU,UAAU,CAAC,IAAY;IACrC,OAAO,IAAI,CAAC,OAAO,CAAC,QAAQ,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,YAAY,CAAC,EAAE,CAAW,CAAC,CAAC;AACpE,CAAC;AAED,kFAAkF;AAClF,MAAM,UAAU,UAAU,CAAC,KAAa;IACtC,OAAO,KAAK,CAAC,OAAO,CAAC,UAAU,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,YAAY,CAAC,EAAE,CAAW,CAAC,CAAC;AACvE,CAAC"}
@@ -0,0 +1,5 @@
1
+ /** @rtif-sdk/formats/html — DOM-free, SSR-safe HTML codec with sanitization. */
2
+ export { createHtmlCodec } from './codec.js';
3
+ export type { HtmlCodecOptions } from './codec.js';
4
+ export type { HtmlBlockRule, HtmlMarkRule, HtmlRules } from './rules.js';
5
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/html/index.ts"],"names":[],"mappings":"AAAA,gFAAgF;AAEhF,OAAO,EAAE,eAAe,EAAE,MAAM,YAAY,CAAC;AAC7C,YAAY,EAAE,gBAAgB,EAAE,MAAM,YAAY,CAAC;AACnD,YAAY,EAAE,aAAa,EAAE,YAAY,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC"}
@@ -0,0 +1,3 @@
1
+ /** @rtif-sdk/formats/html — DOM-free, SSR-safe HTML codec with sanitization. */
2
+ export { createHtmlCodec } from './codec.js';
3
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/html/index.ts"],"names":[],"mappings":"AAAA,gFAAgF;AAEhF,OAAO,EAAE,eAAe,EAAE,MAAM,YAAY,CAAC"}
@@ -0,0 +1,33 @@
1
+ /**
2
+ * Hand-rolled HTML tokenizer and tree builder — zero DOM dependency, so the
3
+ * codec runs identically in browsers, Node, workers, and SSR. It covers the
4
+ * practical subset rich-text paste needs: tags with every attribute-quoting
5
+ * variant, entity decoding, void elements, comments/doctype, unclosed tags,
6
+ * and mismatched close tags (matched by name against the open-element stack).
7
+ *
8
+ * Sanitization starts here:
9
+ * - `<script>`/`<style>`/`<iframe>` and other raw-text containers are
10
+ * skipped together with their entire content.
11
+ * - `on*` event-handler attributes are dropped.
12
+ * - `href`/`src` values failing the protocol allowlist are removed.
13
+ *
14
+ * Known limitation: attribute values containing a literal `>` require quotes
15
+ * (the tag scanner honors quotes, so quoted values are handled correctly).
16
+ */
17
+ /** An element in the parsed tree; attribute names are lowercased, values entity-decoded. */
18
+ export interface HtmlElementNode {
19
+ readonly kind: 'element';
20
+ readonly tag: string;
21
+ readonly attrs: Readonly<Record<string, string>>;
22
+ readonly children: readonly HtmlNode[];
23
+ }
24
+ /** A text node; entities are already decoded. */
25
+ export interface HtmlTextNode {
26
+ readonly kind: 'text';
27
+ readonly text: string;
28
+ }
29
+ /** A node in the parsed HTML tree. */
30
+ export type HtmlNode = HtmlElementNode | HtmlTextNode;
31
+ /** Parse an HTML string into a tree of nodes. Total: never throws. */
32
+ export declare function parseHtmlTree(html: string): readonly HtmlNode[];
33
+ //# sourceMappingURL=parse-tree.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"parse-tree.d.ts","sourceRoot":"","sources":["../../src/html/parse-tree.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAKH,4FAA4F;AAC5F,MAAM,WAAW,eAAe;IAC9B,QAAQ,CAAC,IAAI,EAAE,SAAS,CAAC;IACzB,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,KAAK,EAAE,QAAQ,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAAC;IACjD,QAAQ,CAAC,QAAQ,EAAE,SAAS,QAAQ,EAAE,CAAC;CACxC;AAED,iDAAiD;AACjD,MAAM,WAAW,YAAY;IAC3B,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;CACvB;AAED,sCAAsC;AACtC,MAAM,MAAM,QAAQ,GAAG,eAAe,GAAG,YAAY,CAAC;AAuCtD,sEAAsE;AACtE,wBAAgB,aAAa,CAAC,IAAI,EAAE,MAAM,GAAG,SAAS,QAAQ,EAAE,CAyB/D"}
@@ -0,0 +1,191 @@
1
+ /**
2
+ * Hand-rolled HTML tokenizer and tree builder — zero DOM dependency, so the
3
+ * codec runs identically in browsers, Node, workers, and SSR. It covers the
4
+ * practical subset rich-text paste needs: tags with every attribute-quoting
5
+ * variant, entity decoding, void elements, comments/doctype, unclosed tags,
6
+ * and mismatched close tags (matched by name against the open-element stack).
7
+ *
8
+ * Sanitization starts here:
9
+ * - `<script>`/`<style>`/`<iframe>` and other raw-text containers are
10
+ * skipped together with their entire content.
11
+ * - `on*` event-handler attributes are dropped.
12
+ * - `href`/`src` values failing the protocol allowlist are removed.
13
+ *
14
+ * Known limitation: attribute values containing a literal `>` require quotes
15
+ * (the tag scanner honors quotes, so quoted values are handled correctly).
16
+ */
17
+ import { sanitizeUrl } from '../shared/url.js';
18
+ import { decodeEntities } from './entities.js';
19
+ const VOID_TAGS = new Set([
20
+ 'area',
21
+ 'base',
22
+ 'br',
23
+ 'col',
24
+ 'embed',
25
+ 'hr',
26
+ 'img',
27
+ 'input',
28
+ 'link',
29
+ 'meta',
30
+ 'param',
31
+ 'source',
32
+ 'track',
33
+ 'wbr',
34
+ ]);
35
+ /** Tags whose content is raw text we never want — skipped to the matching close tag. */
36
+ const RAW_SKIP_TAGS = new Set([
37
+ 'script',
38
+ 'style',
39
+ 'iframe',
40
+ 'noscript',
41
+ 'textarea',
42
+ 'title',
43
+ ]);
44
+ const URL_ATTRS = new Set(['href', 'src']);
45
+ /** name, ="double", ='single', or =bare — in that alternation order. */
46
+ const ATTR_PATTERN = /([^\s=/>"']+)\s*(?:=\s*(?:"([^"]*)"|'([^']*)'|([^\s>]*)))?/g;
47
+ /** Parse an HTML string into a tree of nodes. Total: never throws. */
48
+ export function parseHtmlTree(html) {
49
+ const root = [];
50
+ const stack = [{ tag: '', children: root }];
51
+ let i = 0;
52
+ while (i < html.length) {
53
+ if (html[i] !== '<') {
54
+ const next = html.indexOf('<', i);
55
+ const end = next === -1 ? html.length : next;
56
+ pushText(stack, html.slice(i, end));
57
+ i = end;
58
+ continue;
59
+ }
60
+ const after = html[i + 1];
61
+ if (after === '!' || after === '?') {
62
+ i = skipDeclaration(html, i);
63
+ }
64
+ else if (after === '/') {
65
+ i = closeTag(html, i, stack);
66
+ }
67
+ else if (after === undefined || !/[a-zA-Z]/.test(after)) {
68
+ pushText(stack, '<'); // a lone '<' is text, e.g. "a < b"
69
+ i++;
70
+ }
71
+ else {
72
+ i = openTag(html, i, stack);
73
+ }
74
+ }
75
+ return root;
76
+ }
77
+ /** Append decoded text, merging with a preceding text node. */
78
+ const pushText = (stack, raw) => {
79
+ if (raw === '')
80
+ return;
81
+ const text = decodeEntities(raw);
82
+ const children = stack[stack.length - 1].children;
83
+ const last = children[children.length - 1];
84
+ if (last !== undefined && last.kind === 'text') {
85
+ children[children.length - 1] = { kind: 'text', text: last.text + text };
86
+ }
87
+ else {
88
+ children.push({ kind: 'text', text });
89
+ }
90
+ };
91
+ /** Skip `<!-- comments -->`, `<!doctype>`, and `<? ... >`. */
92
+ const skipDeclaration = (html, i) => {
93
+ if (html.startsWith('<!--', i)) {
94
+ const end = html.indexOf('-->', i + 4);
95
+ return end === -1 ? html.length : end + 3;
96
+ }
97
+ const end = html.indexOf('>', i);
98
+ return end === -1 ? html.length : end + 1;
99
+ };
100
+ /** Handle `</tag>`: pop the stack to the named element; ignore when unmatched. */
101
+ const closeTag = (html, i, stack) => {
102
+ const end = html.indexOf('>', i);
103
+ if (end === -1)
104
+ return html.length;
105
+ const name = /[a-zA-Z][a-zA-Z0-9-]*/.exec(html.slice(i + 2, end))?.[0]?.toLowerCase();
106
+ if (name !== undefined) {
107
+ for (let frame = stack.length - 1; frame >= 1; frame--) {
108
+ if (stack[frame].tag === name) {
109
+ stack.length = frame; // pop the element and everything left open inside it
110
+ break;
111
+ }
112
+ }
113
+ }
114
+ return end + 1;
115
+ };
116
+ /** Find the index of the `>` ending the tag that starts at `from`, honoring quoted values. */
117
+ const scanTagEnd = (html, from) => {
118
+ let quote = null;
119
+ for (let j = from; j < html.length; j++) {
120
+ const ch = html[j];
121
+ if (quote !== null) {
122
+ if (ch === quote)
123
+ quote = null;
124
+ }
125
+ else if (ch === '"' || ch === "'") {
126
+ quote = ch;
127
+ }
128
+ else if (ch === '>') {
129
+ return j;
130
+ }
131
+ }
132
+ return -1;
133
+ };
134
+ /** Parse an opening tag, push the element, and descend unless void/self-closing. */
135
+ const openTag = (html, i, stack) => {
136
+ const tagEnd = scanTagEnd(html, i + 1);
137
+ if (tagEnd === -1) {
138
+ pushText(stack, html.slice(i)); // malformed trailing tag — keep as text
139
+ return html.length;
140
+ }
141
+ const content = html.slice(i + 1, tagEnd);
142
+ const selfClosing = content.endsWith('/');
143
+ const body = selfClosing ? content.slice(0, -1) : content;
144
+ const nameMatch = /^[a-zA-Z][a-zA-Z0-9-]*/.exec(body);
145
+ if (nameMatch === null)
146
+ return tagEnd + 1;
147
+ const tag = nameMatch[0].toLowerCase();
148
+ if (RAW_SKIP_TAGS.has(tag))
149
+ return skipRawContent(html, tagEnd + 1, tag);
150
+ const children = [];
151
+ const element = {
152
+ kind: 'element',
153
+ tag,
154
+ attrs: parseAttrs(body.slice(nameMatch[0].length)),
155
+ children,
156
+ };
157
+ stack[stack.length - 1].children.push(element);
158
+ if (!selfClosing && !VOID_TAGS.has(tag))
159
+ stack.push({ tag, children });
160
+ return tagEnd + 1;
161
+ };
162
+ /** Skip everything up to and including `</tag>` (case-insensitive); to EOF when unclosed. */
163
+ const skipRawContent = (html, from, tag) => {
164
+ const close = html.toLowerCase().indexOf(`</${tag}`, from);
165
+ if (close === -1)
166
+ return html.length;
167
+ const end = html.indexOf('>', close);
168
+ return end === -1 ? html.length : end + 1;
169
+ };
170
+ /** Parse attributes, dropping `on*` handlers and disallowed `href`/`src` URLs. */
171
+ const parseAttrs = (raw) => {
172
+ const attrs = {};
173
+ ATTR_PATTERN.lastIndex = 0;
174
+ let match;
175
+ while ((match = ATTR_PATTERN.exec(raw)) !== null) {
176
+ const name = match[1].toLowerCase();
177
+ const value = decodeEntities(match[2] ?? match[3] ?? match[4] ?? '');
178
+ if (name.startsWith('on'))
179
+ continue; // event handlers never survive
180
+ if (URL_ATTRS.has(name)) {
181
+ const safe = sanitizeUrl(value);
182
+ if (safe !== null && !(name in attrs))
183
+ attrs[name] = safe;
184
+ continue;
185
+ }
186
+ if (!(name in attrs))
187
+ attrs[name] = value;
188
+ }
189
+ return attrs;
190
+ };
191
+ //# sourceMappingURL=parse-tree.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"parse-tree.js","sourceRoot":"","sources":["../../src/html/parse-tree.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAEH,OAAO,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAC/C,OAAO,EAAE,cAAc,EAAE,MAAM,eAAe,CAAC;AAmB/C,MAAM,SAAS,GAAwB,IAAI,GAAG,CAAC;IAC7C,MAAM;IACN,MAAM;IACN,IAAI;IACJ,KAAK;IACL,OAAO;IACP,IAAI;IACJ,KAAK;IACL,OAAO;IACP,MAAM;IACN,MAAM;IACN,OAAO;IACP,QAAQ;IACR,OAAO;IACP,KAAK;CACN,CAAC,CAAC;AAEH,wFAAwF;AACxF,MAAM,aAAa,GAAwB,IAAI,GAAG,CAAC;IACjD,QAAQ;IACR,OAAO;IACP,QAAQ;IACR,UAAU;IACV,UAAU;IACV,OAAO;CACR,CAAC,CAAC;AAEH,MAAM,SAAS,GAAwB,IAAI,GAAG,CAAC,CAAC,MAAM,EAAE,KAAK,CAAC,CAAC,CAAC;AAEhE,wEAAwE;AACxE,MAAM,YAAY,GAAG,6DAA6D,CAAC;AAOnF,sEAAsE;AACtE,MAAM,UAAU,aAAa,CAAC,IAAY;IACxC,MAAM,IAAI,GAAe,EAAE,CAAC;IAC5B,MAAM,KAAK,GAAiB,CAAC,EAAE,GAAG,EAAE,EAAE,EAAE,QAAQ,EAAE,IAAI,EAAE,CAAC,CAAC;IAC1D,IAAI,CAAC,GAAG,CAAC,CAAC;IACV,OAAO,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;QACvB,IAAI,IAAI,CAAC,CAAC,CAAC,KAAK,GAAG,EAAE,CAAC;YACpB,MAAM,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC;YAClC,MAAM,GAAG,GAAG,IAAI,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC;YAC7C,QAAQ,CAAC,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC;YACpC,CAAC,GAAG,GAAG,CAAC;YACR,SAAS;QACX,CAAC;QACD,MAAM,KAAK,GAAG,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;QAC1B,IAAI,KAAK,KAAK,GAAG,IAAI,KAAK,KAAK,GAAG,EAAE,CAAC;YACnC,CAAC,GAAG,eAAe,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;QAC/B,CAAC;aAAM,IAAI,KAAK,KAAK,GAAG,EAAE,CAAC;YACzB,CAAC,GAAG,QAAQ,CAAC,IAAI,EAAE,CAAC,EAAE,KAAK,CAAC,CAAC;QAC/B,CAAC;aAAM,IAAI,KAAK,KAAK,SAAS,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;YAC1D,QAAQ,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,CAAC,mCAAmC;YACzD,CAAC,EAAE,CAAC;QACN,CAAC;aAAM,CAAC;YACN,CAAC,GAAG,OAAO,CAAC,IAAI,EAAE,CAAC,EAAE,KAAK,CAAC,CAAC;QAC9B,CAAC;IACH,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,+DAA+D;AAC/D,MAAM,QAAQ,GAAG,CAAC,KAA4B,EAAE,GAAW,EAAQ,EAAE;IACnE,IAAI,GAAG,KAAK,EAAE;QAAE,OAAO;IACvB,MAAM,IAAI,GAAG,cAAc,CAAC,GAAG,CAAC,CAAC;IACjC,MAAM,QAAQ,GAAI,KAAK,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAgB,CAAC,QAAQ,CAAC;IAClE,MAAM,IAAI,GAAG,QAAQ,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAC3C,IAAI,IAAI,KAAK,SAAS,IAAI,IAAI,CAAC,IAAI,KAAK,MAAM,EAAE,CAAC;QAC/C,QAAQ,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,GAAG,IAAI,EAAE,CAAC;IAC3E,CAAC;SAAM,CAAC;QACN,QAAQ,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC;IACxC,CAAC;AACH,CAAC,CAAC;AAEF,8DAA8D;AAC9D,MAAM,eAAe,GAAG,CAAC,IAAY,EAAE,CAAS,EAAU,EAAE;IAC1D,IAAI,IAAI,CAAC,UAAU,CAAC,MAAM,EAAE,CAAC,CAAC,EAAE,CAAC;QAC/B,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;QACvC,OAAO,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC;IAC5C,CAAC;IACD,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC;IACjC,OAAO,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC;AAC5C,CAAC,CAAC;AAEF,kFAAkF;AAClF,MAAM,QAAQ,GAAG,CAAC,IAAY,EAAE,CAAS,EAAE,KAAmB,EAAU,EAAE;IACxE,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC;IACjC,IAAI,GAAG,KAAK,CAAC,CAAC;QAAE,OAAO,IAAI,CAAC,MAAM,CAAC;IACnC,MAAM,IAAI,GAAG,uBAAuB,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,EAAE,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,WAAW,EAAE,CAAC;IACtF,IAAI,IAAI,KAAK,SAAS,EAAE,CAAC;QACvB,KAAK,IAAI,KAAK,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,KAAK,IAAI,CAAC,EAAE,KAAK,EAAE,EAAE,CAAC;YACvD,IAAK,KAAK,CAAC,KAAK,CAAgB,CAAC,GAAG,KAAK,IAAI,EAAE,CAAC;gBAC9C,KAAK,CAAC,MAAM,GAAG,KAAK,CAAC,CAAC,qDAAqD;gBAC3E,MAAM;YACR,CAAC;QACH,CAAC;IACH,CAAC;IACD,OAAO,GAAG,GAAG,CAAC,CAAC;AACjB,CAAC,CAAC;AAEF,8FAA8F;AAC9F,MAAM,UAAU,GAAG,CAAC,IAAY,EAAE,IAAY,EAAU,EAAE;IACxD,IAAI,KAAK,GAAkB,IAAI,CAAC;IAChC,KAAK,IAAI,CAAC,GAAG,IAAI,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACxC,MAAM,EAAE,GAAG,IAAI,CAAC,CAAC,CAAW,CAAC;QAC7B,IAAI,KAAK,KAAK,IAAI,EAAE,CAAC;YACnB,IAAI,EAAE,KAAK,KAAK;gBAAE,KAAK,GAAG,IAAI,CAAC;QACjC,CAAC;aAAM,IAAI,EAAE,KAAK,GAAG,IAAI,EAAE,KAAK,GAAG,EAAE,CAAC;YACpC,KAAK,GAAG,EAAE,CAAC;QACb,CAAC;aAAM,IAAI,EAAE,KAAK,GAAG,EAAE,CAAC;YACtB,OAAO,CAAC,CAAC;QACX,CAAC;IACH,CAAC;IACD,OAAO,CAAC,CAAC,CAAC;AACZ,CAAC,CAAC;AAEF,oFAAoF;AACpF,MAAM,OAAO,GAAG,CAAC,IAAY,EAAE,CAAS,EAAE,KAAmB,EAAU,EAAE;IACvE,MAAM,MAAM,GAAG,UAAU,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;IACvC,IAAI,MAAM,KAAK,CAAC,CAAC,EAAE,CAAC;QAClB,QAAQ,CAAC,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,wCAAwC;QACxE,OAAO,IAAI,CAAC,MAAM,CAAC;IACrB,CAAC;IACD,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,EAAE,MAAM,CAAC,CAAC;IAC1C,MAAM,WAAW,GAAG,OAAO,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC;IAC1C,MAAM,IAAI,GAAG,WAAW,CAAC,CAAC,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC;IAC1D,MAAM,SAAS,GAAG,wBAAwB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACtD,IAAI,SAAS,KAAK,IAAI;QAAE,OAAO,MAAM,GAAG,CAAC,CAAC;IAC1C,MAAM,GAAG,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC;IACvC,IAAI,aAAa,CAAC,GAAG,CAAC,GAAG,CAAC;QAAE,OAAO,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,CAAC,EAAE,GAAG,CAAC,CAAC;IACzE,MAAM,QAAQ,GAAe,EAAE,CAAC;IAChC,MAAM,OAAO,GAAoB;QAC/B,IAAI,EAAE,SAAS;QACf,GAAG;QACH,KAAK,EAAE,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;QAClD,QAAQ;KACT,CAAC;IACD,KAAK,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAgB,CAAC,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IAC/D,IAAI,CAAC,WAAW,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,GAAG,CAAC;QAAE,KAAK,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,QAAQ,EAAE,CAAC,CAAC;IACvE,OAAO,MAAM,GAAG,CAAC,CAAC;AACpB,CAAC,CAAC;AAEF,6FAA6F;AAC7F,MAAM,cAAc,GAAG,CAAC,IAAY,EAAE,IAAY,EAAE,GAAW,EAAU,EAAE;IACzE,MAAM,KAAK,GAAG,IAAI,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,KAAK,GAAG,EAAE,EAAE,IAAI,CAAC,CAAC;IAC3D,IAAI,KAAK,KAAK,CAAC,CAAC;QAAE,OAAO,IAAI,CAAC,MAAM,CAAC;IACrC,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC;IACrC,OAAO,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC;AAC5C,CAAC,CAAC;AAEF,kFAAkF;AAClF,MAAM,UAAU,GAAG,CAAC,GAAW,EAA0B,EAAE;IACzD,MAAM,KAAK,GAA2B,EAAE,CAAC;IACzC,YAAY,CAAC,SAAS,GAAG,CAAC,CAAC;IAC3B,IAAI,KAA6B,CAAC;IAClC,OAAO,CAAC,KAAK,GAAG,YAAY,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QACjD,MAAM,IAAI,GAAI,KAAK,CAAC,CAAC,CAAY,CAAC,WAAW,EAAE,CAAC;QAChD,MAAM,KAAK,GAAG,cAAc,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,KAAK,CAAC,CAAC,CAAC,IAAI,KAAK,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;QACrE,IAAI,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC;YAAE,SAAS,CAAC,+BAA+B;QACpE,IAAI,SAAS,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC;YACxB,MAAM,IAAI,GAAG,WAAW,CAAC,KAAK,CAAC,CAAC;YAChC,IAAI,IAAI,KAAK,IAAI,IAAI,CAAC,CAAC,IAAI,IAAI,KAAK,CAAC;gBAAE,KAAK,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC;YAC1D,SAAS;QACX,CAAC;QACD,IAAI,CAAC,CAAC,IAAI,IAAI,KAAK,CAAC;YAAE,KAAK,CAAC,IAAI,CAAC,GAAG,KAAK,CAAC;IAC5C,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC,CAAC"}
@@ -0,0 +1,28 @@
1
+ /**
2
+ * HTML → Document. Walks the sanitized tree from parse-tree.ts and
3
+ * accumulates normalized blocks. Best-effort: never throws, always returns
4
+ * a valid document with ≥ 1 block. No DOM dependency — SSR-safe.
5
+ *
6
+ * Documented behavior:
7
+ * - `<br>` ends the current block and starts a new one of the *same* type
8
+ * (so `<p>a<br>b</p>` yields two paragraphs; `<br><br>` keeps the empty
9
+ * block between them).
10
+ * - Nested `<ul>`/`<ol>` flatten to consecutive `list` blocks; nesting depth
11
+ * is recorded as `attrs.indent` (absent at the top level). `indent` is
12
+ * parse-only metadata — serialization ignores it.
13
+ * - Unknown elements are transparent (children are parsed in place); loose
14
+ * text at block level becomes a paragraph.
15
+ * - Inside `<blockquote>` and `<li>`, paragraph-ish children adopt the
16
+ * container's block type: `<blockquote><p>a</p><p>b</p></blockquote>`
17
+ * yields two blockquote blocks.
18
+ * - Whitespace collapses per HTML semantics except inside `<pre>`;
19
+ * block-edge whitespace is trimmed; whitespace-only text between blocks
20
+ * is dropped. `&nbsp;` (U+00A0) never collapses.
21
+ * - Styles are ignored: Google-Docs-style `<b style="font-weight:normal">`
22
+ * parses as bold.
23
+ */
24
+ import type { Document } from '@rtif-sdk/core';
25
+ import type { ResolvedHtmlRules } from './rules.js';
26
+ /** Parse an HTML string into a valid normalized document. Never throws. */
27
+ export declare function parseHtml(input: string, rules: ResolvedHtmlRules): Document;
28
+ //# sourceMappingURL=parse.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"parse.d.ts","sourceRoot":"","sources":["../../src/html/parse.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAEH,OAAO,KAAK,EAAgB,QAAQ,EAAe,MAAM,gBAAgB,CAAC;AAK1E,OAAO,KAAK,EAA+B,iBAAiB,EAAE,MAAM,YAAY,CAAC;AAyRjF,2EAA2E;AAC3E,wBAAgB,SAAS,CAAC,KAAK,EAAE,MAAM,EAAE,KAAK,EAAE,iBAAiB,GAAG,QAAQ,CAuB3E"}