xml-to-html-converter 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -10,12 +10,40 @@ A zero-dependency Node.js package for converting XML to HTML. Currently in pre-1
10
10
 
11
11
  ---
12
12
 
13
+ ## v0.1.x: XML Node Extraction & Scaffolding
14
+
15
+ Version `0.1.x` is focused entirely on parsing raw XML into a structured tree of nodes. The `scaffold` function walks an XML string and produces an array of `XmlNode` objects, each carrying its role, its raw source text, and its position in the document, both globally across the full document and locally within its parent.
16
+
17
+ ```ts
18
+ interface XmlNode {
19
+ role: XmlNodeRole;
20
+ raw: string;
21
+ globalIndex: number;
22
+ localIndex: number;
23
+ children?: XmlNode[];
24
+ malformed?: true;
25
+ }
26
+
27
+ type XmlNodeRole =
28
+ | "closeTag"
29
+ | "comment"
30
+ | "doctype"
31
+ | "openTag"
32
+ | "processingInstruction"
33
+ | "selfTag"
34
+ | "textLeaf";
35
+ ```
36
+
37
+ This scaffold is the foundation everything else will be built on. No transformation, no HTML output, no opinions about content, just an accurate, traversable representation of what the XML says.
38
+
39
+ ---
40
+
13
41
  > **Where I am right now**
14
42
  >
15
- > `v0.x` is building the scaffold - a structural tree of every node in your XML document, each carrying its raw source string and its exact position in the document. This scaffold is what the HTML converter will walk when it's built.
43
+ > `v0.x` is building the scaffold: a structural tree of every node in your XML document, each carrying its raw source string and its exact position in the document. This scaffold is what the HTML converter will walk when it's built.
16
44
  >
17
- > - **`scaffold(xml)`** reads any XML string and returns a nested token tree
18
- > - Every token knows its `role`, its `raw` source string, its `globalIndex` in the document, and its `localIndex` within its parent
45
+ > - **`scaffold(xml)`** reads any XML string and returns a nested node tree
46
+ > - Every node knows its `role`, its `raw` source string, its `globalIndex` in the document, and its `localIndex` within its parent
19
47
  > - Broken XML is never thrown - malformed nodes are flagged with `malformed: true` in place and the tree is built regardless
20
48
  >
21
49
  > `v1.0.0` is when this package becomes what it says it is: a full XML-to-HTML converter. Everything before that is the work to get there.
@@ -45,7 +73,7 @@ const tree = scaffold(`
45
73
  `);
46
74
  ```
47
75
 
48
- `scaffold` returns a flat array of root-level tokens. Each `openTag` token carries its children nested inside it:
76
+ `scaffold` returns a flat array of root-level nodes. Each `openTag` node carries its children nested inside it:
49
77
 
50
78
  ```json
51
79
  [
@@ -58,25 +86,25 @@ const tree = scaffold(`
58
86
  {
59
87
  "role": "openTag",
60
88
  "raw": "<bookstore>",
61
- "globalIndex": 2,
62
- "localIndex": 2,
89
+ "globalIndex": 1,
90
+ "localIndex": 1,
63
91
  "children": [
64
92
  {
65
93
  "role": "openTag",
66
94
  "raw": "<book category=\"cooking\">",
67
- "globalIndex": 4,
68
- "localIndex": 1,
95
+ "globalIndex": 2,
96
+ "localIndex": 0,
69
97
  "children": [
70
98
  {
71
99
  "role": "openTag",
72
100
  "raw": "<title lang=\"en\">",
73
- "globalIndex": 6,
74
- "localIndex": 1,
101
+ "globalIndex": 3,
102
+ "localIndex": 0,
75
103
  "children": [
76
104
  {
77
105
  "role": "textLeaf",
78
106
  "raw": "Everyday Italian",
79
- "globalIndex": 7,
107
+ "globalIndex": 4,
80
108
  "localIndex": 0
81
109
  }
82
110
  ]
@@ -90,31 +118,32 @@ const tree = scaffold(`
90
118
 
91
119
  ---
92
120
 
93
- ## Token Shape
121
+ ## Node Shape
94
122
 
95
- Every token in the tree has the following fields:
123
+ Every node in the tree has the following fields:
96
124
 
97
- | Field | Type | Description |
98
- | ------------- | ----------- | ---------------------------------------------------- |
99
- | `role` | `TokenRole` | What kind of token this is |
100
- | `raw` | `string` | The exact source string, untouched |
101
- | `globalIndex` | `number` | Position in the entire document (never resets) |
102
- | `localIndex` | `number` | Position within the parent's children array |
103
- | `children` | `Token[]` | Present only on `openTag` - the nested tokens inside |
104
- | `malformed` | `true` | Present only when the structure is broken |
125
+ | Field | Type | Description |
126
+ | ------------- | ------------- | --------------------------------------------------- |
127
+ | `role` | `XmlNodeRole` | What kind of node this is |
128
+ | `raw` | `string` | The exact source string, untouched |
129
+ | `globalIndex` | `number` | Position in the entire document (never resets) |
130
+ | `localIndex` | `number` | Position within the parent's children array |
131
+ | `children` | `XmlNode[]` | Present only on `openTag` - the nested nodes inside |
132
+ | `malformed` | `true` | Present only when the structure is broken |
105
133
 
106
134
  ---
107
135
 
108
- ## Token Roles
136
+ ## Node Roles
109
137
 
110
- | Role | Has children | Description |
111
- | ----------------------- | ------------ | -------------------------------------------- |
112
- | `openTag` | yes | An opening tag, e.g. `<book category="web">` |
113
- | `selfTag` | no | A self-closing tag, e.g. `<br/>` |
114
- | `closeTag` | no | Only appears when stray (no matching open) |
115
- | `processingInstruction` | no | e.g. `<?xml version="1.0"?>` |
116
- | `comment` | no | e.g. `<!-- a comment -->` |
117
- | `textLeaf` | no | Text content between tags |
138
+ | Role | Has children | Description |
139
+ | ----------------------- | ------------ | --------------------------------------------------- |
140
+ | `openTag` | yes | An opening tag, e.g. `<book category="web">` |
141
+ | `selfTag` | no | A self-closing tag, e.g. `<br/>` |
142
+ | `closeTag` | no | Only appears when stray (no matching open) |
143
+ | `processingInstruction` | no | e.g. `<?xml version="1.0"?>` |
144
+ | `comment` | no | e.g. `<!-- a comment -->` |
145
+ | `textLeaf` | no | Text content between tags, including CDATA sections |
146
+ | `doctype` | no | e.g. `<!DOCTYPE html>` or `<!DOCTYPE root [...]>` |
118
147
 
119
148
  ---
120
149
 
@@ -171,6 +200,27 @@ const tree = scaffold("<root><unclosed><valid>text</valid></root>");
171
200
 
172
201
  ---
173
202
 
203
+ ## Exports
204
+
205
+ ```ts
206
+ import { scaffold, isMalformed } from "xml-to-html-converter";
207
+ import type {
208
+ XmlNode,
209
+ XmlNodeRole,
210
+ MalformedXmlNode,
211
+ } from "xml-to-html-converter";
212
+ ```
213
+
214
+ | Export | Kind | Description |
215
+ | ------------------ | -------- | --------------------------------------------------- |
216
+ | `scaffold` | function | Parses an XML string and returns a node tree |
217
+ | `isMalformed` | function | Type guard, narrows `XmlNode` to `MalformedXmlNode` |
218
+ | `XmlNode` | type | The shape of every node in the tree |
219
+ | `XmlNodeRole` | type | Union of all valid role strings |
220
+ | `MalformedXmlNode` | type | `XmlNode` narrowed to `{ malformed: true }` |
221
+
222
+ ---
223
+
174
224
  ## Requirements
175
225
 
176
226
  Node.js `>=20.0.0`
package/dist/index.d.ts CHANGED
@@ -1,88 +1,13 @@
1
- interface DocumentNode {
2
- type: 'document';
3
- children: Node[];
4
- }
5
- interface ElementNode {
6
- type: 'element';
7
- tag: string;
8
- attributes: Record<string, string>;
9
- children: Node[];
10
- malformed?: true;
11
- }
12
- interface TextNode {
13
- type: 'text';
14
- value: string;
15
- }
16
- interface CommentNode {
17
- type: 'comment';
18
- value: string;
19
- }
20
- interface CDataNode {
21
- type: 'cdata';
22
- value: string;
23
- }
24
- interface ProcessingInstructionNode {
25
- type: 'processing-instruction';
26
- target: string;
27
- attributes: Record<string, string>;
28
- }
29
- interface MalformedNode {
30
- type: 'malformed';
31
- raw: string;
32
- malformed: true;
33
- }
34
- type Node = ElementNode | TextNode | CommentNode | CDataNode | ProcessingInstructionNode | MalformedNode;
35
-
36
- declare function parse(xml: string): DocumentNode;
37
-
38
- declare const TokenType: {
39
- readonly PROCESSING_INSTRUCTION: "processing-instruction";
40
- readonly ELEMENT_OPEN: "element-open";
41
- readonly ELEMENT_CLOSE: "element-close";
42
- readonly SELF_CLOSING: "self-closing";
43
- readonly TEXT: "text";
44
- readonly COMMENT: "comment";
45
- readonly CDATA: "cdata";
46
- readonly MALFORMED: "malformed";
47
- };
48
- interface ProcessingInstructionToken {
49
- type: typeof TokenType.PROCESSING_INSTRUCTION;
50
- target: string;
51
- attributes: Record<string, string>;
52
- }
53
- interface ElementOpenToken {
54
- type: typeof TokenType.ELEMENT_OPEN;
55
- tag: string;
56
- attributes: Record<string, string>;
57
- }
58
- interface ElementCloseToken {
59
- type: typeof TokenType.ELEMENT_CLOSE;
60
- tag: string;
61
- }
62
- interface SelfClosingToken {
63
- type: typeof TokenType.SELF_CLOSING;
64
- tag: string;
65
- attributes: Record<string, string>;
66
- }
67
- interface TextToken {
68
- type: typeof TokenType.TEXT;
69
- value: string;
70
- }
71
- interface CommentToken {
72
- type: typeof TokenType.COMMENT;
73
- value: string;
74
- }
75
- interface CDataToken {
76
- type: typeof TokenType.CDATA;
77
- value: string;
78
- }
79
- interface MalformedToken {
80
- type: typeof TokenType.MALFORMED;
1
+ type XmlNodeRole = "openTag" | "closeTag" | "selfTag" | "processingInstruction" | "comment" | "textLeaf";
2
+ interface XmlNode {
3
+ role: XmlNodeRole;
81
4
  raw: string;
5
+ globalIndex: number;
6
+ localIndex: number;
7
+ children?: XmlNode[];
8
+ malformed?: true;
82
9
  }
83
- type Token = ProcessingInstructionToken | ElementOpenToken | ElementCloseToken | SelfClosingToken | TextToken | CommentToken | CDataToken | MalformedToken;
84
- type ContentToken = Exclude<Token, ElementOpenToken | ElementCloseToken>;
85
10
 
86
- declare function tokenize(xml: string): Token[];
11
+ declare function scaffold(xml: string): XmlNode[];
87
12
 
88
- export { type CDataNode, type CommentNode, type ContentToken, type DocumentNode, type ElementNode, type MalformedNode, type Node, type ProcessingInstructionNode, type TextNode, type Token, TokenType, parse, tokenize };
13
+ export { type XmlNode, type XmlNodeRole, scaffold };
package/dist/index.js CHANGED
@@ -1,179 +1,116 @@
1
- // src/modules/tokenizer/types.ts
2
- var TokenType = {
3
- PROCESSING_INSTRUCTION: "processing-instruction",
4
- ELEMENT_OPEN: "element-open",
5
- ELEMENT_CLOSE: "element-close",
6
- SELF_CLOSING: "self-closing",
7
- TEXT: "text",
8
- COMMENT: "comment",
9
- CDATA: "cdata",
10
- MALFORMED: "malformed"
11
- };
12
-
13
- // src/modules/tokenizer/tokenizer.ts
14
- var WHITESPACE = /\s/;
15
- function parseAttributes(raw) {
16
- const attributes = {};
17
- const pattern = /(\S+?)\s*=\s*(["'])([^"']*)\2/g;
18
- let match;
19
- while ((match = pattern.exec(raw)) !== null) {
20
- attributes[match[1]] = match[3];
21
- }
22
- return attributes;
1
+ // src/modules/scaffold/scaffold.ts
2
+ function scaffold(xml) {
3
+ const counter = { value: 0 };
4
+ const { xmlNodes } = collectXmlNodes(xml, 0, null, counter);
5
+ return xmlNodes;
23
6
  }
24
- function nextToken(xml, position) {
25
- if (xml[position] !== "<") {
26
- const end2 = xml.indexOf("<", position);
27
- const value = xml.slice(position, end2 === -1 ? xml.length : end2);
28
- return {
29
- token: value.trim() ? { type: TokenType.TEXT, value } : null,
30
- end: end2 === -1 ? xml.length : end2
31
- };
32
- }
33
- const next = xml[position + 1];
34
- if (next === "?") {
35
- const closeIndex = xml.indexOf("?>", position);
36
- if (closeIndex === -1)
37
- return {
38
- token: { type: TokenType.MALFORMED, raw: xml.slice(position) },
39
- end: xml.length
40
- };
41
- const end2 = closeIndex + 2;
42
- const inner2 = xml.slice(position + 2, end2 - 2).trim();
43
- const space2 = inner2.search(WHITESPACE);
44
- return {
45
- token: {
46
- type: TokenType.PROCESSING_INSTRUCTION,
47
- target: space2 === -1 ? inner2 : inner2.slice(0, space2),
48
- attributes: parseAttributes(inner2)
49
- },
50
- end: end2
51
- };
52
- }
53
- if (next === "!" && xml[position + 2] === "-") {
54
- const closeIndex = xml.indexOf("-->", position);
55
- if (closeIndex === -1)
56
- return {
57
- token: { type: TokenType.MALFORMED, raw: xml.slice(position) },
58
- end: xml.length
59
- };
60
- const end2 = closeIndex + 3;
61
- return {
62
- token: {
63
- type: TokenType.COMMENT,
64
- value: xml.slice(position + 4, end2 - 3)
65
- },
66
- end: end2
67
- };
68
- }
69
- if (next === "!" && xml[position + 2] === "[") {
70
- const closeIndex = xml.indexOf("]]>", position);
71
- if (closeIndex === -1)
72
- return {
73
- token: { type: TokenType.MALFORMED, raw: xml.slice(position) },
74
- end: xml.length
75
- };
76
- const end2 = closeIndex + 3;
77
- return {
78
- token: { type: TokenType.CDATA, value: xml.slice(position + 9, end2 - 3) },
79
- end: end2
80
- };
81
- }
82
- const end = xml.indexOf(">", position) + 1;
83
- if (!end)
84
- return {
85
- token: { type: TokenType.MALFORMED, raw: xml.slice(position) },
86
- end: xml.length
87
- };
88
- const raw = xml.slice(position + 1, end - 1).trim();
89
- if (raw[0] === "/")
90
- return {
91
- token: { type: TokenType.ELEMENT_CLOSE, tag: raw.slice(1).trim() },
92
- end
93
- };
94
- const selfClosing = raw[raw.length - 1] === "/";
95
- const inner = selfClosing ? raw.slice(0, -1).trim() : raw;
96
- const space = inner.search(WHITESPACE);
97
- const tag = space === -1 ? inner : inner.slice(0, space);
98
- const type = selfClosing ? TokenType.SELF_CLOSING : TokenType.ELEMENT_OPEN;
99
- return { token: { type, tag, attributes: parseAttributes(inner) }, end };
100
- }
101
- function tokenize(xml) {
102
- const tokens = [];
103
- let position = 0;
7
+ function collectXmlNodes(xml, position, parentTag, counter) {
8
+ const xmlNodes = [];
104
9
  while (position < xml.length) {
105
- const { token, end } = nextToken(xml, position);
106
- if (token) tokens.push(token);
107
- position = end;
108
- }
109
- return tokens;
110
- }
111
-
112
- // src/modules/parser/parser.ts
113
- function parse(xml) {
114
- const tokens = tokenize(xml);
115
- const cursor = { position: 0 };
116
- const { children } = collectChildren(tokens, cursor, null);
117
- return { type: "document", children };
118
- }
119
- function collectChildren(tokens, cursor, parentTag) {
120
- const children = [];
121
- while (cursor.position < tokens.length) {
122
- const token = tokens[cursor.position];
123
- cursor.position++;
124
- if (token.type === TokenType.ELEMENT_CLOSE) {
125
- if (token.tag === parentTag) return { children, closed: true };
126
- children.push({
127
- type: "malformed",
128
- raw: `</${token.tag}>`,
10
+ const xmlNodeData = extractXmlNodes(xml, position);
11
+ if (xmlNodeData.role === "closeTag") {
12
+ if (xmlNodeData.tag === parentTag)
13
+ return { xmlNodes, position: xmlNodeData.end, closed: true };
14
+ xmlNodes.push({
15
+ role: "closeTag",
16
+ raw: xmlNodeData.raw,
17
+ globalIndex: counter.value++,
18
+ localIndex: xmlNodes.length,
129
19
  malformed: true
130
20
  });
21
+ position = xmlNodeData.end;
131
22
  continue;
132
23
  }
133
- if (token.type === TokenType.ELEMENT_OPEN) {
134
- const { children: elementChildren, closed } = collectChildren(
135
- tokens,
136
- cursor,
137
- token.tag
24
+ if (xmlNodeData.role === "openTag" && !xmlNodeData.malformed) {
25
+ const globalIndex = counter.value++;
26
+ const localIndex = xmlNodes.length;
27
+ const nested = collectXmlNodes(
28
+ xml,
29
+ xmlNodeData.end,
30
+ xmlNodeData.tag,
31
+ counter
138
32
  );
139
- const node = {
140
- type: "element",
141
- tag: token.tag,
142
- attributes: token.attributes,
143
- children: elementChildren
33
+ const xmlNode2 = {
34
+ role: "openTag",
35
+ raw: xmlNodeData.raw,
36
+ globalIndex,
37
+ localIndex,
38
+ children: nested.xmlNodes
144
39
  };
145
- if (!closed) node.malformed = true;
146
- children.push(node);
40
+ if (!nested.closed) xmlNode2.malformed = true;
41
+ xmlNodes.push(xmlNode2);
42
+ position = nested.position;
147
43
  continue;
148
44
  }
149
- children.push(tokenToNode(token));
45
+ const xmlNode = {
46
+ role: xmlNodeData.role,
47
+ raw: xmlNodeData.raw,
48
+ globalIndex: counter.value++,
49
+ localIndex: xmlNodes.length
50
+ };
51
+ if (xmlNodeData.malformed) xmlNode.malformed = true;
52
+ if (xmlNodeData.role === "openTag") xmlNode.children = [];
53
+ xmlNodes.push(xmlNode);
54
+ position = xmlNodeData.end;
150
55
  }
151
- return { children, closed: parentTag === null };
56
+ return { xmlNodes, position, closed: parentTag === null };
152
57
  }
153
- function tokenToNode(token) {
154
- if (token.type === TokenType.TEXT)
155
- return { type: "text", value: token.value };
156
- if (token.type === TokenType.COMMENT)
157
- return { type: "comment", value: token.value };
158
- if (token.type === TokenType.CDATA)
159
- return { type: "cdata", value: token.value };
160
- if (token.type === TokenType.SELF_CLOSING)
58
+ function extractXmlNodes(xml, position) {
59
+ if (xml[position] !== "<") {
60
+ const end2 = xml.indexOf("<", position);
161
61
  return {
162
- type: "element",
163
- tag: token.tag,
164
- attributes: token.attributes,
165
- children: []
62
+ raw: xml.slice(position, end2 === -1 ? xml.length : end2),
63
+ role: "textLeaf",
64
+ tag: "",
65
+ end: end2 === -1 ? xml.length : end2
66
+ };
67
+ }
68
+ if (xml[position + 1] === "?") {
69
+ const end2 = xml.indexOf("?>", position + 2);
70
+ return end2 === -1 ? {
71
+ raw: xml.slice(position),
72
+ role: "processingInstruction",
73
+ tag: "",
74
+ end: xml.length
75
+ } : {
76
+ raw: xml.slice(position, end2 + 2),
77
+ role: "processingInstruction",
78
+ tag: "",
79
+ end: end2 + 2
166
80
  };
167
- if (token.type === TokenType.PROCESSING_INSTRUCTION)
81
+ }
82
+ if (xml[position + 1] === "!" && xml[position + 2] === "-") {
83
+ const end2 = xml.indexOf("-->", position + 4);
84
+ return end2 === -1 ? { raw: xml.slice(position), role: "comment", tag: "", end: xml.length } : {
85
+ raw: xml.slice(position, end2 + 3),
86
+ role: "comment",
87
+ tag: "",
88
+ end: end2 + 3
89
+ };
90
+ }
91
+ const closeAt = xml.indexOf(">", position);
92
+ if (closeAt === -1)
168
93
  return {
169
- type: "processing-instruction",
170
- target: token.target,
171
- attributes: token.attributes
94
+ raw: xml.slice(position),
95
+ role: "openTag",
96
+ tag: "",
97
+ end: xml.length,
98
+ malformed: true
172
99
  };
173
- return { type: "malformed", raw: token.raw, malformed: true };
100
+ const raw = xml.slice(position, closeAt + 1);
101
+ const end = closeAt + 1;
102
+ const inner = xml.slice(position + 1, closeAt).trim();
103
+ if (inner.startsWith("/")) {
104
+ const tag2 = inner.slice(1).trim().split(/\s/)[0] ?? "";
105
+ return { raw, role: "closeTag", tag: tag2, end };
106
+ }
107
+ if (inner.endsWith("/")) {
108
+ const tag2 = inner.slice(0, -1).trim().split(/\s/)[0] ?? "";
109
+ return { raw, role: "selfTag", tag: tag2, end };
110
+ }
111
+ const tag = inner.split(/\s/)[0] ?? "";
112
+ return { raw, role: "openTag", tag, end };
174
113
  }
175
114
  export {
176
- TokenType,
177
- parse,
178
- tokenize
115
+ scaffold
179
116
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "xml-to-html-converter",
3
- "version": "0.1.5",
3
+ "version": "0.1.6",
4
4
  "description": "Zero dependency XML to HTML converter for Node environments",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",