xml-to-html-converter 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -6,19 +6,17 @@
6
6
  ![XML](https://img.shields.io/badge/input-XML-orange)
7
7
  ![HTML](https://img.shields.io/badge/output-HTML-red)
8
8
 
9
- A zero-dependency Node.js package for converting XML to HTML. Currently in pre-1.0.0 development, building the foundation one functional part at a time. Full XML-to-HTML conversion is the goal of `v1.0.0`, but for now, I will be building 0 dependency tools that will eventually be used to convert an xml to html.
9
+ A zero-dependency Node.js package for converting XML to HTML. Currently in pre-1.0.0 development, building the foundation one functional part at a time. Full XML-to-HTML conversion is the goal of `v1.0.0`.
10
10
 
11
11
  ---
12
12
 
13
13
  > **Where I am right now**
14
14
  >
15
- > The goal of this package is to take any XML document and convert it to HTML. Currently, I am building the foundation which is taking an xml doc of any kind, and turning it into a json object otherwise known as a document node.
15
+ > `v0.x` is building the scaffold - a structural tree of every node in your XML document, each carrying its raw source string and its exact position in the document. This scaffold is what the HTML converter will walk when it's built.
16
16
  >
17
- > What `v0.x` builds is the foundation that makes `v1.0.0` possible:
18
- >
19
- > - **A parser** takes any XML string and returns a plain JS document tree
20
- > - **A fault-tolerant verifier** flags every broken node with `malformed: true` in-place, giving you a complete map of exactly where and what broke
21
- > - **An intermediate representation** is a plain JS object tree that the HTML converter will walk when it's built
17
+ > - **`scaffold(xml)`** reads any XML string and returns a nested token tree
18
+ > - Every token knows its `role`, its `raw` source string, its `globalIndex` in the document, and its `localIndex` within its parent
19
+ > - Broken XML is never thrown - malformed nodes are flagged with `malformed: true` in place and the tree is built regardless
22
20
  >
23
21
  > `v1.0.0` is when this package becomes what it says it is: a full XML-to-HTML converter. Everything before that is the work to get there.
24
22
 
@@ -35,126 +33,147 @@ npm install xml-to-html-converter
35
33
  ## Usage
36
34
 
37
35
  ```js
38
- import { parse } from "xml-to-html-converter";
36
+ import { scaffold } from "xml-to-html-converter";
39
37
 
40
- const tree = parse(`
38
+ const tree = scaffold(`
41
39
  <?xml version="1.0" encoding="UTF-8"?>
42
40
  <bookstore>
43
41
  <book category="cooking">
44
42
  <title lang="en">Everyday Italian</title>
45
- <author>Giada De Laurentiis</author>
46
43
  </book>
47
44
  </bookstore>
48
45
  `);
49
46
  ```
50
47
 
51
- `parse` returns a document node wrapping the full tree:
48
+ `scaffold` returns a flat array of root-level tokens. Each `openTag` token carries its children nested inside it:
52
49
 
53
50
  ```json
54
- {
55
- "type": "document",
56
- "children": [
57
- {
58
- "type": "processing-instruction",
59
- "target": "xml",
60
- "attributes": { "version": "1.0", "encoding": "UTF-8" }
61
- },
62
- {
63
- "type": "element",
64
- "tag": "bookstore",
65
- "attributes": {},
66
- "children": [
67
- {
68
- "type": "element",
69
- "tag": "book",
70
- "attributes": { "category": "cooking" },
71
- "children": [
72
- {
73
- "type": "element",
74
- "tag": "title",
75
- "attributes": { "lang": "en" },
76
- "children": [{ "type": "text", "value": "Everyday Italian" }]
77
- },
78
- {
79
- "type": "element",
80
- "tag": "author",
81
- "attributes": {},
82
- "children": [{ "type": "text", "value": "Giada De Laurentiis" }]
83
- }
84
- ]
85
- }
86
- ]
87
- }
88
- ]
89
- }
51
+ [
52
+ {
53
+ "role": "processingInstruction",
54
+ "raw": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>",
55
+ "globalIndex": 0,
56
+ "localIndex": 0
57
+ },
58
+ {
59
+ "role": "openTag",
60
+ "raw": "<bookstore>",
61
+ "globalIndex": 2,
62
+ "localIndex": 2,
63
+ "children": [
64
+ {
65
+ "role": "openTag",
66
+ "raw": "<book category=\"cooking\">",
67
+ "globalIndex": 4,
68
+ "localIndex": 1,
69
+ "children": [
70
+ {
71
+ "role": "openTag",
72
+ "raw": "<title lang=\"en\">",
73
+ "globalIndex": 6,
74
+ "localIndex": 1,
75
+ "children": [
76
+ {
77
+ "role": "textLeaf",
78
+ "raw": "Everyday Italian",
79
+ "globalIndex": 7,
80
+ "localIndex": 0
81
+ }
82
+ ]
83
+ }
84
+ ]
85
+ }
86
+ ]
87
+ }
88
+ ]
90
89
  ```
91
90
 
92
91
  ---
93
92
 
94
- ## Node Types
93
+ ## Token Shape
94
+
95
+ Every token in the tree has the following fields:
96
+
97
+ | Field | Type | Description |
98
+ | ------------- | ----------- | ---------------------------------------------------- |
99
+ | `role` | `TokenRole` | What kind of token this is |
100
+ | `raw` | `string` | The exact source string, untouched |
101
+ | `globalIndex` | `number` | Position in the entire document (never resets) |
102
+ | `localIndex` | `number` | Position within the parent's children array |
103
+ | `children` | `Token[]` | Present only on `openTag` - the nested tokens inside |
104
+ | `malformed` | `true` | Present only when the structure is broken |
105
+
106
+ ---
95
107
 
96
- Every node in the tree has a `type` field.
108
+ ## Token Roles
97
109
 
98
- | Type | Properties |
99
- | ------------------------ | ------------------------------- |
100
- | `document` | `children` |
101
- | `element` | `tag`, `attributes`, `children` |
102
- | `text` | `value` |
103
- | `comment` | `value` |
104
- | `cdata` | `value` |
105
- | `processing-instruction` | `target`, `attributes` |
106
- | `malformed` | `raw`, `malformed: true` |
110
+ | Role | Has children | Description |
111
+ | ----------------------- | ------------ | -------------------------------------------- |
112
+ | `openTag` | yes | An opening tag, e.g. `<book category="web">` |
113
+ | `selfTag` | no | A self-closing tag, e.g. `<br/>` |
114
+ | `closeTag` | no | Only appears when stray (no matching open) |
115
+ | `processingInstruction` | no | e.g. `<?xml version="1.0"?>` |
116
+ | `comment` | no | e.g. `<!-- a comment -->` |
117
+ | `textLeaf` | no | Text content between tags |
107
118
 
108
119
  ---
109
120
 
110
121
  ## Malformed XML
111
122
 
112
- The parser never throws. No matter what the input looks like, it always returns a complete document tree. I built this because when working with QTI and xsd validation, it was a pain to get things to work and I could never get past the current industries package standards. Additionally, falling down a Java rabbit hole was not something I enjoyed AT ALL. Therefore, malformed structures are flagged with `malformed: true` in-place and the walk continues. The tree is built no matter what your xml looks like.
123
+ `scaffold` never throws. No matter what the input looks like, it always returns a complete tree. Malformed structures are flagged with `malformed: true` in place and the walk continues.
113
124
 
114
- Three types of malformed input are caught:
125
+ Three cases are handled:
115
126
 
116
- - **Unclosed tags** - a tag that opens but never closes gets `malformed: true`, its children are still collected normally
117
- - **Stray closing tags** - a `</tag>` with no matching open becomes a `{ type: 'malformed', raw: '...</tag>', malformed: true }` node at that position
118
- - **Unclosed brackets** - a `<` with no matching `>` before end of string captures the remainder as a malformed node
127
+ - **Unclosed tags** - opens but never closes, gets `malformed: true`, children are still collected
128
+ - **Stray closing tags** - a `</tag>` with no matching open surfaces as a `closeTag` token with `malformed: true`
129
+ - **Unclosed brackets** - a `<` with no matching `>` captures the remainder as a malformed token
119
130
 
120
131
  ```js
121
- const tree = parse("<root><unclosed><valid>text</valid></root>");
132
+ const tree = scaffold("<root><unclosed><valid>text</valid></root>");
122
133
  ```
123
134
 
124
135
  ```json
125
- {
126
- "type": "document",
127
- "children": [
128
- {
129
- "type": "element",
130
- "tag": "root",
131
- "attributes": {},
132
- "children": [
133
- {
134
- "type": "element",
135
- "tag": "unclosed",
136
- "attributes": {},
137
- "malformed": true,
138
- "children": [
139
- {
140
- "type": "element",
141
- "tag": "valid",
142
- "attributes": {},
143
- "children": [{ "type": "text", "value": "text" }]
144
- }
145
- ]
146
- }
147
- ]
148
- }
149
- ]
150
- }
136
+ [
137
+ {
138
+ "role": "openTag",
139
+ "raw": "<root>",
140
+ "globalIndex": 0,
141
+ "localIndex": 0,
142
+ "malformed": true,
143
+ "children": [
144
+ {
145
+ "role": "openTag",
146
+ "raw": "<unclosed>",
147
+ "globalIndex": 1,
148
+ "localIndex": 0,
149
+ "malformed": true,
150
+ "children": [
151
+ {
152
+ "role": "openTag",
153
+ "raw": "<valid>",
154
+ "globalIndex": 2,
155
+ "localIndex": 0,
156
+ "children": [
157
+ {
158
+ "role": "textLeaf",
159
+ "raw": "text",
160
+ "globalIndex": 3,
161
+ "localIndex": 0
162
+ }
163
+ ]
164
+ }
165
+ ]
166
+ }
167
+ ]
168
+ }
169
+ ]
151
170
  ```
152
171
 
153
172
  ---
154
173
 
155
174
  ## Requirements
156
175
 
157
- Node.js `>=18.0.0`
176
+ Node.js `>=20.0.0`
158
177
 
159
178
  ---
160
179
 
package/dist/index.d.ts CHANGED
@@ -32,6 +32,7 @@ interface MalformedNode {
32
32
  malformed: true;
33
33
  }
34
34
  type Node = ElementNode | TextNode | CommentNode | CDataNode | ProcessingInstructionNode | MalformedNode;
35
+
35
36
  declare function parse(xml: string): DocumentNode;
36
37
 
37
38
  declare const TokenType: {
@@ -44,7 +45,6 @@ declare const TokenType: {
44
45
  readonly CDATA: "cdata";
45
46
  readonly MALFORMED: "malformed";
46
47
  };
47
- type TokenTypeValue = typeof TokenType[keyof typeof TokenType];
48
48
  interface ProcessingInstructionToken {
49
49
  type: typeof TokenType.PROCESSING_INSTRUCTION;
50
50
  target: string;
@@ -81,5 +81,8 @@ interface MalformedToken {
81
81
  raw: string;
82
82
  }
83
83
  type Token = ProcessingInstructionToken | ElementOpenToken | ElementCloseToken | SelfClosingToken | TextToken | CommentToken | CDataToken | MalformedToken;
84
+ type ContentToken = Exclude<Token, ElementOpenToken | ElementCloseToken>;
85
+
86
+ declare function tokenize(xml: string): Token[];
84
87
 
85
- export { type CDataNode, type CommentNode, type DocumentNode, type ElementNode, type MalformedNode, type Node, type ProcessingInstructionNode, type TextNode, type Token, type TokenTypeValue, parse };
88
+ export { type CDataNode, type CommentNode, type ContentToken, type DocumentNode, type ElementNode, type MalformedNode, type Node, type ProcessingInstructionNode, type TextNode, type Token, TokenType, parse, tokenize };
package/dist/index.js CHANGED
@@ -1,4 +1,4 @@
1
- // src/tokenizer.ts
1
+ // src/modules/tokenizer/types.ts
2
2
  var TokenType = {
3
3
  PROCESSING_INSTRUCTION: "processing-instruction",
4
4
  ELEMENT_OPEN: "element-open",
@@ -9,13 +9,15 @@ var TokenType = {
9
9
  CDATA: "cdata",
10
10
  MALFORMED: "malformed"
11
11
  };
12
+
13
+ // src/modules/tokenizer/tokenizer.ts
12
14
  var WHITESPACE = /\s/;
13
15
  function parseAttributes(raw) {
14
16
  const attributes = {};
15
- const pattern = /(\S+?)\s*=\s*["']([^"']*)["']/g;
17
+ const pattern = /(\S+?)\s*=\s*(["'])([^"']*)\2/g;
16
18
  let match;
17
19
  while ((match = pattern.exec(raw)) !== null) {
18
- attributes[match[1]] = match[2];
20
+ attributes[match[1]] = match[3];
19
21
  }
20
22
  return attributes;
21
23
  }
@@ -23,33 +25,72 @@ function nextToken(xml, position) {
23
25
  if (xml[position] !== "<") {
24
26
  const end2 = xml.indexOf("<", position);
25
27
  const value = xml.slice(position, end2 === -1 ? xml.length : end2);
26
- return { token: value.trim() ? { type: TokenType.TEXT, value } : null, end: end2 === -1 ? xml.length : end2 };
28
+ return {
29
+ token: value.trim() ? { type: TokenType.TEXT, value } : null,
30
+ end: end2 === -1 ? xml.length : end2
31
+ };
27
32
  }
28
33
  const next = xml[position + 1];
29
34
  if (next === "?") {
30
35
  const closeIndex = xml.indexOf("?>", position);
31
- if (closeIndex === -1) return { token: { type: TokenType.MALFORMED, raw: xml.slice(position) }, end: xml.length };
36
+ if (closeIndex === -1)
37
+ return {
38
+ token: { type: TokenType.MALFORMED, raw: xml.slice(position) },
39
+ end: xml.length
40
+ };
32
41
  const end2 = closeIndex + 2;
33
42
  const inner2 = xml.slice(position + 2, end2 - 2).trim();
34
43
  const space2 = inner2.search(WHITESPACE);
35
- return { token: { type: TokenType.PROCESSING_INSTRUCTION, target: space2 === -1 ? inner2 : inner2.slice(0, space2), attributes: parseAttributes(inner2) }, end: end2 };
44
+ return {
45
+ token: {
46
+ type: TokenType.PROCESSING_INSTRUCTION,
47
+ target: space2 === -1 ? inner2 : inner2.slice(0, space2),
48
+ attributes: parseAttributes(inner2)
49
+ },
50
+ end: end2
51
+ };
36
52
  }
37
53
  if (next === "!" && xml[position + 2] === "-") {
38
54
  const closeIndex = xml.indexOf("-->", position);
39
- if (closeIndex === -1) return { token: { type: TokenType.MALFORMED, raw: xml.slice(position) }, end: xml.length };
55
+ if (closeIndex === -1)
56
+ return {
57
+ token: { type: TokenType.MALFORMED, raw: xml.slice(position) },
58
+ end: xml.length
59
+ };
40
60
  const end2 = closeIndex + 3;
41
- return { token: { type: TokenType.COMMENT, value: xml.slice(position + 4, end2 - 3) }, end: end2 };
61
+ return {
62
+ token: {
63
+ type: TokenType.COMMENT,
64
+ value: xml.slice(position + 4, end2 - 3)
65
+ },
66
+ end: end2
67
+ };
42
68
  }
43
69
  if (next === "!" && xml[position + 2] === "[") {
44
70
  const closeIndex = xml.indexOf("]]>", position);
45
- if (closeIndex === -1) return { token: { type: TokenType.MALFORMED, raw: xml.slice(position) }, end: xml.length };
71
+ if (closeIndex === -1)
72
+ return {
73
+ token: { type: TokenType.MALFORMED, raw: xml.slice(position) },
74
+ end: xml.length
75
+ };
46
76
  const end2 = closeIndex + 3;
47
- return { token: { type: TokenType.CDATA, value: xml.slice(position + 9, end2 - 3) }, end: end2 };
77
+ return {
78
+ token: { type: TokenType.CDATA, value: xml.slice(position + 9, end2 - 3) },
79
+ end: end2
80
+ };
48
81
  }
49
82
  const end = xml.indexOf(">", position) + 1;
50
- if (!end) return { token: { type: TokenType.MALFORMED, raw: xml.slice(position) }, end: xml.length };
83
+ if (!end)
84
+ return {
85
+ token: { type: TokenType.MALFORMED, raw: xml.slice(position) },
86
+ end: xml.length
87
+ };
51
88
  const raw = xml.slice(position + 1, end - 1).trim();
52
- if (raw[0] === "/") return { token: { type: TokenType.ELEMENT_CLOSE, tag: raw.slice(1).trim() }, end };
89
+ if (raw[0] === "/")
90
+ return {
91
+ token: { type: TokenType.ELEMENT_CLOSE, tag: raw.slice(1).trim() },
92
+ end
93
+ };
53
94
  const selfClosing = raw[raw.length - 1] === "/";
54
95
  const inner = selfClosing ? raw.slice(0, -1).trim() : raw;
55
96
  const space = inner.search(WHITESPACE);
@@ -68,11 +109,11 @@ function tokenize(xml) {
68
109
  return tokens;
69
110
  }
70
111
 
71
- // src/parser.ts
112
+ // src/modules/parser/parser.ts
72
113
  function parse(xml) {
73
114
  const tokens = tokenize(xml);
74
115
  const cursor = { position: 0 };
75
- const children = collectChildren(tokens, cursor, null);
116
+ const { children } = collectChildren(tokens, cursor, null);
76
117
  return { type: "document", children };
77
118
  }
78
119
  function collectChildren(tokens, cursor, parentTag) {
@@ -81,33 +122,58 @@ function collectChildren(tokens, cursor, parentTag) {
81
122
  const token = tokens[cursor.position];
82
123
  cursor.position++;
83
124
  if (token.type === TokenType.ELEMENT_CLOSE) {
84
- if (token.tag === parentTag) return children;
85
- children.push({ type: "malformed", raw: `</${token.tag}>`, malformed: true });
125
+ if (token.tag === parentTag) return { children, closed: true };
126
+ children.push({
127
+ type: "malformed",
128
+ raw: `</${token.tag}>`,
129
+ malformed: true
130
+ });
86
131
  continue;
87
132
  }
88
133
  if (token.type === TokenType.ELEMENT_OPEN) {
89
- const node = { type: "element", tag: token.tag, attributes: token.attributes, children: collectChildren(tokens, cursor, token.tag) };
134
+ const { children: elementChildren, closed } = collectChildren(
135
+ tokens,
136
+ cursor,
137
+ token.tag
138
+ );
139
+ const node = {
140
+ type: "element",
141
+ tag: token.tag,
142
+ attributes: token.attributes,
143
+ children: elementChildren
144
+ };
145
+ if (!closed) node.malformed = true;
90
146
  children.push(node);
91
147
  continue;
92
148
  }
93
149
  children.push(tokenToNode(token));
94
150
  }
95
- if (parentTag !== null) {
96
- children.forEach((child) => {
97
- child.malformed = true;
98
- });
99
- }
100
- return children;
151
+ return { children, closed: parentTag === null };
101
152
  }
102
153
  function tokenToNode(token) {
103
- if (token.type === TokenType.TEXT) return { type: "text", value: token.value };
104
- if (token.type === TokenType.COMMENT) return { type: "comment", value: token.value };
105
- if (token.type === TokenType.CDATA) return { type: "cdata", value: token.value };
106
- if (token.type === TokenType.SELF_CLOSING) return { type: "element", tag: token.tag, attributes: token.attributes, children: [] };
107
- if (token.type === TokenType.PROCESSING_INSTRUCTION) return { type: "processing-instruction", target: token.target, attributes: token.attributes };
108
- if (token.type === TokenType.MALFORMED) return { type: "malformed", raw: token.raw, malformed: true };
109
- return { type: "malformed", raw: `<${token.tag}>`, malformed: true };
154
+ if (token.type === TokenType.TEXT)
155
+ return { type: "text", value: token.value };
156
+ if (token.type === TokenType.COMMENT)
157
+ return { type: "comment", value: token.value };
158
+ if (token.type === TokenType.CDATA)
159
+ return { type: "cdata", value: token.value };
160
+ if (token.type === TokenType.SELF_CLOSING)
161
+ return {
162
+ type: "element",
163
+ tag: token.tag,
164
+ attributes: token.attributes,
165
+ children: []
166
+ };
167
+ if (token.type === TokenType.PROCESSING_INSTRUCTION)
168
+ return {
169
+ type: "processing-instruction",
170
+ target: token.target,
171
+ attributes: token.attributes
172
+ };
173
+ return { type: "malformed", raw: token.raw, malformed: true };
110
174
  }
111
175
  export {
112
- parse
176
+ TokenType,
177
+ parse,
178
+ tokenize
113
179
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "xml-to-html-converter",
3
- "version": "0.1.3",
3
+ "version": "0.1.4",
4
4
  "description": "Zero dependency XML to HTML converter for Node environments",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",