xml-to-html-converter 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +110 -91
- package/dist/index.d.ts +5 -2
- package/dist/index.js +97 -31
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -6,19 +6,17 @@
|
|
|
6
6
|

|
|
7
7
|

|
|
8
8
|
|
|
9
|
-
A zero-dependency Node.js package for converting XML to HTML. Currently in pre-1.0.0 development, building the foundation one functional part at a time. Full XML-to-HTML conversion is the goal of `v1.0.0
|
|
9
|
+
A zero-dependency Node.js package for converting XML to HTML. Currently in pre-1.0.0 development, building the foundation one functional part at a time. Full XML-to-HTML conversion is the goal of `v1.0.0`.
|
|
10
10
|
|
|
11
11
|
---
|
|
12
12
|
|
|
13
13
|
> **Where I am right now**
|
|
14
14
|
>
|
|
15
|
-
>
|
|
15
|
+
> `v0.x` is building the scaffold - a structural tree of every node in your XML document, each carrying its raw source string and its exact position in the document. This scaffold is what the HTML converter will walk when it's built.
|
|
16
16
|
>
|
|
17
|
-
>
|
|
18
|
-
>
|
|
19
|
-
> -
|
|
20
|
-
> - **A fault-tolerant verifier** flags every broken node with `malformed: true` in-place, giving you a complete map of exactly where and what broke
|
|
21
|
-
> - **An intermediate representation** is a plain JS object tree that the HTML converter will walk when it's built
|
|
17
|
+
> - **`scaffold(xml)`** reads any XML string and returns a nested token tree
|
|
18
|
+
> - Every token knows its `role`, its `raw` source string, its `globalIndex` in the document, and its `localIndex` within its parent
|
|
19
|
+
> - Broken XML is never thrown - malformed nodes are flagged with `malformed: true` in place and the tree is built regardless
|
|
22
20
|
>
|
|
23
21
|
> `v1.0.0` is when this package becomes what it says it is: a full XML-to-HTML converter. Everything before that is the work to get there.
|
|
24
22
|
|
|
@@ -35,126 +33,147 @@ npm install xml-to-html-converter
|
|
|
35
33
|
## Usage
|
|
36
34
|
|
|
37
35
|
```js
|
|
38
|
-
import {
|
|
36
|
+
import { scaffold } from "xml-to-html-converter";
|
|
39
37
|
|
|
40
|
-
const tree =
|
|
38
|
+
const tree = scaffold(`
|
|
41
39
|
<?xml version="1.0" encoding="UTF-8"?>
|
|
42
40
|
<bookstore>
|
|
43
41
|
<book category="cooking">
|
|
44
42
|
<title lang="en">Everyday Italian</title>
|
|
45
|
-
<author>Giada De Laurentiis</author>
|
|
46
43
|
</book>
|
|
47
44
|
</bookstore>
|
|
48
45
|
`);
|
|
49
46
|
```
|
|
50
47
|
|
|
51
|
-
`
|
|
48
|
+
`scaffold` returns a flat array of root-level tokens. Each `openTag` token carries its children nested inside it:
|
|
52
49
|
|
|
53
50
|
```json
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
51
|
+
[
|
|
52
|
+
{
|
|
53
|
+
"role": "processingInstruction",
|
|
54
|
+
"raw": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>",
|
|
55
|
+
"globalIndex": 0,
|
|
56
|
+
"localIndex": 0
|
|
57
|
+
},
|
|
58
|
+
{
|
|
59
|
+
"role": "openTag",
|
|
60
|
+
"raw": "<bookstore>",
|
|
61
|
+
"globalIndex": 2,
|
|
62
|
+
"localIndex": 2,
|
|
63
|
+
"children": [
|
|
64
|
+
{
|
|
65
|
+
"role": "openTag",
|
|
66
|
+
"raw": "<book category=\"cooking\">",
|
|
67
|
+
"globalIndex": 4,
|
|
68
|
+
"localIndex": 1,
|
|
69
|
+
"children": [
|
|
70
|
+
{
|
|
71
|
+
"role": "openTag",
|
|
72
|
+
"raw": "<title lang=\"en\">",
|
|
73
|
+
"globalIndex": 6,
|
|
74
|
+
"localIndex": 1,
|
|
75
|
+
"children": [
|
|
76
|
+
{
|
|
77
|
+
"role": "textLeaf",
|
|
78
|
+
"raw": "Everyday Italian",
|
|
79
|
+
"globalIndex": 7,
|
|
80
|
+
"localIndex": 0
|
|
81
|
+
}
|
|
82
|
+
]
|
|
83
|
+
}
|
|
84
|
+
]
|
|
85
|
+
}
|
|
86
|
+
]
|
|
87
|
+
}
|
|
88
|
+
]
|
|
90
89
|
```
|
|
91
90
|
|
|
92
91
|
---
|
|
93
92
|
|
|
94
|
-
##
|
|
93
|
+
## Token Shape
|
|
94
|
+
|
|
95
|
+
Every token in the tree has the following fields:
|
|
96
|
+
|
|
97
|
+
| Field | Type | Description |
|
|
98
|
+
| ------------- | ----------- | ---------------------------------------------------- |
|
|
99
|
+
| `role` | `TokenRole` | What kind of token this is |
|
|
100
|
+
| `raw` | `string` | The exact source string, untouched |
|
|
101
|
+
| `globalIndex` | `number` | Position in the entire document (never resets) |
|
|
102
|
+
| `localIndex` | `number` | Position within the parent's children array |
|
|
103
|
+
| `children` | `Token[]` | Present only on `openTag` - the nested tokens inside |
|
|
104
|
+
| `malformed` | `true` | Present only when the structure is broken |
|
|
105
|
+
|
|
106
|
+
---
|
|
95
107
|
|
|
96
|
-
|
|
108
|
+
## Token Roles
|
|
97
109
|
|
|
98
|
-
|
|
|
99
|
-
|
|
|
100
|
-
| `
|
|
101
|
-
| `
|
|
102
|
-
| `
|
|
103
|
-
| `
|
|
104
|
-
| `
|
|
105
|
-
| `
|
|
106
|
-
| `malformed` | `raw`, `malformed: true` |
|
|
110
|
+
| Role | Has children | Description |
|
|
111
|
+
| ----------------------- | ------------ | -------------------------------------------- |
|
|
112
|
+
| `openTag` | yes | An opening tag, e.g. `<book category="web">` |
|
|
113
|
+
| `selfTag` | no | A self-closing tag, e.g. `<br/>` |
|
|
114
|
+
| `closeTag` | no | Only appears when stray (no matching open) |
|
|
115
|
+
| `processingInstruction` | no | e.g. `<?xml version="1.0"?>` |
|
|
116
|
+
| `comment` | no | e.g. `<!-- a comment -->` |
|
|
117
|
+
| `textLeaf` | no | Text content between tags |
|
|
107
118
|
|
|
108
119
|
---
|
|
109
120
|
|
|
110
121
|
## Malformed XML
|
|
111
122
|
|
|
112
|
-
|
|
123
|
+
`scaffold` never throws. No matter what the input looks like, it always returns a complete tree. Malformed structures are flagged with `malformed: true` in place and the walk continues.
|
|
113
124
|
|
|
114
|
-
Three
|
|
125
|
+
Three cases are handled:
|
|
115
126
|
|
|
116
|
-
- **Unclosed tags** -
|
|
117
|
-
- **Stray closing tags** - a `</tag>` with no matching open
|
|
118
|
-
- **Unclosed brackets** - a `<` with no matching `>`
|
|
127
|
+
- **Unclosed tags** - opens but never closes, gets `malformed: true`, children are still collected
|
|
128
|
+
- **Stray closing tags** - a `</tag>` with no matching open surfaces as a `closeTag` token with `malformed: true`
|
|
129
|
+
- **Unclosed brackets** - a `<` with no matching `>` captures the remainder as a malformed token
|
|
119
130
|
|
|
120
131
|
```js
|
|
121
|
-
const tree =
|
|
132
|
+
const tree = scaffold("<root><unclosed><valid>text</valid></root>");
|
|
122
133
|
```
|
|
123
134
|
|
|
124
135
|
```json
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
136
|
+
[
|
|
137
|
+
{
|
|
138
|
+
"role": "openTag",
|
|
139
|
+
"raw": "<root>",
|
|
140
|
+
"globalIndex": 0,
|
|
141
|
+
"localIndex": 0,
|
|
142
|
+
"malformed": true,
|
|
143
|
+
"children": [
|
|
144
|
+
{
|
|
145
|
+
"role": "openTag",
|
|
146
|
+
"raw": "<unclosed>",
|
|
147
|
+
"globalIndex": 1,
|
|
148
|
+
"localIndex": 0,
|
|
149
|
+
"malformed": true,
|
|
150
|
+
"children": [
|
|
151
|
+
{
|
|
152
|
+
"role": "openTag",
|
|
153
|
+
"raw": "<valid>",
|
|
154
|
+
"globalIndex": 2,
|
|
155
|
+
"localIndex": 0,
|
|
156
|
+
"children": [
|
|
157
|
+
{
|
|
158
|
+
"role": "textLeaf",
|
|
159
|
+
"raw": "text",
|
|
160
|
+
"globalIndex": 3,
|
|
161
|
+
"localIndex": 0
|
|
162
|
+
}
|
|
163
|
+
]
|
|
164
|
+
}
|
|
165
|
+
]
|
|
166
|
+
}
|
|
167
|
+
]
|
|
168
|
+
}
|
|
169
|
+
]
|
|
151
170
|
```
|
|
152
171
|
|
|
153
172
|
---
|
|
154
173
|
|
|
155
174
|
## Requirements
|
|
156
175
|
|
|
157
|
-
Node.js `>=
|
|
176
|
+
Node.js `>=20.0.0`
|
|
158
177
|
|
|
159
178
|
---
|
|
160
179
|
|
package/dist/index.d.ts
CHANGED
|
@@ -32,6 +32,7 @@ interface MalformedNode {
|
|
|
32
32
|
malformed: true;
|
|
33
33
|
}
|
|
34
34
|
type Node = ElementNode | TextNode | CommentNode | CDataNode | ProcessingInstructionNode | MalformedNode;
|
|
35
|
+
|
|
35
36
|
declare function parse(xml: string): DocumentNode;
|
|
36
37
|
|
|
37
38
|
declare const TokenType: {
|
|
@@ -44,7 +45,6 @@ declare const TokenType: {
|
|
|
44
45
|
readonly CDATA: "cdata";
|
|
45
46
|
readonly MALFORMED: "malformed";
|
|
46
47
|
};
|
|
47
|
-
type TokenTypeValue = typeof TokenType[keyof typeof TokenType];
|
|
48
48
|
interface ProcessingInstructionToken {
|
|
49
49
|
type: typeof TokenType.PROCESSING_INSTRUCTION;
|
|
50
50
|
target: string;
|
|
@@ -81,5 +81,8 @@ interface MalformedToken {
|
|
|
81
81
|
raw: string;
|
|
82
82
|
}
|
|
83
83
|
type Token = ProcessingInstructionToken | ElementOpenToken | ElementCloseToken | SelfClosingToken | TextToken | CommentToken | CDataToken | MalformedToken;
|
|
84
|
+
type ContentToken = Exclude<Token, ElementOpenToken | ElementCloseToken>;
|
|
85
|
+
|
|
86
|
+
declare function tokenize(xml: string): Token[];
|
|
84
87
|
|
|
85
|
-
export { type CDataNode, type CommentNode, type DocumentNode, type ElementNode, type MalformedNode, type Node, type ProcessingInstructionNode, type TextNode, type Token,
|
|
88
|
+
export { type CDataNode, type CommentNode, type ContentToken, type DocumentNode, type ElementNode, type MalformedNode, type Node, type ProcessingInstructionNode, type TextNode, type Token, TokenType, parse, tokenize };
|
package/dist/index.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
// src/tokenizer.ts
|
|
1
|
+
// src/modules/tokenizer/types.ts
|
|
2
2
|
var TokenType = {
|
|
3
3
|
PROCESSING_INSTRUCTION: "processing-instruction",
|
|
4
4
|
ELEMENT_OPEN: "element-open",
|
|
@@ -9,13 +9,15 @@ var TokenType = {
|
|
|
9
9
|
CDATA: "cdata",
|
|
10
10
|
MALFORMED: "malformed"
|
|
11
11
|
};
|
|
12
|
+
|
|
13
|
+
// src/modules/tokenizer/tokenizer.ts
|
|
12
14
|
var WHITESPACE = /\s/;
|
|
13
15
|
function parseAttributes(raw) {
|
|
14
16
|
const attributes = {};
|
|
15
|
-
const pattern = /(\S+?)\s*=\s*["']([^"']*)
|
|
17
|
+
const pattern = /(\S+?)\s*=\s*(["'])([^"']*)\2/g;
|
|
16
18
|
let match;
|
|
17
19
|
while ((match = pattern.exec(raw)) !== null) {
|
|
18
|
-
attributes[match[1]] = match[
|
|
20
|
+
attributes[match[1]] = match[3];
|
|
19
21
|
}
|
|
20
22
|
return attributes;
|
|
21
23
|
}
|
|
@@ -23,33 +25,72 @@ function nextToken(xml, position) {
|
|
|
23
25
|
if (xml[position] !== "<") {
|
|
24
26
|
const end2 = xml.indexOf("<", position);
|
|
25
27
|
const value = xml.slice(position, end2 === -1 ? xml.length : end2);
|
|
26
|
-
return {
|
|
28
|
+
return {
|
|
29
|
+
token: value.trim() ? { type: TokenType.TEXT, value } : null,
|
|
30
|
+
end: end2 === -1 ? xml.length : end2
|
|
31
|
+
};
|
|
27
32
|
}
|
|
28
33
|
const next = xml[position + 1];
|
|
29
34
|
if (next === "?") {
|
|
30
35
|
const closeIndex = xml.indexOf("?>", position);
|
|
31
|
-
if (closeIndex === -1)
|
|
36
|
+
if (closeIndex === -1)
|
|
37
|
+
return {
|
|
38
|
+
token: { type: TokenType.MALFORMED, raw: xml.slice(position) },
|
|
39
|
+
end: xml.length
|
|
40
|
+
};
|
|
32
41
|
const end2 = closeIndex + 2;
|
|
33
42
|
const inner2 = xml.slice(position + 2, end2 - 2).trim();
|
|
34
43
|
const space2 = inner2.search(WHITESPACE);
|
|
35
|
-
return {
|
|
44
|
+
return {
|
|
45
|
+
token: {
|
|
46
|
+
type: TokenType.PROCESSING_INSTRUCTION,
|
|
47
|
+
target: space2 === -1 ? inner2 : inner2.slice(0, space2),
|
|
48
|
+
attributes: parseAttributes(inner2)
|
|
49
|
+
},
|
|
50
|
+
end: end2
|
|
51
|
+
};
|
|
36
52
|
}
|
|
37
53
|
if (next === "!" && xml[position + 2] === "-") {
|
|
38
54
|
const closeIndex = xml.indexOf("-->", position);
|
|
39
|
-
if (closeIndex === -1)
|
|
55
|
+
if (closeIndex === -1)
|
|
56
|
+
return {
|
|
57
|
+
token: { type: TokenType.MALFORMED, raw: xml.slice(position) },
|
|
58
|
+
end: xml.length
|
|
59
|
+
};
|
|
40
60
|
const end2 = closeIndex + 3;
|
|
41
|
-
return {
|
|
61
|
+
return {
|
|
62
|
+
token: {
|
|
63
|
+
type: TokenType.COMMENT,
|
|
64
|
+
value: xml.slice(position + 4, end2 - 3)
|
|
65
|
+
},
|
|
66
|
+
end: end2
|
|
67
|
+
};
|
|
42
68
|
}
|
|
43
69
|
if (next === "!" && xml[position + 2] === "[") {
|
|
44
70
|
const closeIndex = xml.indexOf("]]>", position);
|
|
45
|
-
if (closeIndex === -1)
|
|
71
|
+
if (closeIndex === -1)
|
|
72
|
+
return {
|
|
73
|
+
token: { type: TokenType.MALFORMED, raw: xml.slice(position) },
|
|
74
|
+
end: xml.length
|
|
75
|
+
};
|
|
46
76
|
const end2 = closeIndex + 3;
|
|
47
|
-
return {
|
|
77
|
+
return {
|
|
78
|
+
token: { type: TokenType.CDATA, value: xml.slice(position + 9, end2 - 3) },
|
|
79
|
+
end: end2
|
|
80
|
+
};
|
|
48
81
|
}
|
|
49
82
|
const end = xml.indexOf(">", position) + 1;
|
|
50
|
-
if (!end)
|
|
83
|
+
if (!end)
|
|
84
|
+
return {
|
|
85
|
+
token: { type: TokenType.MALFORMED, raw: xml.slice(position) },
|
|
86
|
+
end: xml.length
|
|
87
|
+
};
|
|
51
88
|
const raw = xml.slice(position + 1, end - 1).trim();
|
|
52
|
-
if (raw[0] === "/")
|
|
89
|
+
if (raw[0] === "/")
|
|
90
|
+
return {
|
|
91
|
+
token: { type: TokenType.ELEMENT_CLOSE, tag: raw.slice(1).trim() },
|
|
92
|
+
end
|
|
93
|
+
};
|
|
53
94
|
const selfClosing = raw[raw.length - 1] === "/";
|
|
54
95
|
const inner = selfClosing ? raw.slice(0, -1).trim() : raw;
|
|
55
96
|
const space = inner.search(WHITESPACE);
|
|
@@ -68,11 +109,11 @@ function tokenize(xml) {
|
|
|
68
109
|
return tokens;
|
|
69
110
|
}
|
|
70
111
|
|
|
71
|
-
// src/parser.ts
|
|
112
|
+
// src/modules/parser/parser.ts
|
|
72
113
|
function parse(xml) {
|
|
73
114
|
const tokens = tokenize(xml);
|
|
74
115
|
const cursor = { position: 0 };
|
|
75
|
-
const children = collectChildren(tokens, cursor, null);
|
|
116
|
+
const { children } = collectChildren(tokens, cursor, null);
|
|
76
117
|
return { type: "document", children };
|
|
77
118
|
}
|
|
78
119
|
function collectChildren(tokens, cursor, parentTag) {
|
|
@@ -81,33 +122,58 @@ function collectChildren(tokens, cursor, parentTag) {
|
|
|
81
122
|
const token = tokens[cursor.position];
|
|
82
123
|
cursor.position++;
|
|
83
124
|
if (token.type === TokenType.ELEMENT_CLOSE) {
|
|
84
|
-
if (token.tag === parentTag) return children;
|
|
85
|
-
children.push({
|
|
125
|
+
if (token.tag === parentTag) return { children, closed: true };
|
|
126
|
+
children.push({
|
|
127
|
+
type: "malformed",
|
|
128
|
+
raw: `</${token.tag}>`,
|
|
129
|
+
malformed: true
|
|
130
|
+
});
|
|
86
131
|
continue;
|
|
87
132
|
}
|
|
88
133
|
if (token.type === TokenType.ELEMENT_OPEN) {
|
|
89
|
-
const
|
|
134
|
+
const { children: elementChildren, closed } = collectChildren(
|
|
135
|
+
tokens,
|
|
136
|
+
cursor,
|
|
137
|
+
token.tag
|
|
138
|
+
);
|
|
139
|
+
const node = {
|
|
140
|
+
type: "element",
|
|
141
|
+
tag: token.tag,
|
|
142
|
+
attributes: token.attributes,
|
|
143
|
+
children: elementChildren
|
|
144
|
+
};
|
|
145
|
+
if (!closed) node.malformed = true;
|
|
90
146
|
children.push(node);
|
|
91
147
|
continue;
|
|
92
148
|
}
|
|
93
149
|
children.push(tokenToNode(token));
|
|
94
150
|
}
|
|
95
|
-
|
|
96
|
-
children.forEach((child) => {
|
|
97
|
-
child.malformed = true;
|
|
98
|
-
});
|
|
99
|
-
}
|
|
100
|
-
return children;
|
|
151
|
+
return { children, closed: parentTag === null };
|
|
101
152
|
}
|
|
102
153
|
function tokenToNode(token) {
|
|
103
|
-
if (token.type === TokenType.TEXT)
|
|
104
|
-
|
|
105
|
-
if (token.type === TokenType.
|
|
106
|
-
|
|
107
|
-
if (token.type === TokenType.
|
|
108
|
-
|
|
109
|
-
|
|
154
|
+
if (token.type === TokenType.TEXT)
|
|
155
|
+
return { type: "text", value: token.value };
|
|
156
|
+
if (token.type === TokenType.COMMENT)
|
|
157
|
+
return { type: "comment", value: token.value };
|
|
158
|
+
if (token.type === TokenType.CDATA)
|
|
159
|
+
return { type: "cdata", value: token.value };
|
|
160
|
+
if (token.type === TokenType.SELF_CLOSING)
|
|
161
|
+
return {
|
|
162
|
+
type: "element",
|
|
163
|
+
tag: token.tag,
|
|
164
|
+
attributes: token.attributes,
|
|
165
|
+
children: []
|
|
166
|
+
};
|
|
167
|
+
if (token.type === TokenType.PROCESSING_INSTRUCTION)
|
|
168
|
+
return {
|
|
169
|
+
type: "processing-instruction",
|
|
170
|
+
target: token.target,
|
|
171
|
+
attributes: token.attributes
|
|
172
|
+
};
|
|
173
|
+
return { type: "malformed", raw: token.raw, malformed: true };
|
|
110
174
|
}
|
|
111
175
|
export {
|
|
112
|
-
|
|
176
|
+
TokenType,
|
|
177
|
+
parse,
|
|
178
|
+
tokenize
|
|
113
179
|
};
|