xml-to-html-converter 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +161 -0
- package/package.json +24 -9
- package/src/index.js +3 -0
- package/src/parser.js +48 -0
- package/src/tokenizer.js +79 -0
package/README.md
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
# xml-to-html-converter
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+

|
|
5
|
+

|
|
6
|
+

|
|
7
|
+

|
|
8
|
+
|
|
9
|
+
A zero-dependency Node.js package for converting XML to HTML. Currently in pre-1.0.0 development, building the foundation one functional part at a time. Full XML-to-HTML conversion is the goal of `v1.0.0`, but for now, I will be building 0 dependency tools that will eventually be used to convert an xml to html.
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
> **Where I am right now**
|
|
14
|
+
>
|
|
15
|
+
> The goal of this package is to take any XML document and convert it to HTML. Currently, I am building the foundation which is taking an xml doc of any kind, and turning it into a json object otherwise known as a document node.
|
|
16
|
+
>
|
|
17
|
+
> What `v0.x` builds is the foundation that makes `v1.0.0` possible:
|
|
18
|
+
>
|
|
19
|
+
> - **A parser** takes any XML string and returns a plain JS document tree
|
|
20
|
+
> - **A fault-tolerant verifier** flags every broken node with `malformed: true` in-place, giving you a complete map of exactly where and what broke
|
|
21
|
+
> - **An intermediate representation** is a plain JS object tree that the HTML converter will walk when it's built
|
|
22
|
+
>
|
|
23
|
+
> `v1.0.0` is when this package becomes what it says it is: a full XML-to-HTML converter. Everything before that is the work to get there.
|
|
24
|
+
|
|
25
|
+
---
|
|
26
|
+
|
|
27
|
+
## Install
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
npm install xml-to-html-converter
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
---
|
|
34
|
+
|
|
35
|
+
## Usage
|
|
36
|
+
|
|
37
|
+
```js
|
|
38
|
+
import { parse } from "xml-to-html-converter";
|
|
39
|
+
|
|
40
|
+
const tree = parse(`
|
|
41
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
42
|
+
<bookstore>
|
|
43
|
+
<book category="cooking">
|
|
44
|
+
<title lang="en">Everyday Italian</title>
|
|
45
|
+
<author>Giada De Laurentiis</author>
|
|
46
|
+
</book>
|
|
47
|
+
</bookstore>
|
|
48
|
+
`);
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
`parse` returns a document node wrapping the full tree:
|
|
52
|
+
|
|
53
|
+
```json
|
|
54
|
+
{
|
|
55
|
+
"type": "document",
|
|
56
|
+
"children": [
|
|
57
|
+
{
|
|
58
|
+
"type": "processing-instruction",
|
|
59
|
+
"target": "xml",
|
|
60
|
+
"attributes": { "version": "1.0", "encoding": "UTF-8" }
|
|
61
|
+
},
|
|
62
|
+
{
|
|
63
|
+
"type": "element",
|
|
64
|
+
"tag": "bookstore",
|
|
65
|
+
"attributes": {},
|
|
66
|
+
"children": [
|
|
67
|
+
{
|
|
68
|
+
"type": "element",
|
|
69
|
+
"tag": "book",
|
|
70
|
+
"attributes": { "category": "cooking" },
|
|
71
|
+
"children": [
|
|
72
|
+
{
|
|
73
|
+
"type": "element",
|
|
74
|
+
"tag": "title",
|
|
75
|
+
"attributes": { "lang": "en" },
|
|
76
|
+
"children": [{ "type": "text", "value": "Everyday Italian" }]
|
|
77
|
+
},
|
|
78
|
+
{
|
|
79
|
+
"type": "element",
|
|
80
|
+
"tag": "author",
|
|
81
|
+
"attributes": {},
|
|
82
|
+
"children": [{ "type": "text", "value": "Giada De Laurentiis" }]
|
|
83
|
+
}
|
|
84
|
+
]
|
|
85
|
+
}
|
|
86
|
+
]
|
|
87
|
+
}
|
|
88
|
+
]
|
|
89
|
+
}
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
---
|
|
93
|
+
|
|
94
|
+
## Node Types
|
|
95
|
+
|
|
96
|
+
Every node in the tree has a `type` field.
|
|
97
|
+
|
|
98
|
+
| Type | Properties |
|
|
99
|
+
| ------------------------ | ------------------------------- |
|
|
100
|
+
| `document` | `children` |
|
|
101
|
+
| `element` | `tag`, `attributes`, `children` |
|
|
102
|
+
| `text` | `value` |
|
|
103
|
+
| `comment` | `value` |
|
|
104
|
+
| `cdata` | `value` |
|
|
105
|
+
| `processing-instruction` | `target`, `attributes` |
|
|
106
|
+
| `malformed` | `raw`, `malformed: true` |
|
|
107
|
+
|
|
108
|
+
---
|
|
109
|
+
|
|
110
|
+
## Malformed XML
|
|
111
|
+
|
|
112
|
+
The parser never throws. No matter what the input looks like, it always returns a complete document tree. I built this because when working with QTI and xsd validation, it was a pain to get things to work and I could never get past the current industries package standards. Additionally, falling down a Java rabbit hole was not something I enjoyed AT ALL. Therefore, malformed structures are flagged with `malformed: true` in-place and the walk continues. The tree is built no matter what your xml looks like.
|
|
113
|
+
|
|
114
|
+
Three types of malformed input are caught:
|
|
115
|
+
|
|
116
|
+
- **Unclosed tags** - a tag that opens but never closes gets `malformed: true`, its children are still collected normally
|
|
117
|
+
- **Stray closing tags** - a `</tag>` with no matching open becomes a `{ type: 'malformed', raw: '...</tag>', malformed: true }` node at that position
|
|
118
|
+
- **Unclosed brackets** - a `<` with no matching `>` before end of string captures the remainder as a malformed node
|
|
119
|
+
|
|
120
|
+
```js
|
|
121
|
+
const tree = parse("<root><unclosed><valid>text</valid></root>");
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
```json
|
|
125
|
+
{
|
|
126
|
+
"type": "document",
|
|
127
|
+
"children": [
|
|
128
|
+
{
|
|
129
|
+
"type": "element",
|
|
130
|
+
"tag": "root",
|
|
131
|
+
"attributes": {},
|
|
132
|
+
"children": [
|
|
133
|
+
{
|
|
134
|
+
"type": "element",
|
|
135
|
+
"tag": "unclosed",
|
|
136
|
+
"attributes": {},
|
|
137
|
+
"malformed": true,
|
|
138
|
+
"children": [
|
|
139
|
+
{
|
|
140
|
+
"type": "element",
|
|
141
|
+
"tag": "valid",
|
|
142
|
+
"attributes": {},
|
|
143
|
+
"children": [{ "type": "text", "value": "text" }]
|
|
144
|
+
}
|
|
145
|
+
]
|
|
146
|
+
}
|
|
147
|
+
]
|
|
148
|
+
}
|
|
149
|
+
]
|
|
150
|
+
}
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
---
|
|
154
|
+
|
|
155
|
+
## Requirements
|
|
156
|
+
|
|
157
|
+
Node.js `>=18.0.0`
|
|
158
|
+
|
|
159
|
+
## License
|
|
160
|
+
|
|
161
|
+
ISC
|
package/package.json
CHANGED
|
@@ -1,21 +1,36 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "xml-to-html-converter",
|
|
3
|
-
"version": "0.0
|
|
4
|
-
"description": "",
|
|
5
|
-
"
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Zero dependency XML to HTML converter for Node environments",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "src/index.js",
|
|
7
|
+
"exports": {
|
|
8
|
+
".": "./src/index.js"
|
|
9
|
+
},
|
|
10
|
+
"files": [
|
|
11
|
+
"src/"
|
|
12
|
+
],
|
|
13
|
+
"engines": {
|
|
14
|
+
"node": ">=18.0.0"
|
|
15
|
+
},
|
|
6
16
|
"scripts": {
|
|
7
|
-
"test": "
|
|
17
|
+
"test": "node --test tests/tokenizer.test.js tests/parser.test.js"
|
|
8
18
|
},
|
|
19
|
+
"keywords": [
|
|
20
|
+
"xml",
|
|
21
|
+
"html",
|
|
22
|
+
"converter",
|
|
23
|
+
"parser",
|
|
24
|
+
"tree"
|
|
25
|
+
],
|
|
26
|
+
"author": "",
|
|
27
|
+
"license": "ISC",
|
|
9
28
|
"repository": {
|
|
10
29
|
"type": "git",
|
|
11
30
|
"url": "git+https://github.com/jpatterson933/xml-2-html.git"
|
|
12
31
|
},
|
|
13
|
-
"keywords": [],
|
|
14
|
-
"author": "",
|
|
15
|
-
"license": "ISC",
|
|
16
|
-
"type": "commonjs",
|
|
17
32
|
"bugs": {
|
|
18
33
|
"url": "https://github.com/jpatterson933/xml-2-html/issues"
|
|
19
34
|
},
|
|
20
35
|
"homepage": "https://github.com/jpatterson933/xml-2-html#readme"
|
|
21
|
-
}
|
|
36
|
+
}
|
package/src/index.js
ADDED
package/src/parser.js
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import { tokenize, TokenType } from './tokenizer.js';
|
|
2
|
+
|
|
3
|
+
function parse(xml) {
|
|
4
|
+
const tokens = tokenize(xml);
|
|
5
|
+
const cursor = { position: 0 };
|
|
6
|
+
const children = collectChildren(tokens, cursor, null);
|
|
7
|
+
return { type: 'document', children };
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
function collectChildren(tokens, cursor, parentTag) {
|
|
11
|
+
const children = [];
|
|
12
|
+
|
|
13
|
+
while (cursor.position < tokens.length) {
|
|
14
|
+
const token = tokens[cursor.position];
|
|
15
|
+
cursor.position++;
|
|
16
|
+
|
|
17
|
+
if (token.type === TokenType.ELEMENT_CLOSE) {
|
|
18
|
+
if (token.tag === parentTag) return children;
|
|
19
|
+
children.push({ type: 'malformed', raw: `</${token.tag}>`, malformed: true });
|
|
20
|
+
continue;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
if (token.type === TokenType.ELEMENT_OPEN) {
|
|
24
|
+
const node = { type: 'element', tag: token.tag, attributes: token.attributes, children: collectChildren(tokens, cursor, token.tag) };
|
|
25
|
+
children.push(node);
|
|
26
|
+
continue;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
children.push(tokenToNode(token));
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
if (parentTag !== null) {
|
|
33
|
+
children.forEach(child => { child.malformed = true; });
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
return children;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
function tokenToNode(token) {
|
|
40
|
+
if (token.type === TokenType.TEXT) return { type: 'text', value: token.value };
|
|
41
|
+
if (token.type === TokenType.COMMENT) return { type: 'comment', value: token.value };
|
|
42
|
+
if (token.type === TokenType.CDATA) return { type: 'cdata', value: token.value };
|
|
43
|
+
if (token.type === TokenType.SELF_CLOSING) return { type: 'element', tag: token.tag, attributes: token.attributes, children: [] };
|
|
44
|
+
if (token.type === TokenType.PROCESSING_INSTRUCTION) return { type: 'processing-instruction', target: token.target, attributes: token.attributes };
|
|
45
|
+
return { type: 'malformed', raw: token.raw, malformed: true };
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
export { parse };
|
package/src/tokenizer.js
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
const TokenType = {
|
|
2
|
+
PROCESSING_INSTRUCTION: 'processing-instruction',
|
|
3
|
+
ELEMENT_OPEN: 'element-open',
|
|
4
|
+
ELEMENT_CLOSE: 'element-close',
|
|
5
|
+
SELF_CLOSING: 'self-closing',
|
|
6
|
+
TEXT: 'text',
|
|
7
|
+
COMMENT: 'comment',
|
|
8
|
+
CDATA: 'cdata',
|
|
9
|
+
MALFORMED: 'malformed',
|
|
10
|
+
};
|
|
11
|
+
|
|
12
|
+
const WHITESPACE = /\s/;
|
|
13
|
+
const ATTRIBUTE_PATTERN = /(\S+?)\s*=\s*["']([^"']*)["']/g;
|
|
14
|
+
|
|
15
|
+
function parseAttributes(raw) {
|
|
16
|
+
const attributes = {};
|
|
17
|
+
ATTRIBUTE_PATTERN.lastIndex = 0;
|
|
18
|
+
let match;
|
|
19
|
+
while ((match = ATTRIBUTE_PATTERN.exec(raw)) !== null) {
|
|
20
|
+
attributes[match[1]] = match[2];
|
|
21
|
+
}
|
|
22
|
+
return attributes;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
function nextToken(xml, position) {
|
|
26
|
+
if (xml[position] !== '<') {
|
|
27
|
+
const end = xml.indexOf('<', position);
|
|
28
|
+
const value = xml.slice(position, end === -1 ? xml.length : end);
|
|
29
|
+
return { token: value.trim() ? { type: TokenType.TEXT, value } : null, end: end === -1 ? xml.length : end };
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
const next = xml[position + 1];
|
|
33
|
+
|
|
34
|
+
if (next === '?') {
|
|
35
|
+
const end = xml.indexOf('?>', position) + 2;
|
|
36
|
+
const inner = xml.slice(position + 2, end - 2).trim();
|
|
37
|
+
const space = inner.search(WHITESPACE);
|
|
38
|
+
return { token: { type: TokenType.PROCESSING_INSTRUCTION, target: space === -1 ? inner : inner.slice(0, space), attributes: parseAttributes(inner) }, end: end || xml.length };
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
if (next === '!' && xml[position + 2] === '-') {
|
|
42
|
+
const end = xml.indexOf('-->', position) + 3;
|
|
43
|
+
return { token: { type: TokenType.COMMENT, value: xml.slice(position + 4, end - 3) }, end: end || xml.length };
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
if (next === '!' && xml[position + 2] === '[') {
|
|
47
|
+
const end = xml.indexOf(']]>', position) + 3;
|
|
48
|
+
return { token: { type: TokenType.CDATA, value: xml.slice(position + 9, end - 3) }, end: end || xml.length };
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
const end = xml.indexOf('>', position) + 1;
|
|
52
|
+
if (!end) return { token: { type: TokenType.MALFORMED, raw: xml.slice(position) }, end: xml.length };
|
|
53
|
+
|
|
54
|
+
const raw = xml.slice(position + 1, end - 1).trim();
|
|
55
|
+
if (raw[0] === '/') return { token: { type: TokenType.ELEMENT_CLOSE, tag: raw.slice(1).trim() }, end };
|
|
56
|
+
|
|
57
|
+
const selfClosing = raw[raw.length - 1] === '/';
|
|
58
|
+
const inner = selfClosing ? raw.slice(0, -1).trim() : raw;
|
|
59
|
+
const space = inner.search(WHITESPACE);
|
|
60
|
+
const tag = space === -1 ? inner : inner.slice(0, space);
|
|
61
|
+
const type = selfClosing ? TokenType.SELF_CLOSING : TokenType.ELEMENT_OPEN;
|
|
62
|
+
|
|
63
|
+
return { token: { type, tag, attributes: parseAttributes(inner) }, end };
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
function tokenize(xml) {
|
|
67
|
+
const tokens = [];
|
|
68
|
+
let position = 0;
|
|
69
|
+
|
|
70
|
+
while (position < xml.length) {
|
|
71
|
+
const { token, end } = nextToken(xml, position);
|
|
72
|
+
if (token) tokens.push(token);
|
|
73
|
+
position = end;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
return tokens;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
export { tokenize, TokenType };
|