xml-to-html-converter 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,161 @@
1
+ # xml-to-html-converter
2
+
3
+ ![npm version](https://img.shields.io/npm/v/xml-to-html-converter)
4
+ ![npm downloads](https://img.shields.io/npm/dw/xml-to-html-converter)
5
+ ![license](https://img.shields.io/npm/l/xml-to-html-converter)
6
+ ![node version](https://img.shields.io/node/v/xml-to-html-converter)
7
+ ![zero dependencies](https://img.shields.io/badge/dependencies-0-brightgreen)
8
+
9
+ A zero-dependency Node.js package for converting XML to HTML. Currently in pre-1.0.0 development, building the foundation one functional part at a time. Full XML-to-HTML conversion is the goal of `v1.0.0`, but for now, I will be building 0 dependency tools that will eventually be used to convert an xml to html.
10
+
11
+ ---
12
+
13
+ > **Where I am right now**
14
+ >
15
+ > The goal of this package is to take any XML document and convert it to HTML. Currently, I am building the foundation which is taking an xml doc of any kind, and turning it into a json object otherwise known as a document node.
16
+ >
17
+ > What `v0.x` builds is the foundation that makes `v1.0.0` possible:
18
+ >
19
+ > - **A parser** takes any XML string and returns a plain JS document tree
20
+ > - **A fault-tolerant verifier** flags every broken node with `malformed: true` in-place, giving you a complete map of exactly where and what broke
21
+ > - **An intermediate representation** is a plain JS object tree that the HTML converter will walk when it's built
22
+ >
23
+ > `v1.0.0` is when this package becomes what it says it is: a full XML-to-HTML converter. Everything before that is the work to get there.
24
+
25
+ ---
26
+
27
+ ## Install
28
+
29
+ ```bash
30
+ npm install xml-to-html-converter
31
+ ```
32
+
33
+ ---
34
+
35
+ ## Usage
36
+
37
+ ```js
38
+ import { parse } from "xml-to-html-converter";
39
+
40
+ const tree = parse(`
41
+ <?xml version="1.0" encoding="UTF-8"?>
42
+ <bookstore>
43
+ <book category="cooking">
44
+ <title lang="en">Everyday Italian</title>
45
+ <author>Giada De Laurentiis</author>
46
+ </book>
47
+ </bookstore>
48
+ `);
49
+ ```
50
+
51
+ `parse` returns a document node wrapping the full tree:
52
+
53
+ ```json
54
+ {
55
+ "type": "document",
56
+ "children": [
57
+ {
58
+ "type": "processing-instruction",
59
+ "target": "xml",
60
+ "attributes": { "version": "1.0", "encoding": "UTF-8" }
61
+ },
62
+ {
63
+ "type": "element",
64
+ "tag": "bookstore",
65
+ "attributes": {},
66
+ "children": [
67
+ {
68
+ "type": "element",
69
+ "tag": "book",
70
+ "attributes": { "category": "cooking" },
71
+ "children": [
72
+ {
73
+ "type": "element",
74
+ "tag": "title",
75
+ "attributes": { "lang": "en" },
76
+ "children": [{ "type": "text", "value": "Everyday Italian" }]
77
+ },
78
+ {
79
+ "type": "element",
80
+ "tag": "author",
81
+ "attributes": {},
82
+ "children": [{ "type": "text", "value": "Giada De Laurentiis" }]
83
+ }
84
+ ]
85
+ }
86
+ ]
87
+ }
88
+ ]
89
+ }
90
+ ```
91
+
92
+ ---
93
+
94
+ ## Node Types
95
+
96
+ Every node in the tree has a `type` field.
97
+
98
+ | Type | Properties |
99
+ | ------------------------ | ------------------------------- |
100
+ | `document` | `children` |
101
+ | `element` | `tag`, `attributes`, `children` |
102
+ | `text` | `value` |
103
+ | `comment` | `value` |
104
+ | `cdata` | `value` |
105
+ | `processing-instruction` | `target`, `attributes` |
106
+ | `malformed` | `raw`, `malformed: true` |
107
+
108
+ ---
109
+
110
+ ## Malformed XML
111
+
112
+ The parser never throws. No matter what the input looks like, it always returns a complete document tree. I built this because when working with QTI and xsd validation, it was a pain to get things to work and I could never get past the current industries package standards. Additionally, falling down a Java rabbit hole was not something I enjoyed AT ALL. Therefore, malformed structures are flagged with `malformed: true` in-place and the walk continues. The tree is built no matter what your xml looks like.
113
+
114
+ Three types of malformed input are caught:
115
+
116
+ - **Unclosed tags** - a tag that opens but never closes gets `malformed: true`, its children are still collected normally
117
+ - **Stray closing tags** - a `</tag>` with no matching open becomes a `{ type: 'malformed', raw: '...</tag>', malformed: true }` node at that position
118
+ - **Unclosed brackets** - a `<` with no matching `>` before end of string captures the remainder as a malformed node
119
+
120
+ ```js
121
+ const tree = parse("<root><unclosed><valid>text</valid></root>");
122
+ ```
123
+
124
+ ```json
125
+ {
126
+ "type": "document",
127
+ "children": [
128
+ {
129
+ "type": "element",
130
+ "tag": "root",
131
+ "attributes": {},
132
+ "children": [
133
+ {
134
+ "type": "element",
135
+ "tag": "unclosed",
136
+ "attributes": {},
137
+ "malformed": true,
138
+ "children": [
139
+ {
140
+ "type": "element",
141
+ "tag": "valid",
142
+ "attributes": {},
143
+ "children": [{ "type": "text", "value": "text" }]
144
+ }
145
+ ]
146
+ }
147
+ ]
148
+ }
149
+ ]
150
+ }
151
+ ```
152
+
153
+ ---
154
+
155
+ ## Requirements
156
+
157
+ Node.js `>=18.0.0`
158
+
159
+ ## License
160
+
161
+ ISC
package/package.json CHANGED
@@ -1,21 +1,36 @@
1
1
  {
2
2
  "name": "xml-to-html-converter",
3
- "version": "0.0.1",
4
- "description": "",
5
- "main": "index.js",
3
+ "version": "0.1.0",
4
+ "description": "Zero dependency XML to HTML converter for Node environments",
5
+ "type": "module",
6
+ "main": "src/index.js",
7
+ "exports": {
8
+ ".": "./src/index.js"
9
+ },
10
+ "files": [
11
+ "src/"
12
+ ],
13
+ "engines": {
14
+ "node": ">=18.0.0"
15
+ },
6
16
  "scripts": {
7
- "test": "echo \"Error: no test specified\" && exit 1"
17
+ "test": "node --test tests/tokenizer.test.js tests/parser.test.js"
8
18
  },
19
+ "keywords": [
20
+ "xml",
21
+ "html",
22
+ "converter",
23
+ "parser",
24
+ "tree"
25
+ ],
26
+ "author": "",
27
+ "license": "ISC",
9
28
  "repository": {
10
29
  "type": "git",
11
30
  "url": "git+https://github.com/jpatterson933/xml-2-html.git"
12
31
  },
13
- "keywords": [],
14
- "author": "",
15
- "license": "ISC",
16
- "type": "commonjs",
17
32
  "bugs": {
18
33
  "url": "https://github.com/jpatterson933/xml-2-html/issues"
19
34
  },
20
35
  "homepage": "https://github.com/jpatterson933/xml-2-html#readme"
21
- }
36
+ }
package/src/index.js ADDED
@@ -0,0 +1,3 @@
1
+ import { parse } from './parser.js';
2
+
3
+ export { parse };
package/src/parser.js ADDED
@@ -0,0 +1,48 @@
1
+ import { tokenize, TokenType } from './tokenizer.js';
2
+
3
+ function parse(xml) {
4
+ const tokens = tokenize(xml);
5
+ const cursor = { position: 0 };
6
+ const children = collectChildren(tokens, cursor, null);
7
+ return { type: 'document', children };
8
+ }
9
+
10
+ function collectChildren(tokens, cursor, parentTag) {
11
+ const children = [];
12
+
13
+ while (cursor.position < tokens.length) {
14
+ const token = tokens[cursor.position];
15
+ cursor.position++;
16
+
17
+ if (token.type === TokenType.ELEMENT_CLOSE) {
18
+ if (token.tag === parentTag) return children;
19
+ children.push({ type: 'malformed', raw: `</${token.tag}>`, malformed: true });
20
+ continue;
21
+ }
22
+
23
+ if (token.type === TokenType.ELEMENT_OPEN) {
24
+ const node = { type: 'element', tag: token.tag, attributes: token.attributes, children: collectChildren(tokens, cursor, token.tag) };
25
+ children.push(node);
26
+ continue;
27
+ }
28
+
29
+ children.push(tokenToNode(token));
30
+ }
31
+
32
+ if (parentTag !== null) {
33
+ children.forEach(child => { child.malformed = true; });
34
+ }
35
+
36
+ return children;
37
+ }
38
+
39
+ function tokenToNode(token) {
40
+ if (token.type === TokenType.TEXT) return { type: 'text', value: token.value };
41
+ if (token.type === TokenType.COMMENT) return { type: 'comment', value: token.value };
42
+ if (token.type === TokenType.CDATA) return { type: 'cdata', value: token.value };
43
+ if (token.type === TokenType.SELF_CLOSING) return { type: 'element', tag: token.tag, attributes: token.attributes, children: [] };
44
+ if (token.type === TokenType.PROCESSING_INSTRUCTION) return { type: 'processing-instruction', target: token.target, attributes: token.attributes };
45
+ return { type: 'malformed', raw: token.raw, malformed: true };
46
+ }
47
+
48
+ export { parse };
@@ -0,0 +1,79 @@
1
+ const TokenType = {
2
+ PROCESSING_INSTRUCTION: 'processing-instruction',
3
+ ELEMENT_OPEN: 'element-open',
4
+ ELEMENT_CLOSE: 'element-close',
5
+ SELF_CLOSING: 'self-closing',
6
+ TEXT: 'text',
7
+ COMMENT: 'comment',
8
+ CDATA: 'cdata',
9
+ MALFORMED: 'malformed',
10
+ };
11
+
12
+ const WHITESPACE = /\s/;
13
+ const ATTRIBUTE_PATTERN = /(\S+?)\s*=\s*["']([^"']*)["']/g;
14
+
15
+ function parseAttributes(raw) {
16
+ const attributes = {};
17
+ ATTRIBUTE_PATTERN.lastIndex = 0;
18
+ let match;
19
+ while ((match = ATTRIBUTE_PATTERN.exec(raw)) !== null) {
20
+ attributes[match[1]] = match[2];
21
+ }
22
+ return attributes;
23
+ }
24
+
25
+ function nextToken(xml, position) {
26
+ if (xml[position] !== '<') {
27
+ const end = xml.indexOf('<', position);
28
+ const value = xml.slice(position, end === -1 ? xml.length : end);
29
+ return { token: value.trim() ? { type: TokenType.TEXT, value } : null, end: end === -1 ? xml.length : end };
30
+ }
31
+
32
+ const next = xml[position + 1];
33
+
34
+ if (next === '?') {
35
+ const end = xml.indexOf('?>', position) + 2;
36
+ const inner = xml.slice(position + 2, end - 2).trim();
37
+ const space = inner.search(WHITESPACE);
38
+ return { token: { type: TokenType.PROCESSING_INSTRUCTION, target: space === -1 ? inner : inner.slice(0, space), attributes: parseAttributes(inner) }, end: end || xml.length };
39
+ }
40
+
41
+ if (next === '!' && xml[position + 2] === '-') {
42
+ const end = xml.indexOf('-->', position) + 3;
43
+ return { token: { type: TokenType.COMMENT, value: xml.slice(position + 4, end - 3) }, end: end || xml.length };
44
+ }
45
+
46
+ if (next === '!' && xml[position + 2] === '[') {
47
+ const end = xml.indexOf(']]>', position) + 3;
48
+ return { token: { type: TokenType.CDATA, value: xml.slice(position + 9, end - 3) }, end: end || xml.length };
49
+ }
50
+
51
+ const end = xml.indexOf('>', position) + 1;
52
+ if (!end) return { token: { type: TokenType.MALFORMED, raw: xml.slice(position) }, end: xml.length };
53
+
54
+ const raw = xml.slice(position + 1, end - 1).trim();
55
+ if (raw[0] === '/') return { token: { type: TokenType.ELEMENT_CLOSE, tag: raw.slice(1).trim() }, end };
56
+
57
+ const selfClosing = raw[raw.length - 1] === '/';
58
+ const inner = selfClosing ? raw.slice(0, -1).trim() : raw;
59
+ const space = inner.search(WHITESPACE);
60
+ const tag = space === -1 ? inner : inner.slice(0, space);
61
+ const type = selfClosing ? TokenType.SELF_CLOSING : TokenType.ELEMENT_OPEN;
62
+
63
+ return { token: { type, tag, attributes: parseAttributes(inner) }, end };
64
+ }
65
+
66
+ function tokenize(xml) {
67
+ const tokens = [];
68
+ let position = 0;
69
+
70
+ while (position < xml.length) {
71
+ const { token, end } = nextToken(xml, position);
72
+ if (token) tokens.push(token);
73
+ position = end;
74
+ }
75
+
76
+ return tokens;
77
+ }
78
+
79
+ export { tokenize, TokenType };