@tkeron/html-parser 1.1.2 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/.github/workflows/npm_deploy.yml +14 -4
  2. package/README.md +6 -6
  3. package/bun.lock +6 -8
  4. package/check-versions.ts +147 -0
  5. package/index.ts +4 -8
  6. package/package.json +5 -6
  7. package/src/dom-simulator/append-child.ts +130 -0
  8. package/src/dom-simulator/append.ts +18 -0
  9. package/src/dom-simulator/attributes.ts +23 -0
  10. package/src/dom-simulator/clone-node.ts +51 -0
  11. package/src/dom-simulator/convert-ast-node-to-dom.ts +37 -0
  12. package/src/dom-simulator/create-cdata.ts +18 -0
  13. package/src/dom-simulator/create-comment.ts +23 -0
  14. package/src/dom-simulator/create-doctype.ts +24 -0
  15. package/src/dom-simulator/create-document.ts +81 -0
  16. package/src/dom-simulator/create-element.ts +195 -0
  17. package/src/dom-simulator/create-processing-instruction.ts +19 -0
  18. package/src/dom-simulator/create-temp-parent.ts +9 -0
  19. package/src/dom-simulator/create-text-node.ts +23 -0
  20. package/src/dom-simulator/escape-text-content.ts +6 -0
  21. package/src/dom-simulator/find-special-elements.ts +14 -0
  22. package/src/dom-simulator/get-text-content.ts +18 -0
  23. package/src/dom-simulator/index.ts +36 -0
  24. package/src/dom-simulator/inner-outer-html.ts +182 -0
  25. package/src/dom-simulator/insert-after.ts +20 -0
  26. package/src/dom-simulator/insert-before.ts +108 -0
  27. package/src/dom-simulator/matches.ts +26 -0
  28. package/src/dom-simulator/node-types.ts +26 -0
  29. package/src/dom-simulator/prepend.ts +24 -0
  30. package/src/dom-simulator/remove-child.ts +68 -0
  31. package/src/dom-simulator/remove.ts +7 -0
  32. package/src/dom-simulator/replace-child.ts +152 -0
  33. package/src/dom-simulator/set-text-content.ts +33 -0
  34. package/src/dom-simulator/update-element-content.ts +56 -0
  35. package/src/dom-simulator.ts +12 -1126
  36. package/src/encoding/constants.ts +8 -0
  37. package/src/encoding/detect-encoding.ts +21 -0
  38. package/src/encoding/index.ts +1 -0
  39. package/src/encoding/normalize-encoding.ts +6 -0
  40. package/src/html-entities.ts +2127 -0
  41. package/src/index.ts +5 -5
  42. package/src/parser/adoption-agency-helpers.ts +145 -0
  43. package/src/parser/constants.ts +137 -0
  44. package/src/parser/dom-to-ast.ts +79 -0
  45. package/src/parser/index.ts +9 -0
  46. package/src/parser/parse.ts +772 -0
  47. package/src/parser/types.ts +56 -0
  48. package/src/selectors/find-elements-descendant.ts +47 -0
  49. package/src/selectors/index.ts +2 -0
  50. package/src/selectors/matches-selector.ts +12 -0
  51. package/src/selectors/matches-token.ts +27 -0
  52. package/src/selectors/parse-selector.ts +48 -0
  53. package/src/selectors/query-selector-all.ts +43 -0
  54. package/src/selectors/query-selector.ts +6 -0
  55. package/src/selectors/types.ts +10 -0
  56. package/src/serializer/attributes.ts +74 -0
  57. package/src/serializer/escape.ts +13 -0
  58. package/src/serializer/index.ts +1 -0
  59. package/src/serializer/serialize-tokens.ts +511 -0
  60. package/src/tokenizer/calculate-position.ts +10 -0
  61. package/src/tokenizer/constants.ts +11 -0
  62. package/src/tokenizer/decode-entities.ts +64 -0
  63. package/src/tokenizer/index.ts +2 -0
  64. package/src/tokenizer/parse-attributes.ts +74 -0
  65. package/src/tokenizer/tokenize.ts +165 -0
  66. package/src/tokenizer/types.ts +25 -0
  67. package/tests/adoption-agency-helpers.test.ts +304 -0
  68. package/tests/advanced.test.ts +242 -221
  69. package/tests/cloneNode.test.ts +19 -66
  70. package/tests/custom-elements-head.test.ts +54 -55
  71. package/tests/dom-extended.test.ts +77 -64
  72. package/tests/dom-manipulation.test.ts +51 -24
  73. package/tests/dom.test.ts +15 -13
  74. package/tests/encoding/detect-encoding.test.ts +33 -0
  75. package/tests/google-dom.test.ts +2 -2
  76. package/tests/helpers/tokenizer-adapter.test.ts +29 -43
  77. package/tests/helpers/tokenizer-adapter.ts +36 -33
  78. package/tests/helpers/tree-adapter.test.ts +20 -20
  79. package/tests/helpers/tree-adapter.ts +34 -24
  80. package/tests/html-entities-text.test.ts +6 -2
  81. package/tests/innerhtml-void-elements.test.ts +52 -36
  82. package/tests/outerHTML-replacement.test.ts +37 -65
  83. package/tests/parser/dom-to-ast.test.ts +109 -0
  84. package/tests/parser/parse.test.ts +139 -0
  85. package/tests/parser.test.ts +281 -217
  86. package/tests/selectors/query-selector-all.test.ts +39 -0
  87. package/tests/selectors/query-selector.test.ts +42 -0
  88. package/tests/serializer/attributes.test.ts +132 -0
  89. package/tests/serializer/escape.test.ts +51 -0
  90. package/tests/serializer/serialize-tokens.test.ts +80 -0
  91. package/tests/serializer-core.test.ts +6 -6
  92. package/tests/serializer-injectmeta.test.ts +6 -6
  93. package/tests/serializer-optionaltags.test.ts +9 -6
  94. package/tests/serializer-options.test.ts +6 -6
  95. package/tests/serializer-whitespace.test.ts +6 -6
  96. package/tests/tokenizer/calculate-position.test.ts +34 -0
  97. package/tests/tokenizer/decode-entities.test.ts +31 -0
  98. package/tests/tokenizer/parse-attributes.test.ts +44 -0
  99. package/tests/tokenizer/tokenize.test.ts +757 -0
  100. package/tests/tokenizer-namedEntities.test.ts +10 -7
  101. package/tests/tokenizer-pendingSpecChanges.test.ts +10 -7
  102. package/tests/tokenizer.test.ts +268 -256
  103. package/tests/tree-construction-adoption01.test.ts +25 -16
  104. package/tests/tree-construction-adoption02.test.ts +30 -19
  105. package/tests/tree-construction-domjs-unsafe.test.ts +6 -4
  106. package/tests/tree-construction-entities02.test.ts +18 -16
  107. package/tests/tree-construction-html5test-com.test.ts +16 -10
  108. package/tests/tree-construction-math.test.ts +11 -9
  109. package/tests/tree-construction-namespace-sensitivity.test.ts +11 -9
  110. package/tests/tree-construction-noscript01.test.ts +11 -9
  111. package/tests/tree-construction-ruby.test.ts +6 -4
  112. package/tests/tree-construction-scriptdata01.test.ts +6 -4
  113. package/tests/tree-construction-svg.test.ts +6 -4
  114. package/tests/tree-construction-template.test.ts +6 -4
  115. package/tests/tree-construction-tests10.test.ts +6 -4
  116. package/tests/tree-construction-tests11.test.ts +6 -4
  117. package/tests/tree-construction-tests20.test.ts +7 -4
  118. package/tests/tree-construction-tests21.test.ts +7 -4
  119. package/tests/tree-construction-tests23.test.ts +7 -4
  120. package/tests/tree-construction-tests24.test.ts +7 -4
  121. package/tests/tree-construction-tests5.test.ts +6 -5
  122. package/tests/tree-construction-tests6.test.ts +6 -5
  123. package/tests/tree-construction-tests_innerHTML_1.test.ts +6 -5
  124. package/tests/void-elements.test.ts +85 -40
  125. package/tsconfig.json +1 -1
  126. package/src/css-selector.ts +0 -185
  127. package/src/encoding.ts +0 -39
  128. package/src/parser.ts +0 -682
  129. package/src/serializer.ts +0 -450
  130. package/src/tokenizer.ts +0 -325
  131. package/tests/selectors.test.ts +0 -128
package/src/tokenizer.ts DELETED
@@ -1,325 +0,0 @@
1
- export enum TokenType {
2
- TAG_OPEN = 'TAG_OPEN',
3
- TAG_CLOSE = 'TAG_CLOSE',
4
- TEXT = 'TEXT',
5
- COMMENT = 'COMMENT',
6
- CDATA = 'CDATA',
7
- DOCTYPE = 'DOCTYPE',
8
- PROCESSING_INSTRUCTION = 'PROCESSING_INSTRUCTION',
9
- EOF = 'EOF'
10
- }
11
-
12
- export interface Position {
13
- line: number;
14
- column: number;
15
- offset: number;
16
- }
17
-
18
- export interface Token {
19
- type: TokenType;
20
- value: string;
21
- position: Position;
22
- attributes?: Record<string, string>;
23
- isSelfClosing?: boolean;
24
- isClosing?: boolean;
25
- }
26
-
27
- import { allNamedEntities } from 'all-named-html-entities';
28
-
29
- const HTML_ENTITIES: Record<string, string> = allNamedEntities;
30
-
31
- function decodeEntities(text: string): string {
32
- let result = '';
33
- let i = 0;
34
- while (i < text.length) {
35
- if (text[i] === '&') {
36
- let match = '';
37
- let j = i + 1;
38
- if (text[j] === '#') {
39
- j++;
40
- if (text[j] === 'x' || text[j] === 'X') {
41
- j++;
42
- while (j < text.length && /[0-9a-fA-F]/.test(text[j])) {
43
- j++;
44
- }
45
- } else {
46
- while (j < text.length && /[0-9]/.test(text[j])) {
47
- j++;
48
- }
49
- }
50
- if (text[j] === ';') {
51
- j++;
52
- }
53
- match = text.substring(i, j);
54
- const entity = match;
55
- if (entity.startsWith('&#x') && entity.endsWith(';')) {
56
- const hex = entity.slice(3, -1);
57
- result += String.fromCharCode(parseInt(hex, 16));
58
- i = j;
59
- continue;
60
- } else if (entity.startsWith('&#') && entity.endsWith(';')) {
61
- const decimal = entity.slice(2, -1);
62
- result += String.fromCharCode(parseInt(decimal, 10));
63
- i = j;
64
- continue;
65
- }
66
- } else {
67
- while (j < text.length && /[a-zA-Z0-9]/.test(text[j])) {
68
- j++;
69
- }
70
- const hasSemi = text[j] === ';';
71
- if (hasSemi) {
72
- j++;
73
- }
74
- match = text.substring(i, j);
75
- const named = match.slice(1, hasSemi ? -1 : undefined);
76
- if (HTML_ENTITIES[named]) {
77
- if (hasSemi || (j < text.length && !/[a-zA-Z0-9]/.test(text[j]))) {
78
- result += HTML_ENTITIES[named];
79
- i = j;
80
- continue;
81
- }
82
- }
83
- }
84
- result += text[i];
85
- i++;
86
- } else {
87
- result += text[i];
88
- i++;
89
- }
90
- }
91
- return result.replace(/\u0000/g, '\uFFFD');
92
- }
93
-
94
- function parseAttributes(attributeString: string): Record<string, string> {
95
- const attributes: Record<string, string> = {};
96
- let i = 0;
97
-
98
- while (i < attributeString.length) {
99
- while (i < attributeString.length && /\s/.test(attributeString[i])) {
100
- i++;
101
- }
102
- if (i >= attributeString.length || attributeString[i] === '/' || attributeString[i] === '>') {
103
- break;
104
- }
105
-
106
- let name = '';
107
- while (i < attributeString.length && !/[\s=\/>]/.test(attributeString[i])) {
108
- name += attributeString[i];
109
- i++;
110
- }
111
-
112
- if (!name) {
113
- i++;
114
- continue;
115
- }
116
-
117
- while (i < attributeString.length && /\s/.test(attributeString[i])) {
118
- i++;
119
- }
120
-
121
- let value = '';
122
- if (i < attributeString.length && attributeString[i] === '=') {
123
- i++;
124
- while (i < attributeString.length && /\s/.test(attributeString[i])) {
125
- i++;
126
- }
127
-
128
- if (i < attributeString.length) {
129
- if (attributeString[i] === '"') {
130
- i++;
131
- while (i < attributeString.length && attributeString[i] !== '"') {
132
- value += attributeString[i];
133
- i++;
134
- }
135
- i++;
136
- } else if (attributeString[i] === "'") {
137
- i++;
138
- while (i < attributeString.length && attributeString[i] !== "'") {
139
- value += attributeString[i];
140
- i++;
141
- }
142
- i++;
143
- } else {
144
- while (i < attributeString.length && !/[\s>]/.test(attributeString[i])) {
145
- value += attributeString[i];
146
- i++;
147
- }
148
- }
149
- }
150
- }
151
-
152
- attributes[name.toLowerCase()] = decodeEntities(value);
153
- }
154
-
155
- return attributes;
156
- }
157
-
158
- function calculatePosition(text: string, offset: number): Position {
159
- const lines = text.slice(0, offset).split('\n');
160
- return {
161
- line: lines.length,
162
- column: lines[lines.length - 1]?.length ?? 0,
163
- offset
164
- };
165
- }
166
-
167
- const RAW_TEXT_ELEMENTS = new Set(['script', 'style', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript']);
168
- const RCDATA_ELEMENTS = new Set(['textarea', 'title']);
169
-
170
- export function tokenize(html: string): Token[] {
171
- const tokens: Token[] = [];
172
- let currentPos = 0;
173
-
174
- while (currentPos < html.length) {
175
- const char = html[currentPos];
176
-
177
- if (char === '<') {
178
- const remaining = html.slice(currentPos);
179
-
180
- const doctypeMatch = remaining.match(/^<!DOCTYPE\s+[^>]*>/i);
181
- if (doctypeMatch) {
182
- const match = doctypeMatch[0];
183
- const nameMatch = match.match(/<!DOCTYPE\s+([^\s>]+)/i);
184
- tokens.push({
185
- type: TokenType.DOCTYPE,
186
- value: nameMatch && nameMatch[1] ? nameMatch[1].toLowerCase() : match,
187
- position: calculatePosition(html, currentPos)
188
- });
189
- currentPos += match.length;
190
- continue;
191
- }
192
-
193
- const commentMatch = remaining.match(/^<!--([\s\S]*?)(?:-->|$)/);
194
- if (commentMatch) {
195
- const match = commentMatch[0];
196
- tokens.push({
197
- type: TokenType.COMMENT,
198
- value: match.slice(4, match.endsWith('-->') ? -3 : match.length),
199
- position: calculatePosition(html, currentPos)
200
- });
201
- currentPos += match.length;
202
- continue;
203
- }
204
-
205
- const cdataMatch = remaining.match(/^<!\[CDATA\[([\s\S]*?)\]\]>/);
206
- if (cdataMatch) {
207
- const content = cdataMatch[1];
208
- tokens.push({
209
- type: TokenType.COMMENT,
210
- value: '[CDATA[' + content + ']]',
211
- position: calculatePosition(html, currentPos)
212
- });
213
- currentPos += cdataMatch[0].length;
214
- continue;
215
- }
216
-
217
- const piMatch = remaining.match(/^<\?([^>]*)/);
218
- if (piMatch) {
219
- let consumed = piMatch[0].length;
220
- if (remaining[consumed] === '>') {
221
- consumed++;
222
- }
223
- tokens.push({
224
- type: TokenType.COMMENT,
225
- value: '?' + piMatch[1],
226
- position: calculatePosition(html, currentPos)
227
- });
228
- currentPos += consumed;
229
- continue;
230
- }
231
-
232
- const tagMatch = remaining.match(/^<\/?([a-zA-Z][^\s/>]*)([^>]*)>/);
233
-
234
- if (tagMatch) {
235
- const fullTag = tagMatch[0];
236
- const tagName = tagMatch[1]?.toLowerCase();
237
-
238
- if (!tagName) {
239
- currentPos++;
240
- continue;
241
- }
242
-
243
- const isClosing = fullTag.startsWith('</');
244
- const isSelfClosing = fullTag.endsWith('/>');
245
-
246
- let attributes: Record<string, string> = {};
247
- if (!isClosing) {
248
- const attrMatch = fullTag.match(/^<[a-zA-Z][^\s/>]*\s+([^>]*?)\/?>$/);
249
- if (attrMatch && attrMatch[1]) {
250
- attributes = parseAttributes(attrMatch[1]);
251
- }
252
- }
253
-
254
- tokens.push({
255
- type: isClosing ? TokenType.TAG_CLOSE : TokenType.TAG_OPEN,
256
- value: tagName,
257
- position: calculatePosition(html, currentPos),
258
- ...(isClosing ? { isClosing: true } : {
259
- attributes,
260
- isSelfClosing
261
- })
262
- });
263
-
264
- currentPos += fullTag.length;
265
-
266
- if (!isClosing && !isSelfClosing && (RAW_TEXT_ELEMENTS.has(tagName) || RCDATA_ELEMENTS.has(tagName))) {
267
- const closeTagPattern = new RegExp(`</${tagName}\\s*>`, 'i');
268
- const restOfHtml = html.slice(currentPos);
269
- const closeMatch = restOfHtml.match(closeTagPattern);
270
-
271
- if (closeMatch && closeMatch.index !== undefined) {
272
- const rawContent = restOfHtml.slice(0, closeMatch.index);
273
- if (rawContent) {
274
- tokens.push({
275
- type: TokenType.TEXT,
276
- value: RCDATA_ELEMENTS.has(tagName) ? decodeEntities(rawContent) : rawContent,
277
- position: calculatePosition(html, currentPos)
278
- });
279
- }
280
- currentPos += rawContent.length;
281
- }
282
- }
283
- } else {
284
- const textStart = currentPos;
285
- currentPos++;
286
-
287
- while (currentPos < html.length && html[currentPos] !== '<') {
288
- currentPos++;
289
- }
290
-
291
- const textContent = html.slice(textStart, currentPos);
292
- if (textContent) {
293
- tokens.push({
294
- type: TokenType.TEXT,
295
- value: decodeEntities(textContent),
296
- position: calculatePosition(html, textStart)
297
- });
298
- }
299
- }
300
- } else {
301
- const textStart = currentPos;
302
-
303
- while (currentPos < html.length && html[currentPos] !== '<') {
304
- currentPos++;
305
- }
306
-
307
- const textContent = html.slice(textStart, currentPos);
308
- if (textContent) {
309
- tokens.push({
310
- type: TokenType.TEXT,
311
- value: decodeEntities(textContent),
312
- position: calculatePosition(html, textStart)
313
- });
314
- }
315
- }
316
- }
317
-
318
- tokens.push({
319
- type: TokenType.EOF,
320
- value: '',
321
- position: calculatePosition(html, html.length)
322
- });
323
-
324
- return tokens;
325
- }
@@ -1,128 +0,0 @@
1
- import { describe, it, expect } from 'bun:test';
2
- import { querySelector, querySelectorAll } from '../src/css-selector';
3
- import { parseHTML } from '../index';
4
- import type { Element, Document } from '../src/dom-simulator';
5
-
6
- describe('CSS Selectors', () => {
7
- const htmlContent = `
8
- <html>
9
- <body>
10
- <p id="intro" class="first">
11
- <span class="highlight">Hello</span>
12
- </p>
13
- <p class="second">World</p>
14
- <div>
15
- <p class="note">Note</p>
16
- </div>
17
- </body>
18
- </html>
19
- `;
20
-
21
- const doc: Document = parseHTML(htmlContent);
22
-
23
- describe('querySelectorAll', () => {
24
- it('should be a function', () => {
25
- expect(typeof querySelectorAll).toBe('function');
26
- });
27
-
28
- it('should find all elements by tag name', () => {
29
- const paragraphs = querySelectorAll(doc, 'p');
30
- expect(paragraphs.length).toBe(3);
31
- expect(paragraphs[0]!.attributes.class).toBe('first');
32
- expect(paragraphs[1]!.attributes.class).toBe('second');
33
- expect(paragraphs[2]!.attributes.class).toBe('note');
34
- });
35
-
36
- it('should find all elements by class name', () => {
37
- const second = querySelectorAll(doc, '.second');
38
- expect(second.length).toBe(1);
39
- expect(second[0]!.tagName).toBe('P');
40
- });
41
- });
42
-
43
- describe('querySelector', () => {
44
- it('should be a function', () => {
45
- expect(typeof querySelector).toBe('function');
46
- });
47
-
48
- it('should find the first element by tag name', () => {
49
- const firstParagraph = querySelector(doc, 'p');
50
- expect(firstParagraph).not.toBeNull();
51
- expect(firstParagraph?.attributes.id).toBe('intro');
52
- });
53
-
54
- it('should find an element by ID', () => {
55
- const intro = querySelector(doc, '#intro');
56
- expect(intro).not.toBeNull();
57
- expect(intro?.tagName).toBe('P');
58
- });
59
-
60
- it('should return null if no element is found', () => {
61
- const nonExistent = querySelector(doc, '#nonexistent');
62
- expect(nonExistent).toBeNull();
63
- });
64
- });
65
-
66
- describe('Element.matches', () => {
67
- it('should match by tag name', () => {
68
- const p = querySelector(doc, 'p');
69
- expect(p?.matches('p')).toBe(true);
70
- expect(p?.matches('div')).toBe(false);
71
- });
72
-
73
- it('should match by id', () => {
74
- const intro = querySelector(doc, '#intro');
75
- expect(intro?.matches('#intro')).toBe(true);
76
- expect(intro?.matches('#other')).toBe(false);
77
- });
78
-
79
- it('should match by class', () => {
80
- const first = querySelector(doc, '.first');
81
- expect(first?.matches('.first')).toBe(true);
82
- expect(first?.matches('.second')).toBe(false);
83
- });
84
-
85
- it('should match by multiple classes', () => {
86
- const doc2 = parseHTML('<div class="foo bar baz">Test</div>');
87
- const div = doc2.querySelector('div');
88
- expect(div?.matches('.foo')).toBe(true);
89
- expect(div?.matches('.bar')).toBe(true);
90
- expect(div?.matches('.foo.bar')).toBe(true);
91
- expect(div?.matches('.foo.baz')).toBe(true);
92
- expect(div?.matches('.foo.bar.baz')).toBe(true);
93
- expect(div?.matches('.foo.missing')).toBe(false);
94
- });
95
-
96
- it('should match by attribute', () => {
97
- const intro = querySelector(doc, '#intro');
98
- expect(intro?.matches('[id]')).toBe(true);
99
- expect(intro?.matches('[id="intro"]')).toBe(true);
100
- expect(intro?.matches('[class]')).toBe(true);
101
- expect(intro?.matches('[title]')).toBe(false);
102
- });
103
-
104
- it('should match complex selectors', () => {
105
- const intro = querySelector(doc, '#intro');
106
- expect(intro?.matches('p#intro')).toBe(true);
107
- expect(intro?.matches('p.first')).toBe(true);
108
- expect(intro?.matches('div#intro')).toBe(false);
109
- });
110
-
111
- it('should match descendant selectors', () => {
112
- const span = querySelector(doc, 'span');
113
- expect(span?.matches('p span')).toBe(true);
114
- expect(span?.matches('body span')).toBe(true);
115
- expect(span?.matches('div span')).toBe(false);
116
- });
117
-
118
- it('should return false for invalid selector', () => {
119
- const p = querySelector(doc, 'p');
120
- expect(p?.matches('')).toBe(false);
121
- });
122
-
123
- it('should work with universal selector', () => {
124
- const p = querySelector(doc, 'p');
125
- expect(p?.matches('*')).toBe(true);
126
- });
127
- });
128
- });