@lowlighter/xml 6.0.1 → 8.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/parse.ts DELETED
@@ -1,442 +0,0 @@
1
- // Imports
2
- import { initSync, JsReader, source, Token, tokenize } from "./wasm_xml_parser/wasm_xml_parser.js"
3
- import type { record, rw } from "@libs/typing"
4
- import type { Nullable, ReaderSync, xml_document, xml_node, xml_text } from "./_types.ts"
5
- export type { Nullable, ReaderSync, xml_document, xml_node, xml_text }
6
- initSync(source())
7
-
8
- /** XML parser options. */
9
- export type options = {
10
- /** Remove elements from result. */
11
- clean?: {
12
- /** Remove attributes from result. */
13
- attributes?: boolean
14
- /** Remove comments from result. */
15
- comments?: boolean
16
- /** Remove XML doctype from result. */
17
- doctype?: boolean
18
- /** Remove XML processing instructions from result. */
19
- instructions?: boolean
20
- }
21
- /** Flatten result depending on node content. */
22
- flatten?: {
23
- /** If node only contains attributes values (i.e. with key starting with `@`), it'll be flattened as a regular object without `@` prefixes. */
24
- attributes?: boolean
25
- /** If node only contains a `#text` value, it'll be flattened as a string (defaults to `true`). */
26
- text?: boolean
27
- /** If node does not contains any attribute or text, it'll be flattened to `null` (defaults to `true`). */
28
- empty?: boolean
29
- }
30
- /** Revive result. */
31
- revive?: {
32
- /**
33
- * Trim texts (this is applied before other revivals, defaults to `true`).
34
- * It honors `xml:space="preserve"` attribute.
35
- */
36
- trim?: boolean
37
- /**
38
- * Revive XML entities (defaults to `true`).
39
- * Automatically unescape XML entities and replace common entities with their respective characters.
40
- */
41
- entities?: boolean
42
- /** Revive booleans (matching `/^(?:[Tt]rue|[Ff]alse)$/`).*/
43
- booleans?: boolean
44
- /**
45
- * Revive finite numbers.
46
- * Note that the version of the XML prolog is always treated as a string to avoid breaking documents.
47
- */
48
- numbers?: boolean
49
- /**
50
- * Custom reviver (this is applied after other revivals).
51
- * When it is applied on an attribute, `key` and `value` will be given.
52
- * When it is applied on a node, both `key` and `value` will be `null`.
53
- * Return `undefined` to delete either the attribute or the tag.
54
- */
55
- custom?: (args: { name: string; key: Nullable<string>; value: Nullable<string>; node: Readonly<xml_node> }) => unknown
56
- }
57
- /**
58
- * Parsing mode.
59
- * Using `html` is more permissive and will not throw on some invalid XML syntax.
60
- * Mainly unquoted attributes will be supported and not properly closed tags will be accepted.
61
- */
62
- mode?: "xml" | "html"
63
- }
64
-
65
- /**
66
- * Parse a XML string into an object.
67
- *
68
- * Output (cleaning, flattening, reviving, etc.) can be customized using the {@link options} parameter.
69
- *
70
- * Unless flattened, output nodes will contain the following non-enumerable properties (which mean they're not "visible" when iterating over, but are still explicitely accessible):
71
- * - General properties
72
- * - `readonly ["~name"]: string`: tag name
73
- * - `readonly ["~parent"]: xml_node|null`: parent node
74
- * - `["#text"]?: string`: text content
75
- * - Node properties
76
- * - `readonly ["~children"]: Array<xml_node|xml_text>`: node children
77
- * - `readonly ["#comments"]?: Array<string>`: node comments
78
- * - `readonly ["#text"]?: string`: concatenated children text content, this property becomes enumerable if at least one non-empty text node is present
79
- * - XML document properties
80
- * - `["#doctype"]?: xml_node`: XML doctype
81
- * - `["#instruction"]?: { [key:string]: xml_node| Array<xml_node> }`: XML processing instructions
82
- *
83
- * Attributes are prefixed with an arobase (`@`).
84
- *
85
- * You can also pass an object that implement {@link ReaderSync} instead of a string.
86
- *
87
- * @example
88
- * ```ts
89
- * import { parse } from "./parse.ts"
90
- *
91
- * console.log(parse(
92
- * `
93
- * <root>
94
- * <!-- This is a comment -->
95
- * <text>hello</text>
96
- * <array>world</array>
97
- * <array>monde</array>
98
- * <array>世界</array>
99
- * <array>🌏</array>
100
- * <number>42</number>
101
- * <boolean>true</boolean>
102
- * <complex attribute="value">content</complex>
103
- * </root>
104
- * `))
105
- * ```
106
- *
107
- * @example
108
- * ```ts
109
- * import { parse } from "./parse.ts"
110
- *
111
- * using file = await Deno.open("bench/assets/small.xml")
112
- * console.log(parse(file))
113
- * ```
114
- *
115
- * @module
116
- */
117
- export function parse(content: string | ReaderSync, options?: options): xml_document {
118
- const xml = xml_node("~xml") as xml_document
119
- const stack = [xml] as Array<xml_node>
120
- const tokens = [] as Array<[number, string, string?]>
121
- const states = [] as Array<[number, number]>
122
- const flags = { root: false }
123
- try {
124
- const reader = new JsReader(new TextEncoder().encode(content as string), typeof content === "object" ? content : undefined)
125
- tokenize(reader, tokens, states, options?.mode === "html")
126
- } catch (error) {
127
- if (states.at(-1)?.[0] === Token.StateParseAttribute) {
128
- tokens.push([Token.Error, `Failed to parse attribute around position ${states.at(-1)![1]}`])
129
- }
130
- if (!states.length) {
131
- throw new EvalError(`WASM XML parser crashed: ${error}`)
132
- }
133
- }
134
- const errors = tokens.find(([token]) => token === Token.Error)
135
- if (errors) {
136
- throw new SyntaxError(`Malformed XML document: ${errors[1]}`)
137
- }
138
- options ??= {}
139
- options.revive ??= {}
140
- options.revive.trim ??= true
141
- options.revive.entities ??= true
142
- options.flatten ??= {}
143
- options.flatten.text ??= true
144
- options.flatten.empty ??= true
145
- for (const [token, name, value = name] of tokens) {
146
- switch (token) {
147
- // XML declaration
148
- case Token.XMLDeclaration: {
149
- // https://www.w3.org/TR/REC-xml/#NT-VersionNum
150
- const version = value.match(/version=(["'])(?<version>1\.\d+)(\1)/)?.groups?.version
151
- if (version) {
152
- xml["@version"] = version as typeof xml["@version"]
153
- }
154
- // https://www.w3.org/TR/REC-xml/#NT-EncodingDecl
155
- const encoding = value.match(/encoding=(["'])(?<encoding>[A-Za-z][-\w.]*)(\1)/)?.groups?.encoding
156
- if (encoding) {
157
- xml["@encoding"] = encoding as typeof xml["@encoding"]
158
- }
159
- // https://www.w3.org/TR/REC-xml/#NT-SDDecl
160
- const standalone = value.match(/standalone=(["'])(?<standalone>yes|no)(\1)/)?.groups?.standalone
161
- if (standalone) {
162
- xml["@standalone"] = standalone as typeof xml["@standalone"]
163
- }
164
- break
165
- }
166
- // XML Doctype definition
167
- case Token.XMLDoctype: {
168
- xml["#doctype"] = Object.assign(xml_node("~doctype", { parent: xml }), xml_doctype(value))
169
- break
170
- }
171
- // XML processing instruction
172
- case Token.XMLInstruction: {
173
- const [name, ...raw] = value.split(" ")
174
- const instruction = Object.assign(xml_node(name, { parent: xml }), xml_attributes(raw.join(" ")))
175
- xml["#instructions"] ??= {}
176
- switch (true) {
177
- case Array.isArray(xml["#instructions"][name]):
178
- ;(xml["#instructions"][name] as Array<xml_node>).push(instruction)
179
- break
180
- case name in xml["#instructions"]:
181
- xml["#instructions"][name] = [xml["#instructions"][name] as xml_node, instruction]
182
- break
183
- default:
184
- xml["#instructions"][name] = instruction
185
- }
186
- break
187
- }
188
- // XML tag opened
189
- case Token.TagOpen: {
190
- if (stack.length === 1) {
191
- if (flags.root) {
192
- throw new SyntaxError("Multiple root node detected")
193
- }
194
- flags.root = true
195
- }
196
- const parent = stack.at(-1)!
197
- const node = xml_node(name, { parent })
198
- switch (true) {
199
- case Array.isArray(parent[node["~name"]]):
200
- ;(parent[node["~name"]] as Array<xml_node>).push(node)
201
- break
202
- case node["~name"] in parent:
203
- parent[node["~name"]] = [parent[node["~name"]], node]
204
- break
205
- default:
206
- parent[node["~name"]] = node
207
- }
208
- stack.push(node)
209
- break
210
- }
211
- // XML tag closed
212
- case Token.TagClose: {
213
- stack.pop()
214
- break
215
- }
216
- // XML attribute
217
- case Token.TagAttribute: {
218
- stack.at(-1)![`@${name}`] = value
219
- break
220
- }
221
- // Text
222
- case Token.Text: {
223
- xml_text(value, { type: "~text", parent: stack.at(-1)! })
224
- break
225
- }
226
- // CDATA
227
- case Token.CData: {
228
- xml_text(value, { type: "~cdata", parent: stack.at(-1)! })
229
- break
230
- }
231
- // Comment
232
- case Token.Comment: {
233
- xml_text(value, { type: "~comment", parent: stack.at(-1)! })
234
- break
235
- }
236
- }
237
- }
238
- if (!Object.keys(xml).length) {
239
- throw new SyntaxError("Malformed XML document: empty document or no root node detected")
240
- }
241
- return postprocess(xml, options) as xml_document
242
- }
243
-
244
- /** Parse xml attributes. */
245
- function xml_attributes(raw: string) {
246
- const attributes = {} as record<string>
247
- for (const [_, name, __, value] of raw.matchAll(/(?<name>[A-Za-z_][-\w.:]*)=(["'])(?<value>(?:(?!\2).)*)(\2)/g)) {
248
- attributes[`@${name}`] = value
249
- }
250
- return attributes
251
- }
252
-
253
- /** Parse xml doctype. */
254
- function xml_doctype(raw: string) {
255
- const node = {} as xml_node
256
- const { attributes: _attributes, elements: _elements = "" } = raw.match(/^(?<attributes>[^\[]*)(?:\[(?<elements>[\s\S]*)\])?/)?.groups!
257
- // Parse attributes
258
- raw = raw.replace(`[${_elements}]`, "")
259
- for (const [match, __, name] of _attributes.matchAll(/(["'])(?<name>(?:(?!\1).)*)(\1)/g)) {
260
- node[`@${name}`] = ""
261
- raw = raw.replace(match, "")
262
- }
263
- raw.split(/\s+/).filter(Boolean).forEach((name) => node[`@${name}`] = "")
264
- // Parse elements
265
- for (const [_, name, value] of _elements.matchAll(/<!ELEMENT\s+(?<name>\w+)\s+\((?<value>[^\)]+)\)/g)) {
266
- node[name] = value
267
- }
268
- return node
269
- }
270
-
271
- /** Create a new text node. */
272
- function xml_text(value: string, { type = "~text" as "~text" | "~cdata" | "~comment", parent = null as Nullable<xml_node> } = {}): xml_text {
273
- const text = Object.defineProperties({}, {
274
- ["~parent"]: { enumerable: false, writable: false, value: parent },
275
- ["~name"]: { enumerable: false, writable: false, value: type },
276
- }) as xml_text
277
- text["#text"] = value
278
- if (parent) {
279
- parent["~children"].push(text)
280
- }
281
- return text
282
- }
283
-
284
- /** Create a new node. */
285
- function xml_node(name: string, { parent = null as Nullable<xml_node> } = {}): xml_node {
286
- const node = Object.defineProperties({}, {
287
- ["~parent"]: { enumerable: false, writable: false, value: parent },
288
- ["~name"]: { enumerable: false, writable: false, value: name },
289
- ["~children"]: { enumerable: false, writable: true, value: [] },
290
- ["#text"]: {
291
- enumerable: false,
292
- configurable: true,
293
- get(this: xml_node) {
294
- const children = this["~children"].filter((node) => node["~name"] !== "~comment")
295
- // If xml:space is not set to "preserve", concatenate text nodes and trim them while removing empty ones
296
- if (this["@xml:space"] !== "preserve") {
297
- return children.map((child) => child["#text"]).filter(Boolean).join(" ")
298
- }
299
- // If xml:space is set to "preserve", concatenate text nodes without trimming them
300
- // In case of mixed content, add a space between mixed nodes if needed
301
- let text = ""
302
- for (let i = 0; i < children.length; i++) {
303
- const spaced = i && (+children[i - 1]["~name"].startsWith("~") ^ +children[i]["~name"].startsWith("~")) && (!children[i - 1]["#text"].endsWith(" ")) && (!children[i]["#text"].startsWith(" "))
304
- text += `${spaced ? " " : ""}${children[i]["#text"]}`
305
- }
306
- return text
307
- },
308
- },
309
- ["#comments"]: {
310
- enumerable: false,
311
- configurable: true,
312
- get(this: xml_node) {
313
- return this["~children"].filter((node) => node["~name"] === "~comment").map((node) => node["#text"]!)
314
- },
315
- },
316
- }) as xml_node
317
- if (parent) {
318
- parent["~children"].push(node)
319
- }
320
- return node
321
- }
322
-
323
- /** Post-process xml node. */
324
- function postprocess(node: xml_node, options: options) {
325
- // Clean XML document if required
326
- if (node["~name"] === "~xml") {
327
- if (options?.clean?.doctype) {
328
- delete node["#doctype"]
329
- }
330
- if (options?.clean?.instructions) {
331
- ;(node as rw)["~children"] = node["~children"].filter((child) => !(child["~name"] in ((node as xml_document)["#instructions"] ?? {})))
332
- delete node["#instructions"]
333
- }
334
- }
335
- // Clean node and enable enumerable properties if required
336
- if (node["~children"]) {
337
- if (options?.clean?.comments) {
338
- ;(node as rw)["~children"] = node["~children"].filter((child) => child["~name"] !== "~comment")
339
- }
340
- if (options?.revive?.trim) {
341
- node["~children"].forEach((child) => /^~(?:text|cdata|comment)$/.test(child["~name"]) ? (child as rw)["#text"] = revive(child, "#text", { revive: { trim: node["@xml:space"] !== "preserve" } }) : null)
342
- }
343
- if (node["~children"].some((child) => (/^~(?:text|cdata)$/.test(child["~name"])) && (child["#text"].trim().length + (node["@xml:space"] === "preserve" ? 1 : 0) * child["#text"].length))) {
344
- Object.defineProperty(node, "#text", { enumerable: true, configurable: true })
345
- }
346
- if (node["~children"].some((child) => child["~name"] === "~comment")) {
347
- Object.defineProperty(node, "#comments", { enumerable: true, configurable: true })
348
- }
349
- }
350
- // Process child nodes
351
- for (const [key, value] of Object.entries(node)) {
352
- // Skip comments
353
- if (key === "#comments") {
354
- continue
355
- }
356
- // Clean attributes if required
357
- if ((options?.clean?.attributes) && (key.startsWith("@"))) {
358
- delete node[key]
359
- continue
360
- }
361
- // Revive attribute value if required
362
- if (key.startsWith("@")) {
363
- node[key] = revive(node, key, options)
364
- if (node[key] === undefined) {
365
- delete node[key]
366
- }
367
- continue
368
- }
369
- // Handle other nodes
370
- if (Array.isArray(value)) {
371
- node[key] = Object.defineProperties(value.map((child) => postprocess(child, options)), {
372
- ["~parent"]: { enumerable: false, writable: false, value: node },
373
- ["~name"]: { enumerable: false, writable: false, value: key },
374
- })
375
- } else if ((typeof value === "object") && value) {
376
- node[key] = postprocess(value as xml_node, options)
377
- }
378
- if (node[key] === undefined) {
379
- delete node[key]
380
- }
381
- }
382
- // Revive text if required
383
- const keys = Object.keys(node)
384
- if (keys.includes("#text")) {
385
- const _options = { ...options, revive: { ...options?.revive, trim: (options?.revive?.trim) && (node["@xml:space"] !== "preserve") } }
386
- Object.defineProperty(node, "#text", { enumerable: true, configurable: true, value: revive(node, "#text", _options) })
387
- }
388
- // Custom revival if required
389
- if (options?.revive?.custom) {
390
- if (options.revive.custom({ name: node["~name"], key: null, value: null, node: node as xml_node }) === undefined) {
391
- return undefined
392
- }
393
- }
394
- // Flatten object if required
395
- if ((options?.flatten?.text) && (keys.length === 1) && (keys.includes("#text"))) {
396
- return node["#text"]
397
- }
398
- if ((options?.flatten?.attributes) && (keys.length) && (keys.every((key) => key.startsWith("@")))) {
399
- for (const key of keys) {
400
- node[key.slice(1)] = node[key]
401
- delete node[key]
402
- }
403
- return node
404
- }
405
- if (!keys.length) {
406
- return (options?.flatten?.empty) ? null : (options?.flatten?.text) ? "" : Object.defineProperty(node, "#text", { enumerable: true, configurable: true, value: "" })
407
- }
408
- return node
409
- }
410
-
411
- /** Entities */
412
- const entities = {
413
- "&lt;": "<",
414
- "&gt;": ">",
415
- "&apos;": "'",
416
- "&quot;": '"',
417
- "&amp;": "&", //Keep last
418
- } as const
419
-
420
- /** Revive value. */
421
- function revive(node: xml_node | xml_text, key: string, options: options) {
422
- let value = (node as xml_node)[key] as string
423
- if (options?.revive?.trim) {
424
- value = value.trim()
425
- }
426
- if (options?.revive?.entities) {
427
- value = value.replaceAll(/&#(?<hex>x?)(?<code>\d+);/g, (_, hex, code) => String.fromCharCode(Number.parseInt(code, hex ? 16 : 10)))
428
- for (const [entity, character] of Object.entries(entities)) {
429
- value = value.replaceAll(entity, character)
430
- }
431
- }
432
- if ((options?.revive?.numbers) && (value.length) && (Number.isFinite(Number(value))) && (!((node["~name"] === "~xml") && (key === "@version")))) {
433
- value = Number(value) as unknown as string
434
- }
435
- if ((options?.revive?.booleans) && (/^(?:[Tt]rue|[Ff]alse)$/.test(value))) {
436
- value = /^[Tt]rue$/.test(value) as unknown as string
437
- }
438
- if (options?.revive?.custom) {
439
- return options.revive.custom({ name: node["~name"], key, value, node: node as xml_node })
440
- }
441
- return value
442
- }