@metaobjectsdev/render 0.9.0 → 0.11.0-rc.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +19 -0
- package/dist/extract/coerce.js +17 -8
- package/dist/extract/coerce.js.map +1 -1
- package/dist/extract/extract.d.ts.map +1 -1
- package/dist/extract/extract.js +35 -9
- package/dist/extract/extract.js.map +1 -1
- package/dist/extract/json-forgiving-reader.d.ts +7 -0
- package/dist/extract/json-forgiving-reader.d.ts.map +1 -1
- package/dist/extract/json-forgiving-reader.js +12 -1
- package/dist/extract/json-forgiving-reader.js.map +1 -1
- package/dist/extract/types.d.ts +19 -0
- package/dist/extract/types.d.ts.map +1 -1
- package/dist/extract/types.js +9 -1
- package/dist/extract/types.js.map +1 -1
- package/dist/extract/xml-forgiving-reader.d.ts +10 -0
- package/dist/extract/xml-forgiving-reader.d.ts.map +1 -1
- package/dist/extract/xml-forgiving-reader.js +96 -11
- package/dist/extract/xml-forgiving-reader.js.map +1 -1
- package/dist/index.d.ts +2 -2
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +2 -2
- package/dist/index.js.map +1 -1
- package/dist/verify.d.ts +24 -0
- package/dist/verify.d.ts.map +1 -1
- package/dist/verify.js +21 -5
- package/dist/verify.js.map +1 -1
- package/package.json +32 -21
- package/src/extract/coerce.ts +17 -8
- package/src/extract/extract.ts +35 -11
- package/src/extract/json-forgiving-reader.ts +12 -2
- package/src/extract/types.ts +24 -1
- package/src/extract/xml-forgiving-reader.ts +99 -12
- package/src/index.ts +4 -0
- package/src/verify.ts +37 -11
package/src/extract/types.ts
CHANGED
|
@@ -85,6 +85,13 @@ export interface FieldSpec {
|
|
|
85
85
|
readonly defaultValue: string | null;
|
|
86
86
|
/** FR-011: resolved enum normalization mode (from `@normalize`; default `"strip"`). */
|
|
87
87
|
readonly normalize: NormalizeMode;
|
|
88
|
+
/**
|
|
89
|
+
* `@xmlText`: this field receives its element's TEXT CONTENT (analogous to JAXB `@XmlValue` /
|
|
90
|
+
* Jackson `@JacksonXmlText` / .NET `[XmlText]`). The extract engine reads it from the
|
|
91
|
+
* `#text` sentinel the lenient XML reader carries when an element has both attributes and a
|
|
92
|
+
* text body, instead of a same-named child. Absent/false for normal fields and for JSON.
|
|
93
|
+
*/
|
|
94
|
+
readonly textContent?: boolean;
|
|
88
95
|
}
|
|
89
96
|
|
|
90
97
|
/**
|
|
@@ -115,6 +122,14 @@ export function scalar(
|
|
|
115
122
|
};
|
|
116
123
|
}
|
|
117
124
|
|
|
125
|
+
/**
|
|
126
|
+
* A field that receives its element's TEXT CONTENT — the `@xmlText` marker (see
|
|
127
|
+
* {@link FieldSpec.textContent}). A scalar with the `textContent` flag set; coerced to `kind`.
|
|
128
|
+
*/
|
|
129
|
+
export function textContentField(name: string, kind: FieldKind, required: boolean): FieldSpec {
|
|
130
|
+
return { ...scalar(name, kind, required), textContent: true };
|
|
131
|
+
}
|
|
132
|
+
|
|
118
133
|
export function enumField(
|
|
119
134
|
name: string,
|
|
120
135
|
required: boolean,
|
|
@@ -231,16 +246,23 @@ export type OnField = (fieldPath: string, rawValue: string, spec: FieldSpec) =>
|
|
|
231
246
|
/**
|
|
232
247
|
* Bounded runtime override surface. aliases/normalizers are MERGED with the
|
|
233
248
|
* schema's, runtime winning on key conflict. onField is the single hook.
|
|
249
|
+
*
|
|
250
|
+
* `rootless` (XML only): when `true`, the input has NO enclosing root element — the payload's
|
|
251
|
+
* fields ARE the top-level elements (a flat sequence like `<a>..</a><b>..</b>`). The engine
|
|
252
|
+
* parses those top-level elements directly instead of locating a `<rootName>` span, so the caller
|
|
253
|
+
* need not synthesize a wrapper. No effect for JSON. Default `false` (a single root element is
|
|
254
|
+
* expected, as before). Mirrors Java ExtractOptions.rootless.
|
|
234
255
|
*/
|
|
235
256
|
export interface ExtractOptions {
|
|
236
257
|
readonly tolerance: Tolerance;
|
|
237
258
|
readonly aliases: Readonly<Record<string, string>>;
|
|
238
259
|
readonly normalizers: Readonly<Record<string, (raw: string) => unknown | null>>;
|
|
239
260
|
readonly onField: OnField | null;
|
|
261
|
+
readonly rootless: boolean;
|
|
240
262
|
}
|
|
241
263
|
|
|
242
264
|
export function defaults(): ExtractOptions {
|
|
243
|
-
return { tolerance: Tolerance.NORMAL, aliases: {}, normalizers: {}, onField: null };
|
|
265
|
+
return { tolerance: Tolerance.NORMAL, aliases: {}, normalizers: {}, onField: null, rootless: false };
|
|
244
266
|
}
|
|
245
267
|
|
|
246
268
|
/** Normalize a partial / undefined options bag into a complete ExtractOptions. */
|
|
@@ -251,6 +273,7 @@ export function normalizeOptions(opts?: Partial<ExtractOptions> | null): Extract
|
|
|
251
273
|
aliases: opts.aliases == null ? {} : { ...opts.aliases },
|
|
252
274
|
normalizers: opts.normalizers == null ? {} : { ...opts.normalizers },
|
|
253
275
|
onField: opts.onField ?? null,
|
|
276
|
+
rootless: opts.rootless ?? false,
|
|
254
277
|
};
|
|
255
278
|
}
|
|
256
279
|
|
|
@@ -1,5 +1,21 @@
|
|
|
1
1
|
// Stage-4 tolerant XML reader for the bounded corpus malformation set. Never throws.
|
|
2
|
-
// Mirrors Java XmlForgivingReader
|
|
2
|
+
// Mirrors Java XmlForgivingReader: maps an element's child elements, text, AND attributes
|
|
3
|
+
// into the field map, and handles self-closing tags (<x a="1"/>). Must not index-out-of-range
|
|
4
|
+
// on a leading close tag.
|
|
5
|
+
//
|
|
6
|
+
// Representation:
|
|
7
|
+
// - text-only element, no attributes → its trimmed text (string) — unchanged
|
|
8
|
+
// - self-closing / attributes-only element → a record of attribute name→value ("" when none)
|
|
9
|
+
// - element with child elements (± attrs) → a record merging attributes + child entries
|
|
10
|
+
// (a child element wins a name collision)
|
|
11
|
+
// - element with text AND attributes → a record of the attributes plus the body text
|
|
12
|
+
// under TEXT_KEY (a scalar consumer unwraps it)
|
|
13
|
+
// - repeated sibling tags → an array (unchanged)
|
|
14
|
+
|
|
15
|
+
/** Reserved key holding an element's own text content when the element is represented as a
|
|
16
|
+
* record (because it also carries attributes). '#' is not a legal XML name char, so it never
|
|
17
|
+
* collides with a real attribute or child-element name. */
|
|
18
|
+
export const TEXT_KEY = "#text";
|
|
3
19
|
|
|
4
20
|
function quote(s: string): string {
|
|
5
21
|
return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
@@ -12,7 +28,11 @@ function matchFrom(source: string, flags: string, text: string, from: number): R
|
|
|
12
28
|
return g.exec(text);
|
|
13
29
|
}
|
|
14
30
|
|
|
15
|
-
|
|
31
|
+
// tag name + everything up to the closing '>' (attributes and/or a trailing '/' for a
|
|
32
|
+
// self-closing tag). Non-greedy so the first '>' closes the open tag.
|
|
33
|
+
const OPEN_TAG_SRC = "<([A-Za-z_][A-Za-z0-9_]*)([^>]*?)>";
|
|
34
|
+
// one attribute: name = "double" | 'single' | bareword.
|
|
35
|
+
const ATTR_SRC = "([A-Za-z_:][A-Za-z0-9_:.\\-]*)\\s*=\\s*(?:\"([^\"]*)\"|'([^']*)'|([^\\s/>]+))";
|
|
16
36
|
|
|
17
37
|
export function readXml(span: string | null | undefined, caseInsensitive: boolean): Record<string, unknown> {
|
|
18
38
|
const out: Record<string, unknown> = {};
|
|
@@ -25,6 +45,21 @@ export function readXml(span: string | null | undefined, caseInsensitive: boolea
|
|
|
25
45
|
return out;
|
|
26
46
|
}
|
|
27
47
|
|
|
48
|
+
/**
|
|
49
|
+
* Rootless read: parse the WHOLE text's top-level elements directly, with no enclosing root
|
|
50
|
+
* element to strip (a flat sequence like `<a>..</a><b>..</b>`). Used for `ExtractOptions.rootless`
|
|
51
|
+
* responses. Leading/trailing non-element text is ignored. Never throws. Mirrors Java readRootless.
|
|
52
|
+
*/
|
|
53
|
+
export function readXmlRootless(
|
|
54
|
+
text: string | null | undefined,
|
|
55
|
+
caseInsensitive: boolean,
|
|
56
|
+
): Record<string, unknown> {
|
|
57
|
+
const out: Record<string, unknown> = {};
|
|
58
|
+
if (text == null || text.trim().length === 0) return out;
|
|
59
|
+
parseChildren(text, caseInsensitive, out);
|
|
60
|
+
return out;
|
|
61
|
+
}
|
|
62
|
+
|
|
28
63
|
function parseChildren(inner: string, ci: boolean, out: Record<string, unknown>): void {
|
|
29
64
|
const flags = ci ? "i" : "";
|
|
30
65
|
let pos = 0;
|
|
@@ -33,8 +68,19 @@ function parseChildren(inner: string, ci: boolean, out: Record<string, unknown>)
|
|
|
33
68
|
if (m == null) break;
|
|
34
69
|
const tag = m[1] ?? "";
|
|
35
70
|
const key = ci ? tag.toLowerCase() : tag;
|
|
36
|
-
const contentStart = m.index + m[0].length;
|
|
37
71
|
|
|
72
|
+
let rawAttrs = (m[2] ?? "").trim();
|
|
73
|
+
const selfClosing = rawAttrs.endsWith("/");
|
|
74
|
+
if (selfClosing) rawAttrs = rawAttrs.slice(0, -1).trim();
|
|
75
|
+
const attrs = parseAttrs(rawAttrs, ci);
|
|
76
|
+
|
|
77
|
+
if (selfClosing) {
|
|
78
|
+
accumulate(out, key, Object.keys(attrs).length === 0 ? "" : attrs);
|
|
79
|
+
pos = m.index + m[0].length;
|
|
80
|
+
continue;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
const contentStart = m.index + m[0].length;
|
|
38
84
|
const closeRe = `</${quote(tag)}\\s*>`;
|
|
39
85
|
const close = matchFrom(closeRe, flags, inner, contentStart);
|
|
40
86
|
|
|
@@ -44,11 +90,25 @@ function parseChildren(inner: string, ci: boolean, out: Record<string, unknown>)
|
|
|
44
90
|
contentEnd = close.index;
|
|
45
91
|
next = close.index + close[0].length;
|
|
46
92
|
} else {
|
|
47
|
-
// unclosed tag: extract
|
|
93
|
+
// unclosed tag: extract content up to the next sibling open tag.
|
|
48
94
|
const sib = matchFrom(OPEN_TAG_SRC, flags, inner, contentStart);
|
|
49
95
|
if (sib != null) {
|
|
50
|
-
|
|
51
|
-
|
|
96
|
+
// When the unclosed element's content begins IMMEDIATELY with a child open tag
|
|
97
|
+
// (no leading text), that child was almost certainly meant to be NESTED, not a
|
|
98
|
+
// sibling — a common LLM malformation is dropping the parent's close tag while
|
|
99
|
+
// still emitting a real child element (e.g. <check ...><payoff>text). Absorb the
|
|
100
|
+
// remainder of this span as the unclosed element's content so the child nests
|
|
101
|
+
// under it. When there IS leading text before the first child tag (e.g. <t>hi<c>..),
|
|
102
|
+
// keep the sibling split — the leading text is the unclosed element's body and the
|
|
103
|
+
// following tag is its sibling. Mirrors Java XmlForgivingReader.
|
|
104
|
+
const noLeadingText = inner.substring(contentStart, sib.index).trim().length === 0;
|
|
105
|
+
if (noLeadingText) {
|
|
106
|
+
contentEnd = inner.length;
|
|
107
|
+
next = inner.length;
|
|
108
|
+
} else {
|
|
109
|
+
contentEnd = sib.index;
|
|
110
|
+
next = contentEnd;
|
|
111
|
+
}
|
|
52
112
|
} else {
|
|
53
113
|
contentEnd = inner.length;
|
|
54
114
|
next = inner.length;
|
|
@@ -56,16 +116,43 @@ function parseChildren(inner: string, ci: boolean, out: Record<string, unknown>)
|
|
|
56
116
|
}
|
|
57
117
|
|
|
58
118
|
const content = inner.substring(contentStart, contentEnd);
|
|
59
|
-
|
|
60
|
-
accumulate(out, key, value);
|
|
119
|
+
accumulate(out, key, combine(attrs, content, ci));
|
|
61
120
|
pos = next;
|
|
62
121
|
}
|
|
63
122
|
}
|
|
64
123
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
124
|
+
/** Combine an element's attributes with its body (nested children or plain text). */
|
|
125
|
+
function combine(attrs: Record<string, unknown>, content: string, ci: boolean): unknown {
|
|
126
|
+
if (content.includes("<")) {
|
|
127
|
+
const nested: Record<string, unknown> = {};
|
|
128
|
+
parseChildren(content, ci, nested);
|
|
129
|
+
if (Object.keys(nested).length > 0) {
|
|
130
|
+
// attributes first; a child element wins a name collision
|
|
131
|
+
return { ...attrs, ...nested };
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
return textValue(attrs, content);
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
function textValue(attrs: Record<string, unknown>, content: string): unknown {
|
|
138
|
+
const text = content.trim();
|
|
139
|
+
if (Object.keys(attrs).length === 0) return text;
|
|
140
|
+
return { ...attrs, [TEXT_KEY]: text };
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
function parseAttrs(rawAttrs: string, ci: boolean): Record<string, unknown> {
|
|
144
|
+
const attrs: Record<string, unknown> = {};
|
|
145
|
+
if (rawAttrs.length === 0) return attrs;
|
|
146
|
+
const re = new RegExp(ATTR_SRC, "g");
|
|
147
|
+
let a: RegExpExecArray | null;
|
|
148
|
+
while ((a = re.exec(rawAttrs)) != null) {
|
|
149
|
+
const rawName = a[1];
|
|
150
|
+
if (rawName === undefined) continue; // group 1 is mandatory in a match; guards strict TS
|
|
151
|
+
const name = ci ? rawName.toLowerCase() : rawName;
|
|
152
|
+
const val = a[2] ?? a[3] ?? a[4] ?? "";
|
|
153
|
+
if (!Object.prototype.hasOwnProperty.call(attrs, name)) attrs[name] = val;
|
|
154
|
+
}
|
|
155
|
+
return attrs;
|
|
69
156
|
}
|
|
70
157
|
|
|
71
158
|
function accumulate(out: Record<string, unknown>, key: string, value: unknown): void {
|
package/src/index.ts
CHANGED
|
@@ -3,11 +3,14 @@ export { type Provider, InMemoryProvider } from "./provider.js";
|
|
|
3
3
|
export { ESCAPERS, type RenderFormat } from "./escapers.js";
|
|
4
4
|
export {
|
|
5
5
|
verify,
|
|
6
|
+
resolveTemplateVariable,
|
|
7
|
+
parseTemplate,
|
|
6
8
|
ERR_VAR_NOT_ON_PAYLOAD,
|
|
7
9
|
ERR_PARTIAL_UNRESOLVED,
|
|
8
10
|
ERR_REQUIRED_SLOT_UNUSED,
|
|
9
11
|
ERR_OUTPUT_TAG_MISSING,
|
|
10
12
|
type PayloadField,
|
|
13
|
+
type ResolveStack,
|
|
11
14
|
type VerifyError,
|
|
12
15
|
type VerifyOptions,
|
|
13
16
|
} from "./verify.js";
|
|
@@ -21,6 +24,7 @@ export {
|
|
|
21
24
|
Tolerance,
|
|
22
25
|
ExtractionReport,
|
|
23
26
|
scalar,
|
|
27
|
+
textContentField,
|
|
24
28
|
enumField,
|
|
25
29
|
enumArray,
|
|
26
30
|
range,
|
package/src/verify.ts
CHANGED
|
@@ -54,20 +54,34 @@ const MAX_DEPTH = 32;
|
|
|
54
54
|
|
|
55
55
|
// A Mustache parse token: [type, value, start, end, subTokens?, ...].
|
|
56
56
|
type Token = readonly unknown[];
|
|
57
|
-
|
|
58
|
-
|
|
57
|
+
/**
|
|
58
|
+
* The context stack — innermost context last, mirroring Mustache lookup order.
|
|
59
|
+
* Generic over the field node so consumers (e.g. the docs annotator) can resolve
|
|
60
|
+
* an ENRICHED field tree (carrying owner/type metadata) through the EXACT same
|
|
61
|
+
* walk verify uses, guaranteeing the two surfaces agree.
|
|
62
|
+
*/
|
|
63
|
+
export type ResolveStack<F extends PayloadField = PayloadField> = readonly F[][];
|
|
59
64
|
|
|
60
|
-
function find(fields:
|
|
65
|
+
function find<F extends PayloadField>(fields: F[], name: string): F | undefined {
|
|
61
66
|
return fields.find((f) => f.name === name);
|
|
62
67
|
}
|
|
63
68
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
+
/**
|
|
70
|
+
* Resolve a (possibly dotted) variable path the way Mustache does: the FIRST
|
|
71
|
+
* segment is looked up through the context stack (innermost → outermost); each
|
|
72
|
+
* remaining segment is a direct descent into the resolved field's `fields`.
|
|
73
|
+
* Returns the resolved field, or undefined if any segment is missing.
|
|
74
|
+
*
|
|
75
|
+
* EXPORTED so the docs annotator can share this ONE resolution (annotator ⇆
|
|
76
|
+
* verify must agree). Generic over the node type: an enriched tree resolves the
|
|
77
|
+
* same way, since only `name`/`fields` drive the walk.
|
|
78
|
+
*/
|
|
79
|
+
export function resolveTemplateVariable<F extends PayloadField>(
|
|
80
|
+
stack: ResolveStack<F>,
|
|
81
|
+
path: string,
|
|
82
|
+
): F | undefined {
|
|
69
83
|
const segs = path.split(".");
|
|
70
|
-
let current:
|
|
84
|
+
let current: F | undefined;
|
|
71
85
|
for (let i = stack.length - 1; i >= 0; i--) {
|
|
72
86
|
const hit = find(stack[i]!, segs[0]!);
|
|
73
87
|
if (hit) {
|
|
@@ -76,15 +90,27 @@ function resolve(stack: Stack, path: string): PayloadField | undefined {
|
|
|
76
90
|
}
|
|
77
91
|
}
|
|
78
92
|
for (let i = 1; current && i < segs.length; i++) {
|
|
79
|
-
current = current.fields ? find(current.fields, segs[i]!) : undefined;
|
|
93
|
+
current = current.fields ? (find(current.fields, segs[i]!) as F | undefined) : undefined;
|
|
80
94
|
}
|
|
81
95
|
return current;
|
|
82
96
|
}
|
|
83
97
|
|
|
98
|
+
// Internal alias preserving the original call sites unchanged.
|
|
99
|
+
const resolve = resolveTemplateVariable;
|
|
100
|
+
|
|
84
101
|
function parse(text: string): Token[] {
|
|
85
102
|
return Mustache.parse(text) as unknown as Token[];
|
|
86
103
|
}
|
|
87
104
|
|
|
105
|
+
/**
|
|
106
|
+
* Parse a template into Mustache tokens (`[type, value, start, end, subTokens?]`),
|
|
107
|
+
* the SAME parse verify walks. Exported so the docs annotator tokenizes through
|
|
108
|
+
* one parser (no divergent re-tokenization). Returns a readonly token list.
|
|
109
|
+
*/
|
|
110
|
+
export function parseTemplate(text: string): readonly (readonly unknown[])[] {
|
|
111
|
+
return parse(text);
|
|
112
|
+
}
|
|
113
|
+
|
|
88
114
|
// An opening tag is `<tag` immediately followed by `>` or XML whitespace, so
|
|
89
115
|
// attributes are allowed (`<answer foo="1">`) but a longer name is not over-matched
|
|
90
116
|
// (`<answers>` does not satisfy `answer`).
|
|
@@ -124,7 +150,7 @@ export function verify(
|
|
|
124
150
|
// (no second resolution pass).
|
|
125
151
|
const staticTexts: string[] = [templateText];
|
|
126
152
|
|
|
127
|
-
function walk(tokens: Token[], stack:
|
|
153
|
+
function walk(tokens: Token[], stack: ResolveStack, seen: readonly string[]): void {
|
|
128
154
|
const atRoot = stack.length === 1 && stack[0] === root;
|
|
129
155
|
for (const tok of tokens) {
|
|
130
156
|
const type = tok[0] as string;
|