@metaobjectsdev/render 0.8.1-rc.1 → 0.9.0-rc.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/email-document.d.ts +7 -0
- package/dist/email-document.d.ts.map +1 -0
- package/dist/email-document.js +2 -0
- package/dist/email-document.js.map +1 -0
- package/dist/extract/coerce.d.ts +15 -0
- package/dist/extract/coerce.d.ts.map +1 -0
- package/dist/{recover → extract}/coerce.js +87 -13
- package/dist/extract/coerce.js.map +1 -0
- package/dist/{recover/recover-map.d.ts → extract/extract-map.d.ts} +1 -1
- package/dist/{recover/recover-map.d.ts.map → extract/extract-map.d.ts.map} +1 -1
- package/dist/{recover/recover-map.js → extract/extract-map.js} +3 -3
- package/dist/{recover/recover-map.js.map → extract/extract-map.js.map} +1 -1
- package/dist/extract/extract.d.ts +4 -0
- package/dist/extract/extract.d.ts.map +1 -0
- package/dist/extract/extract.js +157 -0
- package/dist/extract/extract.js.map +1 -0
- package/dist/{recover → extract}/json-forgiving-reader.d.ts.map +1 -1
- package/dist/{recover → extract}/json-forgiving-reader.js +1 -1
- package/dist/{recover → extract}/json-forgiving-reader.js.map +1 -1
- package/dist/{recover → extract}/locate.d.ts.map +1 -1
- package/dist/{recover → extract}/locate.js.map +1 -1
- package/dist/extract/normalize.d.ts +4 -0
- package/dist/extract/normalize.d.ts.map +1 -0
- package/dist/extract/normalize.js +22 -0
- package/dist/extract/normalize.js.map +1 -0
- package/dist/extract/strip.d.ts.map +1 -0
- package/dist/{recover → extract}/strip.js.map +1 -1
- package/dist/extract/types.d.ts +160 -0
- package/dist/extract/types.d.ts.map +1 -0
- package/dist/extract/types.js +221 -0
- package/dist/extract/types.js.map +1 -0
- package/dist/{recover → extract}/xml-forgiving-reader.d.ts.map +1 -1
- package/dist/{recover → extract}/xml-forgiving-reader.js +1 -1
- package/dist/{recover → extract}/xml-forgiving-reader.js.map +1 -1
- package/dist/index.d.ts +4 -3
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +4 -4
- package/dist/index.js.map +1 -1
- package/dist/prompt/output-format-renderer.d.ts.map +1 -1
- package/dist/prompt/output-format-renderer.js +113 -59
- package/dist/prompt/output-format-renderer.js.map +1 -1
- package/dist/prompt/output-format-spec.d.ts +1 -1
- package/dist/prompt/prompt-field.d.ts +1 -1
- package/package.json +1 -1
- package/src/email-document.ts +6 -0
- package/src/extract/KNOWN_GAPS.md +59 -0
- package/src/extract/coerce.ts +224 -0
- package/src/{recover/recover-map.ts → extract/extract-map.ts} +2 -2
- package/src/extract/extract.ts +187 -0
- package/src/{recover → extract}/json-forgiving-reader.ts +1 -1
- package/src/extract/normalize.ts +23 -0
- package/src/extract/types.ts +346 -0
- package/src/{recover → extract}/xml-forgiving-reader.ts +1 -1
- package/src/index.ts +17 -11
- package/src/prompt/output-format-renderer.ts +140 -61
- package/src/prompt/output-format-spec.ts +1 -1
- package/src/prompt/prompt-field.ts +1 -1
- package/dist/recover/coerce.d.ts +0 -5
- package/dist/recover/coerce.d.ts.map +0 -1
- package/dist/recover/coerce.js.map +0 -1
- package/dist/recover/recover.d.ts +0 -4
- package/dist/recover/recover.d.ts.map +0 -1
- package/dist/recover/recover.js +0 -115
- package/dist/recover/recover.js.map +0 -1
- package/dist/recover/strip.d.ts.map +0 -1
- package/dist/recover/types.d.ts +0 -117
- package/dist/recover/types.d.ts.map +0 -1
- package/dist/recover/types.js +0 -124
- package/dist/recover/types.js.map +0 -1
- package/src/recover/KNOWN_GAPS.md +0 -35
- package/src/recover/coerce.ts +0 -141
- package/src/recover/recover.ts +0 -146
- package/src/recover/types.ts +0 -217
- /package/dist/{recover → extract}/json-forgiving-reader.d.ts +0 -0
- /package/dist/{recover → extract}/locate.d.ts +0 -0
- /package/dist/{recover → extract}/locate.js +0 -0
- /package/dist/{recover → extract}/strip.d.ts +0 -0
- /package/dist/{recover → extract}/strip.js +0 -0
- /package/dist/{recover → extract}/xml-forgiving-reader.d.ts +0 -0
- /package/src/{recover → extract}/locate.ts +0 -0
- /package/src/{recover → extract}/strip.ts +0 -0
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
// Public entry point. Runs the staged pipeline; NEVER throws. Mirrors Java Extract.
|
|
2
|
+
|
|
3
|
+
import {
|
|
4
|
+
Format,
|
|
5
|
+
FieldKind,
|
|
6
|
+
FieldExtraction,
|
|
7
|
+
Tolerance,
|
|
8
|
+
normalizeOptions,
|
|
9
|
+
} from "./types.js";
|
|
10
|
+
import type { FieldSpec, ExtractOptions, ExtractionOutcome, ExtractSchema } from "./types.js";
|
|
11
|
+
import { ExtractionReport } from "./types.js";
|
|
12
|
+
import { strip } from "./strip.js";
|
|
13
|
+
import { locateJson, locateXml } from "./locate.js";
|
|
14
|
+
import { readJson, TRUNCATED } from "./json-forgiving-reader.js";
|
|
15
|
+
import { readXml } from "./xml-forgiving-reader.js";
|
|
16
|
+
import { coerceValue, scalarCoerce, MALFORMED } from "./coerce.js";
|
|
17
|
+
|
|
18
|
+
/** The forgiving entry point: extract dirty `text` against `schema`. Never throws. */
|
|
19
|
+
export function extract(
|
|
20
|
+
text: string | null | undefined,
|
|
21
|
+
schema: ExtractSchema,
|
|
22
|
+
opts?: Partial<ExtractOptions> | null,
|
|
23
|
+
): ExtractionOutcome {
|
|
24
|
+
const o = normalizeOptions(opts);
|
|
25
|
+
const report = new ExtractionReport();
|
|
26
|
+
const data: Record<string, unknown> = {};
|
|
27
|
+
|
|
28
|
+
const stripped = strip(text);
|
|
29
|
+
const ci = o.tolerance !== Tolerance.STRICT;
|
|
30
|
+
|
|
31
|
+
const span =
|
|
32
|
+
schema.format === Format.JSON ? locateJson(stripped) : locateXml(stripped, schema.rootName, ci);
|
|
33
|
+
|
|
34
|
+
let raw: Record<string, unknown>;
|
|
35
|
+
if (span == null) {
|
|
36
|
+
raw = {};
|
|
37
|
+
} else if (schema.format === Format.JSON) {
|
|
38
|
+
raw = readJson(span);
|
|
39
|
+
} else {
|
|
40
|
+
raw = readXml(span, ci);
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
if (isEmptyRecord(raw) && (stripped.length === 0 || span == null)) {
|
|
44
|
+
report.markEmpty();
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
extractFields(schema.fields, raw, "", data, report, o, ci);
|
|
48
|
+
return { data, report };
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
function extractFields(
|
|
52
|
+
fields: readonly FieldSpec[],
|
|
53
|
+
raw: Record<string, unknown>,
|
|
54
|
+
prefix: string,
|
|
55
|
+
data: Record<string, unknown>,
|
|
56
|
+
report: ExtractionReport,
|
|
57
|
+
o: ExtractOptions,
|
|
58
|
+
ci: boolean,
|
|
59
|
+
): void {
|
|
60
|
+
for (const f of fields) {
|
|
61
|
+
const path = prefix.length === 0 ? f.name : `${prefix}.${f.name}`;
|
|
62
|
+
const present = lookup(raw, f.name, ci);
|
|
63
|
+
if (present === undefined) {
|
|
64
|
+
// FR-011 / Phase B: an absent field with a declared @default fills the value → DEFAULTED
|
|
65
|
+
// (which satisfies a @required field). Generalized to all field kinds: an enum default is
|
|
66
|
+
// its member string as-is; a non-enum default is coerced to the field's kind via the pure
|
|
67
|
+
// scalar coerce (so @default "0" on field.int yields integer 0). A non-coercible non-enum
|
|
68
|
+
// default is treated as no default.
|
|
69
|
+
if (f.defaultValue != null) {
|
|
70
|
+
const coerced =
|
|
71
|
+
f.kind === FieldKind.ENUM ? f.defaultValue : scalarCoerce(f.defaultValue, f);
|
|
72
|
+
if (coerced !== MALFORMED) {
|
|
73
|
+
data[f.name] = coerced;
|
|
74
|
+
report.addCoercion({ fieldPath: path, from: "", to: f.defaultValue, kind: "default" });
|
|
75
|
+
report.set(path, FieldExtraction.DEFAULTED);
|
|
76
|
+
continue;
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
report.set(path, f.required ? FieldExtraction.LOST_REQUIRED : FieldExtraction.LOST_OPTIONAL);
|
|
80
|
+
continue;
|
|
81
|
+
}
|
|
82
|
+
if (present === TRUNCATED) {
|
|
83
|
+
// present-but-garbled (empty/cut-off value)
|
|
84
|
+
report.set(path, FieldExtraction.MALFORMED);
|
|
85
|
+
continue;
|
|
86
|
+
}
|
|
87
|
+
if (f.array) {
|
|
88
|
+
// An array field: a single non-list value is treated as a one-element array
|
|
89
|
+
// (e.g. a single repeated-XML tag). Each element is coerced/recursed independently.
|
|
90
|
+
const elements: unknown[] = Array.isArray(present) ? present : [present];
|
|
91
|
+
const out: unknown[] = [];
|
|
92
|
+
let anyMalformed = false;
|
|
93
|
+
// Phase B (array-of-enum): an enum element flows through the SAME enum coercion pipeline a
|
|
94
|
+
// scalar enum uses (extractValue → coerceValue → coerceEnum), and is CLASSIFIED per element
|
|
95
|
+
// by indexed path (tags[0], tags[1], …) exactly as a scalar enum: EXTRACTED / DEFAULTED (via
|
|
96
|
+
// @coerceDefault) / MALFORMED. Non-enum scalar arrays keep their existing behavior (raw
|
|
97
|
+
// element list, no per-element states).
|
|
98
|
+
const enumElements = f.kind === FieldKind.ENUM;
|
|
99
|
+
for (let idx = 0; idx < elements.length; idx++) {
|
|
100
|
+
const elemPath = `${path}[${idx}]`;
|
|
101
|
+
const v = extractValue(f, elements[idx], elemPath, report, o, ci);
|
|
102
|
+
if (v === MALFORMED) {
|
|
103
|
+
anyMalformed = true;
|
|
104
|
+
if (enumElements) report.set(elemPath, FieldExtraction.MALFORMED);
|
|
105
|
+
} else {
|
|
106
|
+
out.push(v);
|
|
107
|
+
if (enumElements) report.set(elemPath, classifyCoerced(elemPath, report));
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
// Cross-port contract: a MALFORMED array still places its successfully-coerced
|
|
111
|
+
// elements into data (partial extraction), UNLIKE a MALFORMED scalar which is absent.
|
|
112
|
+
data[f.name] = out;
|
|
113
|
+
report.set(path, anyMalformed ? FieldExtraction.MALFORMED : FieldExtraction.EXTRACTED);
|
|
114
|
+
continue;
|
|
115
|
+
}
|
|
116
|
+
if (Array.isArray(present)) {
|
|
117
|
+
// a list where a singular value was expected
|
|
118
|
+
report.set(path, FieldExtraction.MALFORMED);
|
|
119
|
+
continue;
|
|
120
|
+
}
|
|
121
|
+
const v = extractValue(f, present, path, report, o, ci);
|
|
122
|
+
if (v === MALFORMED) {
|
|
123
|
+
report.set(path, FieldExtraction.MALFORMED);
|
|
124
|
+
} else {
|
|
125
|
+
data[f.name] = v;
|
|
126
|
+
// FR-011: a value reached via @coerceDefault (or @default) is DEFAULTED, not EXTRACTED.
|
|
127
|
+
report.set(path, classifyCoerced(path, report));
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/**
|
|
133
|
+
* FR-011: classify a successfully-coerced field. DEFAULTED when its terminal (last-logged)
|
|
134
|
+
* coercion for this path is a default-class fallback; EXTRACTED otherwise. Nested objects
|
|
135
|
+
* (which log no coercion of their own) classify as EXTRACTED.
|
|
136
|
+
*/
|
|
137
|
+
function classifyCoerced(path: string, report: ExtractionReport): FieldExtraction {
|
|
138
|
+
let terminalKind: string | null = null;
|
|
139
|
+
for (const c of report.coercions()) if (c.fieldPath === path) terminalKind = c.kind;
|
|
140
|
+
return terminalKind === "coerceDefault" || terminalKind === "default"
|
|
141
|
+
? FieldExtraction.DEFAULTED
|
|
142
|
+
: FieldExtraction.EXTRACTED;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
/** Coerce one (non-array) element: nested-object recursion or scalar coercion. Returns MALFORMED on failure. */
|
|
146
|
+
function extractValue(
|
|
147
|
+
f: FieldSpec,
|
|
148
|
+
present: unknown,
|
|
149
|
+
path: string,
|
|
150
|
+
report: ExtractionReport,
|
|
151
|
+
o: ExtractOptions,
|
|
152
|
+
ci: boolean,
|
|
153
|
+
): unknown | typeof MALFORMED {
|
|
154
|
+
if (f.kind === FieldKind.OBJECT) {
|
|
155
|
+
if (f.nested != null && isPlainObject(present)) {
|
|
156
|
+
const nestedData: Record<string, unknown> = {};
|
|
157
|
+
extractFields(f.nested.fields, present as Record<string, unknown>, path, nestedData, report, o, ci);
|
|
158
|
+
return nestedData;
|
|
159
|
+
}
|
|
160
|
+
return MALFORMED; // object expected but scalar/non-map present
|
|
161
|
+
}
|
|
162
|
+
const rawStr = typeof present === "string" ? present : stringifyScalar(present);
|
|
163
|
+
return coerceValue(rawStr, f, o, path, report);
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
/** Case-folding lookup honoring tolerance. Returns `undefined` for absent (mirrors Java null). */
|
|
167
|
+
function lookup(raw: Record<string, unknown>, name: string, ci: boolean): unknown {
|
|
168
|
+
if (Object.prototype.hasOwnProperty.call(raw, name)) return raw[name];
|
|
169
|
+
if (ci) {
|
|
170
|
+
const lower = name.toLowerCase();
|
|
171
|
+
for (const k of Object.keys(raw)) if (k.toLowerCase() === lower) return raw[k];
|
|
172
|
+
}
|
|
173
|
+
return undefined;
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
function isPlainObject(o: unknown): boolean {
|
|
177
|
+
return typeof o === "object" && o !== null && !Array.isArray(o);
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
function isEmptyRecord(o: Record<string, unknown>): boolean {
|
|
181
|
+
return Object.keys(o).length === 0;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
/** Mirror Java String.valueOf for non-string forgiving-reader scalars. */
|
|
185
|
+
function stringifyScalar(v: unknown): string {
|
|
186
|
+
return String(v);
|
|
187
|
+
}
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
// Mirrors Java JsonForgivingReader. The no-hang + TRUNCATED contracts are load-bearing.
|
|
3
3
|
|
|
4
4
|
/** Sentinel: a key appeared in the text but its value was empty/cut-off (present-but-garbled). */
|
|
5
|
-
export const TRUNCATED: unique symbol = Symbol("
|
|
5
|
+
export const TRUNCATED: unique symbol = Symbol("extract.json.TRUNCATED");
|
|
6
6
|
|
|
7
7
|
/** A character is JSON-insignificant whitespace. Mirrors Java Character.isWhitespace closely enough for the corpus. */
|
|
8
8
|
function isWhitespace(c: string): boolean {
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
// FR-011: enum-variant normalization for the Coerce stage.
|
|
2
|
+
// ASCII-only by design: enum members are ASCII identifiers, so a pure [A-Za-z0-9]
|
|
3
|
+
// transform is byte-identical across ports and sidesteps locale case-folding (Turkish-İ).
|
|
4
|
+
// Mode comes from the @normalize attr (none|collapse|strip; default strip).
|
|
5
|
+
|
|
6
|
+
export type NormalizeMode = "none" | "collapse" | "strip";
|
|
7
|
+
|
|
8
|
+
/** ASCII-only enum normalization. Pure [A-Za-z0-9] transform → byte-identical cross-port. */
|
|
9
|
+
export function normalizeEnum(s: string, mode: NormalizeMode): string {
|
|
10
|
+
if (mode === "none") return s;
|
|
11
|
+
const up = asciiUpper(s.trim());
|
|
12
|
+
if (mode === "collapse") return up.replace(/[\s_-]+/g, "_");
|
|
13
|
+
return up.replace(/[^A-Z0-9]/g, ""); // strip
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
function asciiUpper(s: string): string {
|
|
17
|
+
let out = "";
|
|
18
|
+
for (let i = 0; i < s.length; i++) {
|
|
19
|
+
const c = s.charCodeAt(i);
|
|
20
|
+
out += c >= 97 && c <= 122 ? String.fromCharCode(c - 32) : s[i];
|
|
21
|
+
}
|
|
22
|
+
return out;
|
|
23
|
+
}
|
|
@@ -0,0 +1,346 @@
|
|
|
1
|
+
import type { NormalizeMode } from "./normalize.js";
|
|
2
|
+
|
|
3
|
+
// FR-010 extract engine — types & model (Tier-2 idiomatic TS port).
|
|
4
|
+
//
|
|
5
|
+
// Cross-port REFERENCE is the Java engine
|
|
6
|
+
// (server/java/render/.../extract/). This file ports the Java records/enums to
|
|
7
|
+
// idiomatic TS: enums become string-union `as const` objects (values match the
|
|
8
|
+
// corpus / Java enum names exactly), records become readonly interfaces +
|
|
9
|
+
// factory functions, and the mutable ExtractionReport is a class.
|
|
10
|
+
|
|
11
|
+
/** Output format the dirty text claims to be. Corpus schema.json uses "JSON"/"XML". */
|
|
12
|
+
export const Format = {
|
|
13
|
+
JSON: "JSON",
|
|
14
|
+
XML: "XML",
|
|
15
|
+
} as const;
|
|
16
|
+
export type Format = (typeof Format)[keyof typeof Format];
|
|
17
|
+
|
|
18
|
+
/** The coercion target kinds the engine understands. OBJECT = nested ExtractSchema. */
|
|
19
|
+
export const FieldKind = {
|
|
20
|
+
STRING: "STRING",
|
|
21
|
+
INT: "INT",
|
|
22
|
+
LONG: "LONG",
|
|
23
|
+
DOUBLE: "DOUBLE",
|
|
24
|
+
BOOLEAN: "BOOLEAN",
|
|
25
|
+
ENUM: "ENUM",
|
|
26
|
+
OBJECT: "OBJECT",
|
|
27
|
+
} as const;
|
|
28
|
+
export type FieldKind = (typeof FieldKind)[keyof typeof FieldKind];
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* FROZEN cross-port per-field extraction classification. Do not reorder or add
|
|
32
|
+
* without an ADR. These string values are SERIALIZED in the conformance corpus.
|
|
33
|
+
*/
|
|
34
|
+
export const FieldExtraction = {
|
|
35
|
+
EXTRACTED: "EXTRACTED",
|
|
36
|
+
// A `@default`/`@coerceDefault`-backed value (absent-fill or present-but-uncoercible fallback).
|
|
37
|
+
DEFAULTED: "DEFAULTED",
|
|
38
|
+
LOST_OPTIONAL: "LOST_OPTIONAL",
|
|
39
|
+
LOST_REQUIRED: "LOST_REQUIRED",
|
|
40
|
+
MALFORMED: "MALFORMED",
|
|
41
|
+
} as const;
|
|
42
|
+
export type FieldExtraction = (typeof FieldExtraction)[keyof typeof FieldExtraction];
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* STRICT: case-sensitive, minimal repair. NORMAL: case-insensitive keys/tags
|
|
46
|
+
* (default). LOOSE: maximal repair (currently identical to NORMAL — reserved).
|
|
47
|
+
*/
|
|
48
|
+
export const Tolerance = {
|
|
49
|
+
STRICT: "STRICT",
|
|
50
|
+
NORMAL: "NORMAL",
|
|
51
|
+
LOOSE: "LOOSE",
|
|
52
|
+
} as const;
|
|
53
|
+
export type Tolerance = (typeof Tolerance)[keyof typeof Tolerance];
|
|
54
|
+
|
|
55
|
+
/** A recorded normalization/coercion. kind e.g. "normalize", "alias", "runtime-alias-override", "clamp", "coerceDefault", "default". */
|
|
56
|
+
export interface Coercion {
|
|
57
|
+
readonly fieldPath: string;
|
|
58
|
+
readonly from: string;
|
|
59
|
+
readonly to: string;
|
|
60
|
+
readonly kind: string;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* One field's extract descriptor. enumValues/enumAlias non-null only for ENUM;
|
|
65
|
+
* min/max non-null only for numeric range constraints; nested non-null only for OBJECT.
|
|
66
|
+
*/
|
|
67
|
+
export interface FieldSpec {
|
|
68
|
+
readonly name: string;
|
|
69
|
+
readonly kind: FieldKind;
|
|
70
|
+
readonly required: boolean;
|
|
71
|
+
readonly array: boolean;
|
|
72
|
+
readonly enumValues: readonly string[] | null;
|
|
73
|
+
readonly enumAlias: Readonly<Record<string, string>> | null;
|
|
74
|
+
readonly min: number | null;
|
|
75
|
+
readonly max: number | null;
|
|
76
|
+
readonly nested: ExtractSchema | null;
|
|
77
|
+
/** FR-011: present-but-uncoercible fallback member (from `@coerceDefault`). ENUM-only; null = none. */
|
|
78
|
+
readonly coerceDefault: string | null;
|
|
79
|
+
/**
|
|
80
|
+
* Absent-fill default (from `@default`). When the field is ABSENT, extract fills this value
|
|
81
|
+
* → DEFAULTED (which satisfies `@required`). Generalized to ALL field kinds (Phase B): for an
|
|
82
|
+
* enum it is the member string verbatim; for a non-enum it is coerced to `kind` via the pure
|
|
83
|
+
* scalar coerce (so `@default "0"` on `field.int` yields integer 0). null = no default.
|
|
84
|
+
*/
|
|
85
|
+
readonly defaultValue: string | null;
|
|
86
|
+
/** FR-011: resolved enum normalization mode (from `@normalize`; default `"strip"`). */
|
|
87
|
+
readonly normalize: NormalizeMode;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* A scalar field, optionally carrying an absent-fill `@default` (Phase B — generalized
|
|
92
|
+
* `@default`). When ABSENT from the model response, extract coerces `defaultValue` to `kind`
|
|
93
|
+
* and classifies the field DEFAULTED (which satisfies `@required`). `defaultValue == null` is
|
|
94
|
+
* the no-default case (back-compat with the original two-arg call).
|
|
95
|
+
*/
|
|
96
|
+
export function scalar(
|
|
97
|
+
name: string,
|
|
98
|
+
kind: FieldKind,
|
|
99
|
+
required: boolean,
|
|
100
|
+
defaultValue?: string | null,
|
|
101
|
+
): FieldSpec {
|
|
102
|
+
return {
|
|
103
|
+
name,
|
|
104
|
+
kind,
|
|
105
|
+
required,
|
|
106
|
+
array: false,
|
|
107
|
+
enumValues: null,
|
|
108
|
+
enumAlias: null,
|
|
109
|
+
min: null,
|
|
110
|
+
max: null,
|
|
111
|
+
nested: null,
|
|
112
|
+
coerceDefault: null,
|
|
113
|
+
defaultValue: defaultValue ?? null,
|
|
114
|
+
normalize: "strip",
|
|
115
|
+
};
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
export function enumField(
|
|
119
|
+
name: string,
|
|
120
|
+
required: boolean,
|
|
121
|
+
values: readonly string[] | null,
|
|
122
|
+
aliases: Readonly<Record<string, string>> | null,
|
|
123
|
+
coerceDefault?: string | null,
|
|
124
|
+
normalize: NormalizeMode = "strip",
|
|
125
|
+
defaultValue?: string | null,
|
|
126
|
+
): FieldSpec {
|
|
127
|
+
return {
|
|
128
|
+
name,
|
|
129
|
+
kind: FieldKind.ENUM,
|
|
130
|
+
required,
|
|
131
|
+
array: false,
|
|
132
|
+
enumValues: values == null ? null : [...values],
|
|
133
|
+
enumAlias: aliases == null ? {} : { ...aliases },
|
|
134
|
+
min: null,
|
|
135
|
+
max: null,
|
|
136
|
+
nested: null,
|
|
137
|
+
coerceDefault: coerceDefault ?? null,
|
|
138
|
+
defaultValue: defaultValue ?? null,
|
|
139
|
+
normalize,
|
|
140
|
+
};
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* Phase B (array-of-enum): an enum field that is a list (`array === true`). Each element flows
|
|
145
|
+
* through the SAME enum coercion pipeline a scalar enum uses (exact → normalize → `@enumAlias`
|
|
146
|
+
* → `@coerceDefault` → MALFORMED) and is classified independently by indexed path (`tags[0]`,
|
|
147
|
+
* `tags[1]`, …). Mirrors {@link enumField} but with `array = true`.
|
|
148
|
+
*/
|
|
149
|
+
export function enumArray(
|
|
150
|
+
name: string,
|
|
151
|
+
required: boolean,
|
|
152
|
+
values: readonly string[] | null,
|
|
153
|
+
aliases: Readonly<Record<string, string>> | null,
|
|
154
|
+
coerceDefault?: string | null,
|
|
155
|
+
normalize: NormalizeMode = "strip",
|
|
156
|
+
defaultValue?: string | null,
|
|
157
|
+
): FieldSpec {
|
|
158
|
+
return {
|
|
159
|
+
name,
|
|
160
|
+
kind: FieldKind.ENUM,
|
|
161
|
+
required,
|
|
162
|
+
array: true,
|
|
163
|
+
enumValues: values == null ? null : [...values],
|
|
164
|
+
enumAlias: aliases == null ? {} : { ...aliases },
|
|
165
|
+
min: null,
|
|
166
|
+
max: null,
|
|
167
|
+
nested: null,
|
|
168
|
+
coerceDefault: coerceDefault ?? null,
|
|
169
|
+
defaultValue: defaultValue ?? null,
|
|
170
|
+
normalize,
|
|
171
|
+
};
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
export function range(
|
|
175
|
+
name: string,
|
|
176
|
+
kind: FieldKind,
|
|
177
|
+
required: boolean,
|
|
178
|
+
min: number | null,
|
|
179
|
+
max: number | null,
|
|
180
|
+
): FieldSpec {
|
|
181
|
+
return {
|
|
182
|
+
name,
|
|
183
|
+
kind,
|
|
184
|
+
required,
|
|
185
|
+
array: false,
|
|
186
|
+
enumValues: null,
|
|
187
|
+
enumAlias: null,
|
|
188
|
+
min,
|
|
189
|
+
max,
|
|
190
|
+
nested: null,
|
|
191
|
+
coerceDefault: null,
|
|
192
|
+
defaultValue: null,
|
|
193
|
+
normalize: "strip",
|
|
194
|
+
};
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
export function object(name: string, required: boolean, array: boolean, nested: ExtractSchema | null): FieldSpec {
|
|
198
|
+
return {
|
|
199
|
+
name,
|
|
200
|
+
kind: FieldKind.OBJECT,
|
|
201
|
+
required,
|
|
202
|
+
array,
|
|
203
|
+
enumValues: null,
|
|
204
|
+
enumAlias: null,
|
|
205
|
+
min: null,
|
|
206
|
+
max: null,
|
|
207
|
+
nested,
|
|
208
|
+
coerceDefault: null,
|
|
209
|
+
defaultValue: null,
|
|
210
|
+
normalize: "strip",
|
|
211
|
+
};
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
/** Top-level extract descriptor. rootName = the XML root tag / logical JSON root name. */
|
|
215
|
+
export interface ExtractSchema {
|
|
216
|
+
readonly format: Format;
|
|
217
|
+
readonly rootName: string;
|
|
218
|
+
readonly fields: readonly FieldSpec[];
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
export function extractSchema(format: Format, rootName: string, fields: readonly FieldSpec[] | null): ExtractSchema {
|
|
222
|
+
return { format, rootName, fields: fields == null ? [] : [...fields] };
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
/**
|
|
226
|
+
* ctx carries the field path and the FieldSpec; return null to fall through to
|
|
227
|
+
* default coercion. The single bespoke-coercion hook (the bounded "20%").
|
|
228
|
+
*/
|
|
229
|
+
export type OnField = (fieldPath: string, rawValue: string, spec: FieldSpec) => unknown | null;
|
|
230
|
+
|
|
231
|
+
/**
|
|
232
|
+
* Bounded runtime override surface. aliases/normalizers are MERGED with the
|
|
233
|
+
* schema's, runtime winning on key conflict. onField is the single hook.
|
|
234
|
+
*/
|
|
235
|
+
export interface ExtractOptions {
|
|
236
|
+
readonly tolerance: Tolerance;
|
|
237
|
+
readonly aliases: Readonly<Record<string, string>>;
|
|
238
|
+
readonly normalizers: Readonly<Record<string, (raw: string) => unknown | null>>;
|
|
239
|
+
readonly onField: OnField | null;
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
export function defaults(): ExtractOptions {
|
|
243
|
+
return { tolerance: Tolerance.NORMAL, aliases: {}, normalizers: {}, onField: null };
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
/** Normalize a partial / undefined options bag into a complete ExtractOptions. */
|
|
247
|
+
export function normalizeOptions(opts?: Partial<ExtractOptions> | null): ExtractOptions {
|
|
248
|
+
if (opts == null) return defaults();
|
|
249
|
+
return {
|
|
250
|
+
tolerance: opts.tolerance ?? Tolerance.NORMAL,
|
|
251
|
+
aliases: opts.aliases == null ? {} : { ...opts.aliases },
|
|
252
|
+
normalizers: opts.normalizers == null ? {} : { ...opts.normalizers },
|
|
253
|
+
onField: opts.onField ?? null,
|
|
254
|
+
};
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
/** Engine return. data is a forgiving record; Phase-2 codegen wraps it into a typed ExtractionResult<T>. */
|
|
258
|
+
export interface ExtractionOutcome {
|
|
259
|
+
readonly data: Record<string, unknown>;
|
|
260
|
+
readonly report: ExtractionReport;
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
/** Typed result of a generated extract(...): best-effort value (null where lost/malformed) + report. */
|
|
264
|
+
export interface ExtractionResult<T> {
|
|
265
|
+
readonly data: T | null;
|
|
266
|
+
readonly report: ExtractionReport;
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
/**
|
|
270
|
+
* Thrown by {@link orThrow} when a {@link ExtractionResult} lost a `@required` field. Mirrors
|
|
271
|
+
* Java's `ExtractException`. Carries the list of lost-required field paths.
|
|
272
|
+
*/
|
|
273
|
+
export class ExtractError extends Error {
|
|
274
|
+
readonly lostRequired: readonly string[];
|
|
275
|
+
constructor(lostRequired: readonly string[]) {
|
|
276
|
+
super(`extract: required field(s) lost: ${lostRequired.join(", ")}`);
|
|
277
|
+
this.name = "ExtractError";
|
|
278
|
+
this.lostRequired = [...lostRequired];
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
/**
|
|
283
|
+
* Opt-in strictness over a never-throwing {@link ExtractionResult}. Mirrors Java
|
|
284
|
+
* `ExtractionResult.orThrow()`. Throws a {@link ExtractError} iff the report has a lost
|
|
285
|
+
* `@required` field; otherwise returns `result.data`.
|
|
286
|
+
*
|
|
287
|
+
* <p>TS divergence from Java (documented): `ExtractionResult` is a plain interface (the generated
|
|
288
|
+
* output-parsers build it as an object literal), so `orThrow` is a free function rather than a
|
|
289
|
+
* method on the result. Semantics are identical.</p>
|
|
290
|
+
*/
|
|
291
|
+
export function orThrow<T>(result: ExtractionResult<T>): T | null {
|
|
292
|
+
if (result.report.hasLostRequired()) {
|
|
293
|
+
throw new ExtractError(result.report.lostRequired());
|
|
294
|
+
}
|
|
295
|
+
return result.data;
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
/** Mutable accumulator of per-field extraction classification, the degenerate-response flag, and coercion notes. */
|
|
299
|
+
export class ExtractionReport {
|
|
300
|
+
// Insertion-ordered (Map preserves insertion order, mirroring Java LinkedHashMap).
|
|
301
|
+
private readonly _states = new Map<string, FieldExtraction>();
|
|
302
|
+
private readonly _coercions: Coercion[] = [];
|
|
303
|
+
private _empty = false;
|
|
304
|
+
|
|
305
|
+
set(fieldPath: string, state: FieldExtraction): void {
|
|
306
|
+
this._states.set(fieldPath, state);
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
addCoercion(c: Coercion): void {
|
|
310
|
+
this._coercions.push(c);
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
markEmpty(): void {
|
|
314
|
+
this._empty = true;
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
isEmpty(): boolean {
|
|
318
|
+
return this._empty;
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
states(): ReadonlyMap<string, FieldExtraction> {
|
|
322
|
+
return new Map(this._states);
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
coercions(): readonly Coercion[] {
|
|
326
|
+
return [...this._coercions];
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
lostRequired(): string[] {
|
|
330
|
+
return this.byState(FieldExtraction.LOST_REQUIRED);
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
malformed(): string[] {
|
|
334
|
+
return this.byState(FieldExtraction.MALFORMED);
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
hasLostRequired(): boolean {
|
|
338
|
+
return this.lostRequired().length > 0;
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
private byState(s: FieldExtraction): string[] {
|
|
342
|
+
const out: string[] = [];
|
|
343
|
+
for (const [k, v] of this._states) if (v === s) out.push(k);
|
|
344
|
+
return out;
|
|
345
|
+
}
|
|
346
|
+
}
|
|
@@ -44,7 +44,7 @@ function parseChildren(inner: string, ci: boolean, out: Record<string, unknown>)
|
|
|
44
44
|
contentEnd = close.index;
|
|
45
45
|
next = close.index + close[0].length;
|
|
46
46
|
} else {
|
|
47
|
-
// unclosed tag:
|
|
47
|
+
// unclosed tag: extract text up to the next sibling open tag
|
|
48
48
|
const sib = matchFrom(OPEN_TAG_SRC, flags, inner, contentStart);
|
|
49
49
|
if (sib != null) {
|
|
50
50
|
contentEnd = sib.index;
|
package/src/index.ts
CHANGED
|
@@ -12,28 +12,31 @@ export {
|
|
|
12
12
|
type VerifyOptions,
|
|
13
13
|
} from "./verify.js";
|
|
14
14
|
|
|
15
|
-
// FR-010 tolerant
|
|
16
|
-
export {
|
|
15
|
+
// FR-010 tolerant extract engine (Tier-2 forgiving parser).
|
|
16
|
+
export { extract } from "./extract/extract.js";
|
|
17
17
|
export {
|
|
18
18
|
Format,
|
|
19
19
|
FieldKind,
|
|
20
|
-
|
|
20
|
+
FieldExtraction,
|
|
21
21
|
Tolerance,
|
|
22
|
-
|
|
22
|
+
ExtractionReport,
|
|
23
23
|
scalar,
|
|
24
24
|
enumField,
|
|
25
|
+
enumArray,
|
|
25
26
|
range,
|
|
26
27
|
object,
|
|
27
|
-
|
|
28
|
+
extractSchema,
|
|
28
29
|
defaults,
|
|
30
|
+
orThrow,
|
|
31
|
+
ExtractError,
|
|
29
32
|
type FieldSpec,
|
|
30
|
-
type
|
|
31
|
-
type
|
|
32
|
-
type
|
|
33
|
-
type
|
|
33
|
+
type ExtractSchema,
|
|
34
|
+
type ExtractOptions,
|
|
35
|
+
type ExtractionOutcome,
|
|
36
|
+
type ExtractionResult,
|
|
34
37
|
type Coercion,
|
|
35
38
|
type OnField,
|
|
36
|
-
} from "./
|
|
39
|
+
} from "./extract/types.js";
|
|
37
40
|
export {
|
|
38
41
|
asString,
|
|
39
42
|
asInt,
|
|
@@ -41,7 +44,7 @@ export {
|
|
|
41
44
|
asDouble,
|
|
42
45
|
asBool,
|
|
43
46
|
asStringList,
|
|
44
|
-
} from "./
|
|
47
|
+
} from "./extract/extract-map.js";
|
|
45
48
|
|
|
46
49
|
// FR-010 artifact 1 — output-format prompt renderer ("produce your answer like this").
|
|
47
50
|
export { renderOutputFormat } from "./prompt/output-format-renderer.js";
|
|
@@ -53,3 +56,6 @@ export {
|
|
|
53
56
|
} from "./prompt/prompt-overrides.js";
|
|
54
57
|
export type { OutputFormatSpec } from "./prompt/output-format-spec.js";
|
|
55
58
|
export type { PromptField } from "./prompt/prompt-field.js";
|
|
59
|
+
|
|
60
|
+
// template.output render-helper result shape (shared per port).
|
|
61
|
+
export type { EmailDocument } from "./email-document.js";
|