@sidub-inc/docuoria.cli 1.0.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +1056 -0
- package/package.json +56 -0
- package/payload/.claude-plugin/plugin.json +21 -0
- package/payload/MANIFEST.json +322 -0
- package/payload/SKILL.md +88 -0
- package/payload/assets/lib/Docuoria.dll +0 -0
- package/payload/assets/schemas/template-schema.json +413 -0
- package/payload/commands/classify.md +11 -0
- package/payload/commands/diagnose.md +11 -0
- package/payload/commands/extract.md +11 -0
- package/payload/commands/inspect.md +11 -0
- package/payload/commands/validate-template.md +11 -0
- package/payload/examples/01-extract-to-csv.md +49 -0
- package/payload/examples/02-classify-unknown-pdf.md +102 -0
- package/payload/examples/03-diagnose-failed-result.md +68 -0
- package/payload/references/classification.md +363 -0
- package/payload/references/decision-tree.md +43 -0
- package/payload/references/failure-tree.md +169 -0
- package/payload/references/pattern-authoring.md +40 -0
- package/payload/references/patterns.md +97 -0
- package/payload/references/privacy.md +36 -0
- package/payload/references/scripts.md +361 -0
- package/payload/references/template-reference.md +606 -0
- package/payload/references/workflow.md +163 -0
- package/payload/scripts/_common.csx +250 -0
- package/payload/scripts/classify.csx +53 -0
- package/payload/scripts/dry-run.csx +85 -0
- package/payload/scripts/evaluate-match.csx +72 -0
- package/payload/scripts/execute.csx +89 -0
- package/payload/scripts/inspect.csx +43 -0
- package/payload/scripts/list-templates.csx +34 -0
- package/payload/scripts/load-template.csx +54 -0
- package/payload/scripts/save-template.csx +53 -0
- package/payload/scripts/schema-info.csx +84 -0
- package/payload/scripts/test-groups.csx +44 -0
- package/payload/scripts/test-pattern.csx +61 -0
- package/payload/scripts/validate-template.csx +54 -0
- package/payload/skill/SKILL.md +88 -0
- package/payload/skill/assets/lib/Docuoria.dll +0 -0
- package/payload/skill/assets/schemas/template-schema.json +413 -0
- package/payload/skill/examples/01-extract-to-csv.md +49 -0
- package/payload/skill/examples/02-classify-unknown-pdf.md +102 -0
- package/payload/skill/examples/03-diagnose-failed-result.md +68 -0
- package/payload/skill/references/classification.md +363 -0
- package/payload/skill/references/decision-tree.md +43 -0
- package/payload/skill/references/failure-tree.md +169 -0
- package/payload/skill/references/pattern-authoring.md +40 -0
- package/payload/skill/references/patterns.md +97 -0
- package/payload/skill/references/privacy.md +36 -0
- package/payload/skill/references/scripts.md +361 -0
- package/payload/skill/references/template-reference.md +606 -0
- package/payload/skill/references/workflow.md +163 -0
- package/payload/skill/scripts/_common.csx +250 -0
- package/payload/skill/scripts/classify.csx +53 -0
- package/payload/skill/scripts/dry-run.csx +85 -0
- package/payload/skill/scripts/evaluate-match.csx +72 -0
- package/payload/skill/scripts/execute.csx +89 -0
- package/payload/skill/scripts/inspect.csx +43 -0
- package/payload/skill/scripts/list-templates.csx +34 -0
- package/payload/skill/scripts/load-template.csx +54 -0
- package/payload/skill/scripts/save-template.csx +53 -0
- package/payload/skill/scripts/schema-info.csx +84 -0
- package/payload/skill/scripts/test-groups.csx +44 -0
- package/payload/skill/scripts/test-pattern.csx +61 -0
- package/payload/skill/scripts/validate-template.csx +54 -0
|
@@ -0,0 +1,606 @@
|
|
|
1
|
+
# Template JSON Reference
|
|
2
|
+
|
|
3
|
+
Complete reference for authoring `Docuoria` template JSON files. Every property name, `$kind` discriminator, and enum value in this document matches the SDK's actual serialization output — copy them verbatim.
|
|
4
|
+
|
|
5
|
+
> **Important:** `fieldType` serializes as an **integer** (0–5), not a string. **Match rule** enums (`TextPatternMatchRule.mode`, `FileNameMatchRule.mode`, `CompositeMatchRule.operator`) also serialize as **integers**. **Extraction source** enums (`TextPatternExtractionSource.mode`, `TableRowsExtractionSource.mode`, `MetadataFieldExtractionSource.standardField`) serialize as **strings** (handled by a custom converter). All field mappings require a `$kind` discriminator (`"FieldMapping"` or `"RepeatingFieldMapping"`). See the [Enum Reference](#enum-reference) section for the canonical values.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## Minimal template (single scalar field)
|
|
10
|
+
|
|
11
|
+
```json
|
|
12
|
+
{
|
|
13
|
+
"identifier": "simple-invoice",
|
|
14
|
+
"rootMatchRule": {
|
|
15
|
+
"$kind": "TextPatternMatchRule",
|
|
16
|
+
"tokens": ["Invoice"],
|
|
17
|
+
"mode": 0,
|
|
18
|
+
"threshold": 0.5
|
|
19
|
+
},
|
|
20
|
+
"dataModel": {
|
|
21
|
+
"schema": {
|
|
22
|
+
"name": "Invoice",
|
|
23
|
+
"fields": [
|
|
24
|
+
{
|
|
25
|
+
"$kind": "PrimitiveFieldDefinition",
|
|
26
|
+
"name": "invoiceNumber",
|
|
27
|
+
"fieldType": 0
|
|
28
|
+
}
|
|
29
|
+
]
|
|
30
|
+
}
|
|
31
|
+
},
|
|
32
|
+
"extractionStep": {
|
|
33
|
+
"mappings": [
|
|
34
|
+
{
|
|
35
|
+
"$kind": "FieldMapping",
|
|
36
|
+
"fieldName": "invoiceNumber",
|
|
37
|
+
"fieldType": 0,
|
|
38
|
+
"source": {
|
|
39
|
+
"$kind": "TextPatternExtractionSource",
|
|
40
|
+
"mode": "Pattern",
|
|
41
|
+
"regexPattern": "Invoice\\s*#?:?\\s*(?<value>\\S+)"
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
]
|
|
45
|
+
},
|
|
46
|
+
"intermediateSteps": [],
|
|
47
|
+
"publishStep": {}
|
|
48
|
+
}
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Complex template (scalar fields + repeating collection)
|
|
52
|
+
|
|
53
|
+
```json
|
|
54
|
+
{
|
|
55
|
+
"identifier": "invoice-with-line-items",
|
|
56
|
+
"rootMatchRule": {
|
|
57
|
+
"$kind": "CompositeMatchRule",
|
|
58
|
+
"operator": 0,
|
|
59
|
+
"children": [
|
|
60
|
+
{
|
|
61
|
+
"rule": {
|
|
62
|
+
"$kind": "TextPatternMatchRule",
|
|
63
|
+
"tokens": ["Invoice", "Total"],
|
|
64
|
+
"mode": 1,
|
|
65
|
+
"threshold": 1.0
|
|
66
|
+
},
|
|
67
|
+
"weight": 1.0
|
|
68
|
+
},
|
|
69
|
+
{
|
|
70
|
+
"rule": {
|
|
71
|
+
"$kind": "FileNameMatchRule",
|
|
72
|
+
"pattern": "*.pdf",
|
|
73
|
+
"mode": 0,
|
|
74
|
+
"threshold": 0.5
|
|
75
|
+
},
|
|
76
|
+
"weight": 0.5
|
|
77
|
+
}
|
|
78
|
+
],
|
|
79
|
+
"threshold": 0.8
|
|
80
|
+
},
|
|
81
|
+
"dataModel": {
|
|
82
|
+
"schema": {
|
|
83
|
+
"name": "InvoiceData",
|
|
84
|
+
"fields": [
|
|
85
|
+
{
|
|
86
|
+
"$kind": "PrimitiveFieldDefinition",
|
|
87
|
+
"name": "invoiceNumber",
|
|
88
|
+
"fieldType": 0
|
|
89
|
+
},
|
|
90
|
+
{
|
|
91
|
+
"$kind": "PrimitiveFieldDefinition",
|
|
92
|
+
"name": "invoiceDate",
|
|
93
|
+
"fieldType": 4
|
|
94
|
+
},
|
|
95
|
+
{
|
|
96
|
+
"$kind": "PrimitiveFieldDefinition",
|
|
97
|
+
"name": "totalAmount",
|
|
98
|
+
"fieldType": 1
|
|
99
|
+
},
|
|
100
|
+
{
|
|
101
|
+
"$kind": "RecordFieldDefinition",
|
|
102
|
+
"name": "lineItems",
|
|
103
|
+
"isCollection": true,
|
|
104
|
+
"record": {
|
|
105
|
+
"name": "LineItem",
|
|
106
|
+
"fields": [
|
|
107
|
+
{
|
|
108
|
+
"$kind": "PrimitiveFieldDefinition",
|
|
109
|
+
"name": "productCode",
|
|
110
|
+
"fieldType": 0
|
|
111
|
+
},
|
|
112
|
+
{
|
|
113
|
+
"$kind": "PrimitiveFieldDefinition",
|
|
114
|
+
"name": "description",
|
|
115
|
+
"fieldType": 0
|
|
116
|
+
},
|
|
117
|
+
{
|
|
118
|
+
"$kind": "PrimitiveFieldDefinition",
|
|
119
|
+
"name": "qty",
|
|
120
|
+
"fieldType": 1
|
|
121
|
+
},
|
|
122
|
+
{
|
|
123
|
+
"$kind": "PrimitiveFieldDefinition",
|
|
124
|
+
"name": "unitPrice",
|
|
125
|
+
"fieldType": 1
|
|
126
|
+
},
|
|
127
|
+
{
|
|
128
|
+
"$kind": "PrimitiveFieldDefinition",
|
|
129
|
+
"name": "amount",
|
|
130
|
+
"fieldType": 1
|
|
131
|
+
}
|
|
132
|
+
]
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
]
|
|
136
|
+
}
|
|
137
|
+
},
|
|
138
|
+
"extractionStep": {
|
|
139
|
+
"mappings": [
|
|
140
|
+
{
|
|
141
|
+
"$kind": "FieldMapping",
|
|
142
|
+
"fieldName": "invoiceNumber",
|
|
143
|
+
"fieldType": 0,
|
|
144
|
+
"source": {
|
|
145
|
+
"$kind": "TextPatternExtractionSource",
|
|
146
|
+
"mode": "Pattern",
|
|
147
|
+
"regexPattern": "Invoice\\s*#:?\\s*(?<value>\\d+)"
|
|
148
|
+
}
|
|
149
|
+
},
|
|
150
|
+
{
|
|
151
|
+
"$kind": "FieldMapping",
|
|
152
|
+
"fieldName": "invoiceDate",
|
|
153
|
+
"fieldType": 4,
|
|
154
|
+
"parseFormat": "MM/dd/yyyy",
|
|
155
|
+
"source": {
|
|
156
|
+
"$kind": "TextPatternExtractionSource",
|
|
157
|
+
"mode": "Pattern",
|
|
158
|
+
"regexPattern": "Date:?\\s*(?<value>\\d{2}/\\d{2}/\\d{4})"
|
|
159
|
+
}
|
|
160
|
+
},
|
|
161
|
+
{
|
|
162
|
+
"$kind": "FieldMapping",
|
|
163
|
+
"fieldName": "totalAmount",
|
|
164
|
+
"fieldType": 1,
|
|
165
|
+
"source": {
|
|
166
|
+
"$kind": "TextPatternExtractionSource",
|
|
167
|
+
"mode": "Pattern",
|
|
168
|
+
"regexPattern": "Total:?\\s*\\$?(?<value>[\\d,]+\\.\\d{2})"
|
|
169
|
+
}
|
|
170
|
+
},
|
|
171
|
+
{
|
|
172
|
+
"$kind": "RepeatingFieldMapping",
|
|
173
|
+
"collectionFieldName": "lineItems",
|
|
174
|
+
"elementDefinition": {
|
|
175
|
+
"name": "LineItem",
|
|
176
|
+
"fields": [
|
|
177
|
+
{ "$kind": "PrimitiveFieldDefinition", "name": "productCode", "fieldType": 0 },
|
|
178
|
+
{ "$kind": "PrimitiveFieldDefinition", "name": "description", "fieldType": 0 },
|
|
179
|
+
{ "$kind": "PrimitiveFieldDefinition", "name": "qty", "fieldType": 1 },
|
|
180
|
+
{ "$kind": "PrimitiveFieldDefinition", "name": "unitPrice", "fieldType": 1 },
|
|
181
|
+
{ "$kind": "PrimitiveFieldDefinition", "name": "amount", "fieldType": 1 }
|
|
182
|
+
]
|
|
183
|
+
},
|
|
184
|
+
"source": {
|
|
185
|
+
"$kind": "TextPatternExtractionSource",
|
|
186
|
+
"mode": "AllMatches",
|
|
187
|
+
"regexPattern": "(?<productCode>\\S+)\\s+(?<description>[^\\n]+?)\\s+(?<qty>[\\d.]+)\\s+(?<unitPrice>[\\d,.]+)\\s+(?<amount>[\\d,.]+)"
|
|
188
|
+
},
|
|
189
|
+
"subFields": [
|
|
190
|
+
{ "$kind": "NamedGroupSubFieldMapping", "fieldName": "productCode", "fieldType": 0, "groupName": "productCode" },
|
|
191
|
+
{ "$kind": "NamedGroupSubFieldMapping", "fieldName": "description", "fieldType": 0, "groupName": "description" },
|
|
192
|
+
{ "$kind": "NamedGroupSubFieldMapping", "fieldName": "qty", "fieldType": 1, "groupName": "qty" },
|
|
193
|
+
{ "$kind": "NamedGroupSubFieldMapping", "fieldName": "unitPrice", "fieldType": 1, "groupName": "unitPrice" },
|
|
194
|
+
{ "$kind": "NamedGroupSubFieldMapping", "fieldName": "amount", "fieldType": 1, "groupName": "amount" }
|
|
195
|
+
]
|
|
196
|
+
}
|
|
197
|
+
]
|
|
198
|
+
},
|
|
199
|
+
"intermediateSteps": [],
|
|
200
|
+
"publishStep": {}
|
|
201
|
+
}
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
---
|
|
205
|
+
|
|
206
|
+
## Top-level template structure
|
|
207
|
+
|
|
208
|
+
| Property | Type | Required | Description |
|
|
209
|
+
|---|---|---|---|
|
|
210
|
+
| `identifier` | string | yes | Unique template ID (min 1 char) |
|
|
211
|
+
| `rootMatchRule` | object | yes | Gates template execution; uses `$kind` discriminator |
|
|
212
|
+
| `dataModel` | object | yes | Output schema definition with `schema` property |
|
|
213
|
+
| `extractionStep` | object | yes | Contains `mappings` array of field extraction declarations |
|
|
214
|
+
| `intermediateSteps` | array | no | Ordered transformation steps (may be empty `[]`) |
|
|
215
|
+
| `publishStep` | object | yes | Publish step configuration (can be empty `{}`) |
|
|
216
|
+
|
|
217
|
+
---
|
|
218
|
+
|
|
219
|
+
## Extraction source types (`$kind` discriminator)
|
|
220
|
+
|
|
221
|
+
| `$kind` value | Variant | Description |
|
|
222
|
+
|---|---|---|
|
|
223
|
+
| `TextPatternExtractionSource` | `mode: "Token"` | Literal token match (returns first occurrence) |
|
|
224
|
+
| `TextPatternExtractionSource` | `mode: "Pattern"` | Regex match with capture group (returns first match) |
|
|
225
|
+
| `TextPatternExtractionSource` | `mode: "AllMatches"` | Regex match (returns all matches → collection) |
|
|
226
|
+
| `TextAnchorExtractionSource` | — | Spatial region + token/regex within that region |
|
|
227
|
+
| `TableCellExtractionSource` | — | Single table cell by row/column |
|
|
228
|
+
| `TableRowsExtractionSource` | — | All data rows from a PDF table → collection |
|
|
229
|
+
| `MetadataFieldExtractionSource` | — | PDF metadata field (Title, Author, etc.) |
|
|
230
|
+
| `FallbackExtractionSource` | — | Tries primary source, then fallback |
|
|
231
|
+
|
|
232
|
+
### `TextPatternExtractionSource` properties
|
|
233
|
+
|
|
234
|
+
| Property | Type | Required | Default | Description |
|
|
235
|
+
|---|---|---|---|---|
|
|
236
|
+
| `$kind` | `"TextPatternExtractionSource"` | yes | — | Discriminator |
|
|
237
|
+
| `mode` | string | yes | — | `"Token"`, `"Pattern"`, or `"AllMatches"` |
|
|
238
|
+
| `literalToken` | string | when mode = `"Token"` | — | Exact text to find |
|
|
239
|
+
| `regexPattern` | string | when mode = `"Pattern"` or `"AllMatches"` | — | .NET regex pattern (must include a capture group) |
|
|
240
|
+
| `pageNumber` | integer (1-based) | no | `null` (all pages) | Restrict extraction to a single page |
|
|
241
|
+
| `caseSensitive` | boolean | no | `false` | Case-sensitive matching |
|
|
242
|
+
| `blockSeparator` | string | no | `"\n"` | Separator between text blocks during flattening |
|
|
243
|
+
| `startAnchorPattern` | string | no | `null` | Regex bounding start of search region (AllMatches only) |
|
|
244
|
+
| `endAnchorPattern` | string | no | `null` | Regex bounding end of search region (AllMatches only) |
|
|
245
|
+
|
|
246
|
+
> **Anti-pattern:** Do NOT use `regexPattern` with `"Token"` mode or `literalToken` with `"Pattern"` / `"AllMatches"` mode. Each mode requires its specific property — mixing them will fail validation.
|
|
247
|
+
|
|
248
|
+
### `TextAnchorExtractionSource` properties
|
|
249
|
+
|
|
250
|
+
| Property | Type | Required | Default | Description |
|
|
251
|
+
|---|---|---|---|---|
|
|
252
|
+
| `$kind` | `"TextAnchorExtractionSource"` | yes | — | Discriminator |
|
|
253
|
+
| `region` | PdfBounds | yes | — | Spatial bounding box (see PdfBounds below) |
|
|
254
|
+
| `literalToken` | string | one of these | — | Exact text to find within region |
|
|
255
|
+
| `regexPattern` | string | one of these | — | Regex to match within region |
|
|
256
|
+
| `pageNumber` | integer (1-based) | no | `null` | Restrict to a single page |
|
|
257
|
+
| `caseSensitive` | boolean | no | `false` | Case-sensitive matching |
|
|
258
|
+
|
|
259
|
+
**PdfBounds** (nested object):
|
|
260
|
+
|
|
261
|
+
| Property | Type | Description |
|
|
262
|
+
|---|---|---|
|
|
263
|
+
| `left` | number | Left edge in PDF points |
|
|
264
|
+
| `top` | number | Top edge in PDF points |
|
|
265
|
+
| `width` | number | Width in PDF points |
|
|
266
|
+
| `height` | number | Height in PDF points |
|
|
267
|
+
|
|
268
|
+
### `TableCellExtractionSource` properties
|
|
269
|
+
|
|
270
|
+
| Property | Type | Required | Default | Description |
|
|
271
|
+
|---|---|---|---|---|
|
|
272
|
+
| `$kind` | `"TableCellExtractionSource"` | yes | — | Discriminator |
|
|
273
|
+
| `rowIndex` | integer (0-based) | yes | — | Data row index |
|
|
274
|
+
| `columnIndex` | integer (0-based) | when no `headerToken` | — | Column by ordinal position |
|
|
275
|
+
| `headerToken` | string | when no `columnIndex` | — | Column by header text match |
|
|
276
|
+
| `pageNumber` | integer (1-based) | no | `null` | Restrict to a single page |
|
|
277
|
+
| `tableIndex` | integer (0-based) | no | `0` | Which table on the page |
|
|
278
|
+
| `caseSensitiveHeader` | boolean | no | `false` | Case-sensitive header matching |
|
|
279
|
+
|
|
280
|
+
> **Mutual exclusivity:** Provide `columnIndex` OR `headerToken`, not both.
|
|
281
|
+
|
|
282
|
+
### `TableRowsExtractionSource` properties
|
|
283
|
+
|
|
284
|
+
| Property | Type | Required | Default | Description |
|
|
285
|
+
|---|---|---|---|---|
|
|
286
|
+
| `$kind` | `"TableRowsExtractionSource"` | yes | — | Discriminator |
|
|
287
|
+
| `mode` | string | yes | — | `"ByHeader"` or `"Ordinal"` |
|
|
288
|
+
| `pageNumber` | integer (1-based) | no | `null` | Restrict to a single page |
|
|
289
|
+
| `tableIndex` | integer (0-based) | no | `0` | Which table on the page |
|
|
290
|
+
| `headerRowIndex` | integer (0-based) | no (ByHeader only) | `0` | Row containing column headers |
|
|
291
|
+
| `skipRows` | integer | no (Ordinal only) | `0` | Number of header/non-data rows to skip |
|
|
292
|
+
| `caseSensitiveHeader` | boolean | no | `false` | Case-sensitive header matching |
|
|
293
|
+
|
|
294
|
+
### `MetadataFieldExtractionSource` properties
|
|
295
|
+
|
|
296
|
+
| Property | Type | Required | Default | Description |
|
|
297
|
+
|---|---|---|---|---|
|
|
298
|
+
| `$kind` | `"MetadataFieldExtractionSource"` | yes | — | Discriminator |
|
|
299
|
+
| `standardField` | string | one of these | — | Standard PDF field: `"Title"`, `"Author"`, `"Subject"`, `"Keywords"`, `"Creator"`, `"Producer"`, `"CreationDate"`, `"ModifiedDate"` |
|
|
300
|
+
| `rawKey` | string | one of these | — | Arbitrary PDF metadata key name |
|
|
301
|
+
|
|
302
|
+
> **Mutual exclusivity:** Provide `standardField` OR `rawKey`, not both.
|
|
303
|
+
|
|
304
|
+
### `FallbackExtractionSource` properties
|
|
305
|
+
|
|
306
|
+
| Property | Type | Required | Default | Description |
|
|
307
|
+
|---|---|---|---|---|
|
|
308
|
+
| `$kind` | `"FallbackExtractionSource"` | yes | — | Discriminator |
|
|
309
|
+
| `primary` | ExtractionSource | yes | — | Primary extraction source (nested object with `$kind`) |
|
|
310
|
+
| `fallback` | ExtractionSource | yes | — | Fallback if primary yields no result |
|
|
311
|
+
|
|
312
|
+
---
|
|
313
|
+
|
|
314
|
+
## Field mapping types
|
|
315
|
+
|
|
316
|
+
### Scalar field mapping (`FieldMapping`)
|
|
317
|
+
|
|
318
|
+
Used for single-value extractions.
|
|
319
|
+
|
|
320
|
+
| Property | Type | Required | Description |
|
|
321
|
+
|---|---|---|---|
|
|
322
|
+
| `fieldName` | string | yes | Schema field name (must match `dataModel` field) |
|
|
323
|
+
| `fieldType` | integer | yes | Target type (see [FieldType enum](#fieldtype)) |
|
|
324
|
+
| `source` | ExtractionSource | yes | Extraction source object with `$kind` |
|
|
325
|
+
| `parseFormat` | string | no | .NET format string for Date/Timestamp coercion |
|
|
326
|
+
| `cultureName` | string | no | Culture name for locale-aware coercion (e.g. `"en-US"`) |
|
|
327
|
+
|
|
328
|
+
### Repeating field mapping (`RepeatingFieldMapping`)
|
|
329
|
+
|
|
330
|
+
Used for collection extractions (line items, rows).
|
|
331
|
+
|
|
332
|
+
| Property | Type | Required | Description |
|
|
333
|
+
|---|---|---|---|
|
|
334
|
+
| `collectionFieldName` | string | yes | Schema collection field name |
|
|
335
|
+
| `elementDefinition` | RecordDefinition | yes | Inline schema for each element (see below) |
|
|
336
|
+
| `source` | ExtractionSource | yes | Collection-capable source (`AllMatches` or `TableRows`) |
|
|
337
|
+
| `subFields` | SubFieldMapping[] | yes | How to project source output into element fields |
|
|
338
|
+
|
|
339
|
+
### `elementDefinition` structure
|
|
340
|
+
|
|
341
|
+
```json
|
|
342
|
+
{
|
|
343
|
+
"name": "ElementName",
|
|
344
|
+
"fields": [
|
|
345
|
+
{ "$kind": "PrimitiveFieldDefinition", "name": "fieldA", "fieldType": 0 },
|
|
346
|
+
{ "$kind": "PrimitiveFieldDefinition", "name": "fieldB", "fieldType": 1 }
|
|
347
|
+
]
|
|
348
|
+
}
|
|
349
|
+
```
|
|
350
|
+
|
|
351
|
+
---
|
|
352
|
+
|
|
353
|
+
## Sub-field mapping types (`$kind` discriminator)
|
|
354
|
+
|
|
355
|
+
| `$kind` value | Key property | Use with | Description |
|
|
356
|
+
|---|---|---|---|
|
|
357
|
+
| `NamedGroupSubFieldMapping` | `groupName` (string) | `TextPatternExtractionSource` (AllMatches) | Maps a named regex capture group `(?<name>...)` |
|
|
358
|
+
| `RegexGroupSubFieldMapping` | `groupIndex` (integer, 1-based) | `TextPatternExtractionSource` (AllMatches) | Maps a numbered regex capture group |
|
|
359
|
+
| `HeaderSubFieldMapping` | `headerToken` (string) | `TableRowsExtractionSource` (ByHeader) | Maps a table column by header text match |
|
|
360
|
+
| `OrdinalSubFieldMapping` | `columnIndex` (integer, 0-based) | `TableRowsExtractionSource` (Ordinal) | Maps a table column by index |
|
|
361
|
+
|
|
362
|
+
All sub-field mappings share:
|
|
363
|
+
|
|
364
|
+
| Property | Type | Required | Description |
|
|
365
|
+
|---|---|---|---|
|
|
366
|
+
| `fieldName` | string | yes | Element-level field name |
|
|
367
|
+
| `fieldType` | integer | yes | Target primitive type (see [FieldType enum](#fieldtype)) |
|
|
368
|
+
|
|
369
|
+
---
|
|
370
|
+
|
|
371
|
+
## Match rule types (`$kind` discriminator)
|
|
372
|
+
|
|
373
|
+
All match rules share: `threshold` (number, 0–1, default 0.5).
|
|
374
|
+
|
|
375
|
+
### `TextPatternMatchRule` properties
|
|
376
|
+
|
|
377
|
+
| Property | Type | Required | Default | Description |
|
|
378
|
+
|---|---|---|---|---|
|
|
379
|
+
| `$kind` | `"TextPatternMatchRule"` | yes | — | Discriminator |
|
|
380
|
+
| `tokens` | string[] | one of these | — | Text tokens to search for |
|
|
381
|
+
| `regexPattern` | string | one of these | — | Regex pattern (alternative to tokens) |
|
|
382
|
+
| `mode` | integer | no | `0` (AnyToken) | `0` (AnyToken) or `1` (AllTokens); ignored when using `regexPattern` |
|
|
383
|
+
| `caseSensitive` | boolean | no | `false` | Case-sensitive matching |
|
|
384
|
+
| `pageNumber` | integer (1-based) | no | `null` | Restrict to a single page |
|
|
385
|
+
| `threshold` | number | no | `0.5` | Confidence threshold |
|
|
386
|
+
|
|
387
|
+
> **Mutual exclusivity:** Provide `tokens` OR `regexPattern`, not both.
|
|
388
|
+
|
|
389
|
+
### `FileNameMatchRule` properties
|
|
390
|
+
|
|
391
|
+
| Property | Type | Required | Default | Description |
|
|
392
|
+
|---|---|---|---|---|
|
|
393
|
+
| `$kind` | `"FileNameMatchRule"` | yes | — | Discriminator |
|
|
394
|
+
| `pattern` | string | yes | — | Glob or regex pattern |
|
|
395
|
+
| `mode` | integer | no | `0` (Glob) | `0` (Glob) or `1` (Regex) |
|
|
396
|
+
| `caseSensitive` | boolean | no | `false` | Case-sensitive matching |
|
|
397
|
+
| `threshold` | number | no | `0.5` | Confidence threshold |
|
|
398
|
+
|
|
399
|
+
### `TextAnchorMatchRule` properties
|
|
400
|
+
|
|
401
|
+
| Property | Type | Required | Default | Description |
|
|
402
|
+
|---|---|---|---|---|
|
|
403
|
+
| `$kind` | `"TextAnchorMatchRule"` | yes | — | Discriminator |
|
|
404
|
+
| `expectedContent` | string | yes | — | Expected text substring |
|
|
405
|
+
| `region` | PdfBounds | yes | — | Spatial bounding box |
|
|
406
|
+
| `pageNumber` | integer (1-based) | no | `null` | Restrict to a single page |
|
|
407
|
+
| `caseSensitive` | boolean | no | `false` | Case-sensitive matching |
|
|
408
|
+
| `threshold` | number | no | `0.5` | Confidence threshold |
|
|
409
|
+
|
|
410
|
+
### `MetadataMatchRule` properties
|
|
411
|
+
|
|
412
|
+
| Property | Type | Required | Default | Description |
|
|
413
|
+
|---|---|---|---|---|
|
|
414
|
+
| `$kind` | `"MetadataMatchRule"` | yes | — | Discriminator |
|
|
415
|
+
| `expectedProperties` | object | yes | — | Key-value pairs of expected metadata (e.g. `{ "Author": "Acme Corp" }`) |
|
|
416
|
+
| `caseSensitive` | boolean | no | `false` | Case-sensitive value matching |
|
|
417
|
+
| `threshold` | number | no | `0.5` | Confidence = matched properties / total expected |
|
|
418
|
+
|
|
419
|
+
### `PageGeometryMatchRule` properties
|
|
420
|
+
|
|
421
|
+
| Property | Type | Required | Default | Description |
|
|
422
|
+
|---|---|---|---|---|
|
|
423
|
+
| `$kind` | `"PageGeometryMatchRule"` | yes | — | Discriminator |
|
|
424
|
+
| `expectedWidth` | number | no | `null` | Expected page width in PDF points |
|
|
425
|
+
| `expectedHeight` | number | no | `null` | Expected page height in PDF points |
|
|
426
|
+
| `expectedPageCount` | integer | no | `null` | Expected total page count |
|
|
427
|
+
| `expectedOrientation` | integer | no | `null` | `0` (Portrait) or `1` (Landscape) |
|
|
428
|
+
| `toleranceInPoints` | number | no | `0.0` | Tolerance for width/height comparison |
|
|
429
|
+
| `threshold` | number | no | `0.5` | Confidence threshold |
|
|
430
|
+
|
|
431
|
+
### `TableMatchRule` properties
|
|
432
|
+
|
|
433
|
+
| Property | Type | Required | Default | Description |
|
|
434
|
+
|---|---|---|---|---|
|
|
435
|
+
| `$kind` | `"TableMatchRule"` | yes | — | Discriminator |
|
|
436
|
+
| `minRows` | integer | no | `null` | Minimum required row count |
|
|
437
|
+
| `minColumns` | integer | no | `null` | Minimum required column count |
|
|
438
|
+
| `requiredHeaderTokens` | string[] | no | `null` | Headers that must be present |
|
|
439
|
+
| `cellContentTokens` | string[] | no | `null` | Text that must appear in any cell |
|
|
440
|
+
| `caseSensitive` | boolean | no | `false` | Case-sensitive matching |
|
|
441
|
+
| `pageNumber` | integer (1-based) | no | `null` | Restrict to a single page |
|
|
442
|
+
| `threshold` | number | no | `0.5` | Confidence threshold |
|
|
443
|
+
|
|
444
|
+
### `CompositeMatchRule` properties
|
|
445
|
+
|
|
446
|
+
| Property | Type | Required | Default | Description |
|
|
447
|
+
|---|---|---|---|---|
|
|
448
|
+
| `$kind` | `"CompositeMatchRule"` | yes | — | Discriminator |
|
|
449
|
+
| `operator` | integer | no | `0` (And) | `0` (And), `1` (Or), or `2` (Not) |
|
|
450
|
+
| `children` | CompositeChildEntry[] | yes | — | Array of child entries (min 1; exactly 1 for `2` / Not) |
|
|
451
|
+
| `threshold` | number | no | `0.5` | Confidence threshold |
|
|
452
|
+
|
|
453
|
+
**Each `children` entry:**
|
|
454
|
+
|
|
455
|
+
| Property | Type | Required | Default | Description |
|
|
456
|
+
|---|---|---|---|---|
|
|
457
|
+
| `rule` | MatchRule | yes | — | Nested match rule object (with `$kind`) |
|
|
458
|
+
| `weight` | number | no | `1.0` | Relative weight in confidence calculation |
|
|
459
|
+
|
|
460
|
+
---
|
|
461
|
+
|
|
462
|
+
## Enum reference
|
|
463
|
+
|
|
464
|
+
### `FieldType`
|
|
465
|
+
|
|
466
|
+
**Serializes as integer.** Use the integer value in JSON, not the string name.
|
|
467
|
+
|
|
468
|
+
| Integer | Name | Description |
|
|
469
|
+
|---|---|---|
|
|
470
|
+
| `0` | String | Text (default) |
|
|
471
|
+
| `1` | Number | Decimal / floating-point |
|
|
472
|
+
| `2` | Integer | Whole number |
|
|
473
|
+
| `3` | Boolean | true / false |
|
|
474
|
+
| `4` | Date | Calendar date (use `parseFormat` for non-ISO formats, e.g. `"MM/dd/yyyy"`) |
|
|
475
|
+
| `5` | Timestamp | Date + time (UTC, use `parseFormat` if needed) |
|
|
476
|
+
|
|
477
|
+
> **Common mistake:** Writing `"fieldType": "String"` instead of `"fieldType": 0`. The string form will fail with a `JsonException` during `Template.FromJson()`.
|
|
478
|
+
|
|
479
|
+
### `TextPatternExtractionSource.mode`
|
|
480
|
+
|
|
481
|
+
Serializes as **string**.
|
|
482
|
+
|
|
483
|
+
| Value | Required property | Description |
|
|
484
|
+
|---|---|---|
|
|
485
|
+
| `"Token"` | `literalToken` | Exact literal match — no regex |
|
|
486
|
+
| `"Pattern"` | `regexPattern` | First regex match with a capture group named `value` |
|
|
487
|
+
| `"AllMatches"` | `regexPattern` | All regex matches — use for collections with named groups |
|
|
488
|
+
|
|
489
|
+
### `TextMatchMode` (for `TextPatternMatchRule.mode`)
|
|
490
|
+
|
|
491
|
+
**Serializes as integer.**
|
|
492
|
+
|
|
493
|
+
| Integer | Name | Description |
|
|
494
|
+
|---|---|---|
|
|
495
|
+
| `0` | AnyToken | Confidence = matched tokens / total tokens |
|
|
496
|
+
| `1` | AllTokens | Confidence = 1.0 only if all tokens match |
|
|
497
|
+
|
|
498
|
+
### `PatternMode` (for `FileNameMatchRule.mode`)
|
|
499
|
+
|
|
500
|
+
**Serializes as integer.**
|
|
501
|
+
|
|
502
|
+
| Integer | Name | Description |
|
|
503
|
+
|---|---|---|
|
|
504
|
+
| `0` | Glob | Glob pattern (`*`, `?`, character classes) |
|
|
505
|
+
| `1` | Regex | .NET regex pattern |
|
|
506
|
+
|
|
507
|
+
### `CompositeOperator` (for `CompositeMatchRule.operator`)
|
|
508
|
+
|
|
509
|
+
**Serializes as integer.**
|
|
510
|
+
|
|
511
|
+
| Integer | Name | Description |
|
|
512
|
+
|---|---|---|
|
|
513
|
+
| `0` | And | Weighted average of children's confidences |
|
|
514
|
+
| `1` | Or | Max-weighted child confidence |
|
|
515
|
+
| `2` | Not | Negation: 1 − child.Confidence (exactly 1 child required) |
|
|
516
|
+
|
|
517
|
+
### `PageOrientation` (for `PageGeometryMatchRule.expectedOrientation`)
|
|
518
|
+
|
|
519
|
+
**Serializes as integer.**
|
|
520
|
+
|
|
521
|
+
| Integer | Name | Description |
|
|
522
|
+
|---|---|---|
|
|
523
|
+
| `0` | Portrait | Height ≥ Width |
|
|
524
|
+
| `1` | Landscape | Width > Height |
|
|
525
|
+
|
|
526
|
+
### `MetadataField` (for `MetadataFieldExtractionSource.standardField`)
|
|
527
|
+
|
|
528
|
+
| Value | Description |
|
|
529
|
+
|---|---|
|
|
530
|
+
| `"Title"` | PDF title |
|
|
531
|
+
| `"Author"` | PDF author |
|
|
532
|
+
| `"Subject"` | PDF subject |
|
|
533
|
+
| `"Keywords"` | PDF keywords |
|
|
534
|
+
| `"Creator"` | Creator application |
|
|
535
|
+
| `"Producer"` | PDF producer |
|
|
536
|
+
| `"CreationDate"` | Creation date |
|
|
537
|
+
| `"ModifiedDate"` | Last modified date |
|
|
538
|
+
|
|
539
|
+
### `ValidationSeverity` (in validation errors)
|
|
540
|
+
|
|
541
|
+
| Value | Description |
|
|
542
|
+
|---|---|
|
|
543
|
+
| `"Info"` | Informational only |
|
|
544
|
+
| `"Warning"` | Non-blocking concern |
|
|
545
|
+
| `"Error"` | Blocking issue |
|
|
546
|
+
|
|
547
|
+
---
|
|
548
|
+
|
|
549
|
+
## Field definition types (`$kind` discriminator)
|
|
550
|
+
|
|
551
|
+
### `PrimitiveFieldDefinition`
|
|
552
|
+
|
|
553
|
+
| Property | Type | Required | Default | Description |
|
|
554
|
+
|---|---|---|---|---|
|
|
555
|
+
| `$kind` | `"PrimitiveFieldDefinition"` | yes | — | Discriminator |
|
|
556
|
+
| `name` | string | yes | — | Field name |
|
|
557
|
+
| `fieldType` | integer | no | `0` (String) | See [FieldType enum](#fieldtype) |
|
|
558
|
+
| `isRequired` | boolean | no | `false` | Whether the field must have a value |
|
|
559
|
+
| `isCollection` | boolean | no | `false` | Whether this is a collection of primitives |
|
|
560
|
+
|
|
561
|
+
> **Note:** `parseFormat` and `cultureName` belong on the **field mapping** (in `extractionStep.mappings`), not on the field definition (in `dataModel.schema.fields`). See [Scalar field mapping](#scalar-field-mapping-fieldmapping).
|
|
562
|
+
|
|
563
|
+
### `RecordFieldDefinition`
|
|
564
|
+
|
|
565
|
+
| Property | Type | Required | Default | Description |
|
|
566
|
+
|---|---|---|---|---|
|
|
567
|
+
| `$kind` | `"RecordFieldDefinition"` | yes | — | Discriminator |
|
|
568
|
+
| `name` | string | yes | — | Field name |
|
|
569
|
+
| `isRequired` | boolean | no | `false` | Whether the record must be present |
|
|
570
|
+
| `isCollection` | boolean | no | `false` | Set `true` for repeating data (e.g. line items) |
|
|
571
|
+
| `record` | RecordDefinition | yes | — | Nested record schema (with `name` and `fields`) |
|
|
572
|
+
|
|
573
|
+
---
|
|
574
|
+
|
|
575
|
+
## FieldType coercion behavior
|
|
576
|
+
|
|
577
|
+
When the engine extracts a string value and the `fieldType` specifies a non-String type, the engine coerces the value:
|
|
578
|
+
|
|
579
|
+
| FieldType | Coercion | `parseFormat` needed? | Example input → output |
|
|
580
|
+
|---|---|---|---|
|
|
581
|
+
| `0` (String) | None (passthrough) | No | `"ABC-123"` → `"ABC-123"` |
|
|
582
|
+
| `1` (Number) | Parse as decimal | No (invariant culture) | `"1,234.56"` → `1234.56` |
|
|
583
|
+
| `2` (Integer) | Parse as long | No | `"42"` → `42` |
|
|
584
|
+
| `3` (Boolean) | Parse true/false | No | `"true"` → `true` |
|
|
585
|
+
| `4` (Date) | Parse as DateOnly | Yes, if non-ISO | `"05/26/2026"` with `"parseFormat": "MM/dd/yyyy"` → `2026-05-26` |
|
|
586
|
+
| `5` (Timestamp) | Parse as DateTime (UTC) | Yes, if non-ISO | `"2026-05-26T14:30:00Z"` → `2026-05-26T14:30:00.0000000Z` |
|
|
587
|
+
|
|
588
|
+
When coercion fails, the engine produces a `FailedResult` with `Step = "FieldCoercion"` and structured diagnostics including `FieldPath`, `SourceText`, and `TargetTypeName`.
|
|
589
|
+
|
|
590
|
+
---
|
|
591
|
+
|
|
592
|
+
## CSV output behavior
|
|
593
|
+
|
|
594
|
+
When executing with `--format csv`:
|
|
595
|
+
|
|
596
|
+
- **Denormalization**: If the template has one collection field, the CSV is fully denormalized — one row per collection element, with scalar values repeated on every row.
|
|
597
|
+
- **Header naming**: Nested fields use dot-notation (e.g. `lineItems.productCode`).
|
|
598
|
+
- **Single collection only**: Multiple collections in one template are rejected (`RejectionReason.GeneratorRejected`). Split into separate templates.
|
|
599
|
+
- **Empty collection**: Produces header row only, zero data rows.
|
|
600
|
+
- **Null values**: Rendered as empty cells.
|
|
601
|
+
|
|
602
|
+
---
|
|
603
|
+
|
|
604
|
+
## JSON Schema
|
|
605
|
+
|
|
606
|
+
The SDK ships a JSON Schema (Draft 2020-12) accessible via `Template.GetJsonSchema()` or the bundled file `assets/schemas/template-schema.json`. Use it for editor autocompletion and pre-validation. Note: the schema is permissive with `additionalProperties: true` — always run `validate-template.csx` for full validation.
|