@sidub-inc/docuoria.cli 1.0.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/dist/index.js +1056 -0
  2. package/package.json +56 -0
  3. package/payload/.claude-plugin/plugin.json +21 -0
  4. package/payload/MANIFEST.json +322 -0
  5. package/payload/SKILL.md +88 -0
  6. package/payload/assets/lib/Docuoria.dll +0 -0
  7. package/payload/assets/schemas/template-schema.json +413 -0
  8. package/payload/commands/classify.md +11 -0
  9. package/payload/commands/diagnose.md +11 -0
  10. package/payload/commands/extract.md +11 -0
  11. package/payload/commands/inspect.md +11 -0
  12. package/payload/commands/validate-template.md +11 -0
  13. package/payload/examples/01-extract-to-csv.md +49 -0
  14. package/payload/examples/02-classify-unknown-pdf.md +102 -0
  15. package/payload/examples/03-diagnose-failed-result.md +68 -0
  16. package/payload/references/classification.md +363 -0
  17. package/payload/references/decision-tree.md +43 -0
  18. package/payload/references/failure-tree.md +169 -0
  19. package/payload/references/pattern-authoring.md +40 -0
  20. package/payload/references/patterns.md +97 -0
  21. package/payload/references/privacy.md +36 -0
  22. package/payload/references/scripts.md +361 -0
  23. package/payload/references/template-reference.md +606 -0
  24. package/payload/references/workflow.md +163 -0
  25. package/payload/scripts/_common.csx +250 -0
  26. package/payload/scripts/classify.csx +53 -0
  27. package/payload/scripts/dry-run.csx +85 -0
  28. package/payload/scripts/evaluate-match.csx +72 -0
  29. package/payload/scripts/execute.csx +89 -0
  30. package/payload/scripts/inspect.csx +43 -0
  31. package/payload/scripts/list-templates.csx +34 -0
  32. package/payload/scripts/load-template.csx +54 -0
  33. package/payload/scripts/save-template.csx +53 -0
  34. package/payload/scripts/schema-info.csx +84 -0
  35. package/payload/scripts/test-groups.csx +44 -0
  36. package/payload/scripts/test-pattern.csx +61 -0
  37. package/payload/scripts/validate-template.csx +54 -0
  38. package/payload/skill/SKILL.md +88 -0
  39. package/payload/skill/assets/lib/Docuoria.dll +0 -0
  40. package/payload/skill/assets/schemas/template-schema.json +413 -0
  41. package/payload/skill/examples/01-extract-to-csv.md +49 -0
  42. package/payload/skill/examples/02-classify-unknown-pdf.md +102 -0
  43. package/payload/skill/examples/03-diagnose-failed-result.md +68 -0
  44. package/payload/skill/references/classification.md +363 -0
  45. package/payload/skill/references/decision-tree.md +43 -0
  46. package/payload/skill/references/failure-tree.md +169 -0
  47. package/payload/skill/references/pattern-authoring.md +40 -0
  48. package/payload/skill/references/patterns.md +97 -0
  49. package/payload/skill/references/privacy.md +36 -0
  50. package/payload/skill/references/scripts.md +361 -0
  51. package/payload/skill/references/template-reference.md +606 -0
  52. package/payload/skill/references/workflow.md +163 -0
  53. package/payload/skill/scripts/_common.csx +250 -0
  54. package/payload/skill/scripts/classify.csx +53 -0
  55. package/payload/skill/scripts/dry-run.csx +85 -0
  56. package/payload/skill/scripts/evaluate-match.csx +72 -0
  57. package/payload/skill/scripts/execute.csx +89 -0
  58. package/payload/skill/scripts/inspect.csx +43 -0
  59. package/payload/skill/scripts/list-templates.csx +34 -0
  60. package/payload/skill/scripts/load-template.csx +54 -0
  61. package/payload/skill/scripts/save-template.csx +53 -0
  62. package/payload/skill/scripts/schema-info.csx +84 -0
  63. package/payload/skill/scripts/test-groups.csx +44 -0
  64. package/payload/skill/scripts/test-pattern.csx +61 -0
  65. package/payload/skill/scripts/validate-template.csx +54 -0
@@ -0,0 +1,413 @@
1
+ {
2
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
3
+ "$id": "https://schemas.sidub.net/docuoria/template-schema.json",
4
+ "title": "Sidub PDF Pipeline Template",
5
+ "description": "JSON Schema (Draft 2020-12) for the Sidub Docuoria Template document. Polymorphic nodes carry a `$kind` discriminator whose value matches the corresponding CLR simple type name. This schema constrains structure only; deeper semantic checks (field references, regex validity, record cycles) are enforced by Template.Validate() at runtime.",
6
+ "type": "object",
7
+ "required": ["identifier", "rootMatchRule", "dataModel", "extractionStep", "publishStep"],
8
+ "properties": {
9
+ "identifier": { "type": "string", "minLength": 1 },
10
+ "rootMatchRule": { "$ref": "#/$defs/MatchRuleConfigurationBase" },
11
+ "dataModel": { "$ref": "#/$defs/DataModel" },
12
+ "extractionStep": { "$ref": "#/$defs/ExtractionStep" },
13
+ "intermediateSteps": {
14
+ "type": "array",
15
+ "items": { "type": "object" }
16
+ },
17
+ "publishStep": { "$ref": "#/$defs/PublishStep" }
18
+ },
19
+ "additionalProperties": true,
20
+ "$defs": {
21
+ "MatchRuleConfigurationBase": {
22
+ "oneOf": [
23
+ { "$ref": "#/$defs/FileNameMatchRule" },
24
+ { "$ref": "#/$defs/TextPatternMatchRule" },
25
+ { "$ref": "#/$defs/TextAnchorMatchRule" },
26
+ { "$ref": "#/$defs/MetadataMatchRule" },
27
+ { "$ref": "#/$defs/PageGeometryMatchRule" },
28
+ { "$ref": "#/$defs/TableMatchRule" },
29
+ { "$ref": "#/$defs/CompositeMatchRule" }
30
+ ]
31
+ },
32
+ "FileNameMatchRule": {
33
+ "type": "object",
34
+ "required": ["$kind", "pattern"],
35
+ "properties": {
36
+ "$kind": { "const": "FileNameMatchRule" },
37
+ "pattern": { "type": "string" },
38
+ "mode": { "type": "integer", "minimum": 0, "maximum": 1, "description": "0=Glob, 1=Regex" },
39
+ "caseSensitive": { "type": "boolean" },
40
+ "threshold": { "type": "number", "minimum": 0, "maximum": 1 }
41
+ },
42
+ "additionalProperties": true
43
+ },
44
+ "TextPatternMatchRule": {
45
+ "type": "object",
46
+ "required": ["$kind"],
47
+ "properties": {
48
+ "$kind": { "const": "TextPatternMatchRule" },
49
+ "tokens": { "type": "array", "items": { "type": "string" } },
50
+ "regexPattern": { "type": "string" },
51
+ "mode": { "type": "integer", "minimum": 0, "maximum": 1, "description": "0=AnyToken, 1=AllTokens" },
52
+ "caseSensitive": { "type": "boolean" },
53
+ "pageNumber": { "type": "integer", "minimum": 1 },
54
+ "threshold": { "type": "number", "minimum": 0, "maximum": 1 }
55
+ },
56
+ "additionalProperties": true
57
+ },
58
+ "TextAnchorMatchRule": {
59
+ "type": "object",
60
+ "required": ["$kind"],
61
+ "properties": {
62
+ "$kind": { "const": "TextAnchorMatchRule" },
63
+ "threshold": { "type": "number", "minimum": 0, "maximum": 1 }
64
+ },
65
+ "additionalProperties": true
66
+ },
67
+ "MetadataMatchRule": {
68
+ "type": "object",
69
+ "required": ["$kind"],
70
+ "properties": {
71
+ "$kind": { "const": "MetadataMatchRule" },
72
+ "threshold": { "type": "number", "minimum": 0, "maximum": 1 }
73
+ },
74
+ "additionalProperties": true
75
+ },
76
+ "PageGeometryMatchRule": {
77
+ "type": "object",
78
+ "required": ["$kind"],
79
+ "properties": {
80
+ "$kind": { "const": "PageGeometryMatchRule" },
81
+ "threshold": { "type": "number", "minimum": 0, "maximum": 1 }
82
+ },
83
+ "additionalProperties": true
84
+ },
85
+ "TableMatchRule": {
86
+ "type": "object",
87
+ "required": ["$kind"],
88
+ "properties": {
89
+ "$kind": { "const": "TableMatchRule" },
90
+ "threshold": { "type": "number", "minimum": 0, "maximum": 1 }
91
+ },
92
+ "additionalProperties": true
93
+ },
94
+ "CompositeMatchRule": {
95
+ "type": "object",
96
+ "required": ["$kind", "children"],
97
+ "properties": {
98
+ "$kind": { "const": "CompositeMatchRule" },
99
+ "operator": { "type": "integer", "minimum": 0, "maximum": 2, "description": "0=And, 1=Or, 2=Not" },
100
+ "children": {
101
+ "type": "array",
102
+ "items": { "$ref": "#/$defs/CompositeChildEntry" },
103
+ "minItems": 1
104
+ },
105
+ "threshold": { "type": "number", "minimum": 0, "maximum": 1 }
106
+ },
107
+ "additionalProperties": true
108
+ },
109
+ "CompositeChildEntry": {
110
+ "type": "object",
111
+ "required": ["rule"],
112
+ "properties": {
113
+ "rule": { "$ref": "#/$defs/MatchRuleConfigurationBase" },
114
+ "weight": { "type": "number" }
115
+ },
116
+ "additionalProperties": true
117
+ },
118
+ "DataModel": {
119
+ "type": "object",
120
+ "required": ["schema"],
121
+ "properties": {
122
+ "schema": { "$ref": "#/$defs/RecordDefinition" }
123
+ },
124
+ "additionalProperties": true
125
+ },
126
+ "RecordDefinition": {
127
+ "type": "object",
128
+ "required": ["name", "fields"],
129
+ "properties": {
130
+ "name": { "type": "string", "minLength": 1 },
131
+ "fields": {
132
+ "type": "array",
133
+ "items": { "$ref": "#/$defs/FieldDefinition" }
134
+ }
135
+ },
136
+ "additionalProperties": true
137
+ },
138
+ "FieldDefinition": {
139
+ "oneOf": [
140
+ { "$ref": "#/$defs/PrimitiveFieldDefinition" },
141
+ { "$ref": "#/$defs/RecordFieldDefinition" }
142
+ ]
143
+ },
144
+ "PrimitiveFieldDefinition": {
145
+ "type": "object",
146
+ "required": ["$kind", "name"],
147
+ "properties": {
148
+ "$kind": { "const": "PrimitiveFieldDefinition" },
149
+ "name": { "type": "string", "minLength": 1 },
150
+ "fieldType": { "type": "integer", "minimum": 0, "maximum": 5, "description": "0=String, 1=Number, 2=Integer, 3=Boolean, 4=Date, 5=Timestamp" },
151
+ "isRequired": { "type": "boolean" },
152
+ "isCollection": { "type": "boolean" }
153
+ },
154
+ "additionalProperties": true
155
+ },
156
+ "RecordFieldDefinition": {
157
+ "type": "object",
158
+ "required": ["$kind", "name"],
159
+ "properties": {
160
+ "$kind": { "const": "RecordFieldDefinition" },
161
+ "name": { "type": "string", "minLength": 1 },
162
+ "isRequired": { "type": "boolean" },
163
+ "isCollection": { "type": "boolean" },
164
+ "record": { "$ref": "#/$defs/RecordDefinition" }
165
+ },
166
+ "additionalProperties": true
167
+ },
168
+ "ExtractionStep": {
169
+ "type": "object",
170
+ "properties": {
171
+ "configuration": { "type": "object" },
172
+ "mappings": {
173
+ "type": "array",
174
+ "items": {
175
+ "oneOf": [
176
+ { "$ref": "#/$defs/FieldMapping" },
177
+ { "$ref": "#/$defs/RepeatingFieldMapping" }
178
+ ]
179
+ }
180
+ }
181
+ },
182
+ "additionalProperties": true
183
+ },
184
+ "FieldMapping": {
185
+ "type": "object",
186
+ "required": ["$kind", "fieldName", "fieldType", "source"],
187
+ "properties": {
188
+ "$kind": { "const": "FieldMapping" },
189
+ "fieldName": { "type": "string", "minLength": 1 },
190
+ "fieldType": { "type": "integer", "minimum": 0, "maximum": 5, "description": "0=String, 1=Number, 2=Integer, 3=Boolean, 4=Date, 5=Timestamp" },
191
+ "source": { "$ref": "#/$defs/ExtractionSource" },
192
+ "parseFormat": { "type": ["string", "null"] },
193
+ "cultureName": { "type": ["string", "null"] }
194
+ },
195
+ "additionalProperties": true
196
+ },
197
+ "RepeatingFieldMapping": {
198
+ "type": "object",
199
+ "required": ["$kind", "collectionFieldName", "elementDefinition", "source", "subFields"],
200
+ "properties": {
201
+ "$kind": { "const": "RepeatingFieldMapping" },
202
+ "collectionFieldName": { "type": "string", "minLength": 1 },
203
+ "elementDefinition": { "$ref": "#/$defs/RecordDefinition" },
204
+ "source": { "$ref": "#/$defs/ExtractionSource" },
205
+ "subFields": {
206
+ "type": "array",
207
+ "items": { "$ref": "#/$defs/SubFieldMapping" }
208
+ }
209
+ },
210
+ "additionalProperties": true
211
+ },
212
+ "ExtractionSource": {
213
+ "oneOf": [
214
+ { "$ref": "#/$defs/TextPatternExtractionSource" },
215
+ { "$ref": "#/$defs/TextAnchorExtractionSource" },
216
+ { "$ref": "#/$defs/TableCellExtractionSource" },
217
+ { "$ref": "#/$defs/TableRowsExtractionSource" },
218
+ { "$ref": "#/$defs/MetadataFieldExtractionSource" },
219
+ { "$ref": "#/$defs/FallbackExtractionSource" }
220
+ ]
221
+ },
222
+ "TextPatternExtractionSource": {
223
+ "type": "object",
224
+ "required": ["$kind", "mode"],
225
+ "properties": {
226
+ "$kind": { "const": "TextPatternExtractionSource" },
227
+ "mode": { "type": "string", "enum": ["Token", "Pattern", "AllMatches"] },
228
+ "literalToken": { "type": "string" },
229
+ "regexPattern": { "type": "string" },
230
+ "pageNumber": { "type": "integer", "minimum": 1 },
231
+ "caseSensitive": { "type": "boolean" },
232
+ "blockSeparator": { "type": "string" },
233
+ "startAnchorPattern": { "type": ["string", "null"] },
234
+ "endAnchorPattern": { "type": ["string", "null"] }
235
+ },
236
+ "additionalProperties": true
237
+ },
238
+ "TextAnchorExtractionSource": {
239
+ "type": "object",
240
+ "required": ["$kind", "region"],
241
+ "properties": {
242
+ "$kind": { "const": "TextAnchorExtractionSource" },
243
+ "region": { "$ref": "#/$defs/PdfBounds" },
244
+ "literalToken": { "type": "string" },
245
+ "regexPattern": { "type": "string" },
246
+ "pageNumber": { "type": "integer", "minimum": 1 },
247
+ "caseSensitive": { "type": "boolean" }
248
+ },
249
+ "additionalProperties": true
250
+ },
251
+ "PdfBounds": {
252
+ "type": "object",
253
+ "required": ["left", "top", "width", "height"],
254
+ "properties": {
255
+ "left": { "type": "number" },
256
+ "top": { "type": "number" },
257
+ "width": { "type": "number" },
258
+ "height": { "type": "number" }
259
+ },
260
+ "additionalProperties": true
261
+ },
262
+ "TableCellExtractionSource": {
263
+ "type": "object",
264
+ "required": ["$kind", "rowIndex"],
265
+ "properties": {
266
+ "$kind": { "const": "TableCellExtractionSource" },
267
+ "rowIndex": { "type": "integer", "minimum": 0 },
268
+ "columnIndex": { "type": "integer", "minimum": 0 },
269
+ "headerToken": { "type": "string" },
270
+ "pageNumber": { "type": "integer", "minimum": 1 },
271
+ "tableIndex": { "type": "integer", "minimum": 0 },
272
+ "caseSensitiveHeader": { "type": "boolean" }
273
+ },
274
+ "additionalProperties": true
275
+ },
276
+ "TableRowsExtractionSource": {
277
+ "type": "object",
278
+ "required": ["$kind", "mode"],
279
+ "properties": {
280
+ "$kind": { "const": "TableRowsExtractionSource" },
281
+ "mode": { "type": "string", "enum": ["ByHeader", "Ordinal"] },
282
+ "pageNumber": { "type": "integer", "minimum": 1 },
283
+ "tableIndex": { "type": "integer", "minimum": 0 },
284
+ "headerRowIndex": { "type": "integer", "minimum": 0 },
285
+ "skipRows": { "type": "integer", "minimum": 0 },
286
+ "caseSensitiveHeader": { "type": "boolean" }
287
+ },
288
+ "additionalProperties": true
289
+ },
290
+ "MetadataFieldExtractionSource": {
291
+ "type": "object",
292
+ "required": ["$kind"],
293
+ "properties": {
294
+ "$kind": { "const": "MetadataFieldExtractionSource" },
295
+ "standardField": { "type": "string", "enum": ["Title", "Author", "Subject", "Keywords", "Creator", "Producer", "CreationDate", "ModifiedDate"] },
296
+ "rawKey": { "type": "string" }
297
+ },
298
+ "additionalProperties": true
299
+ },
300
+ "FallbackExtractionSource": {
301
+ "type": "object",
302
+ "required": ["$kind", "primary", "fallback"],
303
+ "properties": {
304
+ "$kind": { "const": "FallbackExtractionSource" },
305
+ "primary": { "$ref": "#/$defs/ExtractionSource" },
306
+ "fallback": { "$ref": "#/$defs/ExtractionSource" }
307
+ },
308
+ "additionalProperties": true
309
+ },
310
+ "SubFieldMapping": {
311
+ "oneOf": [
312
+ { "$ref": "#/$defs/HeaderSubFieldMapping" },
313
+ { "$ref": "#/$defs/OrdinalSubFieldMapping" },
314
+ { "$ref": "#/$defs/RegexGroupSubFieldMapping" },
315
+ { "$ref": "#/$defs/NamedGroupSubFieldMapping" }
316
+ ]
317
+ },
318
+ "HeaderSubFieldMapping": {
319
+ "type": "object",
320
+ "required": ["$kind", "fieldName"],
321
+ "properties": {
322
+ "$kind": { "const": "HeaderSubFieldMapping" },
323
+ "fieldName": { "type": "string", "minLength": 1 },
324
+ "headerToken": { "type": "string" },
325
+ "caseSensitive": { "type": "boolean" }
326
+ },
327
+ "additionalProperties": true
328
+ },
329
+ "OrdinalSubFieldMapping": {
330
+ "type": "object",
331
+ "required": ["$kind", "fieldName"],
332
+ "properties": {
333
+ "$kind": { "const": "OrdinalSubFieldMapping" },
334
+ "fieldName": { "type": "string", "minLength": 1 },
335
+ "columnIndex": { "type": "integer", "minimum": 0 }
336
+ },
337
+ "additionalProperties": true
338
+ },
339
+ "RegexGroupSubFieldMapping": {
340
+ "type": "object",
341
+ "required": ["$kind", "fieldName"],
342
+ "properties": {
343
+ "$kind": { "const": "RegexGroupSubFieldMapping" },
344
+ "fieldName": { "type": "string", "minLength": 1 },
345
+ "groupIndex": { "type": "integer", "minimum": 1 }
346
+ },
347
+ "additionalProperties": true
348
+ },
349
+ "NamedGroupSubFieldMapping": {
350
+ "type": "object",
351
+ "required": ["$kind", "fieldName"],
352
+ "properties": {
353
+ "$kind": { "const": "NamedGroupSubFieldMapping" },
354
+ "fieldName": { "type": "string", "minLength": 1 },
355
+ "groupName": { "type": "string", "minLength": 1 }
356
+ },
357
+ "additionalProperties": true
358
+ },
359
+ "FieldTransform": {
360
+ "oneOf": [
361
+ { "$ref": "#/$defs/TrimTransform" },
362
+ { "$ref": "#/$defs/CastTransform" },
363
+ { "$ref": "#/$defs/FormatTransform" },
364
+ { "$ref": "#/$defs/RenameTransform" },
365
+ { "$ref": "#/$defs/ComputeTransform" },
366
+ { "$ref": "#/$defs/CollectionElementTransform" }
367
+ ]
368
+ },
369
+ "TrimTransform": {
370
+ "type": "object",
371
+ "required": ["$kind"],
372
+ "properties": { "$kind": { "const": "TrimTransform" } },
373
+ "additionalProperties": true
374
+ },
375
+ "CastTransform": {
376
+ "type": "object",
377
+ "required": ["$kind"],
378
+ "properties": { "$kind": { "const": "CastTransform" } },
379
+ "additionalProperties": true
380
+ },
381
+ "FormatTransform": {
382
+ "type": "object",
383
+ "required": ["$kind"],
384
+ "properties": { "$kind": { "const": "FormatTransform" } },
385
+ "additionalProperties": true
386
+ },
387
+ "RenameTransform": {
388
+ "type": "object",
389
+ "required": ["$kind"],
390
+ "properties": { "$kind": { "const": "RenameTransform" } },
391
+ "additionalProperties": true
392
+ },
393
+ "ComputeTransform": {
394
+ "type": "object",
395
+ "required": ["$kind"],
396
+ "properties": { "$kind": { "const": "ComputeTransform" } },
397
+ "additionalProperties": true
398
+ },
399
+ "CollectionElementTransform": {
400
+ "type": "object",
401
+ "required": ["$kind"],
402
+ "properties": { "$kind": { "const": "CollectionElementTransform" } },
403
+ "additionalProperties": true
404
+ },
405
+ "PublishStep": {
406
+ "type": "object",
407
+ "properties": {
408
+ "configuration": { "type": "object" }
409
+ },
410
+ "additionalProperties": true
411
+ }
412
+ }
413
+ }
@@ -0,0 +1,11 @@
1
+ # Docuoria -- Classify
2
+
3
+ Match a PDF against all templates in the store and report which template applies.
4
+
5
+ ## Invocation
6
+
7
+ ```bash
8
+ dotnet script scripts/classify.csx -- --help
9
+ ```
10
+
11
+ Refer to `SKILL.md` for the full argument reference and worked examples.
@@ -0,0 +1,11 @@
1
+ # Docuoria -- Diagnose
2
+
3
+ Run a template end-to-end without producing output; surface extraction warnings and mismatches.
4
+
5
+ ## Invocation
6
+
7
+ ```bash
8
+ dotnet script scripts/dry-run.csx -- --help
9
+ ```
10
+
11
+ Refer to `SKILL.md` for the full argument reference and worked examples.
@@ -0,0 +1,11 @@
1
+ # Docuoria -- Extract
2
+
3
+ Execute a template against a PDF and produce structured output (CSV or JSON).
4
+
5
+ ## Invocation
6
+
7
+ ```bash
8
+ dotnet script scripts/execute.csx -- --help
9
+ ```
10
+
11
+ Refer to `SKILL.md` for the full argument reference and worked examples.
@@ -0,0 +1,11 @@
1
+ # Docuoria -- Inspect
2
+
3
+ Read and display the text extraction a PDF yields so you can author or debug patterns.
4
+
5
+ ## Invocation
6
+
7
+ ```bash
8
+ dotnet script scripts/inspect.csx -- --help
9
+ ```
10
+
11
+ Refer to `SKILL.md` for the full argument reference and worked examples.
@@ -0,0 +1,11 @@
1
+ # Docuoria -- Validate Template
2
+
3
+ Validate a template JSON file against the schema and report structural errors.
4
+
5
+ ## Invocation
6
+
7
+ ```bash
8
+ dotnet script scripts/validate-template.csx -- --help
9
+ ```
10
+
11
+ Refer to `SKILL.md` for the full argument reference and worked examples.
@@ -0,0 +1,49 @@
1
+ # Example 1 — Extract to CSV (known template)
2
+
3
+ ## Scenario
4
+
5
+ A PDF document with a header (a date and a reference number) and a repeating list of line items. A `Template` already exists in the local template store (a `templates/` directory at the workspace root); the goal is to produce a CSV from this PDF without authoring anything new.
6
+
7
+ ## Template excerpt
8
+
9
+ The full template shape for this scenario (`PrimitiveFieldDefinition` for `refNumber` + `invoiceDate`, `RecordFieldDefinition` with `isCollection: true` for `lineItems`, three `FieldMapping`/`RepeatingFieldMapping` extraction mappings) is documented in [`../references/template-reference.md` § Complex template](../references/template-reference.md#complex-template-scalar-fields--repeating-collection). The single mapping that drives the repeating CSV rows looks like:
10
+
11
+ ```json
12
+ {
13
+ "$kind": "RepeatingFieldMapping",
14
+ "collectionFieldName": "lineItems",
15
+ "source": {
16
+ "$kind": "TextPatternExtractionSource",
17
+ "mode": "AllMatches",
18
+ "regexPattern": "(?<description>[A-Za-z ]+?)\\s+\\$(?<amount>[\\d,.]+)"
19
+ },
20
+ "subFields": [
21
+ { "$kind": "NamedGroupSubFieldMapping", "fieldName": "description", "fieldType": 0, "groupName": "description" },
22
+ { "$kind": "NamedGroupSubFieldMapping", "fieldName": "amount", "fieldType": 1, "groupName": "amount" }
23
+ ]
24
+ }
25
+ ```
26
+
27
+ Note: `fieldType` is an **integer** (0 = String, 1 = Number, 4 = Date). See the template reference for the canonical enum table.
28
+
29
+ ## Steps
30
+
31
+ 1. `dotnet script scripts/list-templates.csx -- --store-path ./templates` — find the template ID that should apply.
32
+ 2. `dotnet script scripts/load-template.csx -- --id <id> --store-path ./templates` — confirm the template loaded and inspect its fields. Add `--output <path>` to write the JSON to a file.
33
+ 3. `dotnet script scripts/dry-run.csx -- --pdf <pdf> --template <template.json>` — confirm a `DryRunSucceeded` outcome. Inspect the extracted fields and (if needed) the `ExtractionDiagnostics` snapshot before publishing.
34
+ 4. `dotnet script scripts/execute.csx -- --pdf <pdf> --template <template.json> --format csv --output output.csv` — runs the full pipeline through the registered CSV generator. Engine API used: `IDocuoriaEngine.ExecuteTemplateAsync<CsvOutputGenerator, CsvGeneratorOptions>`.
35
+
36
+ ## Expected outcome
37
+
38
+ `ProcessingResult` is `SucceededResult`; `output.csv` exists with rows matching the document's line items, in document order, with the header columns implied by the template's schema.
39
+
40
+ ## If it fails
41
+
42
+ Go to [`../references/failure-tree.md`](../references/failure-tree.md). Map the script's stderr `error.code` to a branch via [§ Stderr error.code → Branch routing](../references/failure-tree.md#stderr-errorcode--branch-routing): `rejected` → Branch A (read `RejectionReason` in `detail`); `failed` → Branch B (read `StepIdentifier` in `detail`). If `dry-run.csx` already returned `DryRunSucceeded` but `execute.csx` then returned `rejected` with `RejectionReason.GeneratorRejected`, the generator is rejecting the shape — Branch A's `GeneratorRejected` row covers the remediation.
43
+
44
+ ## See also
45
+
46
+ - [`../references/template-reference.md` § Complex template](../references/template-reference.md#complex-template-scalar-fields--repeating-collection) — full template JSON shape used in this example.
47
+ - [`../references/workflow.md`](../references/workflow.md) Steps 5–7 — this example exercises dry-run, execute, and the store lookup at the start.
48
+ - [`../references/scripts.md`](../references/scripts.md) — full flag list and output envelope for every script invoked here.
49
+
@@ -0,0 +1,102 @@
1
+ # Example 2 — Classify an unknown PDF
2
+
3
+ ## Scenario
4
+
5
+ A PDF document arrives. You do not know whether any stored template matches it. The templates are in a local store directory (`./templates`). The goal is to route the PDF to a matching template, identify a partial match to refine, or recognise it needs a new template authored.
6
+
7
+ ## How classification works
8
+
9
+ The `classify.csx` script evaluates every stored template's `rootMatchRule` against the PDF and returns the top-N templates ranked by `confidence` — an aggregated score (`ruleConfidence × extractionProbeScore`) that reflects both rule match strength and extraction viability. This gradient lets you distinguish between strong matches, partial matches worth refining, and complete misses.
10
+
11
+ ## Example match rules — weak vs. strong
12
+
13
+ ### Weak — vendor tokens only (poor discrimination)
14
+
15
+ ```json
16
+ {
17
+ "rootMatchRule": {
18
+ "$kind": "TextPatternMatchRule",
19
+ "tokens": ["Microsoft", "Invoice", "Bill To"],
20
+ "mode": 1,
21
+ "threshold": 0.5
22
+ }
23
+ }
24
+ ```
25
+
26
+ Every Microsoft invoice (subscription, Azure, support) contains these tokens. All would score `confidence: 1.0` — no discrimination.
27
+
28
+ ### Strong — composite with discriminator (produces meaningful gradients)
29
+
30
+ ```json
31
+ {
32
+ "rootMatchRule": {
33
+ "$kind": "CompositeMatchRule",
34
+ "operator": 0,
35
+ "threshold": 0.8,
36
+ "children": [
37
+ {
38
+ "rule": {
39
+ "$kind": "TextPatternMatchRule",
40
+ "tokens": ["Microsoft", "Invoice"],
41
+ "mode": 1,
42
+ "threshold": 0.5
43
+ },
44
+ "weight": 1.0
45
+ },
46
+ {
47
+ "rule": {
48
+ "$kind": "TextPatternMatchRule",
49
+ "tokens": ["Subscription", "License Qty", "Seats"],
50
+ "mode": 0,
51
+ "threshold": 0.6
52
+ },
53
+ "weight": 2.0
54
+ }
55
+ ]
56
+ }
57
+ }
58
+ ```
59
+
60
+ The first child is a broad gate (must be a Microsoft Invoice). The second child — weighted at 2.0 — is the **discriminator** that targets subscription invoices specifically. A subscription invoice scores `confidence: 1.0`. An Azure usage invoice scores `confidence: 0.33` (discriminator fails: `(1.0×1 + 0.0×2) / 3`). The gradient clearly separates the two.
61
+
62
+ ## Steps
63
+
64
+ 1. **Classify:** `dotnet script scripts/classify.csx -- --pdf <pdf> --store-path ./templates`
65
+
66
+ Example output:
67
+ ```json
68
+ {
69
+ "matches": [
70
+ { "templateId": "ms-subscription-invoice", "confidence": 0.92 },
71
+ { "templateId": "ms-azure-invoice", "confidence": 0.35 },
72
+ { "templateId": "generic-invoice", "confidence": 0.12 }
73
+ ]
74
+ }
75
+ ```
76
+
77
+ 2. **Interpret the results:** map the top match's `confidence` to an action using the canonical gradient table in [`../references/classification.md` § Interpreting the gradient](../references/classification.md#interpreting-the-gradient). An empty `matches` array means no templates are stored — author from scratch.
78
+
79
+ 3. **For a strong match** — verify correctness:
80
+ - Run `dotnet script scripts/dry-run.csx -- --pdf <pdf> --template <matched-id-or-path>`
81
+ - If extraction produces expected data → done.
82
+ - If extraction produces empty collections or nonsensical data → **misclassification**. See [`../references/failure-tree.md`](../references/failure-tree.md) Branch C.
83
+
84
+ 4. **For a partial match** — try extraction, then iterate:
85
+ - Run dry-run with the top-scoring template. Even if some fields are incomplete, others may transfer.
86
+ - Load the template (`load-template.csx -- --id <matched-id> --store-path ./templates`), adjust the match rules and extraction sources for the new document type.
87
+ - Validate the updated template with positive + negative testing before storing.
88
+
89
+ 5. **When authoring a new template** — validate classification before storing:
90
+ - **Positive:** `dotnet script scripts/evaluate-match.csx -- --pdf <target.pdf> --template <new-template.json>` → high `confidence`.
91
+ - **Negative:** repeat with PDFs from the same vendor that should NOT match → low `confidence`.
92
+ - **Ranked:** `dotnet script scripts/classify.csx -- --pdf <target.pdf> --store-path ./templates` → new template must rank #1 with a clear gap over siblings.
93
+
94
+ ## Expected outcome
95
+
96
+ Either: a high-confidence match → proceed to extraction. Or: a partial match → refine an existing template. Or: no meaningful match → author a new template with clear diagnostic insight into why existing templates scored low.
97
+
98
+ ## See also
99
+
100
+ - [`../references/classification.md`](../references/classification.md) — full guide to designing discriminating match rules and interpreting the confidence gradient.
101
+ - [`../references/workflow.md`](../references/workflow.md) Step 1 — classify is the entry point; confidence routing determines the next step.
102
+ - [`../references/failure-tree.md`](../references/failure-tree.md) Branch C — diagnosing classification issues with ranked output.