@sidub-inc/docuoria.cli 1.0.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +1056 -0
- package/package.json +56 -0
- package/payload/.claude-plugin/plugin.json +21 -0
- package/payload/MANIFEST.json +322 -0
- package/payload/SKILL.md +88 -0
- package/payload/assets/lib/Docuoria.dll +0 -0
- package/payload/assets/schemas/template-schema.json +413 -0
- package/payload/commands/classify.md +11 -0
- package/payload/commands/diagnose.md +11 -0
- package/payload/commands/extract.md +11 -0
- package/payload/commands/inspect.md +11 -0
- package/payload/commands/validate-template.md +11 -0
- package/payload/examples/01-extract-to-csv.md +49 -0
- package/payload/examples/02-classify-unknown-pdf.md +102 -0
- package/payload/examples/03-diagnose-failed-result.md +68 -0
- package/payload/references/classification.md +363 -0
- package/payload/references/decision-tree.md +43 -0
- package/payload/references/failure-tree.md +169 -0
- package/payload/references/pattern-authoring.md +40 -0
- package/payload/references/patterns.md +97 -0
- package/payload/references/privacy.md +36 -0
- package/payload/references/scripts.md +361 -0
- package/payload/references/template-reference.md +606 -0
- package/payload/references/workflow.md +163 -0
- package/payload/scripts/_common.csx +250 -0
- package/payload/scripts/classify.csx +53 -0
- package/payload/scripts/dry-run.csx +85 -0
- package/payload/scripts/evaluate-match.csx +72 -0
- package/payload/scripts/execute.csx +89 -0
- package/payload/scripts/inspect.csx +43 -0
- package/payload/scripts/list-templates.csx +34 -0
- package/payload/scripts/load-template.csx +54 -0
- package/payload/scripts/save-template.csx +53 -0
- package/payload/scripts/schema-info.csx +84 -0
- package/payload/scripts/test-groups.csx +44 -0
- package/payload/scripts/test-pattern.csx +61 -0
- package/payload/scripts/validate-template.csx +54 -0
- package/payload/skill/SKILL.md +88 -0
- package/payload/skill/assets/lib/Docuoria.dll +0 -0
- package/payload/skill/assets/schemas/template-schema.json +413 -0
- package/payload/skill/examples/01-extract-to-csv.md +49 -0
- package/payload/skill/examples/02-classify-unknown-pdf.md +102 -0
- package/payload/skill/examples/03-diagnose-failed-result.md +68 -0
- package/payload/skill/references/classification.md +363 -0
- package/payload/skill/references/decision-tree.md +43 -0
- package/payload/skill/references/failure-tree.md +169 -0
- package/payload/skill/references/pattern-authoring.md +40 -0
- package/payload/skill/references/patterns.md +97 -0
- package/payload/skill/references/privacy.md +36 -0
- package/payload/skill/references/scripts.md +361 -0
- package/payload/skill/references/template-reference.md +606 -0
- package/payload/skill/references/workflow.md +163 -0
- package/payload/skill/scripts/_common.csx +250 -0
- package/payload/skill/scripts/classify.csx +53 -0
- package/payload/skill/scripts/dry-run.csx +85 -0
- package/payload/skill/scripts/evaluate-match.csx +72 -0
- package/payload/skill/scripts/execute.csx +89 -0
- package/payload/skill/scripts/inspect.csx +43 -0
- package/payload/skill/scripts/list-templates.csx +34 -0
- package/payload/skill/scripts/load-template.csx +54 -0
- package/payload/skill/scripts/save-template.csx +53 -0
- package/payload/skill/scripts/schema-info.csx +84 -0
- package/payload/skill/scripts/test-groups.csx +44 -0
- package/payload/skill/scripts/test-pattern.csx +61 -0
- package/payload/skill/scripts/validate-template.csx +54 -0
|
@@ -0,0 +1,413 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
3
|
+
"$id": "https://schemas.sidub.net/docuoria/template-schema.json",
|
|
4
|
+
"title": "Sidub PDF Pipeline Template",
|
|
5
|
+
"description": "JSON Schema (Draft 2020-12) for the Sidub Docuoria Template document. Polymorphic nodes carry a `$kind` discriminator whose value matches the corresponding CLR simple type name. This schema constrains structure only; deeper semantic checks (field references, regex validity, record cycles) are enforced by Template.Validate() at runtime.",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"required": ["identifier", "rootMatchRule", "dataModel", "extractionStep", "publishStep"],
|
|
8
|
+
"properties": {
|
|
9
|
+
"identifier": { "type": "string", "minLength": 1 },
|
|
10
|
+
"rootMatchRule": { "$ref": "#/$defs/MatchRuleConfigurationBase" },
|
|
11
|
+
"dataModel": { "$ref": "#/$defs/DataModel" },
|
|
12
|
+
"extractionStep": { "$ref": "#/$defs/ExtractionStep" },
|
|
13
|
+
"intermediateSteps": {
|
|
14
|
+
"type": "array",
|
|
15
|
+
"items": { "type": "object" }
|
|
16
|
+
},
|
|
17
|
+
"publishStep": { "$ref": "#/$defs/PublishStep" }
|
|
18
|
+
},
|
|
19
|
+
"additionalProperties": true,
|
|
20
|
+
"$defs": {
|
|
21
|
+
"MatchRuleConfigurationBase": {
|
|
22
|
+
"oneOf": [
|
|
23
|
+
{ "$ref": "#/$defs/FileNameMatchRule" },
|
|
24
|
+
{ "$ref": "#/$defs/TextPatternMatchRule" },
|
|
25
|
+
{ "$ref": "#/$defs/TextAnchorMatchRule" },
|
|
26
|
+
{ "$ref": "#/$defs/MetadataMatchRule" },
|
|
27
|
+
{ "$ref": "#/$defs/PageGeometryMatchRule" },
|
|
28
|
+
{ "$ref": "#/$defs/TableMatchRule" },
|
|
29
|
+
{ "$ref": "#/$defs/CompositeMatchRule" }
|
|
30
|
+
]
|
|
31
|
+
},
|
|
32
|
+
"FileNameMatchRule": {
|
|
33
|
+
"type": "object",
|
|
34
|
+
"required": ["$kind", "pattern"],
|
|
35
|
+
"properties": {
|
|
36
|
+
"$kind": { "const": "FileNameMatchRule" },
|
|
37
|
+
"pattern": { "type": "string" },
|
|
38
|
+
"mode": { "type": "integer", "minimum": 0, "maximum": 1, "description": "0=Glob, 1=Regex" },
|
|
39
|
+
"caseSensitive": { "type": "boolean" },
|
|
40
|
+
"threshold": { "type": "number", "minimum": 0, "maximum": 1 }
|
|
41
|
+
},
|
|
42
|
+
"additionalProperties": true
|
|
43
|
+
},
|
|
44
|
+
"TextPatternMatchRule": {
|
|
45
|
+
"type": "object",
|
|
46
|
+
"required": ["$kind"],
|
|
47
|
+
"properties": {
|
|
48
|
+
"$kind": { "const": "TextPatternMatchRule" },
|
|
49
|
+
"tokens": { "type": "array", "items": { "type": "string" } },
|
|
50
|
+
"regexPattern": { "type": "string" },
|
|
51
|
+
"mode": { "type": "integer", "minimum": 0, "maximum": 1, "description": "0=AnyToken, 1=AllTokens" },
|
|
52
|
+
"caseSensitive": { "type": "boolean" },
|
|
53
|
+
"pageNumber": { "type": "integer", "minimum": 1 },
|
|
54
|
+
"threshold": { "type": "number", "minimum": 0, "maximum": 1 }
|
|
55
|
+
},
|
|
56
|
+
"additionalProperties": true
|
|
57
|
+
},
|
|
58
|
+
"TextAnchorMatchRule": {
|
|
59
|
+
"type": "object",
|
|
60
|
+
"required": ["$kind"],
|
|
61
|
+
"properties": {
|
|
62
|
+
"$kind": { "const": "TextAnchorMatchRule" },
|
|
63
|
+
"threshold": { "type": "number", "minimum": 0, "maximum": 1 }
|
|
64
|
+
},
|
|
65
|
+
"additionalProperties": true
|
|
66
|
+
},
|
|
67
|
+
"MetadataMatchRule": {
|
|
68
|
+
"type": "object",
|
|
69
|
+
"required": ["$kind"],
|
|
70
|
+
"properties": {
|
|
71
|
+
"$kind": { "const": "MetadataMatchRule" },
|
|
72
|
+
"threshold": { "type": "number", "minimum": 0, "maximum": 1 }
|
|
73
|
+
},
|
|
74
|
+
"additionalProperties": true
|
|
75
|
+
},
|
|
76
|
+
"PageGeometryMatchRule": {
|
|
77
|
+
"type": "object",
|
|
78
|
+
"required": ["$kind"],
|
|
79
|
+
"properties": {
|
|
80
|
+
"$kind": { "const": "PageGeometryMatchRule" },
|
|
81
|
+
"threshold": { "type": "number", "minimum": 0, "maximum": 1 }
|
|
82
|
+
},
|
|
83
|
+
"additionalProperties": true
|
|
84
|
+
},
|
|
85
|
+
"TableMatchRule": {
|
|
86
|
+
"type": "object",
|
|
87
|
+
"required": ["$kind"],
|
|
88
|
+
"properties": {
|
|
89
|
+
"$kind": { "const": "TableMatchRule" },
|
|
90
|
+
"threshold": { "type": "number", "minimum": 0, "maximum": 1 }
|
|
91
|
+
},
|
|
92
|
+
"additionalProperties": true
|
|
93
|
+
},
|
|
94
|
+
"CompositeMatchRule": {
|
|
95
|
+
"type": "object",
|
|
96
|
+
"required": ["$kind", "children"],
|
|
97
|
+
"properties": {
|
|
98
|
+
"$kind": { "const": "CompositeMatchRule" },
|
|
99
|
+
"operator": { "type": "integer", "minimum": 0, "maximum": 2, "description": "0=And, 1=Or, 2=Not" },
|
|
100
|
+
"children": {
|
|
101
|
+
"type": "array",
|
|
102
|
+
"items": { "$ref": "#/$defs/CompositeChildEntry" },
|
|
103
|
+
"minItems": 1
|
|
104
|
+
},
|
|
105
|
+
"threshold": { "type": "number", "minimum": 0, "maximum": 1 }
|
|
106
|
+
},
|
|
107
|
+
"additionalProperties": true
|
|
108
|
+
},
|
|
109
|
+
"CompositeChildEntry": {
|
|
110
|
+
"type": "object",
|
|
111
|
+
"required": ["rule"],
|
|
112
|
+
"properties": {
|
|
113
|
+
"rule": { "$ref": "#/$defs/MatchRuleConfigurationBase" },
|
|
114
|
+
"weight": { "type": "number" }
|
|
115
|
+
},
|
|
116
|
+
"additionalProperties": true
|
|
117
|
+
},
|
|
118
|
+
"DataModel": {
|
|
119
|
+
"type": "object",
|
|
120
|
+
"required": ["schema"],
|
|
121
|
+
"properties": {
|
|
122
|
+
"schema": { "$ref": "#/$defs/RecordDefinition" }
|
|
123
|
+
},
|
|
124
|
+
"additionalProperties": true
|
|
125
|
+
},
|
|
126
|
+
"RecordDefinition": {
|
|
127
|
+
"type": "object",
|
|
128
|
+
"required": ["name", "fields"],
|
|
129
|
+
"properties": {
|
|
130
|
+
"name": { "type": "string", "minLength": 1 },
|
|
131
|
+
"fields": {
|
|
132
|
+
"type": "array",
|
|
133
|
+
"items": { "$ref": "#/$defs/FieldDefinition" }
|
|
134
|
+
}
|
|
135
|
+
},
|
|
136
|
+
"additionalProperties": true
|
|
137
|
+
},
|
|
138
|
+
"FieldDefinition": {
|
|
139
|
+
"oneOf": [
|
|
140
|
+
{ "$ref": "#/$defs/PrimitiveFieldDefinition" },
|
|
141
|
+
{ "$ref": "#/$defs/RecordFieldDefinition" }
|
|
142
|
+
]
|
|
143
|
+
},
|
|
144
|
+
"PrimitiveFieldDefinition": {
|
|
145
|
+
"type": "object",
|
|
146
|
+
"required": ["$kind", "name"],
|
|
147
|
+
"properties": {
|
|
148
|
+
"$kind": { "const": "PrimitiveFieldDefinition" },
|
|
149
|
+
"name": { "type": "string", "minLength": 1 },
|
|
150
|
+
"fieldType": { "type": "integer", "minimum": 0, "maximum": 5, "description": "0=String, 1=Number, 2=Integer, 3=Boolean, 4=Date, 5=Timestamp" },
|
|
151
|
+
"isRequired": { "type": "boolean" },
|
|
152
|
+
"isCollection": { "type": "boolean" }
|
|
153
|
+
},
|
|
154
|
+
"additionalProperties": true
|
|
155
|
+
},
|
|
156
|
+
"RecordFieldDefinition": {
|
|
157
|
+
"type": "object",
|
|
158
|
+
"required": ["$kind", "name"],
|
|
159
|
+
"properties": {
|
|
160
|
+
"$kind": { "const": "RecordFieldDefinition" },
|
|
161
|
+
"name": { "type": "string", "minLength": 1 },
|
|
162
|
+
"isRequired": { "type": "boolean" },
|
|
163
|
+
"isCollection": { "type": "boolean" },
|
|
164
|
+
"record": { "$ref": "#/$defs/RecordDefinition" }
|
|
165
|
+
},
|
|
166
|
+
"additionalProperties": true
|
|
167
|
+
},
|
|
168
|
+
"ExtractionStep": {
|
|
169
|
+
"type": "object",
|
|
170
|
+
"properties": {
|
|
171
|
+
"configuration": { "type": "object" },
|
|
172
|
+
"mappings": {
|
|
173
|
+
"type": "array",
|
|
174
|
+
"items": {
|
|
175
|
+
"oneOf": [
|
|
176
|
+
{ "$ref": "#/$defs/FieldMapping" },
|
|
177
|
+
{ "$ref": "#/$defs/RepeatingFieldMapping" }
|
|
178
|
+
]
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
},
|
|
182
|
+
"additionalProperties": true
|
|
183
|
+
},
|
|
184
|
+
"FieldMapping": {
|
|
185
|
+
"type": "object",
|
|
186
|
+
"required": ["$kind", "fieldName", "fieldType", "source"],
|
|
187
|
+
"properties": {
|
|
188
|
+
"$kind": { "const": "FieldMapping" },
|
|
189
|
+
"fieldName": { "type": "string", "minLength": 1 },
|
|
190
|
+
"fieldType": { "type": "integer", "minimum": 0, "maximum": 5, "description": "0=String, 1=Number, 2=Integer, 3=Boolean, 4=Date, 5=Timestamp" },
|
|
191
|
+
"source": { "$ref": "#/$defs/ExtractionSource" },
|
|
192
|
+
"parseFormat": { "type": ["string", "null"] },
|
|
193
|
+
"cultureName": { "type": ["string", "null"] }
|
|
194
|
+
},
|
|
195
|
+
"additionalProperties": true
|
|
196
|
+
},
|
|
197
|
+
"RepeatingFieldMapping": {
|
|
198
|
+
"type": "object",
|
|
199
|
+
"required": ["$kind", "collectionFieldName", "elementDefinition", "source", "subFields"],
|
|
200
|
+
"properties": {
|
|
201
|
+
"$kind": { "const": "RepeatingFieldMapping" },
|
|
202
|
+
"collectionFieldName": { "type": "string", "minLength": 1 },
|
|
203
|
+
"elementDefinition": { "$ref": "#/$defs/RecordDefinition" },
|
|
204
|
+
"source": { "$ref": "#/$defs/ExtractionSource" },
|
|
205
|
+
"subFields": {
|
|
206
|
+
"type": "array",
|
|
207
|
+
"items": { "$ref": "#/$defs/SubFieldMapping" }
|
|
208
|
+
}
|
|
209
|
+
},
|
|
210
|
+
"additionalProperties": true
|
|
211
|
+
},
|
|
212
|
+
"ExtractionSource": {
|
|
213
|
+
"oneOf": [
|
|
214
|
+
{ "$ref": "#/$defs/TextPatternExtractionSource" },
|
|
215
|
+
{ "$ref": "#/$defs/TextAnchorExtractionSource" },
|
|
216
|
+
{ "$ref": "#/$defs/TableCellExtractionSource" },
|
|
217
|
+
{ "$ref": "#/$defs/TableRowsExtractionSource" },
|
|
218
|
+
{ "$ref": "#/$defs/MetadataFieldExtractionSource" },
|
|
219
|
+
{ "$ref": "#/$defs/FallbackExtractionSource" }
|
|
220
|
+
]
|
|
221
|
+
},
|
|
222
|
+
"TextPatternExtractionSource": {
|
|
223
|
+
"type": "object",
|
|
224
|
+
"required": ["$kind", "mode"],
|
|
225
|
+
"properties": {
|
|
226
|
+
"$kind": { "const": "TextPatternExtractionSource" },
|
|
227
|
+
"mode": { "type": "string", "enum": ["Token", "Pattern", "AllMatches"] },
|
|
228
|
+
"literalToken": { "type": "string" },
|
|
229
|
+
"regexPattern": { "type": "string" },
|
|
230
|
+
"pageNumber": { "type": "integer", "minimum": 1 },
|
|
231
|
+
"caseSensitive": { "type": "boolean" },
|
|
232
|
+
"blockSeparator": { "type": "string" },
|
|
233
|
+
"startAnchorPattern": { "type": ["string", "null"] },
|
|
234
|
+
"endAnchorPattern": { "type": ["string", "null"] }
|
|
235
|
+
},
|
|
236
|
+
"additionalProperties": true
|
|
237
|
+
},
|
|
238
|
+
"TextAnchorExtractionSource": {
|
|
239
|
+
"type": "object",
|
|
240
|
+
"required": ["$kind", "region"],
|
|
241
|
+
"properties": {
|
|
242
|
+
"$kind": { "const": "TextAnchorExtractionSource" },
|
|
243
|
+
"region": { "$ref": "#/$defs/PdfBounds" },
|
|
244
|
+
"literalToken": { "type": "string" },
|
|
245
|
+
"regexPattern": { "type": "string" },
|
|
246
|
+
"pageNumber": { "type": "integer", "minimum": 1 },
|
|
247
|
+
"caseSensitive": { "type": "boolean" }
|
|
248
|
+
},
|
|
249
|
+
"additionalProperties": true
|
|
250
|
+
},
|
|
251
|
+
"PdfBounds": {
|
|
252
|
+
"type": "object",
|
|
253
|
+
"required": ["left", "top", "width", "height"],
|
|
254
|
+
"properties": {
|
|
255
|
+
"left": { "type": "number" },
|
|
256
|
+
"top": { "type": "number" },
|
|
257
|
+
"width": { "type": "number" },
|
|
258
|
+
"height": { "type": "number" }
|
|
259
|
+
},
|
|
260
|
+
"additionalProperties": true
|
|
261
|
+
},
|
|
262
|
+
"TableCellExtractionSource": {
|
|
263
|
+
"type": "object",
|
|
264
|
+
"required": ["$kind", "rowIndex"],
|
|
265
|
+
"properties": {
|
|
266
|
+
"$kind": { "const": "TableCellExtractionSource" },
|
|
267
|
+
"rowIndex": { "type": "integer", "minimum": 0 },
|
|
268
|
+
"columnIndex": { "type": "integer", "minimum": 0 },
|
|
269
|
+
"headerToken": { "type": "string" },
|
|
270
|
+
"pageNumber": { "type": "integer", "minimum": 1 },
|
|
271
|
+
"tableIndex": { "type": "integer", "minimum": 0 },
|
|
272
|
+
"caseSensitiveHeader": { "type": "boolean" }
|
|
273
|
+
},
|
|
274
|
+
"additionalProperties": true
|
|
275
|
+
},
|
|
276
|
+
"TableRowsExtractionSource": {
|
|
277
|
+
"type": "object",
|
|
278
|
+
"required": ["$kind", "mode"],
|
|
279
|
+
"properties": {
|
|
280
|
+
"$kind": { "const": "TableRowsExtractionSource" },
|
|
281
|
+
"mode": { "type": "string", "enum": ["ByHeader", "Ordinal"] },
|
|
282
|
+
"pageNumber": { "type": "integer", "minimum": 1 },
|
|
283
|
+
"tableIndex": { "type": "integer", "minimum": 0 },
|
|
284
|
+
"headerRowIndex": { "type": "integer", "minimum": 0 },
|
|
285
|
+
"skipRows": { "type": "integer", "minimum": 0 },
|
|
286
|
+
"caseSensitiveHeader": { "type": "boolean" }
|
|
287
|
+
},
|
|
288
|
+
"additionalProperties": true
|
|
289
|
+
},
|
|
290
|
+
"MetadataFieldExtractionSource": {
|
|
291
|
+
"type": "object",
|
|
292
|
+
"required": ["$kind"],
|
|
293
|
+
"properties": {
|
|
294
|
+
"$kind": { "const": "MetadataFieldExtractionSource" },
|
|
295
|
+
"standardField": { "type": "string", "enum": ["Title", "Author", "Subject", "Keywords", "Creator", "Producer", "CreationDate", "ModifiedDate"] },
|
|
296
|
+
"rawKey": { "type": "string" }
|
|
297
|
+
},
|
|
298
|
+
"additionalProperties": true
|
|
299
|
+
},
|
|
300
|
+
"FallbackExtractionSource": {
|
|
301
|
+
"type": "object",
|
|
302
|
+
"required": ["$kind", "primary", "fallback"],
|
|
303
|
+
"properties": {
|
|
304
|
+
"$kind": { "const": "FallbackExtractionSource" },
|
|
305
|
+
"primary": { "$ref": "#/$defs/ExtractionSource" },
|
|
306
|
+
"fallback": { "$ref": "#/$defs/ExtractionSource" }
|
|
307
|
+
},
|
|
308
|
+
"additionalProperties": true
|
|
309
|
+
},
|
|
310
|
+
"SubFieldMapping": {
|
|
311
|
+
"oneOf": [
|
|
312
|
+
{ "$ref": "#/$defs/HeaderSubFieldMapping" },
|
|
313
|
+
{ "$ref": "#/$defs/OrdinalSubFieldMapping" },
|
|
314
|
+
{ "$ref": "#/$defs/RegexGroupSubFieldMapping" },
|
|
315
|
+
{ "$ref": "#/$defs/NamedGroupSubFieldMapping" }
|
|
316
|
+
]
|
|
317
|
+
},
|
|
318
|
+
"HeaderSubFieldMapping": {
|
|
319
|
+
"type": "object",
|
|
320
|
+
"required": ["$kind", "fieldName"],
|
|
321
|
+
"properties": {
|
|
322
|
+
"$kind": { "const": "HeaderSubFieldMapping" },
|
|
323
|
+
"fieldName": { "type": "string", "minLength": 1 },
|
|
324
|
+
"headerToken": { "type": "string" },
|
|
325
|
+
"caseSensitive": { "type": "boolean" }
|
|
326
|
+
},
|
|
327
|
+
"additionalProperties": true
|
|
328
|
+
},
|
|
329
|
+
"OrdinalSubFieldMapping": {
|
|
330
|
+
"type": "object",
|
|
331
|
+
"required": ["$kind", "fieldName"],
|
|
332
|
+
"properties": {
|
|
333
|
+
"$kind": { "const": "OrdinalSubFieldMapping" },
|
|
334
|
+
"fieldName": { "type": "string", "minLength": 1 },
|
|
335
|
+
"columnIndex": { "type": "integer", "minimum": 0 }
|
|
336
|
+
},
|
|
337
|
+
"additionalProperties": true
|
|
338
|
+
},
|
|
339
|
+
"RegexGroupSubFieldMapping": {
|
|
340
|
+
"type": "object",
|
|
341
|
+
"required": ["$kind", "fieldName"],
|
|
342
|
+
"properties": {
|
|
343
|
+
"$kind": { "const": "RegexGroupSubFieldMapping" },
|
|
344
|
+
"fieldName": { "type": "string", "minLength": 1 },
|
|
345
|
+
"groupIndex": { "type": "integer", "minimum": 1 }
|
|
346
|
+
},
|
|
347
|
+
"additionalProperties": true
|
|
348
|
+
},
|
|
349
|
+
"NamedGroupSubFieldMapping": {
|
|
350
|
+
"type": "object",
|
|
351
|
+
"required": ["$kind", "fieldName"],
|
|
352
|
+
"properties": {
|
|
353
|
+
"$kind": { "const": "NamedGroupSubFieldMapping" },
|
|
354
|
+
"fieldName": { "type": "string", "minLength": 1 },
|
|
355
|
+
"groupName": { "type": "string", "minLength": 1 }
|
|
356
|
+
},
|
|
357
|
+
"additionalProperties": true
|
|
358
|
+
},
|
|
359
|
+
"FieldTransform": {
|
|
360
|
+
"oneOf": [
|
|
361
|
+
{ "$ref": "#/$defs/TrimTransform" },
|
|
362
|
+
{ "$ref": "#/$defs/CastTransform" },
|
|
363
|
+
{ "$ref": "#/$defs/FormatTransform" },
|
|
364
|
+
{ "$ref": "#/$defs/RenameTransform" },
|
|
365
|
+
{ "$ref": "#/$defs/ComputeTransform" },
|
|
366
|
+
{ "$ref": "#/$defs/CollectionElementTransform" }
|
|
367
|
+
]
|
|
368
|
+
},
|
|
369
|
+
"TrimTransform": {
|
|
370
|
+
"type": "object",
|
|
371
|
+
"required": ["$kind"],
|
|
372
|
+
"properties": { "$kind": { "const": "TrimTransform" } },
|
|
373
|
+
"additionalProperties": true
|
|
374
|
+
},
|
|
375
|
+
"CastTransform": {
|
|
376
|
+
"type": "object",
|
|
377
|
+
"required": ["$kind"],
|
|
378
|
+
"properties": { "$kind": { "const": "CastTransform" } },
|
|
379
|
+
"additionalProperties": true
|
|
380
|
+
},
|
|
381
|
+
"FormatTransform": {
|
|
382
|
+
"type": "object",
|
|
383
|
+
"required": ["$kind"],
|
|
384
|
+
"properties": { "$kind": { "const": "FormatTransform" } },
|
|
385
|
+
"additionalProperties": true
|
|
386
|
+
},
|
|
387
|
+
"RenameTransform": {
|
|
388
|
+
"type": "object",
|
|
389
|
+
"required": ["$kind"],
|
|
390
|
+
"properties": { "$kind": { "const": "RenameTransform" } },
|
|
391
|
+
"additionalProperties": true
|
|
392
|
+
},
|
|
393
|
+
"ComputeTransform": {
|
|
394
|
+
"type": "object",
|
|
395
|
+
"required": ["$kind"],
|
|
396
|
+
"properties": { "$kind": { "const": "ComputeTransform" } },
|
|
397
|
+
"additionalProperties": true
|
|
398
|
+
},
|
|
399
|
+
"CollectionElementTransform": {
|
|
400
|
+
"type": "object",
|
|
401
|
+
"required": ["$kind"],
|
|
402
|
+
"properties": { "$kind": { "const": "CollectionElementTransform" } },
|
|
403
|
+
"additionalProperties": true
|
|
404
|
+
},
|
|
405
|
+
"PublishStep": {
|
|
406
|
+
"type": "object",
|
|
407
|
+
"properties": {
|
|
408
|
+
"configuration": { "type": "object" }
|
|
409
|
+
},
|
|
410
|
+
"additionalProperties": true
|
|
411
|
+
}
|
|
412
|
+
}
|
|
413
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# Docuoria -- Classify
|
|
2
|
+
|
|
3
|
+
Match a PDF against all templates in the store and report which template applies.
|
|
4
|
+
|
|
5
|
+
## Invocation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
dotnet script scripts/classify.csx -- --help
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
Refer to `SKILL.md` for the full argument reference and worked examples.
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# Docuoria -- Diagnose
|
|
2
|
+
|
|
3
|
+
Run a template end-to-end without producing output; surface extraction warnings and mismatches.
|
|
4
|
+
|
|
5
|
+
## Invocation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
dotnet script scripts/dry-run.csx -- --help
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
Refer to `SKILL.md` for the full argument reference and worked examples.
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# Docuoria -- Extract
|
|
2
|
+
|
|
3
|
+
Execute a template against a PDF and produce structured output (CSV or JSON).
|
|
4
|
+
|
|
5
|
+
## Invocation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
dotnet script scripts/execute.csx -- --help
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
Refer to `SKILL.md` for the full argument reference and worked examples.
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# Docuoria -- Inspect
|
|
2
|
+
|
|
3
|
+
Read and display the text extraction a PDF yields so you can author or debug patterns.
|
|
4
|
+
|
|
5
|
+
## Invocation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
dotnet script scripts/inspect.csx -- --help
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
Refer to `SKILL.md` for the full argument reference and worked examples.
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# Docuoria -- Validate Template
|
|
2
|
+
|
|
3
|
+
Validate a template JSON file against the schema and report structural errors.
|
|
4
|
+
|
|
5
|
+
## Invocation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
dotnet script scripts/validate-template.csx -- --help
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
Refer to `SKILL.md` for the full argument reference and worked examples.
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# Example 1 — Extract to CSV (known template)
|
|
2
|
+
|
|
3
|
+
## Scenario
|
|
4
|
+
|
|
5
|
+
A PDF document with a header (a date and a reference number) and a repeating list of line items. A `Template` already exists in the local template store (a `templates/` directory at the workspace root); the goal is to produce a CSV from this PDF without authoring anything new.
|
|
6
|
+
|
|
7
|
+
## Template excerpt
|
|
8
|
+
|
|
9
|
+
The full template shape for this scenario (`PrimitiveFieldDefinition` for `refNumber` + `invoiceDate`, `RecordFieldDefinition` with `isCollection: true` for `lineItems`, three `FieldMapping`/`RepeatingFieldMapping` extraction mappings) is documented in [`../references/template-reference.md` § Complex template](../references/template-reference.md#complex-template-scalar-fields--repeating-collection). The single mapping that drives the repeating CSV rows looks like:
|
|
10
|
+
|
|
11
|
+
```json
|
|
12
|
+
{
|
|
13
|
+
"$kind": "RepeatingFieldMapping",
|
|
14
|
+
"collectionFieldName": "lineItems",
|
|
15
|
+
"source": {
|
|
16
|
+
"$kind": "TextPatternExtractionSource",
|
|
17
|
+
"mode": "AllMatches",
|
|
18
|
+
"regexPattern": "(?<description>[A-Za-z ]+?)\\s+\\$(?<amount>[\\d,.]+)"
|
|
19
|
+
},
|
|
20
|
+
"subFields": [
|
|
21
|
+
{ "$kind": "NamedGroupSubFieldMapping", "fieldName": "description", "fieldType": 0, "groupName": "description" },
|
|
22
|
+
{ "$kind": "NamedGroupSubFieldMapping", "fieldName": "amount", "fieldType": 1, "groupName": "amount" }
|
|
23
|
+
]
|
|
24
|
+
}
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
Note: `fieldType` is an **integer** (0 = String, 1 = Number, 4 = Date). See the template reference for the canonical enum table.
|
|
28
|
+
|
|
29
|
+
## Steps
|
|
30
|
+
|
|
31
|
+
1. `dotnet script scripts/list-templates.csx -- --store-path ./templates` — find the template ID that should apply.
|
|
32
|
+
2. `dotnet script scripts/load-template.csx -- --id <id> --store-path ./templates` — confirm the template loaded and inspect its fields. Add `--output <path>` to write the JSON to a file.
|
|
33
|
+
3. `dotnet script scripts/dry-run.csx -- --pdf <pdf> --template <template.json>` — confirm a `DryRunSucceeded` outcome. Inspect the extracted fields and (if needed) the `ExtractionDiagnostics` snapshot before publishing.
|
|
34
|
+
4. `dotnet script scripts/execute.csx -- --pdf <pdf> --template <template.json> --format csv --output output.csv` — runs the full pipeline through the registered CSV generator. Engine API used: `IDocuoriaEngine.ExecuteTemplateAsync<CsvOutputGenerator, CsvGeneratorOptions>`.
|
|
35
|
+
|
|
36
|
+
## Expected outcome
|
|
37
|
+
|
|
38
|
+
`ProcessingResult` is `SucceededResult`; `output.csv` exists with rows matching the document's line items, in document order, with the header columns implied by the template's schema.
|
|
39
|
+
|
|
40
|
+
## If it fails
|
|
41
|
+
|
|
42
|
+
Go to [`../references/failure-tree.md`](../references/failure-tree.md). Map the script's stderr `error.code` to a branch via [§ Stderr error.code → Branch routing](../references/failure-tree.md#stderr-errorcode--branch-routing): `rejected` → Branch A (read `RejectionReason` in `detail`); `failed` → Branch B (read `StepIdentifier` in `detail`). If `dry-run.csx` already returned `DryRunSucceeded` but `execute.csx` then returned `rejected` with `RejectionReason.GeneratorRejected`, the generator is rejecting the shape — Branch A's `GeneratorRejected` row covers the remediation.
|
|
43
|
+
|
|
44
|
+
## See also
|
|
45
|
+
|
|
46
|
+
- [`../references/template-reference.md` § Complex template](../references/template-reference.md#complex-template-scalar-fields--repeating-collection) — full template JSON shape used in this example.
|
|
47
|
+
- [`../references/workflow.md`](../references/workflow.md) Steps 5–7 — this example exercises dry-run, execute, and the store lookup at the start.
|
|
48
|
+
- [`../references/scripts.md`](../references/scripts.md) — full flag list and output envelope for every script invoked here.
|
|
49
|
+
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# Example 2 — Classify an unknown PDF
|
|
2
|
+
|
|
3
|
+
## Scenario
|
|
4
|
+
|
|
5
|
+
A PDF document arrives. You do not know whether any stored template matches it. The templates are in a local store directory (`./templates`). The goal is to route the PDF to a matching template, identify a partial match to refine, or recognise it needs a new template authored.
|
|
6
|
+
|
|
7
|
+
## How classification works
|
|
8
|
+
|
|
9
|
+
The `classify.csx` script evaluates every stored template's `rootMatchRule` against the PDF and returns the top-N templates ranked by `confidence` — an aggregated score (`ruleConfidence × extractionProbeScore`) that reflects both rule match strength and extraction viability. This gradient lets you distinguish between strong matches, partial matches worth refining, and complete misses.
|
|
10
|
+
|
|
11
|
+
## Example match rules — weak vs. strong
|
|
12
|
+
|
|
13
|
+
### Weak — vendor tokens only (poor discrimination)
|
|
14
|
+
|
|
15
|
+
```json
|
|
16
|
+
{
|
|
17
|
+
"rootMatchRule": {
|
|
18
|
+
"$kind": "TextPatternMatchRule",
|
|
19
|
+
"tokens": ["Microsoft", "Invoice", "Bill To"],
|
|
20
|
+
"mode": 1,
|
|
21
|
+
"threshold": 0.5
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
Every Microsoft invoice (subscription, Azure, support) contains these tokens. All would score `confidence: 1.0` — no discrimination.
|
|
27
|
+
|
|
28
|
+
### Strong — composite with discriminator (produces meaningful gradients)
|
|
29
|
+
|
|
30
|
+
```json
|
|
31
|
+
{
|
|
32
|
+
"rootMatchRule": {
|
|
33
|
+
"$kind": "CompositeMatchRule",
|
|
34
|
+
"operator": 0,
|
|
35
|
+
"threshold": 0.8,
|
|
36
|
+
"children": [
|
|
37
|
+
{
|
|
38
|
+
"rule": {
|
|
39
|
+
"$kind": "TextPatternMatchRule",
|
|
40
|
+
"tokens": ["Microsoft", "Invoice"],
|
|
41
|
+
"mode": 1,
|
|
42
|
+
"threshold": 0.5
|
|
43
|
+
},
|
|
44
|
+
"weight": 1.0
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
"rule": {
|
|
48
|
+
"$kind": "TextPatternMatchRule",
|
|
49
|
+
"tokens": ["Subscription", "License Qty", "Seats"],
|
|
50
|
+
"mode": 0,
|
|
51
|
+
"threshold": 0.6
|
|
52
|
+
},
|
|
53
|
+
"weight": 2.0
|
|
54
|
+
}
|
|
55
|
+
]
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
The first child is a broad gate (must be a Microsoft Invoice). The second child — weighted at 2.0 — is the **discriminator** that targets subscription invoices specifically. A subscription invoice scores `confidence: 1.0`. An Azure usage invoice scores `confidence: 0.33` (discriminator fails: `(1.0×1 + 0.0×2) / 3`). The gradient clearly separates the two.
|
|
61
|
+
|
|
62
|
+
## Steps
|
|
63
|
+
|
|
64
|
+
1. **Classify:** `dotnet script scripts/classify.csx -- --pdf <pdf> --store-path ./templates`
|
|
65
|
+
|
|
66
|
+
Example output:
|
|
67
|
+
```json
|
|
68
|
+
{
|
|
69
|
+
"matches": [
|
|
70
|
+
{ "templateId": "ms-subscription-invoice", "confidence": 0.92 },
|
|
71
|
+
{ "templateId": "ms-azure-invoice", "confidence": 0.35 },
|
|
72
|
+
{ "templateId": "generic-invoice", "confidence": 0.12 }
|
|
73
|
+
]
|
|
74
|
+
}
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
2. **Interpret the results:** map the top match's `confidence` to an action using the canonical gradient table in [`../references/classification.md` § Interpreting the gradient](../references/classification.md#interpreting-the-gradient). An empty `matches` array means no templates are stored — author from scratch.
|
|
78
|
+
|
|
79
|
+
3. **For a strong match** — verify correctness:
|
|
80
|
+
- Run `dotnet script scripts/dry-run.csx -- --pdf <pdf> --template <matched-id-or-path>`
|
|
81
|
+
- If extraction produces expected data → done.
|
|
82
|
+
- If extraction produces empty collections or nonsensical data → **misclassification**. See [`../references/failure-tree.md`](../references/failure-tree.md) Branch C.
|
|
83
|
+
|
|
84
|
+
4. **For a partial match** — try extraction, then iterate:
|
|
85
|
+
- Run dry-run with the top-scoring template. Even if some fields are incomplete, others may transfer.
|
|
86
|
+
- Load the template (`load-template.csx -- --id <matched-id> --store-path ./templates`), adjust the match rules and extraction sources for the new document type.
|
|
87
|
+
- Validate the updated template with positive + negative testing before storing.
|
|
88
|
+
|
|
89
|
+
5. **When authoring a new template** — validate classification before storing:
|
|
90
|
+
- **Positive:** `dotnet script scripts/evaluate-match.csx -- --pdf <target.pdf> --template <new-template.json>` → high `confidence`.
|
|
91
|
+
- **Negative:** repeat with PDFs from the same vendor that should NOT match → low `confidence`.
|
|
92
|
+
- **Ranked:** `dotnet script scripts/classify.csx -- --pdf <target.pdf> --store-path ./templates` → new template must rank #1 with a clear gap over siblings.
|
|
93
|
+
|
|
94
|
+
## Expected outcome
|
|
95
|
+
|
|
96
|
+
Either: a high-confidence match → proceed to extraction. Or: a partial match → refine an existing template. Or: no meaningful match → author a new template with clear diagnostic insight into why existing templates scored low.
|
|
97
|
+
|
|
98
|
+
## See also
|
|
99
|
+
|
|
100
|
+
- [`../references/classification.md`](../references/classification.md) — full guide to designing discriminating match rules and interpreting the confidence gradient.
|
|
101
|
+
- [`../references/workflow.md`](../references/workflow.md) Step 1 — classify is the entry point; confidence routing determines the next step.
|
|
102
|
+
- [`../references/failure-tree.md`](../references/failure-tree.md) Branch C — diagnosing classification issues with ranked output.
|