@sidub-inc/docuoria.cli 1.0.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/dist/index.js +1056 -0
  2. package/package.json +56 -0
  3. package/payload/.claude-plugin/plugin.json +21 -0
  4. package/payload/MANIFEST.json +322 -0
  5. package/payload/SKILL.md +88 -0
  6. package/payload/assets/lib/Docuoria.dll +0 -0
  7. package/payload/assets/schemas/template-schema.json +413 -0
  8. package/payload/commands/classify.md +11 -0
  9. package/payload/commands/diagnose.md +11 -0
  10. package/payload/commands/extract.md +11 -0
  11. package/payload/commands/inspect.md +11 -0
  12. package/payload/commands/validate-template.md +11 -0
  13. package/payload/examples/01-extract-to-csv.md +49 -0
  14. package/payload/examples/02-classify-unknown-pdf.md +102 -0
  15. package/payload/examples/03-diagnose-failed-result.md +68 -0
  16. package/payload/references/classification.md +363 -0
  17. package/payload/references/decision-tree.md +43 -0
  18. package/payload/references/failure-tree.md +169 -0
  19. package/payload/references/pattern-authoring.md +40 -0
  20. package/payload/references/patterns.md +97 -0
  21. package/payload/references/privacy.md +36 -0
  22. package/payload/references/scripts.md +361 -0
  23. package/payload/references/template-reference.md +606 -0
  24. package/payload/references/workflow.md +163 -0
  25. package/payload/scripts/_common.csx +250 -0
  26. package/payload/scripts/classify.csx +53 -0
  27. package/payload/scripts/dry-run.csx +85 -0
  28. package/payload/scripts/evaluate-match.csx +72 -0
  29. package/payload/scripts/execute.csx +89 -0
  30. package/payload/scripts/inspect.csx +43 -0
  31. package/payload/scripts/list-templates.csx +34 -0
  32. package/payload/scripts/load-template.csx +54 -0
  33. package/payload/scripts/save-template.csx +53 -0
  34. package/payload/scripts/schema-info.csx +84 -0
  35. package/payload/scripts/test-groups.csx +44 -0
  36. package/payload/scripts/test-pattern.csx +61 -0
  37. package/payload/scripts/validate-template.csx +54 -0
  38. package/payload/skill/SKILL.md +88 -0
  39. package/payload/skill/assets/lib/Docuoria.dll +0 -0
  40. package/payload/skill/assets/schemas/template-schema.json +413 -0
  41. package/payload/skill/examples/01-extract-to-csv.md +49 -0
  42. package/payload/skill/examples/02-classify-unknown-pdf.md +102 -0
  43. package/payload/skill/examples/03-diagnose-failed-result.md +68 -0
  44. package/payload/skill/references/classification.md +363 -0
  45. package/payload/skill/references/decision-tree.md +43 -0
  46. package/payload/skill/references/failure-tree.md +169 -0
  47. package/payload/skill/references/pattern-authoring.md +40 -0
  48. package/payload/skill/references/patterns.md +97 -0
  49. package/payload/skill/references/privacy.md +36 -0
  50. package/payload/skill/references/scripts.md +361 -0
  51. package/payload/skill/references/template-reference.md +606 -0
  52. package/payload/skill/references/workflow.md +163 -0
  53. package/payload/skill/scripts/_common.csx +250 -0
  54. package/payload/skill/scripts/classify.csx +53 -0
  55. package/payload/skill/scripts/dry-run.csx +85 -0
  56. package/payload/skill/scripts/evaluate-match.csx +72 -0
  57. package/payload/skill/scripts/execute.csx +89 -0
  58. package/payload/skill/scripts/inspect.csx +43 -0
  59. package/payload/skill/scripts/list-templates.csx +34 -0
  60. package/payload/skill/scripts/load-template.csx +54 -0
  61. package/payload/skill/scripts/save-template.csx +53 -0
  62. package/payload/skill/scripts/schema-info.csx +84 -0
  63. package/payload/skill/scripts/test-groups.csx +44 -0
  64. package/payload/skill/scripts/test-pattern.csx +61 -0
  65. package/payload/skill/scripts/validate-template.csx +54 -0
@@ -0,0 +1,163 @@
1
+ # Workflow
2
+
3
+ The pipeline has seven steps operating through a template store — a directory (or API endpoint) of JSON template files that define extraction and classification rules. Store-aware scripts (`classify`, `evaluate-match`, `list-templates`, `load-template`, `save-template`) require `--store-path <dir>` or `--store-url <url>` to locate the store; always pass the store location explicitly.
4
+
5
+ **Step 1 (Classify) always runs first** and determines whether steps 2–4 are needed. Never skip step 1 — classification is cheap and avoids redundant exploration.
6
+
7
+ ```
8
+ 1 Classify ──► strong match (≥ 0.8) ──────────────────────► 5 Dry-run ──► 6 Execute ──► done
9
+ │
10
+ ├──► partial match (0.4–0.8) ──► 3 Test pattern ──► 4 Build template ──► 5 Dry-run ──► 6 Execute ──► 7 Store
11
+ │ (load existing template, fix broken fields only)
12
+ │
13
+ └──► no match / error ──► 2 Inspect ──► 3 Test pattern ──► 4 Build template ──► 5 Dry-run ──► 6 Execute ──► 7 Store
14
+ ```
15
+
16
+ ---
17
+
18
+ ## Step 1 — Classify
19
+
20
+ Determine whether an existing template already handles this PDF. If the templates directory location is unknown, confirm it before classifying — check for a directory containing `.json` template files near the workspace root, or ask the user.
21
+
22
+ - **Script:** `dotnet script scripts/classify.csx -- --pdf <pdf> --store-path <templates-dir>`
23
+ - **API:** `IDocuoriaEngine.ClassifyAsync` — evaluates every stored template's `rootMatchRule` and returns them ranked by `confidence` (`ruleConfidence × extractionProbeScore`).
24
+ - **Output:** `{ "matches": [ { "templateId": "...", "confidence": 0.92 }, ... ] }` — descending by confidence.
25
+
26
+ **Routing:** see [`classification.md` § Interpreting the gradient](classification.md#interpreting-the-gradient) for the canonical confidence-to-action table (≥ 0.8 strong, 0.4–0.8 partial, < 0.4 author new). On `error: no-store` (no templates found at the given path, or no store configured), skip to **Step 2** and author from scratch.
27
+
28
+ ---
29
+
30
+ ## Step 2 — Inspect
31
+
32
+ See what the engine actually reads from the PDF before writing any patterns. The engine's text output often differs from the visual layout.
33
+
34
+ - **Script:** `dotnet script scripts/inspect.csx -- --pdf <pdf>` (optionally `--page N` for a single page)
35
+ - **API:** `IDocuoriaEngine.InspectAsync`
36
+ - **Returns:** `PdfInspection` — `PageCount`, `Pages[*].FlattenedText`, `Pages[*].TextBlocks`, `Pages[*].Tables` snapshots.
37
+ - **Gate:** if `PageCount` is 0 or every page has empty `TextBlocks`, the PDF is scanned/image-only — **STOP**. OCR upstream first.
38
+ - **Next:** Step 3.
39
+
40
+ ---
41
+
42
+ ## Step 3 — Test pattern
43
+
44
+ Prove each regex matches the engine's flattened haystack — not the visible text.
45
+
46
+ - **Scripts:**
47
+ - `dotnet script scripts/test-pattern.csx -- --pdf <pdf> --pattern "<regex>"` — test a single pattern.
48
+ - `dotnet script scripts/test-groups.csx -- --pdf <pdf> --pattern "<regex>"` — test each capture group independently when a multi-group regex partially fails.
49
+ - **API:** `IDocuoriaEngine.TestPatternAsync` / `IDocuoriaEngine.TestGroupsAsync`
50
+ - **Returns:** `PatternTestResult` — `HasMatches`, `Matches`, `Gaps`. For groups: `PatternGroupTestResult.Groups[*].MatchesIndependently`.
51
+ - **Iterate:** use `patterns.md` and `pattern-authoring.md` as reference. Repeat until `HasMatches` is `true` and match count matches expectation.
52
+ - **Next:** Step 4.
53
+
54
+ ---
55
+
56
+ ## Step 4 — Build template
57
+
58
+ Assemble the template JSON, design its classification rules, and validate the schema. This step has three sub-tasks that must all pass before proceeding.
59
+
60
+ ### 4a — Author template JSON
61
+
62
+ - Combine confirmed patterns with the appropriate `ExtractionSource` subtype (consult `decision-tree.md`).
63
+ - Design a discriminating `rootMatchRule` that identifies this **document type**, not just the vendor (consult `classification.md`).
64
+ - Use `CompositeMatchRule` (And) with discriminator children weighted ≥ 2.0.
65
+ - Identify tokens unique to this document type — section headers, product identifiers, column names that siblings do not share.
66
+
67
+ ### 4b — Validate classification rules
68
+
69
+ - **Script:** `dotnet script scripts/evaluate-match.csx -- --pdf <pdf> --template <template.json>`
70
+ - **API:** `IDocuoriaEngine.EvaluateMatchAsync` — returns `confidence` (`ruleConfidence × extractionProbeScore`).
71
+ - **Positive test:** target PDF → confidence ≥ 0.8.
72
+ - **Negative test:** same-vendor PDFs that should NOT match → confidence near zero. If 0.4–0.7, the rules lack discrimination — strengthen the discriminator.
73
+ - See `classification.md` for the full design guide.
74
+
75
+ ### 4c — Validate schema
76
+
77
+ - **Script:** `dotnet script scripts/validate-template.csx -- --template <template.json>`
78
+ - Fix every reported error before proceeding. A schema failure corresponds to `RejectionReason.MalformedTemplate` at runtime.
79
+
80
+ **Next:** Step 5.
81
+
82
+ ---
83
+
84
+ ## Step 5 — Dry-run
85
+
86
+ Extract and transform without generating output. Confirms the template produces correct data before committing to a full run.
87
+
88
+ - **Script:** `dotnet script scripts/dry-run.csx -- --pdf <pdf> --template <template.json>`
89
+ - **API:** `IDocuoriaEngine.DryRunAsync`
90
+ - **Returns:** `DryRunSucceeded` (extracted fields + diagnostics), `DryRunFailed` (`Step`, `FieldPath`, `SourceText`, `TargetTypeName`, `InnerDetail`), or `DryRunRejected` (`RejectionReason`).
91
+
92
+ **On success:** verify all collection fields have the expected element count. An empty `[]` for a `RepeatingFieldMapping` means the pattern didn't match — see `failure-tree.md` Branch D.
93
+
94
+ **On failure:** go to `failure-tree.md` indexed by `Step` (for `DryRunFailed`) or `RejectionReason` (for `DryRunRejected`).
95
+
96
+ **Multi-variant check:** repeat dry-run with every available PDF from the same vendor/template category. If any variant produces empty collections or null scalars where data should exist, the template may be over-fitted — see `failure-tree.md` Branch D (layout variant splitting).
97
+
98
+ **Next:** Step 6.
99
+
100
+ ---
101
+
102
+ ## Step 6 — Execute
103
+
104
+ Full pipeline run including the output generator.
105
+
106
+ - **Script:** `dotnet script scripts/execute.csx -- --pdf <pdf> --template <template.json> --format csv|json` (optionally `--output <path>`)
107
+ - **API:** `IDocuoriaEngine.ExecuteTemplateAsync<TGenerator, TOptions>`
108
+ - **Returns:** `SucceededResult`, `FailedResult`, or `RejectedResult` with `RejectionReason` in {`InvalidPdf`, `MalformedTemplate`, `UnknownOutputGenerator`, `GeneratorRejected`}.
109
+ - **On failure:** go to `failure-tree.md`.
110
+ - **Next:** if this was a strong classify match (step 1 routed here directly) → done. If this is a new or modified template → Step 7.
111
+
112
+ ---
113
+
114
+ ## Step 7 — Store
115
+
116
+ Persist the template and verify it ranks correctly in the store. This prevents the new template from stealing classifications from existing templates or ranking too low for its own target.
117
+
118
+ - **Save:** `dotnet script scripts/save-template.csx -- --template <template.json> --store-path <templates-dir>`
119
+ - **Verify ranking:**
120
+ - `dotnet script scripts/classify.csx -- --pdf <target.pdf> --store-path <templates-dir>` → new template must rank **#1**, confidence ≥ 0.8.
121
+ - `dotnet script scripts/classify.csx -- --pdf <sibling.pdf> --store-path <templates-dir>` → new template should score < 0.4; existing sibling templates should still rank #1 for their own PDFs.
122
+ - **API:** `IDocuoriaEngine.ClassifyAsync`, `IDocuoriaEngine.EvaluateMatchAsync`
123
+ - **Other store scripts:** `list-templates.csx --store-path <templates-dir>` (enumerate all), `load-template.csx --id <id> --store-path <templates-dir>` (fetch one).
124
+
125
+ **Done.** The template is stored and will be found by Step 1 on future PDFs of this type.
126
+
127
+ ---
128
+
129
+ ## Quick reference
130
+
131
+ | Step | Script(s) | Engine API | Result |
132
+ |---|---|---|---|
133
+ | 1 Classify | `classify.csx` | `ClassifyAsync` | Ranked matches with `confidence` |
134
+ | 2 Inspect | `inspect.csx` | `InspectAsync` | `PdfInspection` |
135
+ | 3 Test pattern | `test-pattern.csx`, `test-groups.csx` | `TestPatternAsync`, `TestGroupsAsync` | `PatternTestResult` |
136
+ | 4 Build | (editor), `evaluate-match.csx`, `validate-template.csx` | `EvaluateMatchAsync` | Template JSON + validation |
137
+ | 5 Dry-run | `dry-run.csx` | `DryRunAsync` | `DryRunResult` |
138
+ | 6 Execute | `execute.csx` | `ExecuteTemplateAsync` | `ProcessingResult` |
139
+ | 7 Store | `save-template.csx`, `classify.csx` | `ClassifyAsync` | Ranking verification |
140
+
141
+ ---
142
+
143
+ ## CSV output behaviour
144
+
145
+ When `--format csv` is used with `execute.csx`, the `CsvOutputGenerator` flattens the hierarchical `DataRecord` into tabular CSV.
146
+
147
+ | Template shape | Behaviour |
148
+ |---|---|
149
+ | Scalar fields only | One data row, one column per field |
150
+ | One `RepeatingFieldMapping` | Denormalised: scalars repeat on every row, collection elements get one row each. Column headers use dot notation (`lineItems.description`) |
151
+ | Two+ `RepeatingFieldMapping` | **Rejected** (`RejectionReason.GeneratorRejected`). Use JSON output or split into separate templates |
152
+ | Nested `RecordFieldDefinition` | Flattened with dot notation (`address.city`) |
153
+
154
+ ### CsvGeneratorOptions
155
+
156
+ | Property | Type | Default | Description |
157
+ |---|---|---|---|
158
+ | `Delimiter` | `char` | `,` | Field separator |
159
+ | `Encoding` | `Encoding` | UTF-8 (no BOM) | Output encoding |
160
+ | `NewlineReplacement` | `string?` | `"\n"` | Replace embedded newlines with literal escape text; `" "` collapses to spaces; `null` preserves raw newlines in RFC 4180 quoted cells |
161
+ | `IncludeHeaderRow` | `bool` | `true` | Whether to emit a header row |
162
+ | `DateFormat` | `string?` | `null` (ISO 8601) | .NET date format string for `Date` fields |
163
+ | `NumberFormat` | `string?` | `null` (general `G`) | .NET format string for `Number` fields |
@@ -0,0 +1,250 @@
1
+ #r "nuget: Microsoft.Extensions.Hosting, 10.0.0"
2
+ #r "nuget: Microsoft.Extensions.DependencyInjection, 10.0.0"
3
+ #r "nuget: Microsoft.Extensions.Http, 10.0.0"
4
+ #r "nuget: PdfPig, 0.1.14"
5
+ #r "nuget: Tabula, 1.0.1"
6
+ #r "nuget: CsvHelper, 33.1.0"
7
+ #r "nuget: pythonnet, 3.0.5"
8
+ #r "../assets/lib/Docuoria.dll"
9
+
10
+ #nullable enable
11
+
12
+ // Phase 29: Shared bootstrap for every script under `scripts/` (D-Area-2 in 29-CONTEXT.md).
13
+ // Every script `#load`s this file. It deduplicates DI wiring, arg parsing,
14
+ // and JSON I/O so individual scripts can focus on their semantics.
15
+ //
16
+ // The `dotnet-script` runtime is required to execute these scripts:
17
+ // dotnet tool install -g dotnet-script
18
+
19
+ using System;
20
+ using System.Diagnostics.CodeAnalysis;
21
+ using System.IO;
22
+ using System.Reflection;
23
+ using System.Text.Json;
24
+ using Microsoft.Extensions.DependencyInjection;
25
+ using Microsoft.Extensions.Hosting;
26
+ using Docuoria.Contracts;
27
+ using Docuoria.Registration;
28
+ using Docuoria.Serialization;
29
+ using Docuoria.Storage;
30
+
31
+ // Dev-time override: DOCUORIA_SDK_DLL can point at an alternate Docuoria.dll.
32
+ // The `#r` literal above resolves at script-compile time; this LoadFrom call after-the-fact
33
+ // ensures the override assembly is available for reflection-based DI lookups even when the
34
+ // literal succeeded.
35
+ if (Environment.GetEnvironmentVariable("DOCUORIA_SDK_DLL") is string __sdkPath
36
+ && !string.IsNullOrWhiteSpace(__sdkPath)
37
+ && File.Exists(__sdkPath))
38
+ {
39
+ try
40
+ {
41
+ Assembly.LoadFrom(__sdkPath);
42
+ }
43
+ catch (Exception __sdkLoadEx)
44
+ {
45
+ // WR-02: DOCUORIA_SDK_DLL is an explicit opt-in dev override. If the user set it
46
+ // but loading failed, do NOT silently fall back to the #r literal — emit a structured
47
+ // JSON error envelope to stderr and exit non-zero so the override failure is visible.
48
+ var payload = new { error = new { code = "sdk-load-failed", message = $"DOCUORIA_SDK_DLL load failed: {__sdkLoadEx.Message}", detail = __sdkPath } };
49
+ var json = JsonSerializer.Serialize(payload, DocuoriaJsonOptions.Default);
50
+ Console.Error.WriteLine(json);
51
+ Environment.Exit(1);
52
+ }
53
+ }
54
+
55
+ /// <summary>
56
+ /// Builds a Generic Host with the SDK engine + (optionally) the configured template store.
57
+ /// </summary>
58
+ public static class ScriptHost
59
+ {
60
+ /// <summary>
61
+ /// Construct an IHost containing IDocuoriaEngine and (optionally) ITemplateStoreProvider.
62
+ /// </summary>
63
+ /// <param name="args">Forwarded to Host.CreateDefaultBuilder for configuration binding.</param>
64
+ /// <param name="includeStore">When false, ITemplateStoreProvider is NOT registered — used by
65
+ /// scripts that don't touch the store (inspect, test-pattern, test-groups, validate-template,
66
+ /// dry-run) to avoid forcing store configuration on irrelevant invocations.</param>
67
+ public static IHost CreateHost(string[] args, bool includeStore = true)
68
+ {
69
+ var builder = Host.CreateDefaultBuilder(args);
70
+ builder.ConfigureServices(services =>
71
+ {
72
+ services.AddDocuoriaEngine(b =>
73
+ {
74
+ b.AddBuiltInMatchRules();
75
+ b.AddCsvOutputGenerator();
76
+ b.AddJsonOutputGenerator();
77
+ if (includeStore)
78
+ RegisterStore(b, args);
79
+ });
80
+ });
81
+ return builder.Build();
82
+ }
83
+
84
+ public static IDocuoriaEngine GetEngine(IHost host)
85
+ => host.Services.GetRequiredService<IDocuoriaEngine>();
86
+
87
+ public static ITemplateStoreProvider? GetStore(IHost host)
88
+ => host.Services.GetService<ITemplateStoreProvider>();
89
+
90
+ private static void RegisterStore(IDocuoriaEngineBuilder builder, string[] args)
91
+ {
92
+ var storePath = Cli.Get(args, "store-path");
93
+ var storeUrl = Cli.Get(args, "store-url");
94
+ var storeKey = Cli.Get(args, "store-key");
95
+
96
+ if (!string.IsNullOrWhiteSpace(storePath) && !string.IsNullOrWhiteSpace(storeUrl))
97
+ {
98
+ throw new InvalidOperationException(
99
+ "--store-path and --store-url are mutually exclusive. Use --store-path for a local file store or --store-url for an API store.");
100
+ }
101
+
102
+ if (!string.IsNullOrWhiteSpace(storeUrl))
103
+ {
104
+ var creds = new ApiTemplateStoreCredentials { FunctionKey = storeKey };
105
+ builder.AddApiTemplateStore(new Uri(storeUrl), creds);
106
+ }
107
+ else
108
+ {
109
+ // IN-01: directory is created lazily by LocalFileTemplateStoreProvider on first
110
+ // write; ListAsync tolerates a missing root. Avoid littering arbitrary cwds with
111
+ // an empty ./templates dir at script startup.
112
+ var path = string.IsNullOrWhiteSpace(storePath) ? "./templates" : storePath;
113
+ builder.AddLocalTemplateStore(path);
114
+ }
115
+ }
116
+ }
117
+
118
+ /// <summary>Hand-rolled `--key value` / `--flag` arg parser (System.CommandLine deferred).</summary>
119
+ /// <remarks>Named <c>Cli</c> rather than <c>Args</c> because <c>dotnet-script</c> exposes a
120
+ /// top-level <c>Args</c> global (<see cref="System.Collections.Generic.IList{T}"/> of
121
+ /// <see cref="string"/>) that would shadow a static class of the same name.</remarks>
122
+ public static class Cli
123
+ {
124
+ private static readonly List<(string Name, bool Required, string Description, bool IsFlag)> _registeredArgs = new();
125
+ private static string? _scriptDescription;
126
+
127
+ /// <summary>
128
+ /// Register the script description and check for --help. Call at the top of each script.
129
+ /// If --help is present, prints usage and exits.
130
+ /// </summary>
131
+ public static void Help(IList<string> args, string scriptName, string description, params (string Name, bool Required, string Description, bool IsFlag)[] argDefs)
132
+ {
133
+ _scriptDescription = description;
134
+ _registeredArgs.Clear();
135
+ _registeredArgs.AddRange(argDefs);
136
+
137
+ if (Has(args, "help") || Has(args, "h"))
138
+ {
139
+ Console.WriteLine();
140
+ Console.WriteLine($" {scriptName}");
141
+ Console.WriteLine($" {description}");
142
+ Console.WriteLine();
143
+ Console.WriteLine(" Usage:");
144
+ Console.WriteLine($" dotnet script scripts/{scriptName} -- [args]");
145
+ Console.WriteLine();
146
+ Console.WriteLine(" Arguments:");
147
+ foreach (var (name, required, desc, isFlag) in argDefs)
148
+ {
149
+ var req = required ? "(required)" : "(optional)";
150
+ var kind = isFlag ? "flag" : "value";
151
+ Console.WriteLine($" --{name,-20} {req,-12} {desc}");
152
+ }
153
+ Console.WriteLine($" --{"help",-20} {"(optional)",-12} Show this help message");
154
+ Console.WriteLine();
155
+ Environment.Exit(0);
156
+ }
157
+ }
158
+
159
+ /// <summary>Returns the value following <c>--key</c>, or null when absent.</summary>
160
+ public static string? Get(string[] args, string key)
161
+ {
162
+ if (args is null) return null;
163
+ var marker = "--" + key;
164
+ for (int i = 0; i < args.Length; i++)
165
+ {
166
+ if (string.Equals(args[i], marker, StringComparison.Ordinal))
167
+ {
168
+ if (i + 1 < args.Length) return args[i + 1];
169
+ return null;
170
+ }
171
+ }
172
+ return null;
173
+ }
174
+
175
+ /// <summary>True when <c>--key</c> is present (regardless of any following value).</summary>
176
+ public static bool Has(string[] args, string key)
177
+ {
178
+ if (args is null) return false;
179
+ var marker = "--" + key;
180
+ foreach (var a in args)
181
+ {
182
+ if (string.Equals(a, marker, StringComparison.Ordinal)) return true;
183
+ }
184
+ return false;
185
+ }
186
+
187
+ /// <summary>
188
+ /// Required-arg lookup. On missing key emits a JSON error to stderr and exits with code 2.
189
+ /// </summary>
190
+ public static string Require(IList<string> args, string key) => Require(args.ToArray(), key);
191
+ public static string? Get(IList<string> args, string key) => Get(args.ToArray(), key);
192
+ public static bool Has(IList<string> args, string key) => Has(args.ToArray(), key);
193
+
194
+ public static string Require(string[] args, string key)
195
+ {
196
+ var v = Get(args, key);
197
+ if (v is null)
198
+ {
199
+ JsonOut.Error("missing-arg", $"--{key} is required", null, 2);
200
+ }
201
+ return v!;
202
+ }
203
+ }
204
+
205
+ /// <summary>JSON stdout / stderr writers using DocuoriaJsonOptions.Default (D-Area-1).</summary>
206
+ public static class JsonOut
207
+ {
208
+ /// <summary>Serialize <paramref name="value"/> to a single stdout line.</summary>
209
+ public static void Write(object value)
210
+ {
211
+ var json = JsonSerializer.Serialize(value, value?.GetType() ?? typeof(object), DocuoriaJsonOptions.Default);
212
+ Console.Out.WriteLine(json);
213
+ }
214
+
215
+ /// <summary>Write a pre-serialized JSON string to stdout (avoids double-serialization).</summary>
216
+ public static void WriteRaw(string json)
217
+ {
218
+ Console.Out.WriteLine(json);
219
+ }
220
+
221
+ /// <summary>
222
+ /// Emit a structured error envelope to stderr and terminate the script with
223
+ /// <paramref name="exitCode"/> (default 1). Errors NEVER go to stdout. This method
224
+ /// does not return; <see cref="Environment.Exit(int)"/> terminates the process. The
225
+ /// trailing <c>throw</c> is unreachable in practice but communicates the no-return
226
+ /// contract to the C# flow analyzer so callers don't need to null-forgive the result.
227
+ /// </summary>
228
+ [DoesNotReturn]
229
+ public static void Error(string code, string message, string? detail = null, int exitCode = 1)
230
+ {
231
+ var payload = new { error = new { code, message, detail } };
232
+ var json = JsonSerializer.Serialize(payload, DocuoriaJsonOptions.Default);
233
+ Console.Error.WriteLine(json);
234
+ Environment.Exit(exitCode);
235
+ throw new InvalidOperationException("Environment.Exit returned unexpectedly.");
236
+ }
237
+ }
238
+
239
+ /// <summary>
240
+ /// Opens a readable+seekable FileStream over <paramref name="path"/>, or emits a JSON error and
241
+ /// exits when the file is missing.
242
+ /// </summary>
243
+ public static FileStream LoadPdf(string path)
244
+ {
245
+ if (string.IsNullOrWhiteSpace(path) || !File.Exists(path))
246
+ {
247
+ JsonOut.Error("pdf-not-found", $"PDF not found at '{path}'", null, 1);
248
+ }
249
+ return new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read);
250
+ }
@@ -0,0 +1,53 @@
1
+ #load "_common.csx"
2
+
3
+ #nullable enable
4
+
5
+ // CLS-02 — ranked classification: evaluates every stored template and returns top matches
6
+ // sorted by effective confidence (ruleConfidence × extractionProbeScore, descending).
7
+ // Uses ClassifyRankedAsync to open the PDF once and evaluate all templates without
8
+ // redundant PDF parsing per template.
9
+ // Args: --pdf <path> [--top N]
10
+ // stdout: { matches: [ { templateId, confidence }, ... ] }
11
+
12
+ using Docuoria.Contracts;
13
+ using Docuoria.Storage;
14
+
15
+ try
16
+ {
17
+ Cli.Help(Args, "classify.csx", "Classify a PDF against stored templates (ranked by confidence)",
18
+ ("pdf", true, "Path to the source PDF", false),
19
+ ("top", false, "Maximum number of results to return (default: 5)", false),
20
+ ("store-path", false, "Local template store directory (default: ./templates)", false),
21
+ ("store-url", false, "API template store URL (mutually exclusive with --store-path)", false),
22
+ ("store-key", false, "Function key for API store authentication", false));
23
+
24
+ var pdfPath = Cli.Require(Args, "pdf");
25
+ var topN = int.TryParse(Cli.Get(Args, "top"), out var n) ? n : 5;
26
+
27
+ using var host = ScriptHost.CreateHost(Args.ToArray(), includeStore: true);
28
+ var engine = ScriptHost.GetEngine(host);
29
+
30
+ using var pdf = LoadPdf(pdfPath);
31
+
32
+ // OPT: Single engine call opens the PDF once and evaluates all templates internally,
33
+ // replacing the previous per-template loop that re-parsed the PDF for each template.
34
+ var classifications = await engine.ClassifyRankedAsync(pdf, topN);
35
+
36
+ var ranked = classifications
37
+ .Select(c => new
38
+ {
39
+ templateId = c.TemplateIdentifier,
40
+ confidence = Math.Round(c.RuleConfidence * c.ExtractionProbeScore, 4),
41
+ })
42
+ .ToArray();
43
+
44
+ var opts = new System.Text.Json.JsonSerializerOptions(Docuoria.Serialization.DocuoriaJsonOptions.Default)
45
+ {
46
+ DefaultIgnoreCondition = System.Text.Json.Serialization.JsonIgnoreCondition.Never,
47
+ };
48
+ JsonOut.WriteRaw(System.Text.Json.JsonSerializer.Serialize(new { matches = ranked }, opts));
49
+ }
50
+ catch (Exception ex)
51
+ {
52
+ JsonOut.Error("unhandled", ex.Message, ex.ToString(), 1);
53
+ }
@@ -0,0 +1,85 @@
1
+ #load "_common.csx"
2
+
3
+ #nullable enable
4
+
5
+ // SCR-05 — Wrapper over IDocuoriaEngine.DryRunAsync (extraction + transformation only).
6
+ // Args: --pdf <path> --template <file.json> [--preview-as csv|json]
7
+ // stdout: { kind, result } — discriminator + payload.
8
+ // With --preview-as: { kind, result, preview } — adds formatted output string.
9
+
10
+ using System.Text;
11
+ using Docuoria.Configuration;
12
+ using Docuoria.Contracts;
13
+ using Docuoria.Models;
14
+ using Docuoria.Output.Csv;
15
+ using Docuoria.Output.Json;
16
+ using Docuoria.Results;
17
+
18
+ try
19
+ {
20
+ Cli.Help(Args, "dry-run.csx", "Run extraction + transformation without output generation",
21
+ ("pdf", true, "Path to the source PDF", false),
22
+ ("template", true, "Path to the template JSON file", false),
23
+ ("preview-as", false, "Preview formatted output: csv or json (no file written)", false));
24
+
25
+ var pdfPath = Cli.Require(Args, "pdf");
26
+ var templatePath = Cli.Require(Args, "template");
27
+ var previewAs = Cli.Get(Args, "preview-as")?.Trim().ToLowerInvariant();
28
+
29
+ if (previewAs is not null && previewAs != "csv" && previewAs != "json")
30
+ {
31
+ JsonOut.Error("bad-format", "--preview-as must be csv or json", null, 2);
32
+ }
33
+
34
+ if (!File.Exists(templatePath))
35
+ {
36
+ JsonOut.Error("template-not-found", $"Template not found at '{templatePath}'", null, 1);
37
+ }
38
+
39
+ var template = Template.FromJson(File.ReadAllText(templatePath));
40
+
41
+ using var host = ScriptHost.CreateHost(Args.ToArray(), includeStore: false);
42
+ var engine = ScriptHost.GetEngine(host);
43
+
44
+ using var pdf = LoadPdf(pdfPath);
45
+
46
+ if (previewAs is null)
47
+ {
48
+ // Standard dry-run: extraction + transformation only.
49
+ var result = await engine.DryRunAsync(pdf, template);
50
+ JsonOut.Write(new { kind = result.GetType().Name, result });
51
+ }
52
+ else
53
+ {
54
+ // Preview mode: full execute, but output goes to stdout preview instead of disk.
55
+ ProcessingResult result = previewAs switch
56
+ {
57
+ "csv" => await engine.ExecuteTemplateAsync<CsvOutputGenerator, CsvGeneratorOptions>(
58
+ pdf, template, new CsvGeneratorOptions()),
59
+ "json" => await engine.ExecuteTemplateAsync<JsonOutputGenerator, JsonGeneratorOptions>(
60
+ pdf, template, new JsonGeneratorOptions()),
61
+ _ => throw new InvalidOperationException("unreachable")
62
+ };
63
+
64
+ switch (result)
65
+ {
66
+ case SucceededResult ok:
67
+ var preview = Encoding.UTF8.GetString(ok.Output.Payload.Span);
68
+ JsonOut.Write(new { kind = "SucceededResult", format = previewAs, preview });
69
+ break;
70
+ case RejectedResult rej:
71
+ JsonOut.Error("rejected", $"Rejected ({rej.Reason}){(rej.Detail is not null ? $": {rej.Detail}" : "")}", null, 1);
72
+ break;
73
+ case FailedResult fail:
74
+ JsonOut.Error("failed", fail.ErrorMessage, fail.InnerDetail, 1);
75
+ break;
76
+ default:
77
+ JsonOut.Error("unknown-result", result.GetType().Name, null, 1);
78
+ break;
79
+ }
80
+ }
81
+ }
82
+ catch (Exception ex)
83
+ {
84
+ JsonOut.Error("unhandled", ex.Message, ex.ToString(), 1);
85
+ }
@@ -0,0 +1,72 @@
1
+ #load "_common.csx"
2
+
3
+ #nullable enable
4
+
5
+ // SCR-07 + CLS-01 helper — wrapper over IDocuoriaEngine.EvaluateMatchAsync.
6
+ // Args: --pdf <path> --template <id-or-file>
7
+ // Auto-detects template source per Area-3: contains path sep OR ends .json OR File.Exists -> file;
8
+ // otherwise loads from store via ITemplateStoreProvider.LoadAsync(value).
9
+ // stdout: { confidence, matchedRules } JSON.
10
+
11
+ using Docuoria.Contracts;
12
+ using Docuoria.Models;
13
+ using Docuoria.Storage;
14
+
15
+ try
16
+ {
17
+ Cli.Help(Args, "evaluate-match.csx", "Evaluate a template's match rule against a PDF",
18
+ ("pdf", true, "Path to the source PDF", false),
19
+ ("template", true, "Template ID or path to template JSON file", false),
20
+ ("store-path", false, "Local template store directory (default: ./templates)", false),
21
+ ("store-url", false, "API template store URL (mutually exclusive with --store-path)", false),
22
+ ("store-key", false, "Function key for API store authentication", false));
23
+
24
+ var pdfPath = Cli.Require(Args, "pdf");
25
+ var templateRef = Cli.Require(Args, "template");
26
+
27
+ bool looksLikeFile =
28
+ templateRef.IndexOfAny(new[] { '/', '\\' }) >= 0 ||
29
+ templateRef.EndsWith(".json", StringComparison.OrdinalIgnoreCase) ||
30
+ File.Exists(templateRef);
31
+
32
+ using var host = ScriptHost.CreateHost(Args.ToArray(), includeStore: true);
33
+ var engine = ScriptHost.GetEngine(host);
34
+
35
+ Template? template;
36
+ if (looksLikeFile)
37
+ {
38
+ if (!File.Exists(templateRef))
39
+ {
40
+ JsonOut.Error("template-not-found", $"Template file not found at '{templateRef}'", null, 1);
41
+ }
42
+ template = Template.FromJson(File.ReadAllText(templateRef));
43
+ }
44
+ else
45
+ {
46
+ var store = ScriptHost.GetStore(host);
47
+ if (store is null)
48
+ {
49
+ JsonOut.Error("no-store", "ITemplateStoreProvider is not registered.", null, 1);
50
+ }
51
+ template = await store!.LoadAsync(templateRef);
52
+ if (template is null)
53
+ {
54
+ JsonOut.Error("not-found", $"template '{templateRef}' not found in store", null, 1);
55
+ }
56
+ }
57
+
58
+ using var pdf = LoadPdf(pdfPath);
59
+ var evaluation = await engine.EvaluateMatchAsync(template!, pdf);
60
+
61
+ // Project a clean LLM-facing response: single aggregated confidence + diagnostic rules.
62
+ // Confidence = ruleConfidence × extractionProbeScore (0.0 when either fails).
63
+ JsonOut.Write(new
64
+ {
65
+ confidence = Math.Round(evaluation.RuleConfidence * evaluation.ExtractionProbeScore, 4),
66
+ matchedRules = evaluation.MatchedRules,
67
+ });
68
+ }
69
+ catch (Exception ex)
70
+ {
71
+ JsonOut.Error("unhandled", ex.Message, ex.ToString(), 1);
72
+ }