@sidub-inc/docuoria.cli 1.0.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +1056 -0
- package/package.json +56 -0
- package/payload/.claude-plugin/plugin.json +21 -0
- package/payload/MANIFEST.json +322 -0
- package/payload/SKILL.md +88 -0
- package/payload/assets/lib/Docuoria.dll +0 -0
- package/payload/assets/schemas/template-schema.json +413 -0
- package/payload/commands/classify.md +11 -0
- package/payload/commands/diagnose.md +11 -0
- package/payload/commands/extract.md +11 -0
- package/payload/commands/inspect.md +11 -0
- package/payload/commands/validate-template.md +11 -0
- package/payload/examples/01-extract-to-csv.md +49 -0
- package/payload/examples/02-classify-unknown-pdf.md +102 -0
- package/payload/examples/03-diagnose-failed-result.md +68 -0
- package/payload/references/classification.md +363 -0
- package/payload/references/decision-tree.md +43 -0
- package/payload/references/failure-tree.md +169 -0
- package/payload/references/pattern-authoring.md +40 -0
- package/payload/references/patterns.md +97 -0
- package/payload/references/privacy.md +36 -0
- package/payload/references/scripts.md +361 -0
- package/payload/references/template-reference.md +606 -0
- package/payload/references/workflow.md +163 -0
- package/payload/scripts/_common.csx +250 -0
- package/payload/scripts/classify.csx +53 -0
- package/payload/scripts/dry-run.csx +85 -0
- package/payload/scripts/evaluate-match.csx +72 -0
- package/payload/scripts/execute.csx +89 -0
- package/payload/scripts/inspect.csx +43 -0
- package/payload/scripts/list-templates.csx +34 -0
- package/payload/scripts/load-template.csx +54 -0
- package/payload/scripts/save-template.csx +53 -0
- package/payload/scripts/schema-info.csx +84 -0
- package/payload/scripts/test-groups.csx +44 -0
- package/payload/scripts/test-pattern.csx +61 -0
- package/payload/scripts/validate-template.csx +54 -0
- package/payload/skill/SKILL.md +88 -0
- package/payload/skill/assets/lib/Docuoria.dll +0 -0
- package/payload/skill/assets/schemas/template-schema.json +413 -0
- package/payload/skill/examples/01-extract-to-csv.md +49 -0
- package/payload/skill/examples/02-classify-unknown-pdf.md +102 -0
- package/payload/skill/examples/03-diagnose-failed-result.md +68 -0
- package/payload/skill/references/classification.md +363 -0
- package/payload/skill/references/decision-tree.md +43 -0
- package/payload/skill/references/failure-tree.md +169 -0
- package/payload/skill/references/pattern-authoring.md +40 -0
- package/payload/skill/references/patterns.md +97 -0
- package/payload/skill/references/privacy.md +36 -0
- package/payload/skill/references/scripts.md +361 -0
- package/payload/skill/references/template-reference.md +606 -0
- package/payload/skill/references/workflow.md +163 -0
- package/payload/skill/scripts/_common.csx +250 -0
- package/payload/skill/scripts/classify.csx +53 -0
- package/payload/skill/scripts/dry-run.csx +85 -0
- package/payload/skill/scripts/evaluate-match.csx +72 -0
- package/payload/skill/scripts/execute.csx +89 -0
- package/payload/skill/scripts/inspect.csx +43 -0
- package/payload/skill/scripts/list-templates.csx +34 -0
- package/payload/skill/scripts/load-template.csx +54 -0
- package/payload/skill/scripts/save-template.csx +53 -0
- package/payload/skill/scripts/schema-info.csx +84 -0
- package/payload/skill/scripts/test-groups.csx +44 -0
- package/payload/skill/scripts/test-pattern.csx +61 -0
- package/payload/skill/scripts/validate-template.csx +54 -0
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
# Workflow
|
|
2
|
+
|
|
3
|
+
The pipeline has seven steps operating through a template store — a directory (or API endpoint) of JSON template files that define extraction and classification rules. Store-aware scripts (`classify`, `evaluate-match`, `list-templates`, `load-template`, `save-template`) require `--store-path <dir>` or `--store-url <url>` to locate the store; always pass the store location explicitly.
|
|
4
|
+
|
|
5
|
+
**Step 1 (Classify) always runs first** and determines whether steps 2–4 are needed. Never skip step 1 — classification is cheap and avoids redundant exploration.
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
1 Classify ──► strong match (≥ 0.8) ──────────────────────► 5 Dry-run ──► 6 Execute ──► done
|
|
9
|
+
│
|
|
10
|
+
├──► partial match (0.4–0.8) ──► 3 Test pattern ──► 4 Build template ──► 5 Dry-run ──► 6 Execute ──► 7 Store
|
|
11
|
+
│ (load existing template, fix broken fields only)
|
|
12
|
+
│
|
|
13
|
+
└──► no match / error ──► 2 Inspect ──► 3 Test pattern ──► 4 Build template ──► 5 Dry-run ──► 6 Execute ──► 7 Store
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
---
|
|
17
|
+
|
|
18
|
+
## Step 1 — Classify
|
|
19
|
+
|
|
20
|
+
Determine whether an existing template already handles this PDF. If the templates directory location is unknown, confirm it before classifying — check for a directory containing `.json` template files near the workspace root, or ask the user.
|
|
21
|
+
|
|
22
|
+
- **Script:** `dotnet script scripts/classify.csx -- --pdf <pdf> --store-path <templates-dir>`
|
|
23
|
+
- **API:** `IDocuoriaEngine.ClassifyAsync` — evaluates every stored template's `rootMatchRule` and returns them ranked by `confidence` (`ruleConfidence × extractionProbeScore`).
|
|
24
|
+
- **Output:** `{ "matches": [ { "templateId": "...", "confidence": 0.92 }, ... ] }` — descending by confidence.
|
|
25
|
+
|
|
26
|
+
**Routing:** see [`classification.md` § Interpreting the gradient](classification.md#interpreting-the-gradient) for the canonical confidence-to-action table (≥ 0.8 strong, 0.4–0.8 partial, < 0.4 author new). On `error: no-store` (no templates found at the given path, or no store configured), skip to **Step 2** and author from scratch.
|
|
27
|
+
|
|
28
|
+
---
|
|
29
|
+
|
|
30
|
+
## Step 2 — Inspect
|
|
31
|
+
|
|
32
|
+
See what the engine actually reads from the PDF before writing any patterns. The engine's text output often differs from the visual layout.
|
|
33
|
+
|
|
34
|
+
- **Script:** `dotnet script scripts/inspect.csx -- --pdf <pdf>` (optionally `--page N` for a single page)
|
|
35
|
+
- **API:** `IDocuoriaEngine.InspectAsync`
|
|
36
|
+
- **Returns:** `PdfInspection` — `PageCount`, `Pages[*].FlattenedText`, `Pages[*].TextBlocks`, `Pages[*].Tables` snapshots.
|
|
37
|
+
- **Gate:** if `PageCount` is 0 or every page has empty `TextBlocks`, the PDF is scanned/image-only — **STOP**. OCR upstream first.
|
|
38
|
+
- **Next:** Step 3.
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
42
|
+
## Step 3 — Test pattern
|
|
43
|
+
|
|
44
|
+
Prove each regex matches the engine's flattened haystack — not the visible text.
|
|
45
|
+
|
|
46
|
+
- **Scripts:**
|
|
47
|
+
- `dotnet script scripts/test-pattern.csx -- --pdf <pdf> --pattern "<regex>"` — test a single pattern.
|
|
48
|
+
- `dotnet script scripts/test-groups.csx -- --pdf <pdf> --pattern "<regex>"` — test each capture group independently when a multi-group regex partially fails.
|
|
49
|
+
- **API:** `IDocuoriaEngine.TestPatternAsync` / `IDocuoriaEngine.TestGroupsAsync`
|
|
50
|
+
- **Returns:** `PatternTestResult` — `HasMatches`, `Matches`, `Gaps`. For groups: `PatternGroupTestResult.Groups[*].MatchesIndependently`.
|
|
51
|
+
- **Iterate:** use `patterns.md` and `pattern-authoring.md` as reference. Repeat until `HasMatches` is `true` and match count matches expectation.
|
|
52
|
+
- **Next:** Step 4.
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
## Step 4 — Build template
|
|
57
|
+
|
|
58
|
+
Assemble the template JSON, design its classification rules, and validate the schema. This step has three sub-tasks that must all pass before proceeding.
|
|
59
|
+
|
|
60
|
+
### 4a — Author template JSON
|
|
61
|
+
|
|
62
|
+
- Combine confirmed patterns with the appropriate `ExtractionSource` subtype (consult `decision-tree.md`).
|
|
63
|
+
- Design a discriminating `rootMatchRule` that identifies this **document type**, not just the vendor (consult `classification.md`).
|
|
64
|
+
- Use `CompositeMatchRule` (And) with discriminator children weighted ≥ 2.0.
|
|
65
|
+
- Identify tokens unique to this document type — section headers, product identifiers, column names that siblings do not share.
|
|
66
|
+
|
|
67
|
+
### 4b — Validate classification rules
|
|
68
|
+
|
|
69
|
+
- **Script:** `dotnet script scripts/evaluate-match.csx -- --pdf <pdf> --template <template.json>`
|
|
70
|
+
- **API:** `IDocuoriaEngine.EvaluateMatchAsync` — returns `confidence` (`ruleConfidence × extractionProbeScore`).
|
|
71
|
+
- **Positive test:** target PDF → confidence ≥ 0.8.
|
|
72
|
+
- **Negative test:** same-vendor PDFs that should NOT match → confidence near zero. If 0.4–0.7, the rules lack discrimination — strengthen the discriminator.
|
|
73
|
+
- See `classification.md` for the full design guide.
|
|
74
|
+
|
|
75
|
+
### 4c — Validate schema
|
|
76
|
+
|
|
77
|
+
- **Script:** `dotnet script scripts/validate-template.csx -- --template <template.json>`
|
|
78
|
+
- Fix every reported error before proceeding. A schema failure corresponds to `RejectionReason.MalformedTemplate` at runtime.
|
|
79
|
+
|
|
80
|
+
**Next:** Step 5.
|
|
81
|
+
|
|
82
|
+
---
|
|
83
|
+
|
|
84
|
+
## Step 5 — Dry-run
|
|
85
|
+
|
|
86
|
+
Extract and transform without generating output. Confirms the template produces correct data before committing to a full run.
|
|
87
|
+
|
|
88
|
+
- **Script:** `dotnet script scripts/dry-run.csx -- --pdf <pdf> --template <template.json>`
|
|
89
|
+
- **API:** `IDocuoriaEngine.DryRunAsync`
|
|
90
|
+
- **Returns:** `DryRunSucceeded` (extracted fields + diagnostics), `DryRunFailed` (`Step`, `FieldPath`, `SourceText`, `TargetTypeName`, `InnerDetail`), or `DryRunRejected` (`RejectionReason`).
|
|
91
|
+
|
|
92
|
+
**On success:** verify all collection fields have the expected element count. An empty `[]` for a `RepeatingFieldMapping` means the pattern didn't match — see `failure-tree.md` Branch D.
|
|
93
|
+
|
|
94
|
+
**On failure:** go to `failure-tree.md` indexed by `Step` (for `DryRunFailed`) or `RejectionReason` (for `DryRunRejected`).
|
|
95
|
+
|
|
96
|
+
**Multi-variant check:** repeat dry-run with every available PDF from the same vendor/template category. If any variant produces empty collections or null scalars where data should exist, the template may be over-fitted — see `failure-tree.md` Branch D (layout variant splitting).
|
|
97
|
+
|
|
98
|
+
**Next:** Step 6.
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
## Step 6 — Execute
|
|
103
|
+
|
|
104
|
+
Full pipeline run including the output generator.
|
|
105
|
+
|
|
106
|
+
- **Script:** `dotnet script scripts/execute.csx -- --pdf <pdf> --template <template.json> --format csv|json` (optionally `--output <path>`)
|
|
107
|
+
- **API:** `IDocuoriaEngine.ExecuteTemplateAsync<TGenerator, TOptions>`
|
|
108
|
+
- **Returns:** `SucceededResult`, `FailedResult`, or `RejectedResult` with `RejectionReason` in {`InvalidPdf`, `MalformedTemplate`, `UnknownOutputGenerator`, `GeneratorRejected`}.
|
|
109
|
+
- **On failure:** go to `failure-tree.md`.
|
|
110
|
+
- **Next:** if this was a strong classify match (step 1 routed here directly) → done. If this is a new or modified template → Step 7.
|
|
111
|
+
|
|
112
|
+
---
|
|
113
|
+
|
|
114
|
+
## Step 7 — Store
|
|
115
|
+
|
|
116
|
+
Persist the template and verify it ranks correctly in the store. This prevents the new template from stealing classifications from existing templates or ranking too low for its own target.
|
|
117
|
+
|
|
118
|
+
- **Save:** `dotnet script scripts/save-template.csx -- --template <template.json> --store-path <templates-dir>`
|
|
119
|
+
- **Verify ranking:**
|
|
120
|
+
- `dotnet script scripts/classify.csx -- --pdf <target.pdf> --store-path <templates-dir>` → new template must rank **#1**, confidence ≥ 0.8.
|
|
121
|
+
- `dotnet script scripts/classify.csx -- --pdf <sibling.pdf> --store-path <templates-dir>` → new template should score < 0.4; existing sibling templates should still rank #1 for their own PDFs.
|
|
122
|
+
- **API:** `IDocuoriaEngine.ClassifyAsync`, `IDocuoriaEngine.EvaluateMatchAsync`
|
|
123
|
+
- **Other store scripts:** `list-templates.csx --store-path <templates-dir>` (enumerate all), `load-template.csx --id <id> --store-path <templates-dir>` (fetch one).
|
|
124
|
+
|
|
125
|
+
**Done.** The template is stored and will be found by Step 1 on future PDFs of this type.
|
|
126
|
+
|
|
127
|
+
---
|
|
128
|
+
|
|
129
|
+
## Quick reference
|
|
130
|
+
|
|
131
|
+
| Step | Script(s) | Engine API | Result |
|
|
132
|
+
|---|---|---|---|
|
|
133
|
+
| 1 Classify | `classify.csx` | `ClassifyAsync` | Ranked matches with `confidence` |
|
|
134
|
+
| 2 Inspect | `inspect.csx` | `InspectAsync` | `PdfInspection` |
|
|
135
|
+
| 3 Test pattern | `test-pattern.csx`, `test-groups.csx` | `TestPatternAsync`, `TestGroupsAsync` | `PatternTestResult` |
|
|
136
|
+
| 4 Build | (editor), `evaluate-match.csx`, `validate-template.csx` | `EvaluateMatchAsync` | Template JSON + validation |
|
|
137
|
+
| 5 Dry-run | `dry-run.csx` | `DryRunAsync` | `DryRunResult` |
|
|
138
|
+
| 6 Execute | `execute.csx` | `ExecuteTemplateAsync` | `ProcessingResult` |
|
|
139
|
+
| 7 Store | `save-template.csx`, `classify.csx` | `ClassifyAsync` | Ranking verification |
|
|
140
|
+
|
|
141
|
+
---
|
|
142
|
+
|
|
143
|
+
## CSV output behaviour
|
|
144
|
+
|
|
145
|
+
When `--format csv` is used with `execute.csx`, the `CsvOutputGenerator` flattens the hierarchical `DataRecord` into tabular CSV.
|
|
146
|
+
|
|
147
|
+
| Template shape | Behaviour |
|
|
148
|
+
|---|---|
|
|
149
|
+
| Scalar fields only | One data row, one column per field |
|
|
150
|
+
| One `RepeatingFieldMapping` | Denormalised: scalars repeat on every row, collection elements get one row each. Column headers use dot notation (`lineItems.description`) |
|
|
151
|
+
| Two+ `RepeatingFieldMapping` | **Rejected** (`RejectionReason.GeneratorRejected`). Use JSON output or split into separate templates |
|
|
152
|
+
| Nested `RecordFieldDefinition` | Flattened with dot notation (`address.city`) |
|
|
153
|
+
|
|
154
|
+
### CsvGeneratorOptions
|
|
155
|
+
|
|
156
|
+
| Property | Type | Default | Description |
|
|
157
|
+
|---|---|---|---|
|
|
158
|
+
| `Delimiter` | `char` | `,` | Field separator |
|
|
159
|
+
| `Encoding` | `Encoding` | UTF-8 (no BOM) | Output encoding |
|
|
160
|
+
| `NewlineReplacement` | `string?` | `"\n"` | Replace embedded newlines with literal escape text; `" "` collapses to spaces; `null` preserves raw newlines in RFC 4180 quoted cells |
|
|
161
|
+
| `IncludeHeaderRow` | `bool` | `true` | Whether to emit a header row |
|
|
162
|
+
| `DateFormat` | `string?` | `null` (ISO 8601) | .NET date format string for `Date` fields |
|
|
163
|
+
| `NumberFormat` | `string?` | `null` (general `G`) | .NET format string for `Number` fields |
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
#r "nuget: Microsoft.Extensions.Hosting, 10.0.0"
|
|
2
|
+
#r "nuget: Microsoft.Extensions.DependencyInjection, 10.0.0"
|
|
3
|
+
#r "nuget: Microsoft.Extensions.Http, 10.0.0"
|
|
4
|
+
#r "nuget: PdfPig, 0.1.14"
|
|
5
|
+
#r "nuget: Tabula, 1.0.1"
|
|
6
|
+
#r "nuget: CsvHelper, 33.1.0"
|
|
7
|
+
#r "nuget: pythonnet, 3.0.5"
|
|
8
|
+
#r "../assets/lib/Docuoria.dll"
|
|
9
|
+
|
|
10
|
+
#nullable enable
|
|
11
|
+
|
|
12
|
+
// Phase 29: Shared bootstrap for every script under `scripts/` (D-Area-2 in 29-CONTEXT.md).
|
|
13
|
+
// Every script `#load`s this file. It deduplicates DI wiring, arg parsing,
|
|
14
|
+
// and JSON I/O so individual scripts can focus on their semantics.
|
|
15
|
+
//
|
|
16
|
+
// The `dotnet-script` runtime is required to execute these scripts:
|
|
17
|
+
// dotnet tool install -g dotnet-script
|
|
18
|
+
|
|
19
|
+
using System;
|
|
20
|
+
using System.Diagnostics.CodeAnalysis;
|
|
21
|
+
using System.IO;
|
|
22
|
+
using System.Reflection;
|
|
23
|
+
using System.Text.Json;
|
|
24
|
+
using Microsoft.Extensions.DependencyInjection;
|
|
25
|
+
using Microsoft.Extensions.Hosting;
|
|
26
|
+
using Docuoria.Contracts;
|
|
27
|
+
using Docuoria.Registration;
|
|
28
|
+
using Docuoria.Serialization;
|
|
29
|
+
using Docuoria.Storage;
|
|
30
|
+
|
|
31
|
+
// Dev-time override: DOCUORIA_SDK_DLL can point at an alternate Docuoria.dll.
|
|
32
|
+
// The `#r` literal above resolves at script-compile time; this LoadFrom call after-the-fact
|
|
33
|
+
// ensures the override assembly is available for reflection-based DI lookups even when the
|
|
34
|
+
// literal succeeded.
|
|
35
|
+
if (Environment.GetEnvironmentVariable("DOCUORIA_SDK_DLL") is string __sdkPath
|
|
36
|
+
&& !string.IsNullOrWhiteSpace(__sdkPath)
|
|
37
|
+
&& File.Exists(__sdkPath))
|
|
38
|
+
{
|
|
39
|
+
try
|
|
40
|
+
{
|
|
41
|
+
Assembly.LoadFrom(__sdkPath);
|
|
42
|
+
}
|
|
43
|
+
catch (Exception __sdkLoadEx)
|
|
44
|
+
{
|
|
45
|
+
// WR-02: DOCUORIA_SDK_DLL is an explicit opt-in dev override. If the user set it
|
|
46
|
+
// but loading failed, do NOT silently fall back to the #r literal — emit a structured
|
|
47
|
+
// JSON error envelope to stderr and exit non-zero so the override failure is visible.
|
|
48
|
+
var payload = new { error = new { code = "sdk-load-failed", message = $"DOCUORIA_SDK_DLL load failed: {__sdkLoadEx.Message}", detail = __sdkPath } };
|
|
49
|
+
var json = JsonSerializer.Serialize(payload, DocuoriaJsonOptions.Default);
|
|
50
|
+
Console.Error.WriteLine(json);
|
|
51
|
+
Environment.Exit(1);
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/// <summary>
|
|
56
|
+
/// Builds a Generic Host with the SDK engine + (optionally) the configured template store.
|
|
57
|
+
/// </summary>
|
|
58
|
+
public static class ScriptHost
|
|
59
|
+
{
|
|
60
|
+
/// <summary>
|
|
61
|
+
/// Construct an IHost containing IDocuoriaEngine and (optionally) ITemplateStoreProvider.
|
|
62
|
+
/// </summary>
|
|
63
|
+
/// <param name="args">Forwarded to Host.CreateDefaultBuilder for configuration binding.</param>
|
|
64
|
+
/// <param name="includeStore">When false, ITemplateStoreProvider is NOT registered — used by
|
|
65
|
+
/// scripts that don't touch the store (inspect, test-pattern, test-groups, validate-template,
|
|
66
|
+
/// dry-run) to avoid forcing store configuration on irrelevant invocations.</param>
|
|
67
|
+
public static IHost CreateHost(string[] args, bool includeStore = true)
|
|
68
|
+
{
|
|
69
|
+
var builder = Host.CreateDefaultBuilder(args);
|
|
70
|
+
builder.ConfigureServices(services =>
|
|
71
|
+
{
|
|
72
|
+
services.AddDocuoriaEngine(b =>
|
|
73
|
+
{
|
|
74
|
+
b.AddBuiltInMatchRules();
|
|
75
|
+
b.AddCsvOutputGenerator();
|
|
76
|
+
b.AddJsonOutputGenerator();
|
|
77
|
+
if (includeStore)
|
|
78
|
+
RegisterStore(b, args);
|
|
79
|
+
});
|
|
80
|
+
});
|
|
81
|
+
return builder.Build();
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
public static IDocuoriaEngine GetEngine(IHost host)
|
|
85
|
+
=> host.Services.GetRequiredService<IDocuoriaEngine>();
|
|
86
|
+
|
|
87
|
+
public static ITemplateStoreProvider? GetStore(IHost host)
|
|
88
|
+
=> host.Services.GetService<ITemplateStoreProvider>();
|
|
89
|
+
|
|
90
|
+
private static void RegisterStore(IDocuoriaEngineBuilder builder, string[] args)
|
|
91
|
+
{
|
|
92
|
+
var storePath = Cli.Get(args, "store-path");
|
|
93
|
+
var storeUrl = Cli.Get(args, "store-url");
|
|
94
|
+
var storeKey = Cli.Get(args, "store-key");
|
|
95
|
+
|
|
96
|
+
if (!string.IsNullOrWhiteSpace(storePath) && !string.IsNullOrWhiteSpace(storeUrl))
|
|
97
|
+
{
|
|
98
|
+
throw new InvalidOperationException(
|
|
99
|
+
"--store-path and --store-url are mutually exclusive. Use --store-path for a local file store or --store-url for an API store.");
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
if (!string.IsNullOrWhiteSpace(storeUrl))
|
|
103
|
+
{
|
|
104
|
+
var creds = new ApiTemplateStoreCredentials { FunctionKey = storeKey };
|
|
105
|
+
builder.AddApiTemplateStore(new Uri(storeUrl), creds);
|
|
106
|
+
}
|
|
107
|
+
else
|
|
108
|
+
{
|
|
109
|
+
// IN-01: directory is created lazily by LocalFileTemplateStoreProvider on first
|
|
110
|
+
// write; ListAsync tolerates a missing root. Avoid littering arbitrary cwds with
|
|
111
|
+
// an empty ./templates dir at script startup.
|
|
112
|
+
var path = string.IsNullOrWhiteSpace(storePath) ? "./templates" : storePath;
|
|
113
|
+
builder.AddLocalTemplateStore(path);
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
/// <summary>Hand-rolled `--key value` / `--flag` arg parser (System.CommandLine deferred).</summary>
|
|
119
|
+
/// <remarks>Named <c>Cli</c> rather than <c>Args</c> because <c>dotnet-script</c> exposes a
|
|
120
|
+
/// top-level <c>Args</c> global (<see cref="System.Collections.Generic.IList{T}"/> of
|
|
121
|
+
/// <see cref="string"/>) that would shadow a static class of the same name.</remarks>
|
|
122
|
+
public static class Cli
|
|
123
|
+
{
|
|
124
|
+
private static readonly List<(string Name, bool Required, string Description, bool IsFlag)> _registeredArgs = new();
|
|
125
|
+
private static string? _scriptDescription;
|
|
126
|
+
|
|
127
|
+
/// <summary>
|
|
128
|
+
/// Register the script description and check for --help. Call at the top of each script.
|
|
129
|
+
/// If --help is present, prints usage and exits.
|
|
130
|
+
/// </summary>
|
|
131
|
+
public static void Help(IList<string> args, string scriptName, string description, params (string Name, bool Required, string Description, bool IsFlag)[] argDefs)
|
|
132
|
+
{
|
|
133
|
+
_scriptDescription = description;
|
|
134
|
+
_registeredArgs.Clear();
|
|
135
|
+
_registeredArgs.AddRange(argDefs);
|
|
136
|
+
|
|
137
|
+
if (Has(args, "help") || Has(args, "h"))
|
|
138
|
+
{
|
|
139
|
+
Console.WriteLine();
|
|
140
|
+
Console.WriteLine($" {scriptName}");
|
|
141
|
+
Console.WriteLine($" {description}");
|
|
142
|
+
Console.WriteLine();
|
|
143
|
+
Console.WriteLine(" Usage:");
|
|
144
|
+
Console.WriteLine($" dotnet script scripts/{scriptName} -- [args]");
|
|
145
|
+
Console.WriteLine();
|
|
146
|
+
Console.WriteLine(" Arguments:");
|
|
147
|
+
foreach (var (name, required, desc, isFlag) in argDefs)
|
|
148
|
+
{
|
|
149
|
+
var req = required ? "(required)" : "(optional)";
|
|
150
|
+
var kind = isFlag ? "flag" : "value";
|
|
151
|
+
Console.WriteLine($" --{name,-20} {req,-12} {desc}");
|
|
152
|
+
}
|
|
153
|
+
Console.WriteLine($" --{"help",-20} {"(optional)",-12} Show this help message");
|
|
154
|
+
Console.WriteLine();
|
|
155
|
+
Environment.Exit(0);
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
/// <summary>Returns the value following <c>--key</c>, or null when absent.</summary>
|
|
160
|
+
public static string? Get(string[] args, string key)
|
|
161
|
+
{
|
|
162
|
+
if (args is null) return null;
|
|
163
|
+
var marker = "--" + key;
|
|
164
|
+
for (int i = 0; i < args.Length; i++)
|
|
165
|
+
{
|
|
166
|
+
if (string.Equals(args[i], marker, StringComparison.Ordinal))
|
|
167
|
+
{
|
|
168
|
+
if (i + 1 < args.Length) return args[i + 1];
|
|
169
|
+
return null;
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
return null;
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
/// <summary>True when <c>--key</c> is present (regardless of any following value).</summary>
|
|
176
|
+
public static bool Has(string[] args, string key)
|
|
177
|
+
{
|
|
178
|
+
if (args is null) return false;
|
|
179
|
+
var marker = "--" + key;
|
|
180
|
+
foreach (var a in args)
|
|
181
|
+
{
|
|
182
|
+
if (string.Equals(a, marker, StringComparison.Ordinal)) return true;
|
|
183
|
+
}
|
|
184
|
+
return false;
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
/// <summary>
|
|
188
|
+
/// Required-arg lookup. On missing key emits a JSON error to stderr and exits with code 2.
|
|
189
|
+
/// </summary>
|
|
190
|
+
public static string Require(IList<string> args, string key) => Require(args.ToArray(), key);
|
|
191
|
+
public static string? Get(IList<string> args, string key) => Get(args.ToArray(), key);
|
|
192
|
+
public static bool Has(IList<string> args, string key) => Has(args.ToArray(), key);
|
|
193
|
+
|
|
194
|
+
public static string Require(string[] args, string key)
|
|
195
|
+
{
|
|
196
|
+
var v = Get(args, key);
|
|
197
|
+
if (v is null)
|
|
198
|
+
{
|
|
199
|
+
JsonOut.Error("missing-arg", $"--{key} is required", null, 2);
|
|
200
|
+
}
|
|
201
|
+
return v!;
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
/// <summary>JSON stdout / stderr writers using DocuoriaJsonOptions.Default (D-Area-1).</summary>
|
|
206
|
+
public static class JsonOut
|
|
207
|
+
{
|
|
208
|
+
/// <summary>Serialize <paramref name="value"/> to a single stdout line.</summary>
|
|
209
|
+
public static void Write(object value)
|
|
210
|
+
{
|
|
211
|
+
var json = JsonSerializer.Serialize(value, value?.GetType() ?? typeof(object), DocuoriaJsonOptions.Default);
|
|
212
|
+
Console.Out.WriteLine(json);
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
/// <summary>Write a pre-serialized JSON string to stdout (avoids double-serialization).</summary>
|
|
216
|
+
public static void WriteRaw(string json)
|
|
217
|
+
{
|
|
218
|
+
Console.Out.WriteLine(json);
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
/// <summary>
|
|
222
|
+
/// Emit a structured error envelope to stderr and terminate the script with
|
|
223
|
+
/// <paramref name="exitCode"/> (default 1). Errors NEVER go to stdout. This method
|
|
224
|
+
/// does not return; <see cref="Environment.Exit(int)"/> terminates the process. The
|
|
225
|
+
/// trailing <c>throw</c> is unreachable in practice but communicates the no-return
|
|
226
|
+
/// contract to the C# flow analyzer so callers don't need to null-forgive the result.
|
|
227
|
+
/// </summary>
|
|
228
|
+
[DoesNotReturn]
|
|
229
|
+
public static void Error(string code, string message, string? detail = null, int exitCode = 1)
|
|
230
|
+
{
|
|
231
|
+
var payload = new { error = new { code, message, detail } };
|
|
232
|
+
var json = JsonSerializer.Serialize(payload, DocuoriaJsonOptions.Default);
|
|
233
|
+
Console.Error.WriteLine(json);
|
|
234
|
+
Environment.Exit(exitCode);
|
|
235
|
+
throw new InvalidOperationException("Environment.Exit returned unexpectedly.");
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
/// <summary>
|
|
240
|
+
/// Opens a readable+seekable FileStream over <paramref name="path"/>, or emits a JSON error and
|
|
241
|
+
/// exits when the file is missing.
|
|
242
|
+
/// </summary>
|
|
243
|
+
public static FileStream LoadPdf(string path)
|
|
244
|
+
{
|
|
245
|
+
if (string.IsNullOrWhiteSpace(path) || !File.Exists(path))
|
|
246
|
+
{
|
|
247
|
+
JsonOut.Error("pdf-not-found", $"PDF not found at '{path}'", null, 1);
|
|
248
|
+
}
|
|
249
|
+
return new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read);
|
|
250
|
+
}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
#load "_common.csx"
|
|
2
|
+
|
|
3
|
+
#nullable enable
|
|
4
|
+
|
|
5
|
+
// CLS-02 — ranked classification: evaluates every stored template and returns top matches
|
|
6
|
+
// sorted by effective confidence (ruleConfidence × extractionProbeScore, descending).
|
|
7
|
+
// Uses ClassifyRankedAsync to open the PDF once and evaluate all templates without
|
|
8
|
+
// redundant PDF parsing per template.
|
|
9
|
+
// Args: --pdf <path> [--top N]
|
|
10
|
+
// stdout: { matches: [ { templateId, confidence }, ... ] }
|
|
11
|
+
|
|
12
|
+
using Docuoria.Contracts;
|
|
13
|
+
using Docuoria.Storage;
|
|
14
|
+
|
|
15
|
+
try
|
|
16
|
+
{
|
|
17
|
+
Cli.Help(Args, "classify.csx", "Classify a PDF against stored templates (ranked by confidence)",
|
|
18
|
+
("pdf", true, "Path to the source PDF", false),
|
|
19
|
+
("top", false, "Maximum number of results to return (default: 5)", false),
|
|
20
|
+
("store-path", false, "Local template store directory (default: ./templates)", false),
|
|
21
|
+
("store-url", false, "API template store URL (mutually exclusive with --store-path)", false),
|
|
22
|
+
("store-key", false, "Function key for API store authentication", false));
|
|
23
|
+
|
|
24
|
+
var pdfPath = Cli.Require(Args, "pdf");
|
|
25
|
+
var topN = int.TryParse(Cli.Get(Args, "top"), out var n) ? n : 5;
|
|
26
|
+
|
|
27
|
+
using var host = ScriptHost.CreateHost(Args.ToArray(), includeStore: true);
|
|
28
|
+
var engine = ScriptHost.GetEngine(host);
|
|
29
|
+
|
|
30
|
+
using var pdf = LoadPdf(pdfPath);
|
|
31
|
+
|
|
32
|
+
// OPT: Single engine call opens the PDF once and evaluates all templates internally,
|
|
33
|
+
// replacing the previous per-template loop that re-parsed the PDF for each template.
|
|
34
|
+
var classifications = await engine.ClassifyRankedAsync(pdf, topN);
|
|
35
|
+
|
|
36
|
+
var ranked = classifications
|
|
37
|
+
.Select(c => new
|
|
38
|
+
{
|
|
39
|
+
templateId = c.TemplateIdentifier,
|
|
40
|
+
confidence = Math.Round(c.RuleConfidence * c.ExtractionProbeScore, 4),
|
|
41
|
+
})
|
|
42
|
+
.ToArray();
|
|
43
|
+
|
|
44
|
+
var opts = new System.Text.Json.JsonSerializerOptions(Docuoria.Serialization.DocuoriaJsonOptions.Default)
|
|
45
|
+
{
|
|
46
|
+
DefaultIgnoreCondition = System.Text.Json.Serialization.JsonIgnoreCondition.Never,
|
|
47
|
+
};
|
|
48
|
+
JsonOut.WriteRaw(System.Text.Json.JsonSerializer.Serialize(new { matches = ranked }, opts));
|
|
49
|
+
}
|
|
50
|
+
catch (Exception ex)
|
|
51
|
+
{
|
|
52
|
+
JsonOut.Error("unhandled", ex.Message, ex.ToString(), 1);
|
|
53
|
+
}
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
#load "_common.csx"
|
|
2
|
+
|
|
3
|
+
#nullable enable
|
|
4
|
+
|
|
5
|
+
// SCR-05 — Wrapper over IDocuoriaEngine.DryRunAsync (extraction + transformation only).
|
|
6
|
+
// Args: --pdf <path> --template <file.json> [--preview-as csv|json]
|
|
7
|
+
// stdout: { kind, result } — discriminator + payload.
|
|
8
|
+
// With --preview-as: { kind, result, preview } — adds formatted output string.
|
|
9
|
+
|
|
10
|
+
using System.Text;
|
|
11
|
+
using Docuoria.Configuration;
|
|
12
|
+
using Docuoria.Contracts;
|
|
13
|
+
using Docuoria.Models;
|
|
14
|
+
using Docuoria.Output.Csv;
|
|
15
|
+
using Docuoria.Output.Json;
|
|
16
|
+
using Docuoria.Results;
|
|
17
|
+
|
|
18
|
+
try
|
|
19
|
+
{
|
|
20
|
+
Cli.Help(Args, "dry-run.csx", "Run extraction + transformation without output generation",
|
|
21
|
+
("pdf", true, "Path to the source PDF", false),
|
|
22
|
+
("template", true, "Path to the template JSON file", false),
|
|
23
|
+
("preview-as", false, "Preview formatted output: csv or json (no file written)", false));
|
|
24
|
+
|
|
25
|
+
var pdfPath = Cli.Require(Args, "pdf");
|
|
26
|
+
var templatePath = Cli.Require(Args, "template");
|
|
27
|
+
var previewAs = Cli.Get(Args, "preview-as")?.Trim().ToLowerInvariant();
|
|
28
|
+
|
|
29
|
+
if (previewAs is not null && previewAs != "csv" && previewAs != "json")
|
|
30
|
+
{
|
|
31
|
+
JsonOut.Error("bad-format", "--preview-as must be csv or json", null, 2);
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
if (!File.Exists(templatePath))
|
|
35
|
+
{
|
|
36
|
+
JsonOut.Error("template-not-found", $"Template not found at '{templatePath}'", null, 1);
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
var template = Template.FromJson(File.ReadAllText(templatePath));
|
|
40
|
+
|
|
41
|
+
using var host = ScriptHost.CreateHost(Args.ToArray(), includeStore: false);
|
|
42
|
+
var engine = ScriptHost.GetEngine(host);
|
|
43
|
+
|
|
44
|
+
using var pdf = LoadPdf(pdfPath);
|
|
45
|
+
|
|
46
|
+
if (previewAs is null)
|
|
47
|
+
{
|
|
48
|
+
// Standard dry-run: extraction + transformation only.
|
|
49
|
+
var result = await engine.DryRunAsync(pdf, template);
|
|
50
|
+
JsonOut.Write(new { kind = result.GetType().Name, result });
|
|
51
|
+
}
|
|
52
|
+
else
|
|
53
|
+
{
|
|
54
|
+
// Preview mode: full execute, but output goes to stdout preview instead of disk.
|
|
55
|
+
ProcessingResult result = previewAs switch
|
|
56
|
+
{
|
|
57
|
+
"csv" => await engine.ExecuteTemplateAsync<CsvOutputGenerator, CsvGeneratorOptions>(
|
|
58
|
+
pdf, template, new CsvGeneratorOptions()),
|
|
59
|
+
"json" => await engine.ExecuteTemplateAsync<JsonOutputGenerator, JsonGeneratorOptions>(
|
|
60
|
+
pdf, template, new JsonGeneratorOptions()),
|
|
61
|
+
_ => throw new InvalidOperationException("unreachable")
|
|
62
|
+
};
|
|
63
|
+
|
|
64
|
+
switch (result)
|
|
65
|
+
{
|
|
66
|
+
case SucceededResult ok:
|
|
67
|
+
var preview = Encoding.UTF8.GetString(ok.Output.Payload.Span);
|
|
68
|
+
JsonOut.Write(new { kind = "SucceededResult", format = previewAs, preview });
|
|
69
|
+
break;
|
|
70
|
+
case RejectedResult rej:
|
|
71
|
+
JsonOut.Error("rejected", $"Rejected ({rej.Reason}){(rej.Detail is not null ? $": {rej.Detail}" : "")}", null, 1);
|
|
72
|
+
break;
|
|
73
|
+
case FailedResult fail:
|
|
74
|
+
JsonOut.Error("failed", fail.ErrorMessage, fail.InnerDetail, 1);
|
|
75
|
+
break;
|
|
76
|
+
default:
|
|
77
|
+
JsonOut.Error("unknown-result", result.GetType().Name, null, 1);
|
|
78
|
+
break;
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
catch (Exception ex)
|
|
83
|
+
{
|
|
84
|
+
JsonOut.Error("unhandled", ex.Message, ex.ToString(), 1);
|
|
85
|
+
}
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
#load "_common.csx"
|
|
2
|
+
|
|
3
|
+
#nullable enable
|
|
4
|
+
|
|
5
|
+
// SCR-07 + CLS-01 helper — wrapper over IDocuoriaEngine.EvaluateMatchAsync.
|
|
6
|
+
// Args: --pdf <path> --template <id-or-file>
|
|
7
|
+
// Auto-detects template source per Area-3: contains path sep OR ends .json OR File.Exists -> file;
|
|
8
|
+
// otherwise loads from store via ITemplateStoreProvider.LoadAsync(value).
|
|
9
|
+
// stdout: { confidence, matchedRules } JSON.
|
|
10
|
+
|
|
11
|
+
using Docuoria.Contracts;
|
|
12
|
+
using Docuoria.Models;
|
|
13
|
+
using Docuoria.Storage;
|
|
14
|
+
|
|
15
|
+
try
|
|
16
|
+
{
|
|
17
|
+
Cli.Help(Args, "evaluate-match.csx", "Evaluate a template's match rule against a PDF",
|
|
18
|
+
("pdf", true, "Path to the source PDF", false),
|
|
19
|
+
("template", true, "Template ID or path to template JSON file", false),
|
|
20
|
+
("store-path", false, "Local template store directory (default: ./templates)", false),
|
|
21
|
+
("store-url", false, "API template store URL (mutually exclusive with --store-path)", false),
|
|
22
|
+
("store-key", false, "Function key for API store authentication", false));
|
|
23
|
+
|
|
24
|
+
var pdfPath = Cli.Require(Args, "pdf");
|
|
25
|
+
var templateRef = Cli.Require(Args, "template");
|
|
26
|
+
|
|
27
|
+
bool looksLikeFile =
|
|
28
|
+
templateRef.IndexOfAny(new[] { '/', '\\' }) >= 0 ||
|
|
29
|
+
templateRef.EndsWith(".json", StringComparison.OrdinalIgnoreCase) ||
|
|
30
|
+
File.Exists(templateRef);
|
|
31
|
+
|
|
32
|
+
using var host = ScriptHost.CreateHost(Args.ToArray(), includeStore: true);
|
|
33
|
+
var engine = ScriptHost.GetEngine(host);
|
|
34
|
+
|
|
35
|
+
Template? template;
|
|
36
|
+
if (looksLikeFile)
|
|
37
|
+
{
|
|
38
|
+
if (!File.Exists(templateRef))
|
|
39
|
+
{
|
|
40
|
+
JsonOut.Error("template-not-found", $"Template file not found at '{templateRef}'", null, 1);
|
|
41
|
+
}
|
|
42
|
+
template = Template.FromJson(File.ReadAllText(templateRef));
|
|
43
|
+
}
|
|
44
|
+
else
|
|
45
|
+
{
|
|
46
|
+
var store = ScriptHost.GetStore(host);
|
|
47
|
+
if (store is null)
|
|
48
|
+
{
|
|
49
|
+
JsonOut.Error("no-store", "ITemplateStoreProvider is not registered.", null, 1);
|
|
50
|
+
}
|
|
51
|
+
template = await store!.LoadAsync(templateRef);
|
|
52
|
+
if (template is null)
|
|
53
|
+
{
|
|
54
|
+
JsonOut.Error("not-found", $"template '{templateRef}' not found in store", null, 1);
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
using var pdf = LoadPdf(pdfPath);
|
|
59
|
+
var evaluation = await engine.EvaluateMatchAsync(template!, pdf);
|
|
60
|
+
|
|
61
|
+
// Project a clean LLM-facing response: single aggregated confidence + diagnostic rules.
|
|
62
|
+
// Confidence = ruleConfidence × extractionProbeScore (0.0 when either fails).
|
|
63
|
+
JsonOut.Write(new
|
|
64
|
+
{
|
|
65
|
+
confidence = Math.Round(evaluation.RuleConfidence * evaluation.ExtractionProbeScore, 4),
|
|
66
|
+
matchedRules = evaluation.MatchedRules,
|
|
67
|
+
});
|
|
68
|
+
}
|
|
69
|
+
catch (Exception ex)
|
|
70
|
+
{
|
|
71
|
+
JsonOut.Error("unhandled", ex.Message, ex.ToString(), 1);
|
|
72
|
+
}
|