@sidub-inc/docuoria.cli 1.0.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +1056 -0
- package/package.json +56 -0
- package/payload/.claude-plugin/plugin.json +21 -0
- package/payload/MANIFEST.json +322 -0
- package/payload/SKILL.md +88 -0
- package/payload/assets/lib/Docuoria.dll +0 -0
- package/payload/assets/schemas/template-schema.json +413 -0
- package/payload/commands/classify.md +11 -0
- package/payload/commands/diagnose.md +11 -0
- package/payload/commands/extract.md +11 -0
- package/payload/commands/inspect.md +11 -0
- package/payload/commands/validate-template.md +11 -0
- package/payload/examples/01-extract-to-csv.md +49 -0
- package/payload/examples/02-classify-unknown-pdf.md +102 -0
- package/payload/examples/03-diagnose-failed-result.md +68 -0
- package/payload/references/classification.md +363 -0
- package/payload/references/decision-tree.md +43 -0
- package/payload/references/failure-tree.md +169 -0
- package/payload/references/pattern-authoring.md +40 -0
- package/payload/references/patterns.md +97 -0
- package/payload/references/privacy.md +36 -0
- package/payload/references/scripts.md +361 -0
- package/payload/references/template-reference.md +606 -0
- package/payload/references/workflow.md +163 -0
- package/payload/scripts/_common.csx +250 -0
- package/payload/scripts/classify.csx +53 -0
- package/payload/scripts/dry-run.csx +85 -0
- package/payload/scripts/evaluate-match.csx +72 -0
- package/payload/scripts/execute.csx +89 -0
- package/payload/scripts/inspect.csx +43 -0
- package/payload/scripts/list-templates.csx +34 -0
- package/payload/scripts/load-template.csx +54 -0
- package/payload/scripts/save-template.csx +53 -0
- package/payload/scripts/schema-info.csx +84 -0
- package/payload/scripts/test-groups.csx +44 -0
- package/payload/scripts/test-pattern.csx +61 -0
- package/payload/scripts/validate-template.csx +54 -0
- package/payload/skill/SKILL.md +88 -0
- package/payload/skill/assets/lib/Docuoria.dll +0 -0
- package/payload/skill/assets/schemas/template-schema.json +413 -0
- package/payload/skill/examples/01-extract-to-csv.md +49 -0
- package/payload/skill/examples/02-classify-unknown-pdf.md +102 -0
- package/payload/skill/examples/03-diagnose-failed-result.md +68 -0
- package/payload/skill/references/classification.md +363 -0
- package/payload/skill/references/decision-tree.md +43 -0
- package/payload/skill/references/failure-tree.md +169 -0
- package/payload/skill/references/pattern-authoring.md +40 -0
- package/payload/skill/references/patterns.md +97 -0
- package/payload/skill/references/privacy.md +36 -0
- package/payload/skill/references/scripts.md +361 -0
- package/payload/skill/references/template-reference.md +606 -0
- package/payload/skill/references/workflow.md +163 -0
- package/payload/skill/scripts/_common.csx +250 -0
- package/payload/skill/scripts/classify.csx +53 -0
- package/payload/skill/scripts/dry-run.csx +85 -0
- package/payload/skill/scripts/evaluate-match.csx +72 -0
- package/payload/skill/scripts/execute.csx +89 -0
- package/payload/skill/scripts/inspect.csx +43 -0
- package/payload/skill/scripts/list-templates.csx +34 -0
- package/payload/skill/scripts/load-template.csx +54 -0
- package/payload/skill/scripts/save-template.csx +53 -0
- package/payload/skill/scripts/schema-info.csx +84 -0
- package/payload/skill/scripts/test-groups.csx +44 -0
- package/payload/skill/scripts/test-pattern.csx +61 -0
- package/payload/skill/scripts/validate-template.csx +54 -0
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
#load "_common.csx"
|
|
2
|
+
|
|
3
|
+
#nullable enable
|
|
4
|
+
|
|
5
|
+
// SCR-06 — Wrapper over IDocuoriaEngine.ExecuteTemplateAsync<TGenerator, TOptions>.
|
|
6
|
+
// Args: --pdf <path> --template <file.json> --format csv|json [--output <path>]
|
|
7
|
+
// Success: SucceededResult → write payload bytes (to --output or wrap as string in stdout JSON).
|
|
8
|
+
// Rejected/Failed: emit { status, result } and exit 1.
|
|
9
|
+
|
|
10
|
+
using System.Text;
|
|
11
|
+
using Docuoria.Configuration;
|
|
12
|
+
using Docuoria.Contracts;
|
|
13
|
+
using Docuoria.Models;
|
|
14
|
+
using Docuoria.Output.Csv;
|
|
15
|
+
using Docuoria.Output.Json;
|
|
16
|
+
using Docuoria.Results;
|
|
17
|
+
|
|
18
|
+
try
|
|
19
|
+
{
|
|
20
|
+
Cli.Help(Args, "execute.csx", "Full pipeline run with output generation (CSV or JSON)",
|
|
21
|
+
("pdf", true, "Path to the source PDF", false),
|
|
22
|
+
("template", true, "Path to the template JSON file", false),
|
|
23
|
+
("format", true, "Output format: csv or json", false),
|
|
24
|
+
("output", false, "Write output to this file path (default: stdout)", false));
|
|
25
|
+
|
|
26
|
+
var pdfPath = Cli.Require(Args, "pdf");
|
|
27
|
+
var templatePath = Cli.Require(Args, "template");
|
|
28
|
+
var format = Cli.Require(Args, "format").Trim().ToLowerInvariant();
|
|
29
|
+
var outputPath = Cli.Get(Args, "output");
|
|
30
|
+
|
|
31
|
+
if (format != "csv" && format != "json")
|
|
32
|
+
{
|
|
33
|
+
JsonOut.Error("bad-format", "expected csv|json", null, 2);
|
|
34
|
+
}
|
|
35
|
+
if (!File.Exists(templatePath))
|
|
36
|
+
{
|
|
37
|
+
JsonOut.Error("template-not-found", $"Template not found at '{templatePath}'", null, 1);
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
var template = Template.FromJson(File.ReadAllText(templatePath));
|
|
41
|
+
|
|
42
|
+
using var host = ScriptHost.CreateHost(Args.ToArray(), includeStore: false);
|
|
43
|
+
var engine = ScriptHost.GetEngine(host);
|
|
44
|
+
|
|
45
|
+
using var pdf = LoadPdf(pdfPath);
|
|
46
|
+
|
|
47
|
+
ProcessingResult result = format switch
|
|
48
|
+
{
|
|
49
|
+
"csv" => await engine.ExecuteTemplateAsync<CsvOutputGenerator, CsvGeneratorOptions>(
|
|
50
|
+
pdf, template, new CsvGeneratorOptions()),
|
|
51
|
+
"json" => await engine.ExecuteTemplateAsync<JsonOutputGenerator, JsonGeneratorOptions>(
|
|
52
|
+
pdf, template, new JsonGeneratorOptions()),
|
|
53
|
+
_ => throw new InvalidOperationException("unreachable")
|
|
54
|
+
};
|
|
55
|
+
|
|
56
|
+
switch (result)
|
|
57
|
+
{
|
|
58
|
+
case SucceededResult ok:
|
|
59
|
+
var payload = ok.Output.Payload;
|
|
60
|
+
if (!string.IsNullOrEmpty(outputPath))
|
|
61
|
+
{
|
|
62
|
+
Directory.CreateDirectory(Path.GetDirectoryName(Path.GetFullPath(outputPath!))!);
|
|
63
|
+
await File.WriteAllBytesAsync(outputPath!, payload.ToArray());
|
|
64
|
+
JsonOut.Write(new { status = "ok", path = outputPath });
|
|
65
|
+
}
|
|
66
|
+
else
|
|
67
|
+
{
|
|
68
|
+
var text = Encoding.UTF8.GetString(payload.Span);
|
|
69
|
+
JsonOut.Write(new { status = "ok", format, output = text });
|
|
70
|
+
}
|
|
71
|
+
break;
|
|
72
|
+
|
|
73
|
+
case RejectedResult rej:
|
|
74
|
+
JsonOut.Error("rejected", $"Rejected ({rej.Reason}){(rej.Detail is not null ? $": {rej.Detail}" : "")}", null, 1);
|
|
75
|
+
break;
|
|
76
|
+
|
|
77
|
+
case FailedResult fail:
|
|
78
|
+
JsonOut.Error("failed", fail.ErrorMessage, fail.InnerDetail, 1);
|
|
79
|
+
break;
|
|
80
|
+
|
|
81
|
+
default:
|
|
82
|
+
JsonOut.Error("unknown-result", result.GetType().Name, null, 1);
|
|
83
|
+
break;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
catch (Exception ex)
|
|
87
|
+
{
|
|
88
|
+
JsonOut.Error("unhandled", ex.Message, ex.ToString(), 1);
|
|
89
|
+
}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
#load "_common.csx"
|
|
2
|
+
|
|
3
|
+
#nullable enable
|
|
4
|
+
|
|
5
|
+
// SCR-01 — Agent-facing wrapper over IDocuoriaEngine.InspectAsync (read-only PDF probe).
|
|
6
|
+
// Args: --pdf <path> [--page <n>]
|
|
7
|
+
// stdout: PdfInspection JSON
|
|
8
|
+
// Exit codes: 0 success, 1 handled error, 2 bad args.
|
|
9
|
+
|
|
10
|
+
using Docuoria.Configuration;
|
|
11
|
+
using Docuoria.Contracts;
|
|
12
|
+
using Docuoria.Models;
|
|
13
|
+
|
|
14
|
+
try
|
|
15
|
+
{
|
|
16
|
+
Cli.Help(Args, "inspect.csx", "Inspect PDF structure (page count, text blocks, tables)",
|
|
17
|
+
("pdf", true, "Path to the source PDF", false),
|
|
18
|
+
("page", false, "1-based page index (default: all pages)", false));
|
|
19
|
+
|
|
20
|
+
var pdfPath = Cli.Require(Args, "pdf");
|
|
21
|
+
var pageStr = Cli.Get(Args, "page");
|
|
22
|
+
|
|
23
|
+
using var host = ScriptHost.CreateHost(Args.ToArray(), includeStore: false);
|
|
24
|
+
var engine = ScriptHost.GetEngine(host);
|
|
25
|
+
|
|
26
|
+
PageFilter? filter = null;
|
|
27
|
+
if (!string.IsNullOrWhiteSpace(pageStr))
|
|
28
|
+
{
|
|
29
|
+
if (!int.TryParse(pageStr, out var page) || page < 1)
|
|
30
|
+
{
|
|
31
|
+
JsonOut.Error("bad-arg", "--page must be a positive integer", null, 2);
|
|
32
|
+
}
|
|
33
|
+
filter = PageFilter.SinglePage(page);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
using var pdf = LoadPdf(pdfPath);
|
|
37
|
+
var result = await engine.InspectAsync(pdf, filter);
|
|
38
|
+
JsonOut.Write(result);
|
|
39
|
+
}
|
|
40
|
+
catch (Exception ex)
|
|
41
|
+
{
|
|
42
|
+
JsonOut.Error("unhandled", ex.Message, ex.ToString(), 1);
|
|
43
|
+
}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
#load "_common.csx"
|
|
2
|
+
|
|
3
|
+
#nullable enable
|
|
4
|
+
|
|
5
|
+
// SCR-08 — enumerate ITemplateStoreProvider.ListAsync.
|
|
6
|
+
// stdout: { templates: [..ids..] }
|
|
7
|
+
|
|
8
|
+
using Docuoria.Storage;
|
|
9
|
+
|
|
10
|
+
try
|
|
11
|
+
{
|
|
12
|
+
Cli.Help(Args, "list-templates.csx", "List all template IDs in the configured store",
|
|
13
|
+
("store-path", false, "Local template store directory (default: ./templates)", false),
|
|
14
|
+
("store-url", false, "API template store URL (mutually exclusive with --store-path)", false),
|
|
15
|
+
("store-key", false, "Function key for API store authentication", false));
|
|
16
|
+
|
|
17
|
+
using var host = ScriptHost.CreateHost(Args.ToArray(), includeStore: true);
|
|
18
|
+
var store = ScriptHost.GetStore(host);
|
|
19
|
+
if (store is null)
|
|
20
|
+
{
|
|
21
|
+
JsonOut.Error("no-store", "ITemplateStoreProvider is not registered. Pass --store-path <dir> to specify a local template store directory.", null, 1);
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
var ids = new List<string>();
|
|
25
|
+
await foreach (var id in store!.ListAsync())
|
|
26
|
+
{
|
|
27
|
+
ids.Add(id);
|
|
28
|
+
}
|
|
29
|
+
JsonOut.Write(new { templates = ids });
|
|
30
|
+
}
|
|
31
|
+
catch (Exception ex)
|
|
32
|
+
{
|
|
33
|
+
JsonOut.Error("unhandled", ex.Message, ex.ToString(), 1);
|
|
34
|
+
}
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
#load "_common.csx"
|
|
2
|
+
|
|
3
|
+
#nullable enable
|
|
4
|
+
|
|
5
|
+
// SCR-09 — fetch a template by ID and emit its JSON.
|
|
6
|
+
// Args: --id <identifier> [--output <path>]
|
|
7
|
+
// stdout: parsed template JSON or { status, path }.
|
|
8
|
+
|
|
9
|
+
using System.Text.Json;
|
|
10
|
+
using Docuoria.Serialization;
|
|
11
|
+
using Docuoria.Storage;
|
|
12
|
+
|
|
13
|
+
try
|
|
14
|
+
{
|
|
15
|
+
Cli.Help(Args, "load-template.csx", "Fetch a template by ID and emit its JSON",
|
|
16
|
+
("id", true, "Template identifier", false),
|
|
17
|
+
("output", false, "Write template JSON to this file path", false),
|
|
18
|
+
("store-path", false, "Local template store directory (default: ./templates)", false),
|
|
19
|
+
("store-url", false, "API template store URL (mutually exclusive with --store-path)", false),
|
|
20
|
+
("store-key", false, "Function key for API store authentication", false));
|
|
21
|
+
|
|
22
|
+
var id = Cli.Require(Args, "id");
|
|
23
|
+
var outputPath = Cli.Get(Args, "output");
|
|
24
|
+
|
|
25
|
+
using var host = ScriptHost.CreateHost(Args.ToArray(), includeStore: true);
|
|
26
|
+
var store = ScriptHost.GetStore(host);
|
|
27
|
+
if (store is null)
|
|
28
|
+
{
|
|
29
|
+
JsonOut.Error("no-store", "ITemplateStoreProvider is not registered. Pass --store-path <dir> to specify a local template store directory.", null, 1);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
var tpl = await store!.LoadAsync(id);
|
|
33
|
+
if (tpl is null)
|
|
34
|
+
{
|
|
35
|
+
JsonOut.Error("not-found", $"Template '{id}' not found. Run list-templates.csx to see available template IDs.", null, 1);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
var json = tpl!.ToJson();
|
|
39
|
+
if (!string.IsNullOrEmpty(outputPath))
|
|
40
|
+
{
|
|
41
|
+
Directory.CreateDirectory(Path.GetDirectoryName(Path.GetFullPath(outputPath!))!);
|
|
42
|
+
await File.WriteAllTextAsync(outputPath!, json);
|
|
43
|
+
JsonOut.Write(new { status = "ok", path = outputPath });
|
|
44
|
+
}
|
|
45
|
+
else
|
|
46
|
+
{
|
|
47
|
+
var element = JsonSerializer.Deserialize<JsonElement>(json, DocuoriaJsonOptions.Default);
|
|
48
|
+
JsonOut.Write(element);
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
catch (Exception ex)
|
|
52
|
+
{
|
|
53
|
+
JsonOut.Error("unhandled", ex.Message, ex.ToString(), 1);
|
|
54
|
+
}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
#load "_common.csx"
|
|
2
|
+
|
|
3
|
+
#nullable enable
|
|
4
|
+
|
|
5
|
+
// SCR-10 — persist a template JSON file to the configured store.
|
|
6
|
+
// Args: --template <path> [--overwrite] [--store-path <dir>] [--store-url <url>] [--store-key <key>]
|
|
7
|
+
// stdout: { status, identifier }.
|
|
8
|
+
|
|
9
|
+
using Docuoria.Models;
|
|
10
|
+
using Docuoria.Storage;
|
|
11
|
+
using Docuoria.Storage.Exceptions;
|
|
12
|
+
|
|
13
|
+
try
|
|
14
|
+
{
|
|
15
|
+
Cli.Help(Args, "save-template.csx", "Persist a template JSON file to the configured store",
|
|
16
|
+
("template", true, "Path to the template JSON file", false),
|
|
17
|
+
("overwrite", false, "Overwrite if template already exists", true),
|
|
18
|
+
("store-path", false, "Local template store directory (default: ./templates)", false),
|
|
19
|
+
("store-url", false, "API template store URL (mutually exclusive with --store-path)", false),
|
|
20
|
+
("store-key", false, "Function key for API store authentication", false));
|
|
21
|
+
|
|
22
|
+
var filePath = Cli.Require(Args, "template");
|
|
23
|
+
var overwrite = Cli.Has(Args, "overwrite");
|
|
24
|
+
|
|
25
|
+
if (!File.Exists(filePath))
|
|
26
|
+
{
|
|
27
|
+
JsonOut.Error("template-not-found", $"Template file not found at '{filePath}'", null, 1);
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
using var host = ScriptHost.CreateHost(Args.ToArray(), includeStore: true);
|
|
31
|
+
var store = ScriptHost.GetStore(host);
|
|
32
|
+
if (store is null)
|
|
33
|
+
{
|
|
34
|
+
JsonOut.Error("no-store", "ITemplateStoreProvider is not registered. Pass --store-path <dir> to specify a local template store directory.", null, 1);
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
var tpl = Template.FromJson(File.ReadAllText(filePath));
|
|
38
|
+
|
|
39
|
+
try
|
|
40
|
+
{
|
|
41
|
+
await store!.SaveAsync(tpl, overwrite);
|
|
42
|
+
}
|
|
43
|
+
catch (TemplateAlreadyExistsException tae)
|
|
44
|
+
{
|
|
45
|
+
JsonOut.Error("already-exists", $"{tae.Message} Use --overwrite to replace the existing template.", null, 1);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
JsonOut.Write(new { status = "ok", identifier = tpl.Identifier });
|
|
49
|
+
}
|
|
50
|
+
catch (Exception ex)
|
|
51
|
+
{
|
|
52
|
+
JsonOut.Error("unhandled", ex.Message, ex.ToString(), 1);
|
|
53
|
+
}
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
#load "_common.csx"
|
|
2
|
+
|
|
3
|
+
#nullable enable
|
|
4
|
+
|
|
5
|
+
// SCR-12 — Dump all type/enum/mode information for the SDK.
|
|
6
|
+
// Args: (none)
|
|
7
|
+
// stdout: JSON object with fieldTypes, extractionSources, modes, matchRules, subFieldMappings.
|
|
8
|
+
|
|
9
|
+
using System.Reflection;
|
|
10
|
+
using Docuoria.Models;
|
|
11
|
+
|
|
12
|
+
try
|
|
13
|
+
{
|
|
14
|
+
Cli.Help(Args, "schema-info.csx", "List all SDK types, enums, modes, and valid values");
|
|
15
|
+
|
|
16
|
+
var fieldTypes = new Dictionary<int, string>();
|
|
17
|
+
foreach (var val in Enum.GetValues<FieldType>())
|
|
18
|
+
{
|
|
19
|
+
fieldTypes[(int)val] = val.ToString();
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
var extractionSources = new[]
|
|
23
|
+
{
|
|
24
|
+
"TextPatternExtractionSource",
|
|
25
|
+
"TextAnchorExtractionSource",
|
|
26
|
+
"TableCellExtractionSource",
|
|
27
|
+
"TableRowsExtractionSource",
|
|
28
|
+
"MetadataFieldExtractionSource",
|
|
29
|
+
"FallbackExtractionSource"
|
|
30
|
+
};
|
|
31
|
+
|
|
32
|
+
var modes = new Dictionary<string, object>
|
|
33
|
+
{
|
|
34
|
+
["TextPatternExtractionSource"] = new { mode = new[] { "Token", "Pattern", "AllMatches" }, notes = new { Token = "requires 'literalToken'", Pattern = "requires 'regexPattern'", AllMatches = "requires 'regexPattern'" } },
|
|
35
|
+
["TableRowsExtractionSource"] = new { mode = new[] { "ByHeader", "Ordinal" } },
|
|
36
|
+
["TextPatternMatchRule"] = new { mode = new[] { "AnyToken", "AllTokens" } },
|
|
37
|
+
["FileNameMatchRule"] = new { mode = new[] { "Glob", "Regex" } },
|
|
38
|
+
["CompositeMatchRule"] = new { @operator = new[] { "And", "Or", "Not" } }
|
|
39
|
+
};
|
|
40
|
+
|
|
41
|
+
var matchRules = new[]
|
|
42
|
+
{
|
|
43
|
+
"TextPatternMatchRule",
|
|
44
|
+
"FileNameMatchRule",
|
|
45
|
+
"TextAnchorMatchRule",
|
|
46
|
+
"MetadataMatchRule",
|
|
47
|
+
"PageGeometryMatchRule",
|
|
48
|
+
"TableMatchRule",
|
|
49
|
+
"CompositeMatchRule"
|
|
50
|
+
};
|
|
51
|
+
|
|
52
|
+
var subFieldMappings = new[]
|
|
53
|
+
{
|
|
54
|
+
"NamedGroupSubFieldMapping",
|
|
55
|
+
"RegexGroupSubFieldMapping",
|
|
56
|
+
"HeaderSubFieldMapping",
|
|
57
|
+
"OrdinalSubFieldMapping"
|
|
58
|
+
};
|
|
59
|
+
|
|
60
|
+
var metadataFields = new[]
|
|
61
|
+
{
|
|
62
|
+
"Title", "Author", "Subject", "Keywords",
|
|
63
|
+
"Creator", "Producer", "CreationDate", "ModifiedDate"
|
|
64
|
+
};
|
|
65
|
+
|
|
66
|
+
JsonOut.Write(new
|
|
67
|
+
{
|
|
68
|
+
fieldTypes,
|
|
69
|
+
extractionSources,
|
|
70
|
+
modes,
|
|
71
|
+
matchRules,
|
|
72
|
+
subFieldMappings,
|
|
73
|
+
metadataFields,
|
|
74
|
+
notes = new
|
|
75
|
+
{
|
|
76
|
+
fieldType = "fieldType serializes as INTEGER (0-5), not string. Use the integer value in template JSON.",
|
|
77
|
+
kind = "All polymorphic types use '$kind' discriminator with the CLR class name."
|
|
78
|
+
}
|
|
79
|
+
});
|
|
80
|
+
}
|
|
81
|
+
catch (Exception ex)
|
|
82
|
+
{
|
|
83
|
+
JsonOut.Error("unhandled", ex.Message, ex.ToString(), 1);
|
|
84
|
+
}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
#load "_common.csx"
|
|
2
|
+
|
|
3
|
+
#nullable enable
|
|
4
|
+
|
|
5
|
+
// SCR-03 — Wrapper over IDocuoriaEngine.TestGroupsAsync.
|
|
6
|
+
// Args: --pattern <regex> --pdf <path> [--page <n>]
|
|
7
|
+
// stdout: PatternGroupTestResult JSON.
|
|
8
|
+
|
|
9
|
+
using Docuoria.Configuration;
|
|
10
|
+
using Docuoria.Contracts;
|
|
11
|
+
using Docuoria.Models;
|
|
12
|
+
|
|
13
|
+
try
|
|
14
|
+
{
|
|
15
|
+
Cli.Help(Args, "test-groups.csx", "Test each capture group of a regex independently",
|
|
16
|
+
("pdf", true, "Path to the source PDF", false),
|
|
17
|
+
("pattern", true, "Multi-group regex pattern", false),
|
|
18
|
+
("page", false, "1-based page index (default: all pages)", false));
|
|
19
|
+
|
|
20
|
+
var pattern = Cli.Require(Args, "pattern");
|
|
21
|
+
var pdfPath = Cli.Require(Args, "pdf");
|
|
22
|
+
var pageStr = Cli.Get(Args, "page");
|
|
23
|
+
|
|
24
|
+
using var host = ScriptHost.CreateHost(Args.ToArray(), includeStore: false);
|
|
25
|
+
var engine = ScriptHost.GetEngine(host);
|
|
26
|
+
|
|
27
|
+
PatternTestOptions? options = null;
|
|
28
|
+
if (!string.IsNullOrWhiteSpace(pageStr))
|
|
29
|
+
{
|
|
30
|
+
if (!int.TryParse(pageStr, out var page) || page < 1)
|
|
31
|
+
{
|
|
32
|
+
JsonOut.Error("bad-arg", "--page must be a positive integer", null, 2);
|
|
33
|
+
}
|
|
34
|
+
options = new PatternTestOptions { PageFilter = PageFilter.SinglePage(page) };
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
using var pdf = LoadPdf(pdfPath);
|
|
38
|
+
var result = await engine.TestGroupsAsync(pdf, pattern, options);
|
|
39
|
+
JsonOut.Write(result);
|
|
40
|
+
}
|
|
41
|
+
catch (Exception ex)
|
|
42
|
+
{
|
|
43
|
+
JsonOut.Error("unhandled", ex.Message, ex.ToString(), 1);
|
|
44
|
+
}
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
#load "_common.csx"
|
|
2
|
+
|
|
3
|
+
#nullable enable
|
|
4
|
+
|
|
5
|
+
// SCR-02 — Wrapper over IDocuoriaEngine.TestPatternAsync.
|
|
6
|
+
// Args: --pattern <regex> --pdf <path> [--page <n>] [--block-separator <str>]
|
|
7
|
+
// stdout: PatternTestResult JSON (Error round-trips per Phase 24 — do NOT swallow).
|
|
8
|
+
|
|
9
|
+
using Docuoria.Configuration;
|
|
10
|
+
using Docuoria.Contracts;
|
|
11
|
+
using Docuoria.Models;
|
|
12
|
+
|
|
13
|
+
try
|
|
14
|
+
{
|
|
15
|
+
Cli.Help(Args, "test-pattern.csx", "Test a regex pattern against PDF text",
|
|
16
|
+
("pdf", true, "Path to the source PDF", false),
|
|
17
|
+
("pattern", true, "Regex pattern to test", false),
|
|
18
|
+
("page", false, "1-based page index (default: all pages)", false),
|
|
19
|
+
("block-separator", false, "Override block separator for text flattening", false));
|
|
20
|
+
|
|
21
|
+
var pattern = Cli.Require(Args, "pattern");
|
|
22
|
+
var pdfPath = Cli.Require(Args, "pdf");
|
|
23
|
+
var blockSep = Cli.Get(Args, "block-separator");
|
|
24
|
+
var pageStr = Cli.Get(Args, "page");
|
|
25
|
+
|
|
26
|
+
using var host = ScriptHost.CreateHost(Args.ToArray(), includeStore: false);
|
|
27
|
+
var engine = ScriptHost.GetEngine(host);
|
|
28
|
+
|
|
29
|
+
PatternTestOptions? options = null;
|
|
30
|
+
|
|
31
|
+
// Build options from optional flags
|
|
32
|
+
string? blockSepValue = blockSep;
|
|
33
|
+
PageFilter? pageFilter = null;
|
|
34
|
+
|
|
35
|
+
if (!string.IsNullOrWhiteSpace(pageStr))
|
|
36
|
+
{
|
|
37
|
+
if (!int.TryParse(pageStr, out var page) || page < 1)
|
|
38
|
+
{
|
|
39
|
+
JsonOut.Error("bad-arg", "--page must be a positive integer", null, 2);
|
|
40
|
+
}
|
|
41
|
+
pageFilter = PageFilter.SinglePage(page);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
if (!string.IsNullOrEmpty(blockSepValue) || pageFilter is not null)
|
|
45
|
+
{
|
|
46
|
+
options = new PatternTestOptions
|
|
47
|
+
{
|
|
48
|
+
BlockSeparator = blockSepValue ?? "\n",
|
|
49
|
+
PageFilter = pageFilter
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
using var pdf = LoadPdf(pdfPath);
|
|
54
|
+
|
|
55
|
+
var result = await engine.TestPatternAsync(pdf, pattern, options);
|
|
56
|
+
JsonOut.Write(result);
|
|
57
|
+
}
|
|
58
|
+
catch (Exception ex)
|
|
59
|
+
{
|
|
60
|
+
JsonOut.Error("unhandled", ex.Message, ex.ToString(), 1);
|
|
61
|
+
}
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
#load "_common.csx"
|
|
2
|
+
|
|
3
|
+
#nullable enable
|
|
4
|
+
|
|
5
|
+
// SCR-04 — Template.FromJson + Validate() static check (no engine required, but we host for consistency).
|
|
6
|
+
// Args: --template <file.json>
|
|
7
|
+
// stdout: { valid, errors }
|
|
8
|
+
|
|
9
|
+
using System.Text.Json;
|
|
10
|
+
using Docuoria.Models;
|
|
11
|
+
|
|
12
|
+
try
|
|
13
|
+
{
|
|
14
|
+
Cli.Help(Args, "validate-template.csx", "Validate a template JSON file against the schema",
|
|
15
|
+
("template", true, "Path to the template JSON file", false));
|
|
16
|
+
|
|
17
|
+
var templatePath = Cli.Require(Args, "template");
|
|
18
|
+
if (!File.Exists(templatePath))
|
|
19
|
+
{
|
|
20
|
+
JsonOut.Error("template-not-found", $"Template not found at '{templatePath}'", null, 1);
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
var json = File.ReadAllText(templatePath);
|
|
24
|
+
|
|
25
|
+
Template tpl;
|
|
26
|
+
try
|
|
27
|
+
{
|
|
28
|
+
tpl = Template.FromJson(json);
|
|
29
|
+
}
|
|
30
|
+
catch (JsonException jex)
|
|
31
|
+
{
|
|
32
|
+
var message = jex.Message;
|
|
33
|
+
var hint = (string?)null;
|
|
34
|
+
|
|
35
|
+
// Detect fieldType string-vs-integer errors and add a helpful hint.
|
|
36
|
+
if (message.Contains("FieldType", StringComparison.OrdinalIgnoreCase)
|
|
37
|
+
|| (message.Contains("fieldType", StringComparison.OrdinalIgnoreCase))
|
|
38
|
+
|| (message.Contains("could not be converted") && message.Contains("String", StringComparison.Ordinal)))
|
|
39
|
+
{
|
|
40
|
+
hint = "fieldType must be an integer (0=String, 1=Number, 2=Integer, 3=Boolean, 4=Date, 5=Timestamp). "
|
|
41
|
+
+ "Do NOT use string values like \"String\" or \"Number\".";
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
JsonOut.Error("parse-error", hint is not null ? $"{message} — Hint: {hint}" : message, null, 1);
|
|
45
|
+
return;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
var errors = tpl.Validate();
|
|
49
|
+
JsonOut.Write(new { valid = errors.Count == 0, errors });
|
|
50
|
+
}
|
|
51
|
+
catch (Exception ex)
|
|
52
|
+
{
|
|
53
|
+
JsonOut.Error("unhandled", ex.Message, ex.ToString(), 1);
|
|
54
|
+
}
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: docuoria
|
|
3
|
+
description: Use this skill when working with Docuoria to extract structured data from PDFs, author or validate a template, design match rules for classification, diagnose a FailedResult or RejectedResult, select an ExtractionSource type, write or debug a regex pattern, or verify that PDF processing is local and private. Apply even when the user does not say "Docuoria" — any task involving the Docuoria CLI scripts, template JSON, or the IDocuoriaEngine API qualifies.
|
|
4
|
+
license: MIT
|
|
5
|
+
compatibility: Requires .NET 10 SDK and the `dotnet-script` global tool. SDK assembly (`Docuoria.dll`) is bundled under `assets/lib/`; transitive NuGet dependencies (PdfPig, Tabula, CsvHelper, pythonnet, Microsoft.Extensions.*) are resolved by `dotnet-script` at first run.
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# Docuoria Skill
|
|
9
|
+
|
|
10
|
+
## Installing this skill
|
|
11
|
+
|
|
12
|
+
This skill directory was scaffolded by the Docuoria CLI. To install or update:
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
# npm (Node.js ≥ 20)
|
|
16
|
+
npm install -g @sidub/docuoria
|
|
17
|
+
docuoria init
|
|
18
|
+
|
|
19
|
+
# .NET global tool
|
|
20
|
+
dotnet tool install -g Docuoria.Cli
|
|
21
|
+
docuoria init
|
|
22
|
+
|
|
23
|
+
# Update an existing installation
|
|
24
|
+
docuoria update
|
|
25
|
+
|
|
26
|
+
# Check status / drift
|
|
27
|
+
docuoria list-tools
|
|
28
|
+
docuoria doctor
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
See `docs/cli.md` in the Docuoria repository for the full command reference.
|
|
32
|
+
|
|
33
|
+
---
|
|
34
|
+
|
|
35
|
+
## Invocation
|
|
36
|
+
|
|
37
|
+
All scripts follow `dotnet script scripts/<name>.csx -- --<flag> <value>`, run from the skill root. The `--` separator is mandatory — without it, dotnet-script consumes the flags as its own. Positional arguments are rejected; pass `--help` to any script for its full flag list.
|
|
38
|
+
|
|
39
|
+
Scripts divide into two groups:
|
|
40
|
+
|
|
41
|
+
| Group | Scripts | Store flag |
|
|
42
|
+
| --- | --- | --- |
|
|
43
|
+
| **Store-aware** — read from or write to a template store | `classify`, `evaluate-match`, `list-templates`, `load-template`, `save-template` | `--store-path <dir>` or `--store-url <url>` |
|
|
44
|
+
| **Standalone** — operate on individual PDF and/or template files | `inspect`, `test-pattern`, `test-groups`, `dry-run`, `execute`, `validate-template` | — |
|
|
45
|
+
|
|
46
|
+
Store-aware scripts accept `--store-path <dir>` (local directory) or `--store-url <url>` (API endpoint) to locate templates; these flags are mutually exclusive. When omitted, `--store-path` defaults to `./templates` relative to the process working directory — since the CWD varies by environment, always pass the store location explicitly.
|
|
47
|
+
|
|
48
|
+
## Workflow
|
|
49
|
+
|
|
50
|
+
The pipeline runs in order; classification determines the entry point. Load `references/workflow.md` for the full step-by-step guide.
|
|
51
|
+
|
|
52
|
+
1. **Classify** — match the PDF against all stored templates
|
|
53
|
+
2. **Inspect** — read the engine's text extraction (when no template matches)
|
|
54
|
+
3. **Test** — prove regex patterns against the engine's haystack
|
|
55
|
+
4. **Build** — author the template JSON, validate classification rules and schema
|
|
56
|
+
5. **Dry-run** — end-to-end extraction without output generation
|
|
57
|
+
6. **Execute** — full pipeline producing CSV or JSON output
|
|
58
|
+
7. **Store** — persist the template and verify it ranks correctly
|
|
59
|
+
|
|
60
|
+
## Routing
|
|
61
|
+
|
|
62
|
+
Consult the canonical reference before relying on memory. Each concern has a single owner.
|
|
63
|
+
|
|
64
|
+
| If the agent needs to… | Load |
|
|
65
|
+
| --- | --- |
|
|
66
|
+
| Follow the full pipeline step-by-step | `references/workflow.md` |
|
|
67
|
+
| Pick an `ExtractionSource` subtype for a field (`TextPattern`, `TableRows`, `TextAnchor`, `MetadataField`, `Fallback`) | `references/decision-tree.md` |
|
|
68
|
+
| Design a discriminating `rootMatchRule` (token selection, composite architecture, structural rules, weights, thresholds) | `references/classification.md` |
|
|
69
|
+
| Diagnose a `RejectedResult`, `FailedResult`, classification failure, or empty/incomplete `DryRunSucceeded` | `references/failure-tree.md` |
|
|
70
|
+
| Map a stderr `error.code` to a remediation branch | `references/failure-tree.md` § Stderr error.code → Branch routing |
|
|
71
|
+
| Copy a regex pattern from the library or adapt one to a specific PDF | `references/patterns.md` then `references/pattern-authoring.md` |
|
|
72
|
+
| Look up a CLI script's flags, output envelope, or error codes | `references/scripts.md` |
|
|
73
|
+
| Look up a template JSON property, `$kind` discriminator, enum value, or shape | `references/template-reference.md` |
|
|
74
|
+
| Answer whether PDF processing is local/private | `references/privacy.md` |
|
|
75
|
+
|
|
76
|
+
## Skill layout
|
|
77
|
+
|
|
78
|
+
- `SKILL.md` — this router; loaded at skill activation.
|
|
79
|
+
- `references/` — deep guides loaded on demand (see Routing table).
|
|
80
|
+
- `scripts/` — `dotnet-script` CLI surface (`_common.csx` plus 11 verb scripts).
|
|
81
|
+
- `assets/lib/Docuoria.dll` — bundled SDK assembly.
|
|
82
|
+
- `assets/schemas/template-schema.json` — JSON Schema for template authoring and validation.
|
|
83
|
+
- `examples/` — three worked end-to-end walkthroughs.
|
|
84
|
+
|
|
85
|
+
## Gotchas
|
|
86
|
+
|
|
87
|
+
- **`fieldType` in template JSON must be an integer (0–5), never a string.** The engine rejects string values with `RejectionReason.MalformedTemplate`. Enum: 0 String, 1 Number, 2 Integer, 3 Boolean, 4 Date, 5 Timestamp. Run `validate-template.csx` to catch this before dry-run.
|
|
88
|
+
- **Adapt every regex to the actual PDF.** The engine's flattened text differs from the visual layout — whitespace, line breaks, and character encoding may not match what you see. Validate with `test-pattern.csx` and `inspect.csx` rather than pasting library patterns verbatim.
|
|
Binary file
|