@sidub-inc/docuoria.cli 1.0.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/dist/index.js +1056 -0
  2. package/package.json +56 -0
  3. package/payload/.claude-plugin/plugin.json +21 -0
  4. package/payload/MANIFEST.json +322 -0
  5. package/payload/SKILL.md +88 -0
  6. package/payload/assets/lib/Docuoria.dll +0 -0
  7. package/payload/assets/schemas/template-schema.json +413 -0
  8. package/payload/commands/classify.md +11 -0
  9. package/payload/commands/diagnose.md +11 -0
  10. package/payload/commands/extract.md +11 -0
  11. package/payload/commands/inspect.md +11 -0
  12. package/payload/commands/validate-template.md +11 -0
  13. package/payload/examples/01-extract-to-csv.md +49 -0
  14. package/payload/examples/02-classify-unknown-pdf.md +102 -0
  15. package/payload/examples/03-diagnose-failed-result.md +68 -0
  16. package/payload/references/classification.md +363 -0
  17. package/payload/references/decision-tree.md +43 -0
  18. package/payload/references/failure-tree.md +169 -0
  19. package/payload/references/pattern-authoring.md +40 -0
  20. package/payload/references/patterns.md +97 -0
  21. package/payload/references/privacy.md +36 -0
  22. package/payload/references/scripts.md +361 -0
  23. package/payload/references/template-reference.md +606 -0
  24. package/payload/references/workflow.md +163 -0
  25. package/payload/scripts/_common.csx +250 -0
  26. package/payload/scripts/classify.csx +53 -0
  27. package/payload/scripts/dry-run.csx +85 -0
  28. package/payload/scripts/evaluate-match.csx +72 -0
  29. package/payload/scripts/execute.csx +89 -0
  30. package/payload/scripts/inspect.csx +43 -0
  31. package/payload/scripts/list-templates.csx +34 -0
  32. package/payload/scripts/load-template.csx +54 -0
  33. package/payload/scripts/save-template.csx +53 -0
  34. package/payload/scripts/schema-info.csx +84 -0
  35. package/payload/scripts/test-groups.csx +44 -0
  36. package/payload/scripts/test-pattern.csx +61 -0
  37. package/payload/scripts/validate-template.csx +54 -0
  38. package/payload/skill/SKILL.md +88 -0
  39. package/payload/skill/assets/lib/Docuoria.dll +0 -0
  40. package/payload/skill/assets/schemas/template-schema.json +413 -0
  41. package/payload/skill/examples/01-extract-to-csv.md +49 -0
  42. package/payload/skill/examples/02-classify-unknown-pdf.md +102 -0
  43. package/payload/skill/examples/03-diagnose-failed-result.md +68 -0
  44. package/payload/skill/references/classification.md +363 -0
  45. package/payload/skill/references/decision-tree.md +43 -0
  46. package/payload/skill/references/failure-tree.md +169 -0
  47. package/payload/skill/references/pattern-authoring.md +40 -0
  48. package/payload/skill/references/patterns.md +97 -0
  49. package/payload/skill/references/privacy.md +36 -0
  50. package/payload/skill/references/scripts.md +361 -0
  51. package/payload/skill/references/template-reference.md +606 -0
  52. package/payload/skill/references/workflow.md +163 -0
  53. package/payload/skill/scripts/_common.csx +250 -0
  54. package/payload/skill/scripts/classify.csx +53 -0
  55. package/payload/skill/scripts/dry-run.csx +85 -0
  56. package/payload/skill/scripts/evaluate-match.csx +72 -0
  57. package/payload/skill/scripts/execute.csx +89 -0
  58. package/payload/skill/scripts/inspect.csx +43 -0
  59. package/payload/skill/scripts/list-templates.csx +34 -0
  60. package/payload/skill/scripts/load-template.csx +54 -0
  61. package/payload/skill/scripts/save-template.csx +53 -0
  62. package/payload/skill/scripts/schema-info.csx +84 -0
  63. package/payload/skill/scripts/test-groups.csx +44 -0
  64. package/payload/skill/scripts/test-pattern.csx +61 -0
  65. package/payload/skill/scripts/validate-template.csx +54 -0
@@ -0,0 +1,89 @@
1
+ #load "_common.csx"
2
+
3
+ #nullable enable
4
+
5
+ // SCR-06 — Wrapper over IDocuoriaEngine.ExecuteTemplateAsync<TGenerator, TOptions>.
6
+ // Args: --pdf <path> --template <file.json> --format csv|json [--output <path>]
7
+ // Success: SucceededResult → write payload bytes (to --output or wrap as string in stdout JSON).
8
+ // Rejected/Failed: emit { status, result } and exit 1.
9
+
10
+ using System.Text;
11
+ using Docuoria.Configuration;
12
+ using Docuoria.Contracts;
13
+ using Docuoria.Models;
14
+ using Docuoria.Output.Csv;
15
+ using Docuoria.Output.Json;
16
+ using Docuoria.Results;
17
+
18
+ try
19
+ {
20
+ Cli.Help(Args, "execute.csx", "Full pipeline run with output generation (CSV or JSON)",
21
+ ("pdf", true, "Path to the source PDF", false),
22
+ ("template", true, "Path to the template JSON file", false),
23
+ ("format", true, "Output format: csv or json", false),
24
+ ("output", false, "Write output to this file path (default: stdout)", false));
25
+
26
+ var pdfPath = Cli.Require(Args, "pdf");
27
+ var templatePath = Cli.Require(Args, "template");
28
+ var format = Cli.Require(Args, "format").Trim().ToLowerInvariant();
29
+ var outputPath = Cli.Get(Args, "output");
30
+
31
+ if (format != "csv" && format != "json")
32
+ {
33
+ JsonOut.Error("bad-format", "expected csv|json", null, 2);
34
+ }
35
+ if (!File.Exists(templatePath))
36
+ {
37
+ JsonOut.Error("template-not-found", $"Template not found at '{templatePath}'", null, 1);
38
+ }
39
+
40
+ var template = Template.FromJson(File.ReadAllText(templatePath));
41
+
42
+ using var host = ScriptHost.CreateHost(Args.ToArray(), includeStore: false);
43
+ var engine = ScriptHost.GetEngine(host);
44
+
45
+ using var pdf = LoadPdf(pdfPath);
46
+
47
+ ProcessingResult result = format switch
48
+ {
49
+ "csv" => await engine.ExecuteTemplateAsync<CsvOutputGenerator, CsvGeneratorOptions>(
50
+ pdf, template, new CsvGeneratorOptions()),
51
+ "json" => await engine.ExecuteTemplateAsync<JsonOutputGenerator, JsonGeneratorOptions>(
52
+ pdf, template, new JsonGeneratorOptions()),
53
+ _ => throw new InvalidOperationException("unreachable")
54
+ };
55
+
56
+ switch (result)
57
+ {
58
+ case SucceededResult ok:
59
+ var payload = ok.Output.Payload;
60
+ if (!string.IsNullOrEmpty(outputPath))
61
+ {
62
+ Directory.CreateDirectory(Path.GetDirectoryName(Path.GetFullPath(outputPath!))!);
63
+ await File.WriteAllBytesAsync(outputPath!, payload.ToArray());
64
+ JsonOut.Write(new { status = "ok", path = outputPath });
65
+ }
66
+ else
67
+ {
68
+ var text = Encoding.UTF8.GetString(payload.Span);
69
+ JsonOut.Write(new { status = "ok", format, output = text });
70
+ }
71
+ break;
72
+
73
+ case RejectedResult rej:
74
+ JsonOut.Error("rejected", $"Rejected ({rej.Reason}){(rej.Detail is not null ? $": {rej.Detail}" : "")}", null, 1);
75
+ break;
76
+
77
+ case FailedResult fail:
78
+ JsonOut.Error("failed", fail.ErrorMessage, fail.InnerDetail, 1);
79
+ break;
80
+
81
+ default:
82
+ JsonOut.Error("unknown-result", result.GetType().Name, null, 1);
83
+ break;
84
+ }
85
+ }
86
+ catch (Exception ex)
87
+ {
88
+ JsonOut.Error("unhandled", ex.Message, ex.ToString(), 1);
89
+ }
@@ -0,0 +1,43 @@
1
+ #load "_common.csx"
2
+
3
+ #nullable enable
4
+
5
+ // SCR-01 — Agent-facing wrapper over IDocuoriaEngine.InspectAsync (read-only PDF probe).
6
+ // Args: --pdf <path> [--page <n>]
7
+ // stdout: PdfInspection JSON
8
+ // Exit codes: 0 success, 1 handled error, 2 bad args.
9
+
10
+ using Docuoria.Configuration;
11
+ using Docuoria.Contracts;
12
+ using Docuoria.Models;
13
+
14
+ try
15
+ {
16
+ Cli.Help(Args, "inspect.csx", "Inspect PDF structure (page count, text blocks, tables)",
17
+ ("pdf", true, "Path to the source PDF", false),
18
+ ("page", false, "1-based page index (default: all pages)", false));
19
+
20
+ var pdfPath = Cli.Require(Args, "pdf");
21
+ var pageStr = Cli.Get(Args, "page");
22
+
23
+ using var host = ScriptHost.CreateHost(Args.ToArray(), includeStore: false);
24
+ var engine = ScriptHost.GetEngine(host);
25
+
26
+ PageFilter? filter = null;
27
+ if (!string.IsNullOrWhiteSpace(pageStr))
28
+ {
29
+ if (!int.TryParse(pageStr, out var page) || page < 1)
30
+ {
31
+ JsonOut.Error("bad-arg", "--page must be a positive integer", null, 2);
32
+ }
33
+ filter = PageFilter.SinglePage(page);
34
+ }
35
+
36
+ using var pdf = LoadPdf(pdfPath);
37
+ var result = await engine.InspectAsync(pdf, filter);
38
+ JsonOut.Write(result);
39
+ }
40
+ catch (Exception ex)
41
+ {
42
+ JsonOut.Error("unhandled", ex.Message, ex.ToString(), 1);
43
+ }
@@ -0,0 +1,34 @@
1
+ #load "_common.csx"
2
+
3
+ #nullable enable
4
+
5
+ // SCR-08 — enumerate ITemplateStoreProvider.ListAsync.
6
+ // stdout: { templates: [..ids..] }
7
+
8
+ using Docuoria.Storage;
9
+
10
+ try
11
+ {
12
+ Cli.Help(Args, "list-templates.csx", "List all template IDs in the configured store",
13
+ ("store-path", false, "Local template store directory (default: ./templates)", false),
14
+ ("store-url", false, "API template store URL (mutually exclusive with --store-path)", false),
15
+ ("store-key", false, "Function key for API store authentication", false));
16
+
17
+ using var host = ScriptHost.CreateHost(Args.ToArray(), includeStore: true);
18
+ var store = ScriptHost.GetStore(host);
19
+ if (store is null)
20
+ {
21
+ JsonOut.Error("no-store", "ITemplateStoreProvider is not registered. Pass --store-path <dir> to specify a local template store directory.", null, 1);
22
+ }
23
+
24
+ var ids = new List<string>();
25
+ await foreach (var id in store!.ListAsync())
26
+ {
27
+ ids.Add(id);
28
+ }
29
+ JsonOut.Write(new { templates = ids });
30
+ }
31
+ catch (Exception ex)
32
+ {
33
+ JsonOut.Error("unhandled", ex.Message, ex.ToString(), 1);
34
+ }
@@ -0,0 +1,54 @@
1
+ #load "_common.csx"
2
+
3
+ #nullable enable
4
+
5
+ // SCR-09 — fetch a template by ID and emit its JSON.
6
+ // Args: --id <identifier> [--output <path>]
7
+ // stdout: parsed template JSON or { status, path }.
8
+
9
+ using System.Text.Json;
10
+ using Docuoria.Serialization;
11
+ using Docuoria.Storage;
12
+
13
+ try
14
+ {
15
+ Cli.Help(Args, "load-template.csx", "Fetch a template by ID and emit its JSON",
16
+ ("id", true, "Template identifier", false),
17
+ ("output", false, "Write template JSON to this file path", false),
18
+ ("store-path", false, "Local template store directory (default: ./templates)", false),
19
+ ("store-url", false, "API template store URL (mutually exclusive with --store-path)", false),
20
+ ("store-key", false, "Function key for API store authentication", false));
21
+
22
+ var id = Cli.Require(Args, "id");
23
+ var outputPath = Cli.Get(Args, "output");
24
+
25
+ using var host = ScriptHost.CreateHost(Args.ToArray(), includeStore: true);
26
+ var store = ScriptHost.GetStore(host);
27
+ if (store is null)
28
+ {
29
+ JsonOut.Error("no-store", "ITemplateStoreProvider is not registered. Pass --store-path <dir> to specify a local template store directory.", null, 1);
30
+ }
31
+
32
+ var tpl = await store!.LoadAsync(id);
33
+ if (tpl is null)
34
+ {
35
+ JsonOut.Error("not-found", $"Template '{id}' not found. Run list-templates.csx to see available template IDs.", null, 1);
36
+ }
37
+
38
+ var json = tpl!.ToJson();
39
+ if (!string.IsNullOrEmpty(outputPath))
40
+ {
41
+ Directory.CreateDirectory(Path.GetDirectoryName(Path.GetFullPath(outputPath!))!);
42
+ await File.WriteAllTextAsync(outputPath!, json);
43
+ JsonOut.Write(new { status = "ok", path = outputPath });
44
+ }
45
+ else
46
+ {
47
+ var element = JsonSerializer.Deserialize<JsonElement>(json, DocuoriaJsonOptions.Default);
48
+ JsonOut.Write(element);
49
+ }
50
+ }
51
+ catch (Exception ex)
52
+ {
53
+ JsonOut.Error("unhandled", ex.Message, ex.ToString(), 1);
54
+ }
@@ -0,0 +1,53 @@
1
+ #load "_common.csx"
2
+
3
+ #nullable enable
4
+
5
+ // SCR-10 — persist a template JSON file to the configured store.
6
+ // Args: --template <path> [--overwrite] [--store-path <dir>] [--store-url <url>] [--store-key <key>]
7
+ // stdout: { status, identifier }.
8
+
9
+ using Docuoria.Models;
10
+ using Docuoria.Storage;
11
+ using Docuoria.Storage.Exceptions;
12
+
13
+ try
14
+ {
15
+ Cli.Help(Args, "save-template.csx", "Persist a template JSON file to the configured store",
16
+ ("template", true, "Path to the template JSON file", false),
17
+ ("overwrite", false, "Overwrite if template already exists", true),
18
+ ("store-path", false, "Local template store directory (default: ./templates)", false),
19
+ ("store-url", false, "API template store URL (mutually exclusive with --store-path)", false),
20
+ ("store-key", false, "Function key for API store authentication", false));
21
+
22
+ var filePath = Cli.Require(Args, "template");
23
+ var overwrite = Cli.Has(Args, "overwrite");
24
+
25
+ if (!File.Exists(filePath))
26
+ {
27
+ JsonOut.Error("template-not-found", $"Template file not found at '{filePath}'", null, 1);
28
+ }
29
+
30
+ using var host = ScriptHost.CreateHost(Args.ToArray(), includeStore: true);
31
+ var store = ScriptHost.GetStore(host);
32
+ if (store is null)
33
+ {
34
+ JsonOut.Error("no-store", "ITemplateStoreProvider is not registered. Pass --store-path <dir> to specify a local template store directory.", null, 1);
35
+ }
36
+
37
+ var tpl = Template.FromJson(File.ReadAllText(filePath));
38
+
39
+ try
40
+ {
41
+ await store!.SaveAsync(tpl, overwrite);
42
+ }
43
+ catch (TemplateAlreadyExistsException tae)
44
+ {
45
+ JsonOut.Error("already-exists", $"{tae.Message} Use --overwrite to replace the existing template.", null, 1);
46
+ }
47
+
48
+ JsonOut.Write(new { status = "ok", identifier = tpl.Identifier });
49
+ }
50
+ catch (Exception ex)
51
+ {
52
+ JsonOut.Error("unhandled", ex.Message, ex.ToString(), 1);
53
+ }
@@ -0,0 +1,84 @@
1
+ #load "_common.csx"
2
+
3
+ #nullable enable
4
+
5
+ // SCR-12 — Dump all type/enum/mode information for the SDK.
6
+ // Args: (none)
7
+ // stdout: JSON object with fieldTypes, extractionSources, modes, matchRules, subFieldMappings.
8
+
9
+ using System.Reflection;
10
+ using Docuoria.Models;
11
+
12
+ try
13
+ {
14
+ Cli.Help(Args, "schema-info.csx", "List all SDK types, enums, modes, and valid values");
15
+
16
+ var fieldTypes = new Dictionary<int, string>();
17
+ foreach (var val in Enum.GetValues<FieldType>())
18
+ {
19
+ fieldTypes[(int)val] = val.ToString();
20
+ }
21
+
22
+ var extractionSources = new[]
23
+ {
24
+ "TextPatternExtractionSource",
25
+ "TextAnchorExtractionSource",
26
+ "TableCellExtractionSource",
27
+ "TableRowsExtractionSource",
28
+ "MetadataFieldExtractionSource",
29
+ "FallbackExtractionSource"
30
+ };
31
+
32
+ var modes = new Dictionary<string, object>
33
+ {
34
+ ["TextPatternExtractionSource"] = new { mode = new[] { "Token", "Pattern", "AllMatches" }, notes = new { Token = "requires 'literalToken'", Pattern = "requires 'regexPattern'", AllMatches = "requires 'regexPattern'" } },
35
+ ["TableRowsExtractionSource"] = new { mode = new[] { "ByHeader", "Ordinal" } },
36
+ ["TextPatternMatchRule"] = new { mode = new[] { "AnyToken", "AllTokens" } },
37
+ ["FileNameMatchRule"] = new { mode = new[] { "Glob", "Regex" } },
38
+ ["CompositeMatchRule"] = new { @operator = new[] { "And", "Or", "Not" } }
39
+ };
40
+
41
+ var matchRules = new[]
42
+ {
43
+ "TextPatternMatchRule",
44
+ "FileNameMatchRule",
45
+ "TextAnchorMatchRule",
46
+ "MetadataMatchRule",
47
+ "PageGeometryMatchRule",
48
+ "TableMatchRule",
49
+ "CompositeMatchRule"
50
+ };
51
+
52
+ var subFieldMappings = new[]
53
+ {
54
+ "NamedGroupSubFieldMapping",
55
+ "RegexGroupSubFieldMapping",
56
+ "HeaderSubFieldMapping",
57
+ "OrdinalSubFieldMapping"
58
+ };
59
+
60
+ var metadataFields = new[]
61
+ {
62
+ "Title", "Author", "Subject", "Keywords",
63
+ "Creator", "Producer", "CreationDate", "ModifiedDate"
64
+ };
65
+
66
+ JsonOut.Write(new
67
+ {
68
+ fieldTypes,
69
+ extractionSources,
70
+ modes,
71
+ matchRules,
72
+ subFieldMappings,
73
+ metadataFields,
74
+ notes = new
75
+ {
76
+ fieldType = "fieldType serializes as INTEGER (0-5), not string. Use the integer value in template JSON.",
77
+ kind = "All polymorphic types use '$kind' discriminator with the CLR class name."
78
+ }
79
+ });
80
+ }
81
+ catch (Exception ex)
82
+ {
83
+ JsonOut.Error("unhandled", ex.Message, ex.ToString(), 1);
84
+ }
@@ -0,0 +1,44 @@
1
+ #load "_common.csx"
2
+
3
+ #nullable enable
4
+
5
+ // SCR-03 — Wrapper over IDocuoriaEngine.TestGroupsAsync.
6
+ // Args: --pattern <regex> --pdf <path> [--page <n>]
7
+ // stdout: PatternGroupTestResult JSON.
8
+
9
+ using Docuoria.Configuration;
10
+ using Docuoria.Contracts;
11
+ using Docuoria.Models;
12
+
13
+ try
14
+ {
15
+ Cli.Help(Args, "test-groups.csx", "Test each capture group of a regex independently",
16
+ ("pdf", true, "Path to the source PDF", false),
17
+ ("pattern", true, "Multi-group regex pattern", false),
18
+ ("page", false, "1-based page index (default: all pages)", false));
19
+
20
+ var pattern = Cli.Require(Args, "pattern");
21
+ var pdfPath = Cli.Require(Args, "pdf");
22
+ var pageStr = Cli.Get(Args, "page");
23
+
24
+ using var host = ScriptHost.CreateHost(Args.ToArray(), includeStore: false);
25
+ var engine = ScriptHost.GetEngine(host);
26
+
27
+ PatternTestOptions? options = null;
28
+ if (!string.IsNullOrWhiteSpace(pageStr))
29
+ {
30
+ if (!int.TryParse(pageStr, out var page) || page < 1)
31
+ {
32
+ JsonOut.Error("bad-arg", "--page must be a positive integer", null, 2);
33
+ }
34
+ options = new PatternTestOptions { PageFilter = PageFilter.SinglePage(page) };
35
+ }
36
+
37
+ using var pdf = LoadPdf(pdfPath);
38
+ var result = await engine.TestGroupsAsync(pdf, pattern, options);
39
+ JsonOut.Write(result);
40
+ }
41
+ catch (Exception ex)
42
+ {
43
+ JsonOut.Error("unhandled", ex.Message, ex.ToString(), 1);
44
+ }
@@ -0,0 +1,61 @@
1
+ #load "_common.csx"
2
+
3
+ #nullable enable
4
+
5
+ // SCR-02 — Wrapper over IDocuoriaEngine.TestPatternAsync.
6
+ // Args: --pattern <regex> --pdf <path> [--page <n>] [--block-separator <str>]
7
+ // stdout: PatternTestResult JSON (Error round-trips per Phase 24 — do NOT swallow).
8
+
9
+ using Docuoria.Configuration;
10
+ using Docuoria.Contracts;
11
+ using Docuoria.Models;
12
+
13
+ try
14
+ {
15
+ Cli.Help(Args, "test-pattern.csx", "Test a regex pattern against PDF text",
16
+ ("pdf", true, "Path to the source PDF", false),
17
+ ("pattern", true, "Regex pattern to test", false),
18
+ ("page", false, "1-based page index (default: all pages)", false),
19
+ ("block-separator", false, "Override block separator for text flattening", false));
20
+
21
+ var pattern = Cli.Require(Args, "pattern");
22
+ var pdfPath = Cli.Require(Args, "pdf");
23
+ var blockSep = Cli.Get(Args, "block-separator");
24
+ var pageStr = Cli.Get(Args, "page");
25
+
26
+ using var host = ScriptHost.CreateHost(Args.ToArray(), includeStore: false);
27
+ var engine = ScriptHost.GetEngine(host);
28
+
29
+ PatternTestOptions? options = null;
30
+
31
+ // Build options from optional flags
32
+ string? blockSepValue = blockSep;
33
+ PageFilter? pageFilter = null;
34
+
35
+ if (!string.IsNullOrWhiteSpace(pageStr))
36
+ {
37
+ if (!int.TryParse(pageStr, out var page) || page < 1)
38
+ {
39
+ JsonOut.Error("bad-arg", "--page must be a positive integer", null, 2);
40
+ }
41
+ pageFilter = PageFilter.SinglePage(page);
42
+ }
43
+
44
+ if (!string.IsNullOrEmpty(blockSepValue) || pageFilter is not null)
45
+ {
46
+ options = new PatternTestOptions
47
+ {
48
+ BlockSeparator = blockSepValue ?? "\n",
49
+ PageFilter = pageFilter
50
+ };
51
+ }
52
+
53
+ using var pdf = LoadPdf(pdfPath);
54
+
55
+ var result = await engine.TestPatternAsync(pdf, pattern, options);
56
+ JsonOut.Write(result);
57
+ }
58
+ catch (Exception ex)
59
+ {
60
+ JsonOut.Error("unhandled", ex.Message, ex.ToString(), 1);
61
+ }
@@ -0,0 +1,54 @@
1
+ #load "_common.csx"
2
+
3
+ #nullable enable
4
+
5
+ // SCR-04 — Template.FromJson + Validate() static check (no engine required, but we host for consistency).
6
+ // Args: --template <file.json>
7
+ // stdout: { valid, errors }
8
+
9
+ using System.Text.Json;
10
+ using Docuoria.Models;
11
+
12
+ try
13
+ {
14
+ Cli.Help(Args, "validate-template.csx", "Validate a template JSON file against the schema",
15
+ ("template", true, "Path to the template JSON file", false));
16
+
17
+ var templatePath = Cli.Require(Args, "template");
18
+ if (!File.Exists(templatePath))
19
+ {
20
+ JsonOut.Error("template-not-found", $"Template not found at '{templatePath}'", null, 1);
21
+ }
22
+
23
+ var json = File.ReadAllText(templatePath);
24
+
25
+ Template tpl;
26
+ try
27
+ {
28
+ tpl = Template.FromJson(json);
29
+ }
30
+ catch (JsonException jex)
31
+ {
32
+ var message = jex.Message;
33
+ var hint = (string?)null;
34
+
35
+ // Detect fieldType string-vs-integer errors and add a helpful hint.
36
+ if (message.Contains("FieldType", StringComparison.OrdinalIgnoreCase)
37
+ || (message.Contains("fieldType", StringComparison.OrdinalIgnoreCase))
38
+ || (message.Contains("could not be converted") && message.Contains("String", StringComparison.Ordinal)))
39
+ {
40
+ hint = "fieldType must be an integer (0=String, 1=Number, 2=Integer, 3=Boolean, 4=Date, 5=Timestamp). "
41
+ + "Do NOT use string values like \"String\" or \"Number\".";
42
+ }
43
+
44
+ JsonOut.Error("parse-error", hint is not null ? $"{message} — Hint: {hint}" : message, null, 1);
45
+ return;
46
+ }
47
+
48
+ var errors = tpl.Validate();
49
+ JsonOut.Write(new { valid = errors.Count == 0, errors });
50
+ }
51
+ catch (Exception ex)
52
+ {
53
+ JsonOut.Error("unhandled", ex.Message, ex.ToString(), 1);
54
+ }
@@ -0,0 +1,88 @@
1
+ ---
2
+ name: docuoria
3
+ description: Use this skill when working with Docuoria to extract structured data from PDFs, author or validate a template, design match rules for classification, diagnose a FailedResult or RejectedResult, select an ExtractionSource type, write or debug a regex pattern, or verify that PDF processing is local and private. Apply even when the user does not say "Docuoria" — any task involving the Docuoria CLI scripts, template JSON, or the IDocuoriaEngine API qualifies.
4
+ license: MIT
5
+ compatibility: Requires .NET 10 SDK and the `dotnet-script` global tool. SDK assembly (`Docuoria.dll`) is bundled under `assets/lib/`; transitive NuGet dependencies (PdfPig, Tabula, CsvHelper, pythonnet, Microsoft.Extensions.*) are resolved by `dotnet-script` at first run.
6
+ ---
7
+
8
+ # Docuoria Skill
9
+
10
+ ## Installing this skill
11
+
12
+ This skill directory was scaffolded by the Docuoria CLI. To install or update:
13
+
14
+ ```bash
15
+ # npm (Node.js ≥ 20)
16
+ npm install -g @sidub/docuoria
17
+ docuoria init
18
+
19
+ # .NET global tool
20
+ dotnet tool install -g Docuoria.Cli
21
+ docuoria init
22
+
23
+ # Update an existing installation
24
+ docuoria update
25
+
26
+ # Check status / drift
27
+ docuoria list-tools
28
+ docuoria doctor
29
+ ```
30
+
31
+ See `docs/cli.md` in the Docuoria repository for the full command reference.
32
+
33
+ ---
34
+
35
+ ## Invocation
36
+
37
+ All scripts follow `dotnet script scripts/<name>.csx -- --<flag> <value>`, run from the skill root. The `--` separator is mandatory — without it, dotnet-script consumes the flags as its own. Positional arguments are rejected; pass `--help` to any script for its full flag list.
38
+
39
+ Scripts divide into two groups:
40
+
41
+ | Group | Scripts | Store flag |
42
+ | --- | --- | --- |
43
+ | **Store-aware** — read from or write to a template store | `classify`, `evaluate-match`, `list-templates`, `load-template`, `save-template` | `--store-path <dir>` or `--store-url <url>` |
44
+ | **Standalone** — operate on individual PDF and/or template files | `inspect`, `test-pattern`, `test-groups`, `dry-run`, `execute`, `validate-template` | — |
45
+
46
+ Store-aware scripts accept `--store-path <dir>` (local directory) or `--store-url <url>` (API endpoint) to locate templates; these flags are mutually exclusive. When omitted, `--store-path` defaults to `./templates` relative to the process working directory — since the CWD varies by environment, always pass the store location explicitly.
47
+
48
+ ## Workflow
49
+
50
+ The pipeline runs in order; classification determines the entry point. Load `references/workflow.md` for the full step-by-step guide.
51
+
52
+ 1. **Classify** — match the PDF against all stored templates
53
+ 2. **Inspect** — read the engine's text extraction (when no template matches)
54
+ 3. **Test** — prove regex patterns against the engine's haystack
55
+ 4. **Build** — author the template JSON, validate classification rules and schema
56
+ 5. **Dry-run** — end-to-end extraction without output generation
57
+ 6. **Execute** — full pipeline producing CSV or JSON output
58
+ 7. **Store** — persist the template and verify it ranks correctly
59
+
60
+ ## Routing
61
+
62
+ Consult the canonical reference before relying on memory. Each concern has a single owner.
63
+
64
+ | If the agent needs to… | Load |
65
+ | --- | --- |
66
+ | Follow the full pipeline step-by-step | `references/workflow.md` |
67
+ | Pick an `ExtractionSource` subtype for a field (`TextPattern`, `TableRows`, `TextAnchor`, `MetadataField`, `Fallback`) | `references/decision-tree.md` |
68
+ | Design a discriminating `rootMatchRule` (token selection, composite architecture, structural rules, weights, thresholds) | `references/classification.md` |
69
+ | Diagnose a `RejectedResult`, `FailedResult`, classification failure, or empty/incomplete `DryRunSucceeded` | `references/failure-tree.md` |
70
+ | Map a stderr `error.code` to a remediation branch | `references/failure-tree.md` § Stderr error.code → Branch routing |
71
+ | Copy a regex pattern from the library or adapt one to a specific PDF | `references/patterns.md` then `references/pattern-authoring.md` |
72
+ | Look up a CLI script's flags, output envelope, or error codes | `references/scripts.md` |
73
+ | Look up a template JSON property, `$kind` discriminator, enum value, or shape | `references/template-reference.md` |
74
+ | Answer whether PDF processing is local/private | `references/privacy.md` |
75
+
76
+ ## Skill layout
77
+
78
+ - `SKILL.md` — this router; loaded at skill activation.
79
+ - `references/` — deep guides loaded on demand (see Routing table).
80
+ - `scripts/` — `dotnet-script` CLI surface (`_common.csx` plus 11 verb scripts).
81
+ - `assets/lib/Docuoria.dll` — bundled SDK assembly.
82
+ - `assets/schemas/template-schema.json` — JSON Schema for template authoring and validation.
83
+ - `examples/` — three worked end-to-end walkthroughs.
84
+
85
+ ## Gotchas
86
+
87
+ - **`fieldType` in template JSON must be an integer (0–5), never a string.** The engine rejects string values with `RejectionReason.MalformedTemplate`. Enum: 0 String, 1 Number, 2 Integer, 3 Boolean, 4 Date, 5 Timestamp. Run `validate-template.csx` to catch this before dry-run.
88
+ - **Adapt every regex to the actual PDF.** The engine's flattened text differs from the visual layout — whitespace, line breaks, and character encoding may not match what you see. Validate with `test-pattern.csx` and `inspect.csx` rather than pasting library patterns verbatim.