extract-cli 0.1.6__tar.gz → 0.1.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. extract_cli-0.1.9/AGENTS.md +87 -0
  2. {extract_cli-0.1.6 → extract_cli-0.1.9}/CHANGELOG.md +76 -0
  3. {extract_cli-0.1.6 → extract_cli-0.1.9}/PKG-INFO +32 -3
  4. {extract_cli-0.1.6 → extract_cli-0.1.9}/README.md +30 -1
  5. {extract_cli-0.1.6 → extract_cli-0.1.9}/docs/INTEROP.md +7 -4
  6. {extract_cli-0.1.6 → extract_cli-0.1.9}/docs/spec/extract-output.schema.json +58 -0
  7. {extract_cli-0.1.6 → extract_cli-0.1.9}/extract_cli.py +344 -23
  8. extract_cli-0.1.9/llms.txt +79 -0
  9. {extract_cli-0.1.6 → extract_cli-0.1.9}/pyproject.toml +7 -2
  10. {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/_fixtures_build.py +36 -2
  11. {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/_make_goldens.py +2 -2
  12. {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/conftest.py +1 -0
  13. {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/fixtures/employment_docx.docx.expected.json +14 -1
  14. {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/fixtures/heading_docx.docx.expected.json +8 -1
  15. {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/fixtures/lease_allcaps.txt.expected.json +14 -1
  16. {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/fixtures/license_pdf.pdf.expected.json +14 -1
  17. {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/fixtures/nda_h2.md.expected.json +8 -1
  18. extract_cli-0.1.9/tests/fixtures/numbered_docx.docx +0 -0
  19. extract_cli-0.1.9/tests/fixtures/numbered_docx.docx.expected.json +149 -0
  20. {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/fixtures/scanned.pdf.expected.json +8 -1
  21. {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/fixtures/services_bold.txt.expected.json +14 -1
  22. {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/fixtures/services_html.html.expected.json +17 -4
  23. {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/test_clause_map.py +38 -0
  24. {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/test_cli.py +63 -1
  25. {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/test_deterministic.py +25 -0
  26. {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/test_misc.py +76 -0
  27. {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/test_property.py +18 -0
  28. {extract_cli-0.1.6 → extract_cli-0.1.9}/.gitignore +0 -0
  29. {extract_cli-0.1.6 → extract_cli-0.1.9}/ARCHITECTURE.md +0 -0
  30. {extract_cli-0.1.6 → extract_cli-0.1.9}/CONTRIBUTING.md +0 -0
  31. {extract_cli-0.1.6 → extract_cli-0.1.9}/LICENSE +0 -0
  32. {extract_cli-0.1.6 → extract_cli-0.1.9}/Makefile +0 -0
  33. {extract_cli-0.1.6 → extract_cli-0.1.9}/config/llm.json.example +0 -0
  34. {extract_cli-0.1.6 → extract_cli-0.1.9}/scripts/release.py +0 -0
  35. {extract_cli-0.1.6 → extract_cli-0.1.9}/scripts/validate_against_spec.py +0 -0
  36. {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/_schema_validator.py +0 -0
  37. {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/fixtures/employment_docx.docx +0 -0
  38. {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/fixtures/heading_docx.docx +0 -0
  39. {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/fixtures/lease_allcaps.txt +0 -0
  40. {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/fixtures/license_pdf.pdf +0 -0
  41. {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/fixtures/nda_h2.md +0 -0
  42. {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/fixtures/scanned.pdf +0 -0
  43. {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/fixtures/services_bold.txt +0 -0
  44. {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/fixtures/services_html.html +0 -0
  45. {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/test_llm.py +0 -0
  46. {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/test_schema_conformance.py +0 -0
@@ -0,0 +1,87 @@
1
+ # Agents
2
+
3
+ Drive `extract-cli` from an LLM agent or non-interactive client. Same agent
4
+ contract as the rest of the contract-ops suite: a stable machine-readable
5
+ catalog, JSON on stdout, humans on stderr, and a small documented exit-code set.
6
+
7
+ `extract-cli` is the suite's **open-loop front door**: hand it any contract
8
+ (`.md` / `.txt` / `.html` / `.docx` / `.pdf`, yours or a counterparty's) and it
9
+ returns structured JSON the rest of the pipeline can consume. Every field
10
+ carries a `confidence` and a `source` — **verify, don't trust**.
11
+
12
+ ## Output contract
13
+
14
+ - **Success**: a single JSON object to **stdout**, exit `0`. This is the machine
15
+ payload; it's the default (no `--json` needed, though `--json` forces it).
16
+ - Every extracted scalar is the envelope `{value, confidence, source}`;
17
+ "not found" is the canonical `{value: null, confidence: 0.0, source: "none"}`.
18
+ Lists (`parties`, `clauses`, `defined_terms`) carry per-item
19
+ `confidence`/`source`. `source ∈ {deterministic, llm, none}`.
20
+ - `_meta` records `extractor_version`, `tiers_used`, and `llm_used`.
21
+ - The output shape is locked by a JSON Schema —
22
+ [`docs/spec/extract-output.schema.json`](docs/spec/extract-output.schema.json),
23
+ also printed by `extract schema`. Validate against it instead of trusting
24
+ field shapes by convention. (Note: the `--no-confidence` projection is a
25
+ reduced convenience view, **not** governed by the schema.)
26
+ - **stderr** is for humans only: `--why` rationale, warnings, and errors.
27
+ stdout stays clean JSON even under `--why`.
28
+ - **Failure**: a one-line `error: <message>` on **stderr**, non-zero exit.
29
+ The error shape is a flat string (the suite is not uniform on error-object
30
+ shape) — **branch on the exit code, never on the human-readable message.**
31
+
32
+ ## Exit codes
33
+
34
+ | Code | Meaning |
35
+ |------|---------|
36
+ | `0` | Success. |
37
+ | `1` | Low-signal document — no high-signal fields (parties/clauses/dates) could be extracted; e.g. a scanned/image-only or empty file. A **finding**, not a crash: valid JSON is still emitted on stdout. |
38
+ | `2` | Bad usage / user-actionable error (unreadable path, bad flag value, unsupported completion shell). |
39
+
40
+ ## Discovery
41
+
42
+ Never hardcode command or flag names — call the catalog at startup:
43
+
44
+ ```bash
45
+ extract --catalog json # {name, bin, version, description, commands[], exitCodes}
46
+ ```
47
+
48
+ `--catalog json` is the suite-wide discovery contract (parallel to
49
+ `nda-review-cli --catalog json`, `docx2pdf --catalog json`,
50
+ `sign --catalog json`). It is **complete, accurate, and stable across minor
51
+ versions** — a test asserts it never drifts from the real parser.
52
+
53
+ Tool-specific discovery extras:
54
+
55
+ ```bash
56
+ extract schema # the output JSON Schema (the cross-CLI data contract)
57
+ extract fields # extractable fields and the tier that produces each
58
+ extract fields --json # ...as JSON
59
+ extract demo # run on a bundled fixture (zero-config first run)
60
+ extract --version
61
+ ```
62
+
63
+ ## Failure → recovery
64
+
65
+ | Symptom | Diagnose | Recover |
66
+ |---|---|---|
67
+ | Exit `1`, warning "no high-signal fields" | The document is likely scanned/image-only or has no recognizable structure. JSON is still emitted. | OCR the source first, or feed a text/`.docx`/`.md` version. The empty-but-valid JSON is safe to pass downstream. |
68
+ | Exit `2`, `error: ...` | `extract --catalog json` (or `extract <cmd> --help`) for the real surface. | Fix the path/flag and retry. |
69
+ | `clauses: []` on a real contract | The `.docx` likely auto-numbers via Word's numbering with no heading style (its numbers live only in `numbering.xml`), so the deterministic cascade sees no headings. | Re-run with `--llm` (opt-in): when no clauses are detected, the LLM is asked for section headings, normalized through the same canonical vocabulary and emitted with `tier: "llm"`, `source: "llm"`, and a modest confidence. Requires `~/.config/contract-ops/llm.json`. |
70
+ | Low-fidelity `.docx`/`.pdf` text | The stdlib best-effort reader ran (no extras installed). | `pip install "extract-cli[docx]"` and/or `"extract-cli[pdf]"` for higher fidelity. The core always works without them. |
71
+ | `--llm` only printed a warning | No LLM config found. | Copy [`config/llm.json.example`](config/llm.json.example) to `~/.config/contract-ops/llm.json`. Without it, deterministic output is still returned in full. |
72
+
73
+ ## Recommended usage
74
+
75
+ ```bash
76
+ # Inspect any contract's structure, one tool for five formats.
77
+ extract counterparty.docx | jq '{parties: [.parties[].name],
78
+ governing_law: .governing_law.value, clauses: [.clauses[].canonical_title]}'
79
+
80
+ # Gate a workflow on extraction confidence (non-zero exit if any clause is shaky).
81
+ extract draft.docx | jq -e '.clauses | all(.confidence > 0.7)' && echo ok
82
+ ```
83
+
84
+ The integration contract is the **output schema** + the **shared canonical
85
+ clause vocabulary** (`canonical_title` values match what `template-vault-cli`
86
+ detects and `nda-review-cli` keys policy on) — not per-tool flags. See
87
+ [`docs/INTEROP.md`](docs/INTEROP.md).
@@ -6,6 +6,79 @@ to [Semantic Versioning](https://semver.org/). Per the suite convention
6
6
  (see [`docs/INTEROP.md`](docs/INTEROP.md)), **backward-incompatible changes to
7
7
  the output schema require a major version bump**; new optional fields are minor.
8
8
 
9
+ ## [0.1.9] - 2026-05-22
10
+
11
+ ### Security / robustness
12
+ - **Resource bounds for untrusted input.** A hard on-disk file cap
13
+ (`MAX_INPUT_BYTES`, 100 MB) and a decompressed-size cap
14
+ (`MAX_DECOMPRESSED_BYTES`, 200 MB) so a zip-bomb `.docx` or zlib-bomb `.pdf`
15
+ can't exhaust memory: the DOCX reader checks `word/document.xml`'s
16
+ uncompressed size before reading, and the PDF reader decompresses streams
17
+ with a bounded budget. Both degrade gracefully (warning, empty text), never
18
+ crash. (Verified fast/bounded on a 2 MB doc: ~0.6 s, ~10 MB peak.)
19
+
20
+ ### Added (output schema — minor, backward-compatible additions)
21
+ - **`jurisdiction`** — governing law normalized to a stable code
22
+ (`State of Delaware` → `US-DE`, `Province of Ontario` → `CA-ON`, …).
23
+ - **`amounts[]`** — every distinct monetary amount (`value` remains the headline one).
24
+ - **`signatories[]`** — `{name, title}` from signature blocks (`By:` / `Name:` /
25
+ `Title:`); empty on unsigned templates.
26
+
27
+ ### Changed
28
+ - **Clause vocabulary round 2** (from the corpus survey): canonical
29
+ `Suspension`, `Support`, `Service Levels` + `invoicing`→Payment,
30
+ customer-data/protection-by-* → Data Protection. Noise filter now also drops
31
+ recitals/preamble/signature sections, definition fragments (a title starting
32
+ with a quote), and unfilled placeholders (`[ # ]%`). Mapped clause coverage
33
+ across the 58-document corpus rose from 57% → **64%**, no over-matching.
34
+ - Test coverage raised to **92%** (94% with the `[docx]`/`[pdf]` extras).
35
+
36
+ ## [0.1.8] - 2026-05-22
37
+
38
+ Clause-detection breadth, driven by a 58-document real-corpus survey.
39
+
40
+ ### Added
41
+ - **Auto-numbered DOCX clauses.** The DOCX reader now treats `w:numPr` list
42
+ paragraphs (no heading style; number generated from `numbering.xml`) as
43
+ clause-heading candidates, run through the same run-in/heading-likeness filter
44
+ as heading styles. Real agreements that number clauses this way (data
45
+ processing / design-partner agreements) get a clause map where they previously
46
+ got none; deep numbered body sentences are still excluded. New `numbered_docx`
47
+ fixture + tests.
48
+ - **Two-line `ARTICLE N` headings.** A bare `ARTICLE N` / `SECTION N` line whose
49
+ title sits on the next line (common in formal agreements) is detected as a
50
+ pair — recovering, e.g., a real SEC services agreement's clause map (0 → 8).
51
+ Fires only with >= 2 well-formed pairs; reported under the `numbered` tier (no
52
+ schema change).
53
+ - **Expanded canonical clause vocabulary** from the corpus survey: new canonical
54
+ clauses `Exclusions`, `Remedies`, `Restrictions`, `Taxes`,
55
+ `Reservation of Rights`, `Third-Party Beneficiaries`, `Feedback`,
56
+ `Miscellaneous`, plus aliases for `Compliance with Laws` (anti-bribery, export
57
+ controls) and `Data Protection` (customer data/content). ~155 more clauses map
58
+ across the corpus, with no observed over-matching.
59
+ - **`CLAUDE.md`** — codebase development notes (complements AGENTS.md).
60
+
61
+ No output-schema change.
62
+
63
+ ## [0.1.7] - 2026-05-22
64
+
65
+ ### Added
66
+ - **`extract --catalog json` — the suite's shared discovery contract.** Emits
67
+ `{name, bin, version, description, commands[], exitCodes}` (mirroring
68
+ `nda-review-cli --catalog json` / `docx2pdf --catalog json` /
69
+ `sign --catalog json`) so agents can learn every command and flag at startup
70
+ instead of hardcoding them. A test asserts the catalog never drifts from the
71
+ real argparse parser. Also added to the bash/zsh completion flag lists.
72
+ - **`AGENTS.md`** — the agent contract in the suite's canonical section order
73
+ (output contract / exit codes / discovery / failure → recovery).
74
+ - **`llms.txt`** — machine-readable tool summary at the repo root.
75
+
76
+ ### Changed
77
+ - Packaging: added the suite-standard keywords (`contract-ops`, `agent-first`,
78
+ `legal-tech`); README now opens with `## Run this` / `## Where to go next`;
79
+ `--catalog json` documented in the README and `docs/INTEROP.md`. No schema or
80
+ extraction-logic change (`extractor_version` unchanged).
81
+
9
82
  ## [0.1.6] - 2026-05-21
10
83
 
11
84
  ### Docs
@@ -198,6 +271,9 @@ Initial release — the open-loop front door of the contract-ops CLI suite.
198
271
  intentionally *not* governed by the output schema (the schema describes the
199
272
  full default output).
200
273
 
274
+ [0.1.9]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.9
275
+ [0.1.8]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.8
276
+ [0.1.7]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.7
201
277
  [0.1.6]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.6
202
278
  [0.1.5]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.5
203
279
  [0.1.4]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.4
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-cli
3
- Version: 0.1.6
3
+ Version: 0.1.9
4
4
  Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON.
5
5
  Project-URL: Homepage, https://cli.drbaher.com/
6
6
  Project-URL: Repository, https://github.com/DrBaher/extract-cli
@@ -8,7 +8,7 @@ Project-URL: Suite interop, https://github.com/DrBaher/extract-cli/blob/main/doc
8
8
  Author-email: DrBaher <Drbaher@gmail.com>
9
9
  License: MIT
10
10
  License-File: LICENSE
11
- Keywords: clause,cli,contract,extraction,json,legal,nda
11
+ Keywords: agent-first,clause,cli,contract,contract-ops,extraction,json,legal,legal-tech,nda
12
12
  Classifier: Development Status :: 4 - Beta
13
13
  Classifier: Environment :: Console
14
14
  Classifier: Intended Audience :: Developers
@@ -61,6 +61,30 @@ ingest (extract) → review → diff → convert → sign
61
61
  ^you are here
62
62
  ```
63
63
 
64
+ ## Run this
65
+
66
+ ```bash
67
+ pipx run extract-cli demo # zero-config: extract a bundled NDA → structured JSON
68
+ # or, installed: pip install extract-cli && extract demo
69
+ ```
70
+
71
+ That prints the full output contract — parties, dates, term, governing law, and
72
+ a clause map normalized onto the suite's canonical vocabulary — for a bundled
73
+ fixture, with no setup and no network. Point it at your own file with
74
+ `extract path/to/contract.docx`.
75
+
76
+ ## Where to go next
77
+
78
+ - **New here?** Keep reading — [What it does](#what-it-does) and
79
+ [The two extraction tiers](#the-two-extraction-tiers).
80
+ - **Driving it from an agent?** See [`AGENTS.md`](AGENTS.md) and call
81
+ `extract --catalog json` at startup to discover commands/flags. The output
82
+ shape is locked by [`docs/spec/extract-output.schema.json`](docs/spec/extract-output.schema.json).
83
+ - **Wiring it into the pipeline?** See [`docs/INTEROP.md`](docs/INTEROP.md) — the
84
+ contract is the output schema + the shared clause vocabulary.
85
+ - **Contributing / building a sibling CLI?** [`CONTRIBUTING.md`](CONTRIBUTING.md)
86
+ and [ARCHITECTURE.md](ARCHITECTURE.md).
87
+
64
88
  ## What it does
65
89
 
66
90
  Give it a contract in **`.md` / `.txt` / `.html`** (native), **`.docx`**, or
@@ -115,6 +139,7 @@ for them.
115
139
 
116
140
  ```bash
117
141
  extract <path> # parse a document → structured JSON on stdout (default)
142
+ extract --catalog json # machine-readable catalog of commands/flags (agents call at startup)
118
143
  extract schema # print the output JSON Schema (the cross-CLI contract)
119
144
  extract fields # list extractable fields and their tier
120
145
  extract demo # run on a bundled fixture and show the narrative
@@ -125,6 +150,7 @@ extract completion bash # emit a shell-completion script (bash|zsh)
125
150
 
126
151
  | Flag | Meaning |
127
152
  |---|---|
153
+ | `--catalog json` | Print the machine-readable command/flag catalog and exit (the suite discovery contract; agents call this at startup) |
128
154
  | `--llm` | Opt-in LLM enrichment of fuzzy fields (off by default) |
129
155
  | `--fields a,b,c` | Emit only a subset of top-level fields (e.g. `parties,clauses`) |
130
156
  | `--format json\|table` | Output format (default `json`) |
@@ -148,10 +174,13 @@ Streams follow the suite convention: **stdout** is the machine payload (JSON),
148
174
  "dates": { "effective": { "value": "2024-03-01", "confidence": 0.85, "source": "deterministic" }, "expiration": { "value": null, "confidence": 0.0, "source": "none" } },
149
175
  "term": { "length": { "value": "3 years", ... }, "auto_renew": { "value": true, ... }, "notice_period_days": { "value": 60, ... } },
150
176
  "governing_law": { "value": "State of Delaware", "confidence": 0.85, "source": "deterministic" },
177
+ "jurisdiction": { "value": "US-DE", "confidence": 0.8, "source": "deterministic" },
151
178
  "clauses": [ { "canonical_title": "Confidentiality", "detected_title": "## Confidentiality Obligations", "tier": "h2", "span": {"start": 0, "end": 120}, "confidence": 0.95, "source": "deterministic", "mapped": true } ],
152
179
  "defined_terms": [ { "term": "Confidential Information", "confidence": 0.6, "source": "deterministic" } ],
153
180
  "value": { "value": "$50,000", "confidence": 0.6, "source": "deterministic" },
154
- "_meta": { "extractor_version": "0.1.0", "tiers_used": ["deterministic"], "llm_used": false }
181
+ "amounts": [ { "value": "$50,000", "confidence": 0.6, "source": "deterministic" } ],
182
+ "signatories": [ { "name": "Jane Doe", "title": "CEO", "confidence": 0.55, "source": "deterministic" } ],
183
+ "_meta": { "extractor_version": "0.1.9", "tiers_used": ["deterministic"], "llm_used": false }
155
184
  }
156
185
  ```
157
186
 
@@ -23,6 +23,30 @@ ingest (extract) → review → diff → convert → sign
23
23
  ^you are here
24
24
  ```
25
25
 
26
+ ## Run this
27
+
28
+ ```bash
29
+ pipx run extract-cli demo # zero-config: extract a bundled NDA → structured JSON
30
+ # or, installed: pip install extract-cli && extract demo
31
+ ```
32
+
33
+ That prints the full output contract — parties, dates, term, governing law, and
34
+ a clause map normalized onto the suite's canonical vocabulary — for a bundled
35
+ fixture, with no setup and no network. Point it at your own file with
36
+ `extract path/to/contract.docx`.
37
+
38
+ ## Where to go next
39
+
40
+ - **New here?** Keep reading — [What it does](#what-it-does) and
41
+ [The two extraction tiers](#the-two-extraction-tiers).
42
+ - **Driving it from an agent?** See [`AGENTS.md`](AGENTS.md) and call
43
+ `extract --catalog json` at startup to discover commands/flags. The output
44
+ shape is locked by [`docs/spec/extract-output.schema.json`](docs/spec/extract-output.schema.json).
45
+ - **Wiring it into the pipeline?** See [`docs/INTEROP.md`](docs/INTEROP.md) — the
46
+ contract is the output schema + the shared clause vocabulary.
47
+ - **Contributing / building a sibling CLI?** [`CONTRIBUTING.md`](CONTRIBUTING.md)
48
+ and [ARCHITECTURE.md](ARCHITECTURE.md).
49
+
26
50
  ## What it does
27
51
 
28
52
  Give it a contract in **`.md` / `.txt` / `.html`** (native), **`.docx`**, or
@@ -77,6 +101,7 @@ for them.
77
101
 
78
102
  ```bash
79
103
  extract <path> # parse a document → structured JSON on stdout (default)
104
+ extract --catalog json # machine-readable catalog of commands/flags (agents call at startup)
80
105
  extract schema # print the output JSON Schema (the cross-CLI contract)
81
106
  extract fields # list extractable fields and their tier
82
107
  extract demo # run on a bundled fixture and show the narrative
@@ -87,6 +112,7 @@ extract completion bash # emit a shell-completion script (bash|zsh)
87
112
 
88
113
  | Flag | Meaning |
89
114
  |---|---|
115
+ | `--catalog json` | Print the machine-readable command/flag catalog and exit (the suite discovery contract; agents call this at startup) |
90
116
  | `--llm` | Opt-in LLM enrichment of fuzzy fields (off by default) |
91
117
  | `--fields a,b,c` | Emit only a subset of top-level fields (e.g. `parties,clauses`) |
92
118
  | `--format json\|table` | Output format (default `json`) |
@@ -110,10 +136,13 @@ Streams follow the suite convention: **stdout** is the machine payload (JSON),
110
136
  "dates": { "effective": { "value": "2024-03-01", "confidence": 0.85, "source": "deterministic" }, "expiration": { "value": null, "confidence": 0.0, "source": "none" } },
111
137
  "term": { "length": { "value": "3 years", ... }, "auto_renew": { "value": true, ... }, "notice_period_days": { "value": 60, ... } },
112
138
  "governing_law": { "value": "State of Delaware", "confidence": 0.85, "source": "deterministic" },
139
+ "jurisdiction": { "value": "US-DE", "confidence": 0.8, "source": "deterministic" },
113
140
  "clauses": [ { "canonical_title": "Confidentiality", "detected_title": "## Confidentiality Obligations", "tier": "h2", "span": {"start": 0, "end": 120}, "confidence": 0.95, "source": "deterministic", "mapped": true } ],
114
141
  "defined_terms": [ { "term": "Confidential Information", "confidence": 0.6, "source": "deterministic" } ],
115
142
  "value": { "value": "$50,000", "confidence": 0.6, "source": "deterministic" },
116
- "_meta": { "extractor_version": "0.1.0", "tiers_used": ["deterministic"], "llm_used": false }
143
+ "amounts": [ { "value": "$50,000", "confidence": 0.6, "source": "deterministic" } ],
144
+ "signatories": [ { "name": "Jane Doe", "title": "CEO", "confidence": 0.55, "source": "deterministic" } ],
145
+ "_meta": { "extractor_version": "0.1.9", "tiers_used": ["deterministic"], "llm_used": false }
117
146
  }
118
147
  ```
119
148
 
@@ -52,10 +52,12 @@ is a self-contained reference validator.
52
52
 
53
53
  Top-level keys: `document` {title, format, sha256, source_path}, `parties[]`,
54
54
  `dates` {effective, expiration}, `term` {length, auto_renew,
55
- notice_period_days, *renewal_mechanics?*}, `governing_law`, `clauses[]`
56
- {canonical_title, detected_title, tier, span, confidence, source, mapped},
57
- `defined_terms[]`, `value`, *`obligations[]?`*, and `_meta` {extractor_version,
58
- tiers_used, llm_used}. **Every extracted field carries a `confidence` (0–1) and
55
+ notice_period_days, *renewal_mechanics?*}, `governing_law`, `jurisdiction`
56
+ (normalized code, e.g. `US-DE`), `clauses[]` {canonical_title, detected_title,
57
+ tier, span, confidence, source, mapped}, `defined_terms[]`, `value`,
58
+ `amounts[]` (all monetary amounts), `signatories[]` {name, title}, *`obligations[]?`*,
59
+ and `_meta` {extractor_version, tiers_used, llm_used}. Formats: markdown, text,
60
+ html, docx, pdf. **Every extracted field carries a `confidence` (0–1) and
59
61
  a `source` ∈ {deterministic, llm, none}.** Scalar fields use the envelope
60
62
  `{value, confidence, source}`; "not found" is `{value: null, confidence: 0.0,
61
63
  source: "none"}`. Italic fields are added only under `--llm`.
@@ -118,6 +120,7 @@ only stdlib `urllib`, so there is no runtime dependency.
118
120
  | Concern | Convention |
119
121
  |---|---|
120
122
  | Primary result | **stdout** (JSON payload, default) |
123
+ | Discovery | `extract --catalog json` (commands/flags, the suite contract) + `extract schema` / `extract fields --json` |
121
124
  | `--why`, warnings, errors | **stderr** |
122
125
  | `--why` envelope | plain-text `[why] <header>` block (as in template-vault-cli / draft-cli) |
123
126
  | Quiet | `-q` / `--silent` / `--quiet` aliases |
@@ -10,9 +10,12 @@
10
10
  "dates",
11
11
  "term",
12
12
  "governing_law",
13
+ "jurisdiction",
13
14
  "clauses",
14
15
  "defined_terms",
15
16
  "value",
17
+ "amounts",
18
+ "signatories",
16
19
  "_meta"
17
20
  ],
18
21
  "additionalProperties": false,
@@ -157,6 +160,9 @@
157
160
  "governing_law": {
158
161
  "$ref": "#/$defs/field"
159
162
  },
163
+ "jurisdiction": {
164
+ "$ref": "#/$defs/field"
165
+ },
160
166
  "clauses": {
161
167
  "type": "array",
162
168
  "items": {
@@ -247,6 +253,58 @@
247
253
  "value": {
248
254
  "$ref": "#/$defs/field"
249
255
  },
256
+ "amounts": {
257
+ "type": "array",
258
+ "items": {
259
+ "type": "object",
260
+ "required": [
261
+ "value",
262
+ "confidence",
263
+ "source"
264
+ ],
265
+ "properties": {
266
+ "value": {
267
+ "type": "string"
268
+ },
269
+ "confidence": {
270
+ "$ref": "#/$defs/confidence"
271
+ },
272
+ "source": {
273
+ "$ref": "#/$defs/source"
274
+ }
275
+ },
276
+ "additionalProperties": false
277
+ }
278
+ },
279
+ "signatories": {
280
+ "type": "array",
281
+ "items": {
282
+ "type": "object",
283
+ "required": [
284
+ "name",
285
+ "confidence",
286
+ "source"
287
+ ],
288
+ "properties": {
289
+ "name": {
290
+ "type": "string"
291
+ },
292
+ "title": {
293
+ "type": [
294
+ "string",
295
+ "null"
296
+ ]
297
+ },
298
+ "confidence": {
299
+ "$ref": "#/$defs/confidence"
300
+ },
301
+ "source": {
302
+ "$ref": "#/$defs/source"
303
+ }
304
+ },
305
+ "additionalProperties": false
306
+ }
307
+ },
250
308
  "obligations": {
251
309
  "type": "array",
252
310
  "items": {