extract-cli 0.1.6__tar.gz → 0.1.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- extract_cli-0.1.9/AGENTS.md +87 -0
- {extract_cli-0.1.6 → extract_cli-0.1.9}/CHANGELOG.md +76 -0
- {extract_cli-0.1.6 → extract_cli-0.1.9}/PKG-INFO +32 -3
- {extract_cli-0.1.6 → extract_cli-0.1.9}/README.md +30 -1
- {extract_cli-0.1.6 → extract_cli-0.1.9}/docs/INTEROP.md +7 -4
- {extract_cli-0.1.6 → extract_cli-0.1.9}/docs/spec/extract-output.schema.json +58 -0
- {extract_cli-0.1.6 → extract_cli-0.1.9}/extract_cli.py +344 -23
- extract_cli-0.1.9/llms.txt +79 -0
- {extract_cli-0.1.6 → extract_cli-0.1.9}/pyproject.toml +7 -2
- {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/_fixtures_build.py +36 -2
- {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/_make_goldens.py +2 -2
- {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/conftest.py +1 -0
- {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/fixtures/employment_docx.docx.expected.json +14 -1
- {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/fixtures/heading_docx.docx.expected.json +8 -1
- {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/fixtures/lease_allcaps.txt.expected.json +14 -1
- {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/fixtures/license_pdf.pdf.expected.json +14 -1
- {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/fixtures/nda_h2.md.expected.json +8 -1
- extract_cli-0.1.9/tests/fixtures/numbered_docx.docx +0 -0
- extract_cli-0.1.9/tests/fixtures/numbered_docx.docx.expected.json +149 -0
- {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/fixtures/scanned.pdf.expected.json +8 -1
- {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/fixtures/services_bold.txt.expected.json +14 -1
- {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/fixtures/services_html.html.expected.json +17 -4
- {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/test_clause_map.py +38 -0
- {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/test_cli.py +63 -1
- {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/test_deterministic.py +25 -0
- {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/test_misc.py +76 -0
- {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/test_property.py +18 -0
- {extract_cli-0.1.6 → extract_cli-0.1.9}/.gitignore +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.9}/ARCHITECTURE.md +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.9}/CONTRIBUTING.md +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.9}/LICENSE +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.9}/Makefile +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.9}/config/llm.json.example +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.9}/scripts/release.py +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.9}/scripts/validate_against_spec.py +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/_schema_validator.py +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/fixtures/employment_docx.docx +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/fixtures/heading_docx.docx +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/fixtures/lease_allcaps.txt +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/fixtures/license_pdf.pdf +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/fixtures/nda_h2.md +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/fixtures/scanned.pdf +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/fixtures/services_bold.txt +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/fixtures/services_html.html +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/test_llm.py +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.9}/tests/test_schema_conformance.py +0 -0
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# Agents
|
|
2
|
+
|
|
3
|
+
Drive `extract-cli` from an LLM agent or non-interactive client. Same agent
|
|
4
|
+
contract as the rest of the contract-ops suite: a stable machine-readable
|
|
5
|
+
catalog, JSON on stdout, humans on stderr, and a small documented exit-code set.
|
|
6
|
+
|
|
7
|
+
`extract-cli` is the suite's **open-loop front door**: hand it any contract
|
|
8
|
+
(`.md` / `.txt` / `.html` / `.docx` / `.pdf`, yours or a counterparty's) and it
|
|
9
|
+
returns structured JSON the rest of the pipeline can consume. Every field
|
|
10
|
+
carries a `confidence` and a `source` — **verify, don't trust**.
|
|
11
|
+
|
|
12
|
+
## Output contract
|
|
13
|
+
|
|
14
|
+
- **Success**: a single JSON object to **stdout**, exit `0`. This is the machine
|
|
15
|
+
payload; it's the default (no `--json` needed, though `--json` forces it).
|
|
16
|
+
- Every extracted scalar is the envelope `{value, confidence, source}`;
|
|
17
|
+
"not found" is the canonical `{value: null, confidence: 0.0, source: "none"}`.
|
|
18
|
+
Lists (`parties`, `clauses`, `defined_terms`) carry per-item
|
|
19
|
+
`confidence`/`source`. `source ∈ {deterministic, llm, none}`.
|
|
20
|
+
- `_meta` records `extractor_version`, `tiers_used`, and `llm_used`.
|
|
21
|
+
- The output shape is locked by a JSON Schema —
|
|
22
|
+
[`docs/spec/extract-output.schema.json`](docs/spec/extract-output.schema.json),
|
|
23
|
+
also printed by `extract schema`. Validate against it instead of trusting
|
|
24
|
+
field shapes by convention. (Note: the `--no-confidence` projection is a
|
|
25
|
+
reduced convenience view, **not** governed by the schema.)
|
|
26
|
+
- **stderr** is for humans only: `--why` rationale, warnings, and errors.
|
|
27
|
+
stdout stays clean JSON even under `--why`.
|
|
28
|
+
- **Failure**: a one-line `error: <message>` on **stderr**, non-zero exit.
|
|
29
|
+
The error shape is a flat string (the suite is not uniform on error-object
|
|
30
|
+
shape) — **branch on the exit code, never on the human-readable message.**
|
|
31
|
+
|
|
32
|
+
## Exit codes
|
|
33
|
+
|
|
34
|
+
| Code | Meaning |
|
|
35
|
+
|------|---------|
|
|
36
|
+
| `0` | Success. |
|
|
37
|
+
| `1` | Low-signal document — no high-signal fields (parties/clauses/dates) could be extracted; e.g. a scanned/image-only or empty file. A **finding**, not a crash: valid JSON is still emitted on stdout. |
|
|
38
|
+
| `2` | Bad usage / user-actionable error (unreadable path, bad flag value, unsupported completion shell). |
|
|
39
|
+
|
|
40
|
+
## Discovery
|
|
41
|
+
|
|
42
|
+
Never hardcode command or flag names — call the catalog at startup:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
extract --catalog json # {name, bin, version, description, commands[], exitCodes}
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
`--catalog json` is the suite-wide discovery contract (parallel to
|
|
49
|
+
`nda-review-cli --catalog json`, `docx2pdf --catalog json`,
|
|
50
|
+
`sign --catalog json`). It is **complete, accurate, and stable across minor
|
|
51
|
+
versions** — a test asserts it never drifts from the real parser.
|
|
52
|
+
|
|
53
|
+
Tool-specific discovery extras:
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
extract schema # the output JSON Schema (the cross-CLI data contract)
|
|
57
|
+
extract fields # extractable fields and the tier that produces each
|
|
58
|
+
extract fields --json # ...as JSON
|
|
59
|
+
extract demo # run on a bundled fixture (zero-config first run)
|
|
60
|
+
extract --version
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Failure → recovery
|
|
64
|
+
|
|
65
|
+
| Symptom | Diagnose | Recover |
|
|
66
|
+
|---|---|---|
|
|
67
|
+
| Exit `1`, warning "no high-signal fields" | The document is likely scanned/image-only or has no recognizable structure. JSON is still emitted. | OCR the source first, or feed a text/`.docx`/`.md` version. The empty-but-valid JSON is safe to pass downstream. |
|
|
68
|
+
| Exit `2`, `error: ...` | `extract --catalog json` (or `extract <cmd> --help`) for the real surface. | Fix the path/flag and retry. |
|
|
69
|
+
| `clauses: []` on a real contract | The `.docx` likely auto-numbers via Word's numbering with no heading style (its numbers live only in `numbering.xml`), so the deterministic cascade sees no headings. | Re-run with `--llm` (opt-in): when no clauses are detected, the LLM is asked for section headings, normalized through the same canonical vocabulary and emitted with `tier: "llm"`, `source: "llm"`, and a modest confidence. Requires `~/.config/contract-ops/llm.json`. |
|
|
70
|
+
| Low-fidelity `.docx`/`.pdf` text | The stdlib best-effort reader ran (no extras installed). | `pip install "extract-cli[docx]"` and/or `"extract-cli[pdf]"` for higher fidelity. The core always works without them. |
|
|
71
|
+
| `--llm` only printed a warning | No LLM config found. | Copy [`config/llm.json.example`](config/llm.json.example) to `~/.config/contract-ops/llm.json`. Without it, deterministic output is still returned in full. |
|
|
72
|
+
|
|
73
|
+
## Recommended usage
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
# Inspect any contract's structure, one tool for five formats.
|
|
77
|
+
extract counterparty.docx | jq '{parties: [.parties[].name],
|
|
78
|
+
governing_law: .governing_law.value, clauses: [.clauses[].canonical_title]}'
|
|
79
|
+
|
|
80
|
+
# Gate a workflow on extraction confidence (non-zero exit if any clause is shaky).
|
|
81
|
+
extract draft.docx | jq -e '.clauses | all(.confidence > 0.7)' && echo ok
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
The integration contract is the **output schema** + the **shared canonical
|
|
85
|
+
clause vocabulary** (`canonical_title` values match what `template-vault-cli`
|
|
86
|
+
detects and `nda-review-cli` keys policy on) — not per-tool flags. See
|
|
87
|
+
[`docs/INTEROP.md`](docs/INTEROP.md).
|
|
@@ -6,6 +6,79 @@ to [Semantic Versioning](https://semver.org/). Per the suite convention
|
|
|
6
6
|
(see [`docs/INTEROP.md`](docs/INTEROP.md)), **backward-incompatible changes to
|
|
7
7
|
the output schema require a major version bump**; new optional fields are minor.
|
|
8
8
|
|
|
9
|
+
## [0.1.9] - 2026-05-22
|
|
10
|
+
|
|
11
|
+
### Security / robustness
|
|
12
|
+
- **Resource bounds for untrusted input.** A hard on-disk file cap
|
|
13
|
+
(`MAX_INPUT_BYTES`, 100 MB) and a decompressed-size cap
|
|
14
|
+
(`MAX_DECOMPRESSED_BYTES`, 200 MB) so a zip-bomb `.docx` or zlib-bomb `.pdf`
|
|
15
|
+
can't exhaust memory: the DOCX reader checks `word/document.xml`'s
|
|
16
|
+
uncompressed size before reading, and the PDF reader decompresses streams
|
|
17
|
+
with a bounded budget. Both degrade gracefully (warning, empty text), never
|
|
18
|
+
crash. (Verified fast/bounded on a 2 MB doc: ~0.6 s, ~10 MB peak.)
|
|
19
|
+
|
|
20
|
+
### Added (output schema — minor, backward-compatible additions)
|
|
21
|
+
- **`jurisdiction`** — governing law normalized to a stable code
|
|
22
|
+
(`State of Delaware` → `US-DE`, `Province of Ontario` → `CA-ON`, …).
|
|
23
|
+
- **`amounts[]`** — every distinct monetary amount (`value` remains the headline one).
|
|
24
|
+
- **`signatories[]`** — `{name, title}` from signature blocks (`By:` / `Name:` /
|
|
25
|
+
`Title:`); empty on unsigned templates.
|
|
26
|
+
|
|
27
|
+
### Changed
|
|
28
|
+
- **Clause vocabulary round 2** (from the corpus survey): canonical
|
|
29
|
+
`Suspension`, `Support`, `Service Levels` + `invoicing`→Payment,
|
|
30
|
+
customer-data/protection-by-* → Data Protection. Noise filter now also drops
|
|
31
|
+
recitals/preamble/signature sections, definition fragments (a title starting
|
|
32
|
+
with a quote), and unfilled placeholders (`[ # ]%`). Mapped clause coverage
|
|
33
|
+
across the 58-document corpus rose from 57% → **64%**, no over-matching.
|
|
34
|
+
- Test coverage raised to **92%** (94% with the `[docx]`/`[pdf]` extras).
|
|
35
|
+
|
|
36
|
+
## [0.1.8] - 2026-05-22
|
|
37
|
+
|
|
38
|
+
Clause-detection breadth, driven by a 58-document real-corpus survey.
|
|
39
|
+
|
|
40
|
+
### Added
|
|
41
|
+
- **Auto-numbered DOCX clauses.** The DOCX reader now treats `w:numPr` list
|
|
42
|
+
paragraphs (no heading style; number generated from `numbering.xml`) as
|
|
43
|
+
clause-heading candidates, run through the same run-in/heading-likeness filter
|
|
44
|
+
as heading styles. Real agreements that number clauses this way (data
|
|
45
|
+
processing / design-partner agreements) get a clause map where they previously
|
|
46
|
+
got none; deep numbered body sentences are still excluded. New `numbered_docx`
|
|
47
|
+
fixture + tests.
|
|
48
|
+
- **Two-line `ARTICLE N` headings.** A bare `ARTICLE N` / `SECTION N` line whose
|
|
49
|
+
title sits on the next line (common in formal agreements) is detected as a
|
|
50
|
+
pair — recovering, e.g., a real SEC services agreement's clause map (0 → 8).
|
|
51
|
+
Fires only with >= 2 well-formed pairs; reported under the `numbered` tier (no
|
|
52
|
+
schema change).
|
|
53
|
+
- **Expanded canonical clause vocabulary** from the corpus survey: new canonical
|
|
54
|
+
clauses `Exclusions`, `Remedies`, `Restrictions`, `Taxes`,
|
|
55
|
+
`Reservation of Rights`, `Third-Party Beneficiaries`, `Feedback`,
|
|
56
|
+
`Miscellaneous`, plus aliases for `Compliance with Laws` (anti-bribery, export
|
|
57
|
+
controls) and `Data Protection` (customer data/content). ~155 more clauses map
|
|
58
|
+
across the corpus, with no observed over-matching.
|
|
59
|
+
- **`CLAUDE.md`** — codebase development notes (complements AGENTS.md).
|
|
60
|
+
|
|
61
|
+
No output-schema change.
|
|
62
|
+
|
|
63
|
+
## [0.1.7] - 2026-05-22
|
|
64
|
+
|
|
65
|
+
### Added
|
|
66
|
+
- **`extract --catalog json` — the suite's shared discovery contract.** Emits
|
|
67
|
+
`{name, bin, version, description, commands[], exitCodes}` (mirroring
|
|
68
|
+
`nda-review-cli --catalog json` / `docx2pdf --catalog json` /
|
|
69
|
+
`sign --catalog json`) so agents can learn every command and flag at startup
|
|
70
|
+
instead of hardcoding them. A test asserts the catalog never drifts from the
|
|
71
|
+
real argparse parser. Also added to the bash/zsh completion flag lists.
|
|
72
|
+
- **`AGENTS.md`** — the agent contract in the suite's canonical section order
|
|
73
|
+
(output contract / exit codes / discovery / failure → recovery).
|
|
74
|
+
- **`llms.txt`** — machine-readable tool summary at the repo root.
|
|
75
|
+
|
|
76
|
+
### Changed
|
|
77
|
+
- Packaging: added the suite-standard keywords (`contract-ops`, `agent-first`,
|
|
78
|
+
`legal-tech`); README now opens with `## Run this` / `## Where to go next`;
|
|
79
|
+
`--catalog json` documented in the README and `docs/INTEROP.md`. No schema or
|
|
80
|
+
extraction-logic change (`extractor_version` unchanged).
|
|
81
|
+
|
|
9
82
|
## [0.1.6] - 2026-05-21
|
|
10
83
|
|
|
11
84
|
### Docs
|
|
@@ -198,6 +271,9 @@ Initial release — the open-loop front door of the contract-ops CLI suite.
|
|
|
198
271
|
intentionally *not* governed by the output schema (the schema describes the
|
|
199
272
|
full default output).
|
|
200
273
|
|
|
274
|
+
[0.1.9]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.9
|
|
275
|
+
[0.1.8]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.8
|
|
276
|
+
[0.1.7]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.7
|
|
201
277
|
[0.1.6]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.6
|
|
202
278
|
[0.1.5]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.5
|
|
203
279
|
[0.1.4]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.4
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: extract-cli
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.9
|
|
4
4
|
Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON.
|
|
5
5
|
Project-URL: Homepage, https://cli.drbaher.com/
|
|
6
6
|
Project-URL: Repository, https://github.com/DrBaher/extract-cli
|
|
@@ -8,7 +8,7 @@ Project-URL: Suite interop, https://github.com/DrBaher/extract-cli/blob/main/doc
|
|
|
8
8
|
Author-email: DrBaher <Drbaher@gmail.com>
|
|
9
9
|
License: MIT
|
|
10
10
|
License-File: LICENSE
|
|
11
|
-
Keywords: clause,cli,contract,extraction,json,legal,nda
|
|
11
|
+
Keywords: agent-first,clause,cli,contract,contract-ops,extraction,json,legal,legal-tech,nda
|
|
12
12
|
Classifier: Development Status :: 4 - Beta
|
|
13
13
|
Classifier: Environment :: Console
|
|
14
14
|
Classifier: Intended Audience :: Developers
|
|
@@ -61,6 +61,30 @@ ingest (extract) → review → diff → convert → sign
|
|
|
61
61
|
^you are here
|
|
62
62
|
```
|
|
63
63
|
|
|
64
|
+
## Run this
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
pipx run extract-cli demo # zero-config: extract a bundled NDA → structured JSON
|
|
68
|
+
# or, installed: pip install extract-cli && extract demo
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
That prints the full output contract — parties, dates, term, governing law, and
|
|
72
|
+
a clause map normalized onto the suite's canonical vocabulary — for a bundled
|
|
73
|
+
fixture, with no setup and no network. Point it at your own file with
|
|
74
|
+
`extract path/to/contract.docx`.
|
|
75
|
+
|
|
76
|
+
## Where to go next
|
|
77
|
+
|
|
78
|
+
- **New here?** Keep reading — [What it does](#what-it-does) and
|
|
79
|
+
[The two extraction tiers](#the-two-extraction-tiers).
|
|
80
|
+
- **Driving it from an agent?** See [`AGENTS.md`](AGENTS.md) and call
|
|
81
|
+
`extract --catalog json` at startup to discover commands/flags. The output
|
|
82
|
+
shape is locked by [`docs/spec/extract-output.schema.json`](docs/spec/extract-output.schema.json).
|
|
83
|
+
- **Wiring it into the pipeline?** See [`docs/INTEROP.md`](docs/INTEROP.md) — the
|
|
84
|
+
contract is the output schema + the shared clause vocabulary.
|
|
85
|
+
- **Contributing / building a sibling CLI?** [`CONTRIBUTING.md`](CONTRIBUTING.md)
|
|
86
|
+
and [ARCHITECTURE.md](ARCHITECTURE.md).
|
|
87
|
+
|
|
64
88
|
## What it does
|
|
65
89
|
|
|
66
90
|
Give it a contract in **`.md` / `.txt` / `.html`** (native), **`.docx`**, or
|
|
@@ -115,6 +139,7 @@ for them.
|
|
|
115
139
|
|
|
116
140
|
```bash
|
|
117
141
|
extract <path> # parse a document → structured JSON on stdout (default)
|
|
142
|
+
extract --catalog json # machine-readable catalog of commands/flags (agents call at startup)
|
|
118
143
|
extract schema # print the output JSON Schema (the cross-CLI contract)
|
|
119
144
|
extract fields # list extractable fields and their tier
|
|
120
145
|
extract demo # run on a bundled fixture and show the narrative
|
|
@@ -125,6 +150,7 @@ extract completion bash # emit a shell-completion script (bash|zsh)
|
|
|
125
150
|
|
|
126
151
|
| Flag | Meaning |
|
|
127
152
|
|---|---|
|
|
153
|
+
| `--catalog json` | Print the machine-readable command/flag catalog and exit (the suite discovery contract; agents call this at startup) |
|
|
128
154
|
| `--llm` | Opt-in LLM enrichment of fuzzy fields (off by default) |
|
|
129
155
|
| `--fields a,b,c` | Emit only a subset of top-level fields (e.g. `parties,clauses`) |
|
|
130
156
|
| `--format json\|table` | Output format (default `json`) |
|
|
@@ -148,10 +174,13 @@ Streams follow the suite convention: **stdout** is the machine payload (JSON),
|
|
|
148
174
|
"dates": { "effective": { "value": "2024-03-01", "confidence": 0.85, "source": "deterministic" }, "expiration": { "value": null, "confidence": 0.0, "source": "none" } },
|
|
149
175
|
"term": { "length": { "value": "3 years", ... }, "auto_renew": { "value": true, ... }, "notice_period_days": { "value": 60, ... } },
|
|
150
176
|
"governing_law": { "value": "State of Delaware", "confidence": 0.85, "source": "deterministic" },
|
|
177
|
+
"jurisdiction": { "value": "US-DE", "confidence": 0.8, "source": "deterministic" },
|
|
151
178
|
"clauses": [ { "canonical_title": "Confidentiality", "detected_title": "## Confidentiality Obligations", "tier": "h2", "span": {"start": 0, "end": 120}, "confidence": 0.95, "source": "deterministic", "mapped": true } ],
|
|
152
179
|
"defined_terms": [ { "term": "Confidential Information", "confidence": 0.6, "source": "deterministic" } ],
|
|
153
180
|
"value": { "value": "$50,000", "confidence": 0.6, "source": "deterministic" },
|
|
154
|
-
"
|
|
181
|
+
"amounts": [ { "value": "$50,000", "confidence": 0.6, "source": "deterministic" } ],
|
|
182
|
+
"signatories": [ { "name": "Jane Doe", "title": "CEO", "confidence": 0.55, "source": "deterministic" } ],
|
|
183
|
+
"_meta": { "extractor_version": "0.1.9", "tiers_used": ["deterministic"], "llm_used": false }
|
|
155
184
|
}
|
|
156
185
|
```
|
|
157
186
|
|
|
@@ -23,6 +23,30 @@ ingest (extract) → review → diff → convert → sign
|
|
|
23
23
|
^you are here
|
|
24
24
|
```
|
|
25
25
|
|
|
26
|
+
## Run this
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pipx run extract-cli demo # zero-config: extract a bundled NDA → structured JSON
|
|
30
|
+
# or, installed: pip install extract-cli && extract demo
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
That prints the full output contract — parties, dates, term, governing law, and
|
|
34
|
+
a clause map normalized onto the suite's canonical vocabulary — for a bundled
|
|
35
|
+
fixture, with no setup and no network. Point it at your own file with
|
|
36
|
+
`extract path/to/contract.docx`.
|
|
37
|
+
|
|
38
|
+
## Where to go next
|
|
39
|
+
|
|
40
|
+
- **New here?** Keep reading — [What it does](#what-it-does) and
|
|
41
|
+
[The two extraction tiers](#the-two-extraction-tiers).
|
|
42
|
+
- **Driving it from an agent?** See [`AGENTS.md`](AGENTS.md) and call
|
|
43
|
+
`extract --catalog json` at startup to discover commands/flags. The output
|
|
44
|
+
shape is locked by [`docs/spec/extract-output.schema.json`](docs/spec/extract-output.schema.json).
|
|
45
|
+
- **Wiring it into the pipeline?** See [`docs/INTEROP.md`](docs/INTEROP.md) — the
|
|
46
|
+
contract is the output schema + the shared clause vocabulary.
|
|
47
|
+
- **Contributing / building a sibling CLI?** [`CONTRIBUTING.md`](CONTRIBUTING.md)
|
|
48
|
+
and [ARCHITECTURE.md](ARCHITECTURE.md).
|
|
49
|
+
|
|
26
50
|
## What it does
|
|
27
51
|
|
|
28
52
|
Give it a contract in **`.md` / `.txt` / `.html`** (native), **`.docx`**, or
|
|
@@ -77,6 +101,7 @@ for them.
|
|
|
77
101
|
|
|
78
102
|
```bash
|
|
79
103
|
extract <path> # parse a document → structured JSON on stdout (default)
|
|
104
|
+
extract --catalog json # machine-readable catalog of commands/flags (agents call at startup)
|
|
80
105
|
extract schema # print the output JSON Schema (the cross-CLI contract)
|
|
81
106
|
extract fields # list extractable fields and their tier
|
|
82
107
|
extract demo # run on a bundled fixture and show the narrative
|
|
@@ -87,6 +112,7 @@ extract completion bash # emit a shell-completion script (bash|zsh)
|
|
|
87
112
|
|
|
88
113
|
| Flag | Meaning |
|
|
89
114
|
|---|---|
|
|
115
|
+
| `--catalog json` | Print the machine-readable command/flag catalog and exit (the suite discovery contract; agents call this at startup) |
|
|
90
116
|
| `--llm` | Opt-in LLM enrichment of fuzzy fields (off by default) |
|
|
91
117
|
| `--fields a,b,c` | Emit only a subset of top-level fields (e.g. `parties,clauses`) |
|
|
92
118
|
| `--format json\|table` | Output format (default `json`) |
|
|
@@ -110,10 +136,13 @@ Streams follow the suite convention: **stdout** is the machine payload (JSON),
|
|
|
110
136
|
"dates": { "effective": { "value": "2024-03-01", "confidence": 0.85, "source": "deterministic" }, "expiration": { "value": null, "confidence": 0.0, "source": "none" } },
|
|
111
137
|
"term": { "length": { "value": "3 years", ... }, "auto_renew": { "value": true, ... }, "notice_period_days": { "value": 60, ... } },
|
|
112
138
|
"governing_law": { "value": "State of Delaware", "confidence": 0.85, "source": "deterministic" },
|
|
139
|
+
"jurisdiction": { "value": "US-DE", "confidence": 0.8, "source": "deterministic" },
|
|
113
140
|
"clauses": [ { "canonical_title": "Confidentiality", "detected_title": "## Confidentiality Obligations", "tier": "h2", "span": {"start": 0, "end": 120}, "confidence": 0.95, "source": "deterministic", "mapped": true } ],
|
|
114
141
|
"defined_terms": [ { "term": "Confidential Information", "confidence": 0.6, "source": "deterministic" } ],
|
|
115
142
|
"value": { "value": "$50,000", "confidence": 0.6, "source": "deterministic" },
|
|
116
|
-
"
|
|
143
|
+
"amounts": [ { "value": "$50,000", "confidence": 0.6, "source": "deterministic" } ],
|
|
144
|
+
"signatories": [ { "name": "Jane Doe", "title": "CEO", "confidence": 0.55, "source": "deterministic" } ],
|
|
145
|
+
"_meta": { "extractor_version": "0.1.9", "tiers_used": ["deterministic"], "llm_used": false }
|
|
117
146
|
}
|
|
118
147
|
```
|
|
119
148
|
|
|
@@ -52,10 +52,12 @@ is a self-contained reference validator.
|
|
|
52
52
|
|
|
53
53
|
Top-level keys: `document` {title, format, sha256, source_path}, `parties[]`,
|
|
54
54
|
`dates` {effective, expiration}, `term` {length, auto_renew,
|
|
55
|
-
notice_period_days, *renewal_mechanics?*}, `governing_law`, `
|
|
56
|
-
|
|
57
|
-
`defined_terms[]`, `value`,
|
|
58
|
-
|
|
55
|
+
notice_period_days, *renewal_mechanics?*}, `governing_law`, `jurisdiction`
|
|
56
|
+
(normalized code, e.g. `US-DE`), `clauses[]` {canonical_title, detected_title,
|
|
57
|
+
tier, span, confidence, source, mapped}, `defined_terms[]`, `value`,
|
|
58
|
+
`amounts[]` (all monetary amounts), `signatories[]` {name, title}, *`obligations[]?`*,
|
|
59
|
+
and `_meta` {extractor_version, tiers_used, llm_used}. Formats: markdown, text,
|
|
60
|
+
html, docx, pdf. **Every extracted field carries a `confidence` (0–1) and
|
|
59
61
|
a `source` ∈ {deterministic, llm, none}.** Scalar fields use the envelope
|
|
60
62
|
`{value, confidence, source}`; "not found" is `{value: null, confidence: 0.0,
|
|
61
63
|
source: "none"}`. Italic fields are added only under `--llm`.
|
|
@@ -118,6 +120,7 @@ only stdlib `urllib`, so there is no runtime dependency.
|
|
|
118
120
|
| Concern | Convention |
|
|
119
121
|
|---|---|
|
|
120
122
|
| Primary result | **stdout** (JSON payload, default) |
|
|
123
|
+
| Discovery | `extract --catalog json` (commands/flags, the suite contract) + `extract schema` / `extract fields --json` |
|
|
121
124
|
| `--why`, warnings, errors | **stderr** |
|
|
122
125
|
| `--why` envelope | plain-text `[why] <header>` block (as in template-vault-cli / draft-cli) |
|
|
123
126
|
| Quiet | `-q` / `--silent` / `--quiet` aliases |
|
|
@@ -10,9 +10,12 @@
|
|
|
10
10
|
"dates",
|
|
11
11
|
"term",
|
|
12
12
|
"governing_law",
|
|
13
|
+
"jurisdiction",
|
|
13
14
|
"clauses",
|
|
14
15
|
"defined_terms",
|
|
15
16
|
"value",
|
|
17
|
+
"amounts",
|
|
18
|
+
"signatories",
|
|
16
19
|
"_meta"
|
|
17
20
|
],
|
|
18
21
|
"additionalProperties": false,
|
|
@@ -157,6 +160,9 @@
|
|
|
157
160
|
"governing_law": {
|
|
158
161
|
"$ref": "#/$defs/field"
|
|
159
162
|
},
|
|
163
|
+
"jurisdiction": {
|
|
164
|
+
"$ref": "#/$defs/field"
|
|
165
|
+
},
|
|
160
166
|
"clauses": {
|
|
161
167
|
"type": "array",
|
|
162
168
|
"items": {
|
|
@@ -247,6 +253,58 @@
|
|
|
247
253
|
"value": {
|
|
248
254
|
"$ref": "#/$defs/field"
|
|
249
255
|
},
|
|
256
|
+
"amounts": {
|
|
257
|
+
"type": "array",
|
|
258
|
+
"items": {
|
|
259
|
+
"type": "object",
|
|
260
|
+
"required": [
|
|
261
|
+
"value",
|
|
262
|
+
"confidence",
|
|
263
|
+
"source"
|
|
264
|
+
],
|
|
265
|
+
"properties": {
|
|
266
|
+
"value": {
|
|
267
|
+
"type": "string"
|
|
268
|
+
},
|
|
269
|
+
"confidence": {
|
|
270
|
+
"$ref": "#/$defs/confidence"
|
|
271
|
+
},
|
|
272
|
+
"source": {
|
|
273
|
+
"$ref": "#/$defs/source"
|
|
274
|
+
}
|
|
275
|
+
},
|
|
276
|
+
"additionalProperties": false
|
|
277
|
+
}
|
|
278
|
+
},
|
|
279
|
+
"signatories": {
|
|
280
|
+
"type": "array",
|
|
281
|
+
"items": {
|
|
282
|
+
"type": "object",
|
|
283
|
+
"required": [
|
|
284
|
+
"name",
|
|
285
|
+
"confidence",
|
|
286
|
+
"source"
|
|
287
|
+
],
|
|
288
|
+
"properties": {
|
|
289
|
+
"name": {
|
|
290
|
+
"type": "string"
|
|
291
|
+
},
|
|
292
|
+
"title": {
|
|
293
|
+
"type": [
|
|
294
|
+
"string",
|
|
295
|
+
"null"
|
|
296
|
+
]
|
|
297
|
+
},
|
|
298
|
+
"confidence": {
|
|
299
|
+
"$ref": "#/$defs/confidence"
|
|
300
|
+
},
|
|
301
|
+
"source": {
|
|
302
|
+
"$ref": "#/$defs/source"
|
|
303
|
+
}
|
|
304
|
+
},
|
|
305
|
+
"additionalProperties": false
|
|
306
|
+
}
|
|
307
|
+
},
|
|
250
308
|
"obligations": {
|
|
251
309
|
"type": "array",
|
|
252
310
|
"items": {
|