extract-cli 0.1.5__tar.gz → 0.1.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- extract_cli-0.1.8/AGENTS.md +87 -0
- {extract_cli-0.1.5 → extract_cli-0.1.8}/CHANGELOG.md +66 -0
- {extract_cli-0.1.5 → extract_cli-0.1.8}/PKG-INFO +62 -25
- {extract_cli-0.1.5 → extract_cli-0.1.8}/README.md +60 -23
- {extract_cli-0.1.5 → extract_cli-0.1.8}/docs/INTEROP.md +1 -0
- {extract_cli-0.1.5 → extract_cli-0.1.8}/extract_cli.py +187 -14
- extract_cli-0.1.8/llms.txt +79 -0
- {extract_cli-0.1.5 → extract_cli-0.1.8}/pyproject.toml +7 -2
- {extract_cli-0.1.5 → extract_cli-0.1.8}/tests/_fixtures_build.py +36 -2
- {extract_cli-0.1.5 → extract_cli-0.1.8}/tests/_make_goldens.py +2 -2
- {extract_cli-0.1.5 → extract_cli-0.1.8}/tests/conftest.py +1 -0
- {extract_cli-0.1.5 → extract_cli-0.1.8}/tests/fixtures/employment_docx.docx.expected.json +1 -1
- {extract_cli-0.1.5 → extract_cli-0.1.8}/tests/fixtures/heading_docx.docx.expected.json +1 -1
- {extract_cli-0.1.5 → extract_cli-0.1.8}/tests/fixtures/lease_allcaps.txt.expected.json +1 -1
- {extract_cli-0.1.5 → extract_cli-0.1.8}/tests/fixtures/license_pdf.pdf.expected.json +1 -1
- {extract_cli-0.1.5 → extract_cli-0.1.8}/tests/fixtures/nda_h2.md.expected.json +1 -1
- extract_cli-0.1.8/tests/fixtures/numbered_docx.docx +0 -0
- extract_cli-0.1.8/tests/fixtures/numbered_docx.docx.expected.json +142 -0
- {extract_cli-0.1.5 → extract_cli-0.1.8}/tests/fixtures/scanned.pdf.expected.json +1 -1
- {extract_cli-0.1.5 → extract_cli-0.1.8}/tests/fixtures/services_bold.txt.expected.json +1 -1
- {extract_cli-0.1.5 → extract_cli-0.1.8}/tests/fixtures/services_html.html.expected.json +1 -1
- {extract_cli-0.1.5 → extract_cli-0.1.8}/tests/test_clause_map.py +23 -0
- {extract_cli-0.1.5 → extract_cli-0.1.8}/tests/test_cli.py +63 -1
- {extract_cli-0.1.5 → extract_cli-0.1.8}/tests/test_misc.py +12 -0
- {extract_cli-0.1.5 → extract_cli-0.1.8}/.gitignore +0 -0
- {extract_cli-0.1.5 → extract_cli-0.1.8}/ARCHITECTURE.md +0 -0
- {extract_cli-0.1.5 → extract_cli-0.1.8}/CONTRIBUTING.md +0 -0
- {extract_cli-0.1.5 → extract_cli-0.1.8}/LICENSE +0 -0
- {extract_cli-0.1.5 → extract_cli-0.1.8}/Makefile +0 -0
- {extract_cli-0.1.5 → extract_cli-0.1.8}/config/llm.json.example +0 -0
- {extract_cli-0.1.5 → extract_cli-0.1.8}/docs/spec/extract-output.schema.json +0 -0
- {extract_cli-0.1.5 → extract_cli-0.1.8}/scripts/release.py +0 -0
- {extract_cli-0.1.5 → extract_cli-0.1.8}/scripts/validate_against_spec.py +0 -0
- {extract_cli-0.1.5 → extract_cli-0.1.8}/tests/_schema_validator.py +0 -0
- {extract_cli-0.1.5 → extract_cli-0.1.8}/tests/fixtures/employment_docx.docx +0 -0
- {extract_cli-0.1.5 → extract_cli-0.1.8}/tests/fixtures/heading_docx.docx +0 -0
- {extract_cli-0.1.5 → extract_cli-0.1.8}/tests/fixtures/lease_allcaps.txt +0 -0
- {extract_cli-0.1.5 → extract_cli-0.1.8}/tests/fixtures/license_pdf.pdf +0 -0
- {extract_cli-0.1.5 → extract_cli-0.1.8}/tests/fixtures/nda_h2.md +0 -0
- {extract_cli-0.1.5 → extract_cli-0.1.8}/tests/fixtures/scanned.pdf +0 -0
- {extract_cli-0.1.5 → extract_cli-0.1.8}/tests/fixtures/services_bold.txt +0 -0
- {extract_cli-0.1.5 → extract_cli-0.1.8}/tests/fixtures/services_html.html +0 -0
- {extract_cli-0.1.5 → extract_cli-0.1.8}/tests/test_deterministic.py +0 -0
- {extract_cli-0.1.5 → extract_cli-0.1.8}/tests/test_llm.py +0 -0
- {extract_cli-0.1.5 → extract_cli-0.1.8}/tests/test_property.py +0 -0
- {extract_cli-0.1.5 → extract_cli-0.1.8}/tests/test_schema_conformance.py +0 -0
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# Agents
|
|
2
|
+
|
|
3
|
+
Drive `extract-cli` from an LLM agent or non-interactive client. Same agent
|
|
4
|
+
contract as the rest of the contract-ops suite: a stable machine-readable
|
|
5
|
+
catalog, JSON on stdout, humans on stderr, and a small documented exit-code set.
|
|
6
|
+
|
|
7
|
+
`extract-cli` is the suite's **open-loop front door**: hand it any contract
|
|
8
|
+
(`.md` / `.txt` / `.html` / `.docx` / `.pdf`, yours or a counterparty's) and it
|
|
9
|
+
returns structured JSON the rest of the pipeline can consume. Every field
|
|
10
|
+
carries a `confidence` and a `source` — **verify, don't trust**.
|
|
11
|
+
|
|
12
|
+
## Output contract
|
|
13
|
+
|
|
14
|
+
- **Success**: a single JSON object to **stdout**, exit `0`. This is the machine
|
|
15
|
+
payload; it's the default (no `--json` needed, though `--json` forces it).
|
|
16
|
+
- Every extracted scalar is the envelope `{value, confidence, source}`;
|
|
17
|
+
"not found" is the canonical `{value: null, confidence: 0.0, source: "none"}`.
|
|
18
|
+
Lists (`parties`, `clauses`, `defined_terms`) carry per-item
|
|
19
|
+
`confidence`/`source`. `source ∈ {deterministic, llm, none}`.
|
|
20
|
+
- `_meta` records `extractor_version`, `tiers_used`, and `llm_used`.
|
|
21
|
+
- The output shape is locked by a JSON Schema —
|
|
22
|
+
[`docs/spec/extract-output.schema.json`](docs/spec/extract-output.schema.json),
|
|
23
|
+
also printed by `extract schema`. Validate against it instead of trusting
|
|
24
|
+
field shapes by convention. (Note: the `--no-confidence` projection is a
|
|
25
|
+
reduced convenience view, **not** governed by the schema.)
|
|
26
|
+
- **stderr** is for humans only: `--why` rationale, warnings, and errors.
|
|
27
|
+
stdout stays clean JSON even under `--why`.
|
|
28
|
+
- **Failure**: a one-line `error: <message>` on **stderr**, non-zero exit.
|
|
29
|
+
The error shape is a flat string (the suite is not uniform on error-object
|
|
30
|
+
shape) — **branch on the exit code, never on the human-readable message.**
|
|
31
|
+
|
|
32
|
+
## Exit codes
|
|
33
|
+
|
|
34
|
+
| Code | Meaning |
|
|
35
|
+
|------|---------|
|
|
36
|
+
| `0` | Success. |
|
|
37
|
+
| `1` | Low-signal document — no high-signal fields (parties/clauses/dates) could be extracted; e.g. a scanned/image-only or empty file. A **finding**, not a crash: valid JSON is still emitted on stdout. |
|
|
38
|
+
| `2` | Bad usage / user-actionable error (unreadable path, bad flag value, unsupported completion shell). |
|
|
39
|
+
|
|
40
|
+
## Discovery
|
|
41
|
+
|
|
42
|
+
Never hardcode command or flag names — call the catalog at startup:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
extract --catalog json # {name, bin, version, description, commands[], exitCodes}
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
`--catalog json` is the suite-wide discovery contract (parallel to
|
|
49
|
+
`nda-review-cli --catalog json`, `docx2pdf --catalog json`,
|
|
50
|
+
`sign --catalog json`). It is **complete, accurate, and stable across minor
|
|
51
|
+
versions** — a test asserts it never drifts from the real parser.
|
|
52
|
+
|
|
53
|
+
Tool-specific discovery extras:
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
extract schema # the output JSON Schema (the cross-CLI data contract)
|
|
57
|
+
extract fields # extractable fields and the tier that produces each
|
|
58
|
+
extract fields --json # ...as JSON
|
|
59
|
+
extract demo # run on a bundled fixture (zero-config first run)
|
|
60
|
+
extract --version
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Failure → recovery
|
|
64
|
+
|
|
65
|
+
| Symptom | Diagnose | Recover |
|
|
66
|
+
|---|---|---|
|
|
67
|
+
| Exit `1`, warning "no high-signal fields" | The document is likely scanned/image-only or has no recognizable structure. JSON is still emitted. | OCR the source first, or feed a text/`.docx`/`.md` version. The empty-but-valid JSON is safe to pass downstream. |
|
|
68
|
+
| Exit `2`, `error: ...` | `extract --catalog json` (or `extract <cmd> --help`) for the real surface. | Fix the path/flag and retry. |
|
|
69
|
+
| `clauses: []` on a real contract | The `.docx` likely auto-numbers via Word's numbering with no heading style (its numbers live only in `numbering.xml`), so the deterministic cascade sees no headings. | Re-run with `--llm` (opt-in): when no clauses are detected, the LLM is asked for section headings, normalized through the same canonical vocabulary and emitted with `tier: "llm"`, `source: "llm"`, and a modest confidence. Requires `~/.config/contract-ops/llm.json`. |
|
|
70
|
+
| Low-fidelity `.docx`/`.pdf` text | The stdlib best-effort reader ran (no extras installed). | `pip install "extract-cli[docx]"` and/or `"extract-cli[pdf]"` for higher fidelity. The core always works without them. |
|
|
71
|
+
| `--llm` only printed a warning | No LLM config found. | Copy [`config/llm.json.example`](config/llm.json.example) to `~/.config/contract-ops/llm.json`. Without it, deterministic output is still returned in full. |
|
|
72
|
+
|
|
73
|
+
## Recommended usage
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
# Inspect any contract's structure, one tool for five formats.
|
|
77
|
+
extract counterparty.docx | jq '{parties: [.parties[].name],
|
|
78
|
+
governing_law: .governing_law.value, clauses: [.clauses[].canonical_title]}'
|
|
79
|
+
|
|
80
|
+
# Gate a workflow on extraction confidence (non-zero exit if any clause is shaky).
|
|
81
|
+
extract draft.docx | jq -e '.clauses | all(.confidence > 0.7)' && echo ok
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
The integration contract is the **output schema** + the **shared canonical
|
|
85
|
+
clause vocabulary** (`canonical_title` values match what `template-vault-cli`
|
|
86
|
+
detects and `nda-review-cli` keys policy on) — not per-tool flags. See
|
|
87
|
+
[`docs/INTEROP.md`](docs/INTEROP.md).
|
|
@@ -6,6 +6,69 @@ to [Semantic Versioning](https://semver.org/). Per the suite convention
|
|
|
6
6
|
(see [`docs/INTEROP.md`](docs/INTEROP.md)), **backward-incompatible changes to
|
|
7
7
|
the output schema require a major version bump**; new optional fields are minor.
|
|
8
8
|
|
|
9
|
+
## [0.1.8] - 2026-05-22
|
|
10
|
+
|
|
11
|
+
Clause-detection breadth, driven by a 58-document real-corpus survey.
|
|
12
|
+
|
|
13
|
+
### Added
|
|
14
|
+
- **Auto-numbered DOCX clauses.** The DOCX reader now treats `w:numPr` list
|
|
15
|
+
paragraphs (no heading style; number generated from `numbering.xml`) as
|
|
16
|
+
clause-heading candidates, run through the same run-in/heading-likeness filter
|
|
17
|
+
as heading styles. Real agreements that number clauses this way (data
|
|
18
|
+
processing / design-partner agreements) get a clause map where they previously
|
|
19
|
+
got none; deep numbered body sentences are still excluded. New `numbered_docx`
|
|
20
|
+
fixture + tests.
|
|
21
|
+
- **Two-line `ARTICLE N` headings.** A bare `ARTICLE N` / `SECTION N` line whose
|
|
22
|
+
title sits on the next line (common in formal agreements) is detected as a
|
|
23
|
+
pair — recovering, e.g., a real SEC services agreement's clause map (0 → 8).
|
|
24
|
+
Fires only with >= 2 well-formed pairs; reported under the `numbered` tier (no
|
|
25
|
+
schema change).
|
|
26
|
+
- **Expanded canonical clause vocabulary** from the corpus survey: new canonical
|
|
27
|
+
clauses `Exclusions`, `Remedies`, `Restrictions`, `Taxes`,
|
|
28
|
+
`Reservation of Rights`, `Third-Party Beneficiaries`, `Feedback`,
|
|
29
|
+
`Miscellaneous`, plus aliases for `Compliance with Laws` (anti-bribery, export
|
|
30
|
+
controls) and `Data Protection` (customer data/content). ~155 more clauses map
|
|
31
|
+
across the corpus, with no observed over-matching.
|
|
32
|
+
- **`CLAUDE.md`** — codebase development notes (complements AGENTS.md).
|
|
33
|
+
|
|
34
|
+
No output-schema change.
|
|
35
|
+
|
|
36
|
+
## [0.1.7] - 2026-05-22
|
|
37
|
+
|
|
38
|
+
### Added
|
|
39
|
+
- **`extract --catalog json` — the suite's shared discovery contract.** Emits
|
|
40
|
+
`{name, bin, version, description, commands[], exitCodes}` (mirroring
|
|
41
|
+
`nda-review-cli --catalog json` / `docx2pdf --catalog json` /
|
|
42
|
+
`sign --catalog json`) so agents can learn every command and flag at startup
|
|
43
|
+
instead of hardcoding them. A test asserts the catalog never drifts from the
|
|
44
|
+
real argparse parser. Also added to the bash/zsh completion flag lists.
|
|
45
|
+
- **`AGENTS.md`** — the agent contract in the suite's canonical section order
|
|
46
|
+
(output contract / exit codes / discovery / failure → recovery).
|
|
47
|
+
- **`llms.txt`** — machine-readable tool summary at the repo root.
|
|
48
|
+
|
|
49
|
+
### Changed
|
|
50
|
+
- Packaging: added the suite-standard keywords (`contract-ops`, `agent-first`,
|
|
51
|
+
`legal-tech`); README now opens with `## Run this` / `## Where to go next`;
|
|
52
|
+
`--catalog json` documented in the README and `docs/INTEROP.md`. No schema or
|
|
53
|
+
extraction-logic change (`extractor_version` unchanged).
|
|
54
|
+
|
|
55
|
+
## [0.1.6] - 2026-05-21
|
|
56
|
+
|
|
57
|
+
### Docs
|
|
58
|
+
- **Rewrote the README composability section to verified, runnable examples.**
|
|
59
|
+
Testing extract-cli against the real sibling CLIs (`template-vault-cli`,
|
|
60
|
+
`nda-review-cli`) showed the previous pipes were aspirational — the siblings
|
|
61
|
+
expose no `--from-extract`/`--stdin` flag (`nda-review review` takes
|
|
62
|
+
`--file`/`--text`; `template-vault` reads its own vault). The integration
|
|
63
|
+
contract is the **output schema + the shared canonical clause vocabulary**,
|
|
64
|
+
glued by stdout JSON and standard tools (`jq`, `comm`): `extract`'s
|
|
65
|
+
`canonical_title` values are the same names template-vault detects and
|
|
66
|
+
nda-review keys policy on, so a foreign document's clauses line up with the
|
|
67
|
+
suite's with no bespoke adapter. New examples cover clause-coverage gap
|
|
68
|
+
analysis against a vault template and a combined extract+nda-review intake
|
|
69
|
+
report — all runnable today. (Also fixed a broken `jq input_filename` in the
|
|
70
|
+
folder-triage example.) No code or schema change.
|
|
71
|
+
|
|
9
72
|
## [0.1.5] - 2026-05-21
|
|
10
73
|
|
|
11
74
|
### Added
|
|
@@ -181,6 +244,9 @@ Initial release — the open-loop front door of the contract-ops CLI suite.
|
|
|
181
244
|
intentionally *not* governed by the output schema (the schema describes the
|
|
182
245
|
full default output).
|
|
183
246
|
|
|
247
|
+
[0.1.8]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.8
|
|
248
|
+
[0.1.7]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.7
|
|
249
|
+
[0.1.6]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.6
|
|
184
250
|
[0.1.5]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.5
|
|
185
251
|
[0.1.4]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.4
|
|
186
252
|
[0.1.3]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.3
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: extract-cli
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.8
|
|
4
4
|
Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON.
|
|
5
5
|
Project-URL: Homepage, https://cli.drbaher.com/
|
|
6
6
|
Project-URL: Repository, https://github.com/DrBaher/extract-cli
|
|
@@ -8,7 +8,7 @@ Project-URL: Suite interop, https://github.com/DrBaher/extract-cli/blob/main/doc
|
|
|
8
8
|
Author-email: DrBaher <Drbaher@gmail.com>
|
|
9
9
|
License: MIT
|
|
10
10
|
License-File: LICENSE
|
|
11
|
-
Keywords: clause,cli,contract,extraction,json,legal,nda
|
|
11
|
+
Keywords: agent-first,clause,cli,contract,contract-ops,extraction,json,legal,legal-tech,nda
|
|
12
12
|
Classifier: Development Status :: 4 - Beta
|
|
13
13
|
Classifier: Environment :: Console
|
|
14
14
|
Classifier: Intended Audience :: Developers
|
|
@@ -61,6 +61,30 @@ ingest (extract) → review → diff → convert → sign
|
|
|
61
61
|
^you are here
|
|
62
62
|
```
|
|
63
63
|
|
|
64
|
+
## Run this
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
pipx run extract-cli demo # zero-config: extract a bundled NDA → structured JSON
|
|
68
|
+
# or, installed: pip install extract-cli && extract demo
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
That prints the full output contract — parties, dates, term, governing law, and
|
|
72
|
+
a clause map normalized onto the suite's canonical vocabulary — for a bundled
|
|
73
|
+
fixture, with no setup and no network. Point it at your own file with
|
|
74
|
+
`extract path/to/contract.docx`.
|
|
75
|
+
|
|
76
|
+
## Where to go next
|
|
77
|
+
|
|
78
|
+
- **New here?** Keep reading — [What it does](#what-it-does) and
|
|
79
|
+
[The two extraction tiers](#the-two-extraction-tiers).
|
|
80
|
+
- **Driving it from an agent?** See [`AGENTS.md`](AGENTS.md) and call
|
|
81
|
+
`extract --catalog json` at startup to discover commands/flags. The output
|
|
82
|
+
shape is locked by [`docs/spec/extract-output.schema.json`](docs/spec/extract-output.schema.json).
|
|
83
|
+
- **Wiring it into the pipeline?** See [`docs/INTEROP.md`](docs/INTEROP.md) — the
|
|
84
|
+
contract is the output schema + the shared clause vocabulary.
|
|
85
|
+
- **Contributing / building a sibling CLI?** [`CONTRIBUTING.md`](CONTRIBUTING.md)
|
|
86
|
+
and [ARCHITECTURE.md](ARCHITECTURE.md).
|
|
87
|
+
|
|
64
88
|
## What it does
|
|
65
89
|
|
|
66
90
|
Give it a contract in **`.md` / `.txt` / `.html`** (native), **`.docx`**, or
|
|
@@ -115,6 +139,7 @@ for them.
|
|
|
115
139
|
|
|
116
140
|
```bash
|
|
117
141
|
extract <path> # parse a document → structured JSON on stdout (default)
|
|
142
|
+
extract --catalog json # machine-readable catalog of commands/flags (agents call at startup)
|
|
118
143
|
extract schema # print the output JSON Schema (the cross-CLI contract)
|
|
119
144
|
extract fields # list extractable fields and their tier
|
|
120
145
|
extract demo # run on a bundled fixture and show the narrative
|
|
@@ -125,6 +150,7 @@ extract completion bash # emit a shell-completion script (bash|zsh)
|
|
|
125
150
|
|
|
126
151
|
| Flag | Meaning |
|
|
127
152
|
|---|---|
|
|
153
|
+
| `--catalog json` | Print the machine-readable command/flag catalog and exit (the suite discovery contract; agents call this at startup) |
|
|
128
154
|
| `--llm` | Opt-in LLM enrichment of fuzzy fields (off by default) |
|
|
129
155
|
| `--fields a,b,c` | Emit only a subset of top-level fields (e.g. `parties,clauses`) |
|
|
130
156
|
| `--format json\|table` | Output format (default `json`) |
|
|
@@ -171,37 +197,48 @@ extract counterparty.pdf | jq '.clauses[] | {canonical_title, detected_title, ma
|
|
|
171
197
|
|
|
172
198
|
## Composability — piping into the rest of the suite
|
|
173
199
|
|
|
174
|
-
`extract-cli` is built to be the first stage of a Unix pipe.
|
|
175
|
-
|
|
200
|
+
`extract-cli` is built to be the first stage of a Unix pipe. The glue is its
|
|
201
|
+
**stdout JSON + standard tools** (`jq`, `comm`) and the **shared clause
|
|
202
|
+
vocabulary** — `extract`'s `canonical_title` values are the same names
|
|
203
|
+
`template-vault-cli` detects and `nda-review-cli` keys policy on, so a foreign
|
|
204
|
+
document's clauses line up with the suite's with no bespoke adapter. Every
|
|
205
|
+
example below is runnable today (verified against the real sibling CLIs).
|
|
176
206
|
|
|
177
207
|
```bash
|
|
178
|
-
# 1)
|
|
179
|
-
extract
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
#
|
|
183
|
-
extract
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
#
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
#
|
|
192
|
-
|
|
208
|
+
# 1) Inspect any contract's structure (.md/.txt/.html/.docx/.pdf, one tool).
|
|
209
|
+
extract counterparty.docx | jq '{parties: [.parties[].name],
|
|
210
|
+
governing_law: .governing_law.value, clauses: [.clauses[].canonical_title]}'
|
|
211
|
+
|
|
212
|
+
# 2) Clause-coverage gap vs your canonical template in template-vault-cli.
|
|
213
|
+
# extract normalizes the counterparty's *foreign* headings onto the same
|
|
214
|
+
# clause vocabulary template-vault detects, so a plain `comm` diffs them.
|
|
215
|
+
template-vault info nda/mutual-standard --json | jq -r '.clauses[].title' | sort > ours.txt
|
|
216
|
+
extract counterparty_nda.docx | jq -r '.clauses[].canonical_title' | sort -u > theirs.txt
|
|
217
|
+
comm -23 ours.txt theirs.txt # clauses in OUR standard that THEY are missing
|
|
218
|
+
comm -13 ours.txt theirs.txt # clauses THEY added that we don't have
|
|
219
|
+
|
|
220
|
+
# 3) Intake: extract for structure, nda-review-cli for a policy verdict on the
|
|
221
|
+
# same foreign doc; merge both views with jq.
|
|
222
|
+
extract counterparty_nda.docx > extract.json
|
|
223
|
+
nda-review review --file counterparty_nda.docx --playbook output/nda_playbook.json \
|
|
224
|
+
--out-json review.json
|
|
225
|
+
jq -n --slurpfile e extract.json --slurpfile r review.json \
|
|
226
|
+
'{parties: [$e[0].parties[].name], governing_law: $e[0].governing_law.value,
|
|
227
|
+
clauses: ($e[0].clauses | length), decision: $r[0].decision, risk: $r[0].risk_score}'
|
|
228
|
+
|
|
229
|
+
# 4) Triage a folder of inbound contracts: governing law + parties per file.
|
|
230
|
+
for f in inbox/*; do
|
|
193
231
|
extract "$f" --fields parties,governing_law --no-confidence \
|
|
194
|
-
| jq -c '{file:
|
|
232
|
+
| jq -c --arg f "$f" '{file: $f, gov: .governing_law, parties: [.parties[].name]}'
|
|
195
233
|
done
|
|
196
234
|
|
|
197
|
-
# 5) Gate a workflow on extraction confidence.
|
|
235
|
+
# 5) Gate a workflow on extraction confidence (non-zero exit if any clause is shaky).
|
|
198
236
|
extract draft.docx | jq -e '.clauses | all(.confidence > 0.7)' && echo "ok to review"
|
|
199
237
|
```
|
|
200
238
|
|
|
201
|
-
> The
|
|
202
|
-
>
|
|
203
|
-
>
|
|
204
|
-
> versioning commitment on the schema.
|
|
239
|
+
> The integration contract is the **output schema** and the **canonical clause
|
|
240
|
+
> vocabulary**, not per-tool flags. See [`docs/INTEROP.md`](docs/INTEROP.md) for
|
|
241
|
+
> the shared conventions and the schema's versioning commitment.
|
|
205
242
|
|
|
206
243
|
## LLM configuration (opt-in)
|
|
207
244
|
|
|
@@ -23,6 +23,30 @@ ingest (extract) → review → diff → convert → sign
|
|
|
23
23
|
^you are here
|
|
24
24
|
```
|
|
25
25
|
|
|
26
|
+
## Run this
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pipx run extract-cli demo # zero-config: extract a bundled NDA → structured JSON
|
|
30
|
+
# or, installed: pip install extract-cli && extract demo
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
That prints the full output contract — parties, dates, term, governing law, and
|
|
34
|
+
a clause map normalized onto the suite's canonical vocabulary — for a bundled
|
|
35
|
+
fixture, with no setup and no network. Point it at your own file with
|
|
36
|
+
`extract path/to/contract.docx`.
|
|
37
|
+
|
|
38
|
+
## Where to go next
|
|
39
|
+
|
|
40
|
+
- **New here?** Keep reading — [What it does](#what-it-does) and
|
|
41
|
+
[The two extraction tiers](#the-two-extraction-tiers).
|
|
42
|
+
- **Driving it from an agent?** See [`AGENTS.md`](AGENTS.md) and call
|
|
43
|
+
`extract --catalog json` at startup to discover commands/flags. The output
|
|
44
|
+
shape is locked by [`docs/spec/extract-output.schema.json`](docs/spec/extract-output.schema.json).
|
|
45
|
+
- **Wiring it into the pipeline?** See [`docs/INTEROP.md`](docs/INTEROP.md) — the
|
|
46
|
+
contract is the output schema + the shared clause vocabulary.
|
|
47
|
+
- **Contributing / building a sibling CLI?** [`CONTRIBUTING.md`](CONTRIBUTING.md)
|
|
48
|
+
and [ARCHITECTURE.md](ARCHITECTURE.md).
|
|
49
|
+
|
|
26
50
|
## What it does
|
|
27
51
|
|
|
28
52
|
Give it a contract in **`.md` / `.txt` / `.html`** (native), **`.docx`**, or
|
|
@@ -77,6 +101,7 @@ for them.
|
|
|
77
101
|
|
|
78
102
|
```bash
|
|
79
103
|
extract <path> # parse a document → structured JSON on stdout (default)
|
|
104
|
+
extract --catalog json # machine-readable catalog of commands/flags (agents call at startup)
|
|
80
105
|
extract schema # print the output JSON Schema (the cross-CLI contract)
|
|
81
106
|
extract fields # list extractable fields and their tier
|
|
82
107
|
extract demo # run on a bundled fixture and show the narrative
|
|
@@ -87,6 +112,7 @@ extract completion bash # emit a shell-completion script (bash|zsh)
|
|
|
87
112
|
|
|
88
113
|
| Flag | Meaning |
|
|
89
114
|
|---|---|
|
|
115
|
+
| `--catalog json` | Print the machine-readable command/flag catalog and exit (the suite discovery contract; agents call this at startup) |
|
|
90
116
|
| `--llm` | Opt-in LLM enrichment of fuzzy fields (off by default) |
|
|
91
117
|
| `--fields a,b,c` | Emit only a subset of top-level fields (e.g. `parties,clauses`) |
|
|
92
118
|
| `--format json\|table` | Output format (default `json`) |
|
|
@@ -133,37 +159,48 @@ extract counterparty.pdf | jq '.clauses[] | {canonical_title, detected_title, ma
|
|
|
133
159
|
|
|
134
160
|
## Composability — piping into the rest of the suite
|
|
135
161
|
|
|
136
|
-
`extract-cli` is built to be the first stage of a Unix pipe.
|
|
137
|
-
|
|
162
|
+
`extract-cli` is built to be the first stage of a Unix pipe. The glue is its
|
|
163
|
+
**stdout JSON + standard tools** (`jq`, `comm`) and the **shared clause
|
|
164
|
+
vocabulary** — `extract`'s `canonical_title` values are the same names
|
|
165
|
+
`template-vault-cli` detects and `nda-review-cli` keys policy on, so a foreign
|
|
166
|
+
document's clauses line up with the suite's with no bespoke adapter. Every
|
|
167
|
+
example below is runnable today (verified against the real sibling CLIs).
|
|
138
168
|
|
|
139
169
|
```bash
|
|
140
|
-
# 1)
|
|
141
|
-
extract
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
#
|
|
145
|
-
extract
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
#
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
#
|
|
154
|
-
|
|
170
|
+
# 1) Inspect any contract's structure (.md/.txt/.html/.docx/.pdf, one tool).
|
|
171
|
+
extract counterparty.docx | jq '{parties: [.parties[].name],
|
|
172
|
+
governing_law: .governing_law.value, clauses: [.clauses[].canonical_title]}'
|
|
173
|
+
|
|
174
|
+
# 2) Clause-coverage gap vs your canonical template in template-vault-cli.
|
|
175
|
+
# extract normalizes the counterparty's *foreign* headings onto the same
|
|
176
|
+
# clause vocabulary template-vault detects, so a plain `comm` diffs them.
|
|
177
|
+
template-vault info nda/mutual-standard --json | jq -r '.clauses[].title' | sort > ours.txt
|
|
178
|
+
extract counterparty_nda.docx | jq -r '.clauses[].canonical_title' | sort -u > theirs.txt
|
|
179
|
+
comm -23 ours.txt theirs.txt # clauses in OUR standard that THEY are missing
|
|
180
|
+
comm -13 ours.txt theirs.txt # clauses THEY added that we don't have
|
|
181
|
+
|
|
182
|
+
# 3) Intake: extract for structure, nda-review-cli for a policy verdict on the
|
|
183
|
+
# same foreign doc; merge both views with jq.
|
|
184
|
+
extract counterparty_nda.docx > extract.json
|
|
185
|
+
nda-review review --file counterparty_nda.docx --playbook output/nda_playbook.json \
|
|
186
|
+
--out-json review.json
|
|
187
|
+
jq -n --slurpfile e extract.json --slurpfile r review.json \
|
|
188
|
+
'{parties: [$e[0].parties[].name], governing_law: $e[0].governing_law.value,
|
|
189
|
+
clauses: ($e[0].clauses | length), decision: $r[0].decision, risk: $r[0].risk_score}'
|
|
190
|
+
|
|
191
|
+
# 4) Triage a folder of inbound contracts: governing law + parties per file.
|
|
192
|
+
for f in inbox/*; do
|
|
155
193
|
extract "$f" --fields parties,governing_law --no-confidence \
|
|
156
|
-
| jq -c '{file:
|
|
194
|
+
| jq -c --arg f "$f" '{file: $f, gov: .governing_law, parties: [.parties[].name]}'
|
|
157
195
|
done
|
|
158
196
|
|
|
159
|
-
# 5) Gate a workflow on extraction confidence.
|
|
197
|
+
# 5) Gate a workflow on extraction confidence (non-zero exit if any clause is shaky).
|
|
160
198
|
extract draft.docx | jq -e '.clauses | all(.confidence > 0.7)' && echo "ok to review"
|
|
161
199
|
```
|
|
162
200
|
|
|
163
|
-
> The
|
|
164
|
-
>
|
|
165
|
-
>
|
|
166
|
-
> versioning commitment on the schema.
|
|
201
|
+
> The integration contract is the **output schema** and the **canonical clause
|
|
202
|
+
> vocabulary**, not per-tool flags. See [`docs/INTEROP.md`](docs/INTEROP.md) for
|
|
203
|
+
> the shared conventions and the schema's versioning commitment.
|
|
167
204
|
|
|
168
205
|
## LLM configuration (opt-in)
|
|
169
206
|
|
|
@@ -118,6 +118,7 @@ only stdlib `urllib`, so there is no runtime dependency.
|
|
|
118
118
|
| Concern | Convention |
|
|
119
119
|
|---|---|
|
|
120
120
|
| Primary result | **stdout** (JSON payload, default) |
|
|
121
|
+
| Discovery | `extract --catalog json` (commands/flags, the suite contract) + `extract schema` / `extract fields --json` |
|
|
121
122
|
| `--why`, warnings, errors | **stderr** |
|
|
122
123
|
| `--why` envelope | plain-text `[why] <header>` block (as in template-vault-cli / draft-cli) |
|
|
123
124
|
| Quiet | `-q` / `--silent` / `--quiet` aliases |
|