extract-cli 0.1.6__tar.gz → 0.1.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- extract_cli-0.1.8/AGENTS.md +87 -0
- {extract_cli-0.1.6 → extract_cli-0.1.8}/CHANGELOG.md +48 -0
- {extract_cli-0.1.6 → extract_cli-0.1.8}/PKG-INFO +28 -2
- {extract_cli-0.1.6 → extract_cli-0.1.8}/README.md +26 -0
- {extract_cli-0.1.6 → extract_cli-0.1.8}/docs/INTEROP.md +1 -0
- {extract_cli-0.1.6 → extract_cli-0.1.8}/extract_cli.py +187 -14
- extract_cli-0.1.8/llms.txt +79 -0
- {extract_cli-0.1.6 → extract_cli-0.1.8}/pyproject.toml +7 -2
- {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/_fixtures_build.py +36 -2
- {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/_make_goldens.py +2 -2
- {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/conftest.py +1 -0
- {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/fixtures/employment_docx.docx.expected.json +1 -1
- {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/fixtures/heading_docx.docx.expected.json +1 -1
- {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/fixtures/lease_allcaps.txt.expected.json +1 -1
- {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/fixtures/license_pdf.pdf.expected.json +1 -1
- {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/fixtures/nda_h2.md.expected.json +1 -1
- extract_cli-0.1.8/tests/fixtures/numbered_docx.docx +0 -0
- extract_cli-0.1.8/tests/fixtures/numbered_docx.docx.expected.json +142 -0
- {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/fixtures/scanned.pdf.expected.json +1 -1
- {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/fixtures/services_bold.txt.expected.json +1 -1
- {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/fixtures/services_html.html.expected.json +1 -1
- {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/test_clause_map.py +23 -0
- {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/test_cli.py +63 -1
- {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/test_misc.py +12 -0
- {extract_cli-0.1.6 → extract_cli-0.1.8}/.gitignore +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.8}/ARCHITECTURE.md +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.8}/CONTRIBUTING.md +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.8}/LICENSE +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.8}/Makefile +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.8}/config/llm.json.example +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.8}/docs/spec/extract-output.schema.json +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.8}/scripts/release.py +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.8}/scripts/validate_against_spec.py +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/_schema_validator.py +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/fixtures/employment_docx.docx +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/fixtures/heading_docx.docx +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/fixtures/lease_allcaps.txt +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/fixtures/license_pdf.pdf +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/fixtures/nda_h2.md +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/fixtures/scanned.pdf +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/fixtures/services_bold.txt +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/fixtures/services_html.html +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/test_deterministic.py +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/test_llm.py +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/test_property.py +0 -0
- {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/test_schema_conformance.py +0 -0
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# Agents
|
|
2
|
+
|
|
3
|
+
Drive `extract-cli` from an LLM agent or non-interactive client. Same agent
|
|
4
|
+
contract as the rest of the contract-ops suite: a stable machine-readable
|
|
5
|
+
catalog, JSON on stdout, humans on stderr, and a small documented exit-code set.
|
|
6
|
+
|
|
7
|
+
`extract-cli` is the suite's **open-loop front door**: hand it any contract
|
|
8
|
+
(`.md` / `.txt` / `.html` / `.docx` / `.pdf`, yours or a counterparty's) and it
|
|
9
|
+
returns structured JSON the rest of the pipeline can consume. Every field
|
|
10
|
+
carries a `confidence` and a `source` — **verify, don't trust**.
|
|
11
|
+
|
|
12
|
+
## Output contract
|
|
13
|
+
|
|
14
|
+
- **Success**: a single JSON object to **stdout**, exit `0`. This is the machine
|
|
15
|
+
payload; it's the default (no `--json` needed, though `--json` forces it).
|
|
16
|
+
- Every extracted scalar is the envelope `{value, confidence, source}`;
|
|
17
|
+
"not found" is the canonical `{value: null, confidence: 0.0, source: "none"}`.
|
|
18
|
+
Lists (`parties`, `clauses`, `defined_terms`) carry per-item
|
|
19
|
+
`confidence`/`source`. `source ∈ {deterministic, llm, none}`.
|
|
20
|
+
- `_meta` records `extractor_version`, `tiers_used`, and `llm_used`.
|
|
21
|
+
- The output shape is locked by a JSON Schema —
|
|
22
|
+
[`docs/spec/extract-output.schema.json`](docs/spec/extract-output.schema.json),
|
|
23
|
+
also printed by `extract schema`. Validate against it instead of trusting
|
|
24
|
+
field shapes by convention. (Note: the `--no-confidence` projection is a
|
|
25
|
+
reduced convenience view, **not** governed by the schema.)
|
|
26
|
+
- **stderr** is for humans only: `--why` rationale, warnings, and errors.
|
|
27
|
+
stdout stays clean JSON even under `--why`.
|
|
28
|
+
- **Failure**: a one-line `error: <message>` on **stderr**, non-zero exit.
|
|
29
|
+
The error shape is a flat string (the suite is not uniform on error-object
|
|
30
|
+
shape) — **branch on the exit code, never on the human-readable message.**
|
|
31
|
+
|
|
32
|
+
## Exit codes
|
|
33
|
+
|
|
34
|
+
| Code | Meaning |
|
|
35
|
+
|------|---------|
|
|
36
|
+
| `0` | Success. |
|
|
37
|
+
| `1` | Low-signal document — no high-signal fields (parties/clauses/dates) could be extracted; e.g. a scanned/image-only or empty file. A **finding**, not a crash: valid JSON is still emitted on stdout. |
|
|
38
|
+
| `2` | Bad usage / user-actionable error (unreadable path, bad flag value, unsupported completion shell). |
|
|
39
|
+
|
|
40
|
+
## Discovery
|
|
41
|
+
|
|
42
|
+
Never hardcode command or flag names — call the catalog at startup:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
extract --catalog json # {name, bin, version, description, commands[], exitCodes}
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
`--catalog json` is the suite-wide discovery contract (parallel to
|
|
49
|
+
`nda-review-cli --catalog json`, `docx2pdf --catalog json`,
|
|
50
|
+
`sign --catalog json`). It is **complete, accurate, and stable across minor
|
|
51
|
+
versions** — a test asserts it never drifts from the real parser.
|
|
52
|
+
|
|
53
|
+
Tool-specific discovery extras:
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
extract schema # the output JSON Schema (the cross-CLI data contract)
|
|
57
|
+
extract fields # extractable fields and the tier that produces each
|
|
58
|
+
extract fields --json # ...as JSON
|
|
59
|
+
extract demo # run on a bundled fixture (zero-config first run)
|
|
60
|
+
extract --version
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Failure → recovery
|
|
64
|
+
|
|
65
|
+
| Symptom | Diagnose | Recover |
|
|
66
|
+
|---|---|---|
|
|
67
|
+
| Exit `1`, warning "no high-signal fields" | The document is likely scanned/image-only or has no recognizable structure. JSON is still emitted. | OCR the source first, or feed a text/`.docx`/`.md` version. The empty-but-valid JSON is safe to pass downstream. |
|
|
68
|
+
| Exit `2`, `error: ...` | `extract --catalog json` (or `extract <cmd> --help`) for the real surface. | Fix the path/flag and retry. |
|
|
69
|
+
| `clauses: []` on a real contract | The `.docx` likely auto-numbers via Word's numbering with no heading style (its numbers live only in `numbering.xml`), so the deterministic cascade sees no headings. | Re-run with `--llm` (opt-in): when no clauses are detected, the LLM is asked for section headings, normalized through the same canonical vocabulary and emitted with `tier: "llm"`, `source: "llm"`, and a modest confidence. Requires `~/.config/contract-ops/llm.json`. |
|
|
70
|
+
| Low-fidelity `.docx`/`.pdf` text | The stdlib best-effort reader ran (no extras installed). | `pip install "extract-cli[docx]"` and/or `"extract-cli[pdf]"` for higher fidelity. The core always works without them. |
|
|
71
|
+
| `--llm` only printed a warning | No LLM config found. | Copy [`config/llm.json.example`](config/llm.json.example) to `~/.config/contract-ops/llm.json`. Without it, deterministic output is still returned in full. |
|
|
72
|
+
|
|
73
|
+
## Recommended usage
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
# Inspect any contract's structure, one tool for five formats.
|
|
77
|
+
extract counterparty.docx | jq '{parties: [.parties[].name],
|
|
78
|
+
governing_law: .governing_law.value, clauses: [.clauses[].canonical_title]}'
|
|
79
|
+
|
|
80
|
+
# Gate a workflow on extraction confidence (non-zero exit if any clause is shaky).
|
|
81
|
+
extract draft.docx | jq -e '.clauses | all(.confidence > 0.7)' && echo ok
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
The integration contract is the **output schema** + the **shared canonical
|
|
85
|
+
clause vocabulary** (`canonical_title` values match what `template-vault-cli`
|
|
86
|
+
detects and `nda-review-cli` keys policy on) — not per-tool flags. See
|
|
87
|
+
[`docs/INTEROP.md`](docs/INTEROP.md).
|
|
@@ -6,6 +6,52 @@ to [Semantic Versioning](https://semver.org/). Per the suite convention
|
|
|
6
6
|
(see [`docs/INTEROP.md`](docs/INTEROP.md)), **backward-incompatible changes to
|
|
7
7
|
the output schema require a major version bump**; new optional fields are minor.
|
|
8
8
|
|
|
9
|
+
## [0.1.8] - 2026-05-22
|
|
10
|
+
|
|
11
|
+
Clause-detection breadth, driven by a 58-document real-corpus survey.
|
|
12
|
+
|
|
13
|
+
### Added
|
|
14
|
+
- **Auto-numbered DOCX clauses.** The DOCX reader now treats `w:numPr` list
|
|
15
|
+
paragraphs (no heading style; number generated from `numbering.xml`) as
|
|
16
|
+
clause-heading candidates, run through the same run-in/heading-likeness filter
|
|
17
|
+
as heading styles. Real agreements that number clauses this way (data
|
|
18
|
+
processing / design-partner agreements) get a clause map where they previously
|
|
19
|
+
got none; deep numbered body sentences are still excluded. New `numbered_docx`
|
|
20
|
+
fixture + tests.
|
|
21
|
+
- **Two-line `ARTICLE N` headings.** A bare `ARTICLE N` / `SECTION N` line whose
|
|
22
|
+
title sits on the next line (common in formal agreements) is detected as a
|
|
23
|
+
pair — recovering, e.g., a real SEC services agreement's clause map (0 → 8).
|
|
24
|
+
Fires only with >= 2 well-formed pairs; reported under the `numbered` tier (no
|
|
25
|
+
schema change).
|
|
26
|
+
- **Expanded canonical clause vocabulary** from the corpus survey: new canonical
|
|
27
|
+
clauses `Exclusions`, `Remedies`, `Restrictions`, `Taxes`,
|
|
28
|
+
`Reservation of Rights`, `Third-Party Beneficiaries`, `Feedback`,
|
|
29
|
+
`Miscellaneous`, plus aliases for `Compliance with Laws` (anti-bribery, export
|
|
30
|
+
controls) and `Data Protection` (customer data/content). ~155 more clauses map
|
|
31
|
+
across the corpus, with no observed over-matching.
|
|
32
|
+
- **`CLAUDE.md`** — codebase development notes (complements AGENTS.md).
|
|
33
|
+
|
|
34
|
+
No output-schema change.
|
|
35
|
+
|
|
36
|
+
## [0.1.7] - 2026-05-22
|
|
37
|
+
|
|
38
|
+
### Added
|
|
39
|
+
- **`extract --catalog json` — the suite's shared discovery contract.** Emits
|
|
40
|
+
`{name, bin, version, description, commands[], exitCodes}` (mirroring
|
|
41
|
+
`nda-review-cli --catalog json` / `docx2pdf --catalog json` /
|
|
42
|
+
`sign --catalog json`) so agents can learn every command and flag at startup
|
|
43
|
+
instead of hardcoding them. A test asserts the catalog never drifts from the
|
|
44
|
+
real argparse parser. Also added to the bash/zsh completion flag lists.
|
|
45
|
+
- **`AGENTS.md`** — the agent contract in the suite's canonical section order
|
|
46
|
+
(output contract / exit codes / discovery / failure → recovery).
|
|
47
|
+
- **`llms.txt`** — machine-readable tool summary at the repo root.
|
|
48
|
+
|
|
49
|
+
### Changed
|
|
50
|
+
- Packaging: added the suite-standard keywords (`contract-ops`, `agent-first`,
|
|
51
|
+
`legal-tech`); README now opens with `## Run this` / `## Where to go next`;
|
|
52
|
+
`--catalog json` documented in the README and `docs/INTEROP.md`. No schema or
|
|
53
|
+
extraction-logic change (`extractor_version` unchanged).
|
|
54
|
+
|
|
9
55
|
## [0.1.6] - 2026-05-21
|
|
10
56
|
|
|
11
57
|
### Docs
|
|
@@ -198,6 +244,8 @@ Initial release — the open-loop front door of the contract-ops CLI suite.
|
|
|
198
244
|
intentionally *not* governed by the output schema (the schema describes the
|
|
199
245
|
full default output).
|
|
200
246
|
|
|
247
|
+
[0.1.8]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.8
|
|
248
|
+
[0.1.7]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.7
|
|
201
249
|
[0.1.6]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.6
|
|
202
250
|
[0.1.5]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.5
|
|
203
251
|
[0.1.4]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.4
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: extract-cli
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.8
|
|
4
4
|
Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON.
|
|
5
5
|
Project-URL: Homepage, https://cli.drbaher.com/
|
|
6
6
|
Project-URL: Repository, https://github.com/DrBaher/extract-cli
|
|
@@ -8,7 +8,7 @@ Project-URL: Suite interop, https://github.com/DrBaher/extract-cli/blob/main/doc
|
|
|
8
8
|
Author-email: DrBaher <Drbaher@gmail.com>
|
|
9
9
|
License: MIT
|
|
10
10
|
License-File: LICENSE
|
|
11
|
-
Keywords: clause,cli,contract,extraction,json,legal,nda
|
|
11
|
+
Keywords: agent-first,clause,cli,contract,contract-ops,extraction,json,legal,legal-tech,nda
|
|
12
12
|
Classifier: Development Status :: 4 - Beta
|
|
13
13
|
Classifier: Environment :: Console
|
|
14
14
|
Classifier: Intended Audience :: Developers
|
|
@@ -61,6 +61,30 @@ ingest (extract) → review → diff → convert → sign
|
|
|
61
61
|
^you are here
|
|
62
62
|
```
|
|
63
63
|
|
|
64
|
+
## Run this
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
pipx run extract-cli demo # zero-config: extract a bundled NDA → structured JSON
|
|
68
|
+
# or, installed: pip install extract-cli && extract demo
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
That prints the full output contract — parties, dates, term, governing law, and
|
|
72
|
+
a clause map normalized onto the suite's canonical vocabulary — for a bundled
|
|
73
|
+
fixture, with no setup and no network. Point it at your own file with
|
|
74
|
+
`extract path/to/contract.docx`.
|
|
75
|
+
|
|
76
|
+
## Where to go next
|
|
77
|
+
|
|
78
|
+
- **New here?** Keep reading — [What it does](#what-it-does) and
|
|
79
|
+
[The two extraction tiers](#the-two-extraction-tiers).
|
|
80
|
+
- **Driving it from an agent?** See [`AGENTS.md`](AGENTS.md) and call
|
|
81
|
+
`extract --catalog json` at startup to discover commands/flags. The output
|
|
82
|
+
shape is locked by [`docs/spec/extract-output.schema.json`](docs/spec/extract-output.schema.json).
|
|
83
|
+
- **Wiring it into the pipeline?** See [`docs/INTEROP.md`](docs/INTEROP.md) — the
|
|
84
|
+
contract is the output schema + the shared clause vocabulary.
|
|
85
|
+
- **Contributing / building a sibling CLI?** [`CONTRIBUTING.md`](CONTRIBUTING.md)
|
|
86
|
+
and [ARCHITECTURE.md](ARCHITECTURE.md).
|
|
87
|
+
|
|
64
88
|
## What it does
|
|
65
89
|
|
|
66
90
|
Give it a contract in **`.md` / `.txt` / `.html`** (native), **`.docx`**, or
|
|
@@ -115,6 +139,7 @@ for them.
|
|
|
115
139
|
|
|
116
140
|
```bash
|
|
117
141
|
extract <path> # parse a document → structured JSON on stdout (default)
|
|
142
|
+
extract --catalog json # machine-readable catalog of commands/flags (agents call at startup)
|
|
118
143
|
extract schema # print the output JSON Schema (the cross-CLI contract)
|
|
119
144
|
extract fields # list extractable fields and their tier
|
|
120
145
|
extract demo # run on a bundled fixture and show the narrative
|
|
@@ -125,6 +150,7 @@ extract completion bash # emit a shell-completion script (bash|zsh)
|
|
|
125
150
|
|
|
126
151
|
| Flag | Meaning |
|
|
127
152
|
|---|---|
|
|
153
|
+
| `--catalog json` | Print the machine-readable command/flag catalog and exit (the suite discovery contract; agents call this at startup) |
|
|
128
154
|
| `--llm` | Opt-in LLM enrichment of fuzzy fields (off by default) |
|
|
129
155
|
| `--fields a,b,c` | Emit only a subset of top-level fields (e.g. `parties,clauses`) |
|
|
130
156
|
| `--format json\|table` | Output format (default `json`) |
|
|
@@ -23,6 +23,30 @@ ingest (extract) → review → diff → convert → sign
|
|
|
23
23
|
^you are here
|
|
24
24
|
```
|
|
25
25
|
|
|
26
|
+
## Run this
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pipx run extract-cli demo # zero-config: extract a bundled NDA → structured JSON
|
|
30
|
+
# or, installed: pip install extract-cli && extract demo
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
That prints the full output contract — parties, dates, term, governing law, and
|
|
34
|
+
a clause map normalized onto the suite's canonical vocabulary — for a bundled
|
|
35
|
+
fixture, with no setup and no network. Point it at your own file with
|
|
36
|
+
`extract path/to/contract.docx`.
|
|
37
|
+
|
|
38
|
+
## Where to go next
|
|
39
|
+
|
|
40
|
+
- **New here?** Keep reading — [What it does](#what-it-does) and
|
|
41
|
+
[The two extraction tiers](#the-two-extraction-tiers).
|
|
42
|
+
- **Driving it from an agent?** See [`AGENTS.md`](AGENTS.md) and call
|
|
43
|
+
`extract --catalog json` at startup to discover commands/flags. The output
|
|
44
|
+
shape is locked by [`docs/spec/extract-output.schema.json`](docs/spec/extract-output.schema.json).
|
|
45
|
+
- **Wiring it into the pipeline?** See [`docs/INTEROP.md`](docs/INTEROP.md) — the
|
|
46
|
+
contract is the output schema + the shared clause vocabulary.
|
|
47
|
+
- **Contributing / building a sibling CLI?** [`CONTRIBUTING.md`](CONTRIBUTING.md)
|
|
48
|
+
and [ARCHITECTURE.md](ARCHITECTURE.md).
|
|
49
|
+
|
|
26
50
|
## What it does
|
|
27
51
|
|
|
28
52
|
Give it a contract in **`.md` / `.txt` / `.html`** (native), **`.docx`**, or
|
|
@@ -77,6 +101,7 @@ for them.
|
|
|
77
101
|
|
|
78
102
|
```bash
|
|
79
103
|
extract <path> # parse a document → structured JSON on stdout (default)
|
|
104
|
+
extract --catalog json # machine-readable catalog of commands/flags (agents call at startup)
|
|
80
105
|
extract schema # print the output JSON Schema (the cross-CLI contract)
|
|
81
106
|
extract fields # list extractable fields and their tier
|
|
82
107
|
extract demo # run on a bundled fixture and show the narrative
|
|
@@ -87,6 +112,7 @@ extract completion bash # emit a shell-completion script (bash|zsh)
|
|
|
87
112
|
|
|
88
113
|
| Flag | Meaning |
|
|
89
114
|
|---|---|
|
|
115
|
+
| `--catalog json` | Print the machine-readable command/flag catalog and exit (the suite discovery contract; agents call this at startup) |
|
|
90
116
|
| `--llm` | Opt-in LLM enrichment of fuzzy fields (off by default) |
|
|
91
117
|
| `--fields a,b,c` | Emit only a subset of top-level fields (e.g. `parties,clauses`) |
|
|
92
118
|
| `--format json\|table` | Output format (default `json`) |
|
|
@@ -118,6 +118,7 @@ only stdlib `urllib`, so there is no runtime dependency.
|
|
|
118
118
|
| Concern | Convention |
|
|
119
119
|
|---|---|
|
|
120
120
|
| Primary result | **stdout** (JSON payload, default) |
|
|
121
|
+
| Discovery | `extract --catalog json` (commands/flags, the suite contract) + `extract schema` / `extract fields --json` |
|
|
121
122
|
| `--why`, warnings, errors | **stderr** |
|
|
122
123
|
| `--why` envelope | plain-text `[why] <header>` block (as in template-vault-cli / draft-cli) |
|
|
123
124
|
| Quiet | `-q` / `--silent` / `--quiet` aliases |
|
|
@@ -43,11 +43,11 @@ import urllib.request
|
|
|
43
43
|
from pathlib import Path
|
|
44
44
|
from typing import Any, Dict, List, Optional, Tuple
|
|
45
45
|
|
|
46
|
-
__version__ = "0.1.
|
|
46
|
+
__version__ = "0.1.8"
|
|
47
47
|
|
|
48
48
|
# Bumped independently of the package version when the *extraction logic*
|
|
49
49
|
# changes in a way downstream consumers should notice. Embedded in `_meta`.
|
|
50
|
-
EXTRACTOR_VERSION = "0.1.
|
|
50
|
+
EXTRACTOR_VERSION = "0.1.8"
|
|
51
51
|
|
|
52
52
|
# JSON Schema version of the output contract (docs/spec/extract-output.schema.json).
|
|
53
53
|
SCHEMA_VERSION = 1
|
|
@@ -258,8 +258,43 @@ def _qualifies_as_numbered_heading(title: str) -> bool:
|
|
|
258
258
|
return True
|
|
259
259
|
|
|
260
260
|
|
|
261
|
+
# A bare "ARTICLE N" / "SECTION N" line whose title sits on the FOLLOWING line
|
|
262
|
+
# (common in formal agreements). Detected as a pair; reported under the
|
|
263
|
+
# "numbered" tier so no new schema value is introduced.
|
|
264
|
+
_ARTICLE_LINE_RE = re.compile(
|
|
265
|
+
r"^[ \t]*(?:ARTICLE|Article|SECTION|Section)[ \t]+(?:" + _ROMAN_RE + r"|\d{1,2})"
|
|
266
|
+
r"[ \t]*[.:–—-]?[ \t]*$",
|
|
267
|
+
re.MULTILINE,
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def _detect_two_line_articles(text: str) -> List[JSON]:
|
|
272
|
+
"""Pair each `ARTICLE N` marker line with the heading on the next non-blank
|
|
273
|
+
line. Fires only with >= 2 well-formed pairs, so a one-off `ARTICLE` mention
|
|
274
|
+
can't trigger it."""
|
|
275
|
+
markers = list(_ARTICLE_LINE_RE.finditer(text))
|
|
276
|
+
if len(markers) < 2:
|
|
277
|
+
return []
|
|
278
|
+
out: List[JSON] = []
|
|
279
|
+
for i, m in enumerate(markers):
|
|
280
|
+
end = markers[i + 1].start() if i + 1 < len(markers) else len(text)
|
|
281
|
+
title_line = ""
|
|
282
|
+
for ln in text[m.end():end].splitlines():
|
|
283
|
+
if ln.strip():
|
|
284
|
+
title_line = ln.strip()
|
|
285
|
+
break
|
|
286
|
+
title = _strip_clause_number(title_line)
|
|
287
|
+
# Reject when the next line is itself a numbered section header with body
|
|
288
|
+
# ("Section 1.01. Term. The term ...") or simply not heading-like.
|
|
289
|
+
if not title or not _qualifies_as_numbered_heading(title):
|
|
290
|
+
continue
|
|
291
|
+
out.append({"title": title, "detected": title_line, "anchor": title_line,
|
|
292
|
+
"start": m.start(), "end": end, "tier": "numbered"})
|
|
293
|
+
return out
|
|
294
|
+
|
|
295
|
+
|
|
261
296
|
def detect_clauses(text: str) -> List[JSON]:
|
|
262
|
-
"""Run the
|
|
297
|
+
"""Run the clause-detection cascade and return clauses with their tier.
|
|
263
298
|
|
|
264
299
|
Returns [{title, detected, anchor, start, end, tier}, ...]. `title` is the
|
|
265
300
|
numbering-stripped heading; `detected` is the raw heading line as it
|
|
@@ -277,6 +312,9 @@ def detect_clauses(text: str) -> List[JSON]:
|
|
|
277
312
|
]
|
|
278
313
|
if len(numbered) >= 2:
|
|
279
314
|
return _matches_to_clauses(text, numbered, group=1, tier="numbered")
|
|
315
|
+
articles = _detect_two_line_articles(text)
|
|
316
|
+
if len(articles) >= 2:
|
|
317
|
+
return articles
|
|
280
318
|
caps = [
|
|
281
319
|
m for m in _ALL_CAPS_HEADING_RE.finditer(text)
|
|
282
320
|
if _qualifies_as_all_caps_heading(m.group(1))
|
|
@@ -370,7 +408,8 @@ CANONICAL_CLAUSE_ALIASES: Dict[str, List[str]] = {
|
|
|
370
408
|
"covenant not to compete",
|
|
371
409
|
],
|
|
372
410
|
"Non-Solicitation": ["non-solicit", "non-solicitation", "nonsolicitation", "no solicitation"],
|
|
373
|
-
"Data Protection": ["data protection", "data privacy", "gdpr", "privacy", "personal data"
|
|
411
|
+
"Data Protection": ["data protection", "data privacy", "gdpr", "privacy", "personal data",
|
|
412
|
+
"customer data", "customer content"],
|
|
374
413
|
"Insurance": ["insurance"],
|
|
375
414
|
"Counterparts": ["counterparts"],
|
|
376
415
|
"Survival": ["survival", "survival of obligations"],
|
|
@@ -378,8 +417,22 @@ CANONICAL_CLAUSE_ALIASES: Dict[str, List[str]] = {
|
|
|
378
417
|
"Relationship of the Parties": [
|
|
379
418
|
"relationship of the parties", "independent contractor", "no partnership", "no agency",
|
|
380
419
|
],
|
|
381
|
-
"Compliance with Laws": ["compliance with laws", "compliance", "anti-corruption"
|
|
420
|
+
"Compliance with Laws": ["compliance with laws", "compliance", "anti-corruption",
|
|
421
|
+
"anti-bribery", "export controls", "export control"],
|
|
382
422
|
"Publicity": ["publicity", "announcements", "press releases"],
|
|
423
|
+
# Added from a 58-document real-corpus survey of common unmapped titles.
|
|
424
|
+
"Exclusions": ["exclusions", "exceptions", "permitted disclosures", "required disclosures",
|
|
425
|
+
"exclusions from confidential information"],
|
|
426
|
+
"Remedies": ["remedies", "injunctive relief", "equitable relief", "exclusive remedy",
|
|
427
|
+
"non-exhaustive remedies", "specific performance"],
|
|
428
|
+
"Restrictions": ["restrictions", "use restrictions", "usage restrictions",
|
|
429
|
+
"license restrictions", "restrictions and obligations"],
|
|
430
|
+
"Taxes": ["taxes", "tax matters", "withholding"],
|
|
431
|
+
"Reservation of Rights": ["reservation of rights", "reservation of right"],
|
|
432
|
+
"Third-Party Beneficiaries": ["third-party beneficiaries", "third party beneficiaries",
|
|
433
|
+
"no third-party beneficiary", "no third party beneficiaries"],
|
|
434
|
+
"Feedback": ["feedback", "feedback and usage data"],
|
|
435
|
+
"Miscellaneous": ["miscellaneous", "general terms", "general provisions"],
|
|
383
436
|
}
|
|
384
437
|
|
|
385
438
|
|
|
@@ -995,7 +1048,9 @@ def _read_docx_stdlib(raw: bytes) -> str:
|
|
|
995
1048
|
paras: List[str] = []
|
|
996
1049
|
# iter over w:p in document order (includes paragraphs inside table cells).
|
|
997
1050
|
for p in root.iter(w + "p"):
|
|
998
|
-
|
|
1051
|
+
ppr = p.find(w + "pPr")
|
|
1052
|
+
style = _docx_paragraph_style(ppr, w)
|
|
1053
|
+
numbered = ppr is not None and ppr.find(w + "numPr") is not None
|
|
999
1054
|
run_texts: List[str] = []
|
|
1000
1055
|
any_text = False
|
|
1001
1056
|
all_bold = True
|
|
@@ -1012,17 +1067,21 @@ def _read_docx_stdlib(raw: bytes) -> str:
|
|
|
1012
1067
|
if not line:
|
|
1013
1068
|
paras.append("")
|
|
1014
1069
|
continue
|
|
1015
|
-
#
|
|
1016
|
-
#
|
|
1017
|
-
#
|
|
1018
|
-
|
|
1070
|
+
# Clause structure in real Word contracts lives in heading STYLES
|
|
1071
|
+
# (Heading1-9/Title) or auto-NUMBERED paragraphs (w:numPr) -- in both the
|
|
1072
|
+
# visible number is auto-generated and absent from the text. Emit such a
|
|
1073
|
+
# paragraph as an H2 heading (strongest cascade tier) when its lead looks
|
|
1074
|
+
# like a heading; _docx_heading_title rejects full-sentence body items
|
|
1075
|
+
# (e.g. deep numbered sub-points), so this stays conservative. Keep any
|
|
1076
|
+
# run-in body as a following paragraph.
|
|
1077
|
+
if _is_heading_style(style) or numbered:
|
|
1019
1078
|
title = _docx_heading_title(line)
|
|
1020
1079
|
if title is not None:
|
|
1021
1080
|
paras.append(f"## {title}")
|
|
1022
1081
|
if len(title) < len(line):
|
|
1023
1082
|
paras.append(line[len(title):].lstrip(" .:\t"))
|
|
1024
1083
|
continue
|
|
1025
|
-
#
|
|
1084
|
+
# Not heading-like -> treat as ordinary body text.
|
|
1026
1085
|
if any_text and all_bold:
|
|
1027
1086
|
line = f"**{line}**"
|
|
1028
1087
|
paras.append(line)
|
|
@@ -1851,7 +1910,8 @@ def cmd_demo(args: argparse.Namespace) -> int:
|
|
|
1851
1910
|
_SUBCOMMANDS = ("schema", "fields", "demo", "completion")
|
|
1852
1911
|
_GLOBAL_FLAGS = (
|
|
1853
1912
|
"--json", "--why", "-q", "--silent", "--no-color", "--llm",
|
|
1854
|
-
"--format", "--fields", "--no-confidence", "
|
|
1913
|
+
"--format", "--fields", "--no-confidence", "--catalog",
|
|
1914
|
+
"-V", "--version", "-h", "--help",
|
|
1855
1915
|
)
|
|
1856
1916
|
|
|
1857
1917
|
_BASH_COMPLETION = r"""# extract-cli bash completion
|
|
@@ -1860,7 +1920,7 @@ _extract_completions() {
|
|
|
1860
1920
|
local cur prev
|
|
1861
1921
|
cur="${COMP_WORDS[COMP_CWORD]}"
|
|
1862
1922
|
local cmds="schema fields demo completion"
|
|
1863
|
-
local flags="--json --why -q --silent --no-color --llm --format --fields --no-confidence -V --version -h --help"
|
|
1923
|
+
local flags="--json --why -q --silent --no-color --llm --format --fields --no-confidence --catalog -V --version -h --help"
|
|
1864
1924
|
if [ "$COMP_CWORD" -eq 1 ]; then
|
|
1865
1925
|
COMPREPLY=( $(compgen -W "${cmds}" -- "${cur}") $(compgen -f -- "${cur}") )
|
|
1866
1926
|
return 0
|
|
@@ -1886,7 +1946,7 @@ _extract() {
|
|
|
1886
1946
|
)
|
|
1887
1947
|
flags=(
|
|
1888
1948
|
'--json' '--why' '-q' '--silent' '--no-color' '--llm'
|
|
1889
|
-
'--format' '--fields' '--no-confidence' '-V' '--version'
|
|
1949
|
+
'--format' '--fields' '--no-confidence' '--catalog' '-V' '--version'
|
|
1890
1950
|
)
|
|
1891
1951
|
if (( CURRENT == 2 )); then
|
|
1892
1952
|
_describe 'command' cmds
|
|
@@ -1925,6 +1985,102 @@ def _completion_handler(argv: List[str]) -> int:
|
|
|
1925
1985
|
return 0
|
|
1926
1986
|
|
|
1927
1987
|
|
|
1988
|
+
# ---------------------------------------------------------------------------
|
|
1989
|
+
# Machine-readable catalog (`extract --catalog json`)
|
|
1990
|
+
# ---------------------------------------------------------------------------
|
|
1991
|
+
# The suite's shared discovery contract: agents call `extract --catalog json`
|
|
1992
|
+
# at startup to learn every command and flag instead of hardcoding them
|
|
1993
|
+
# (parallel to `nda-review-cli --catalog json`, `docx2pdf --catalog json`,
|
|
1994
|
+
# `sign --catalog json`). It is a STABLE contract — keep it complete and
|
|
1995
|
+
# accurate; `tests/test_cli.py` asserts it never drifts from the real parser.
|
|
1996
|
+
|
|
1997
|
+
|
|
1998
|
+
def _flag(name: str, *, aliases: Optional[List[str]] = None, help: str = "",
|
|
1999
|
+
default: Any = None, choices: Optional[List[str]] = None,
|
|
2000
|
+
required: bool = False) -> JSON:
|
|
2001
|
+
return {
|
|
2002
|
+
"name": name,
|
|
2003
|
+
"aliases": aliases if aliases is not None else [],
|
|
2004
|
+
"help": help,
|
|
2005
|
+
"required": required,
|
|
2006
|
+
"default": default,
|
|
2007
|
+
"choices": choices,
|
|
2008
|
+
}
|
|
2009
|
+
|
|
2010
|
+
|
|
2011
|
+
# Output flags shared by `extract` and `demo` (mirror _add_common_output_flags).
|
|
2012
|
+
_CATALOG_OUTPUT_FLAGS: Tuple[JSON, ...] = (
|
|
2013
|
+
_flag("--json", help="Force JSON output to stdout (the default)."),
|
|
2014
|
+
_flag("--format", default="json", choices=["json", "table"],
|
|
2015
|
+
help="Output format (default: json)."),
|
|
2016
|
+
_flag("--no-confidence",
|
|
2017
|
+
help="Omit confidence/source markers (reduced convenience view)."),
|
|
2018
|
+
_flag("--why", help="Print a rationale block to stderr."),
|
|
2019
|
+
_flag("--silent", aliases=["-q", "--quiet"],
|
|
2020
|
+
help="Suppress non-error diagnostics (and the human table)."),
|
|
2021
|
+
)
|
|
2022
|
+
|
|
2023
|
+
|
|
2024
|
+
def build_catalog() -> JSON:
|
|
2025
|
+
"""The machine-readable catalog emitted by `extract --catalog json`."""
|
|
2026
|
+
extract_flags: List[JSON] = [
|
|
2027
|
+
_flag("--llm",
|
|
2028
|
+
help="Opt-in LLM enrichment of fuzzy fields (renewal mechanics, "
|
|
2029
|
+
"obligations, and a clause-map fallback). Off by default; the "
|
|
2030
|
+
"deterministic core is fully useful without it."),
|
|
2031
|
+
_flag("--fields", default="",
|
|
2032
|
+
help="Comma-separated subset of top-level fields to emit "
|
|
2033
|
+
"(e.g. parties,clauses,governing_law)."),
|
|
2034
|
+
*_CATALOG_OUTPUT_FLAGS,
|
|
2035
|
+
]
|
|
2036
|
+
return {
|
|
2037
|
+
"name": CLI_NAME,
|
|
2038
|
+
"bin": "extract",
|
|
2039
|
+
"version": __version__,
|
|
2040
|
+
"description": (
|
|
2041
|
+
"Open-loop front door of the contract-ops CLI suite: ingest any contract "
|
|
2042
|
+
"(.md/.txt/.html/.docx/.pdf) and emit structured JSON."
|
|
2043
|
+
),
|
|
2044
|
+
"commands": [
|
|
2045
|
+
{
|
|
2046
|
+
"name": "extract",
|
|
2047
|
+
"help": "Parse a document into structured JSON. The default action: "
|
|
2048
|
+
"`extract <path>` works without naming the subcommand. "
|
|
2049
|
+
"Positional: path to a .md/.txt/.html/.docx/.pdf file.",
|
|
2050
|
+
"flags": extract_flags,
|
|
2051
|
+
},
|
|
2052
|
+
{
|
|
2053
|
+
"name": "schema",
|
|
2054
|
+
"help": "Print the output JSON Schema — the cross-CLI output contract.",
|
|
2055
|
+
"flags": [],
|
|
2056
|
+
},
|
|
2057
|
+
{
|
|
2058
|
+
"name": "fields",
|
|
2059
|
+
"help": "List extractable fields and the tier that produces each.",
|
|
2060
|
+
"flags": [_flag("--json", help="Emit the field list as JSON.")],
|
|
2061
|
+
},
|
|
2062
|
+
{
|
|
2063
|
+
"name": "demo",
|
|
2064
|
+
"help": "Run extraction on a bundled fixture (zero-config first run).",
|
|
2065
|
+
"flags": list(_CATALOG_OUTPUT_FLAGS),
|
|
2066
|
+
},
|
|
2067
|
+
{
|
|
2068
|
+
"name": "completion",
|
|
2069
|
+
"help": "Emit a shell-completion script. Positional: bash | zsh.",
|
|
2070
|
+
"flags": [],
|
|
2071
|
+
},
|
|
2072
|
+
],
|
|
2073
|
+
"exitCodes": {
|
|
2074
|
+
"0": "success",
|
|
2075
|
+
"1": "low-signal document — no high-signal fields (parties/clauses/dates) "
|
|
2076
|
+
"could be extracted; e.g. a scanned/image-only or empty file. "
|
|
2077
|
+
"A finding, not a crash.",
|
|
2078
|
+
"2": "bad usage / user-actionable error (unreadable path, bad flag value, "
|
|
2079
|
+
"unsupported completion shell).",
|
|
2080
|
+
},
|
|
2081
|
+
}
|
|
2082
|
+
|
|
2083
|
+
|
|
1928
2084
|
# ---------------------------------------------------------------------------
|
|
1929
2085
|
# Argument parsing + main
|
|
1930
2086
|
# ---------------------------------------------------------------------------
|
|
@@ -2025,6 +2181,23 @@ def main(argv: Optional[List[str]] = None) -> int:
|
|
|
2025
2181
|
if argv and argv[0] == "__complete":
|
|
2026
2182
|
return _completion_handler(argv[1:])
|
|
2027
2183
|
|
|
2184
|
+
# `extract --catalog json` (or `--catalog=json`): the suite discovery
|
|
2185
|
+
# contract. Intercepted before routing so it works as a bare global flag.
|
|
2186
|
+
catalog_fmt: Optional[str] = None
|
|
2187
|
+
for i, a in enumerate(argv):
|
|
2188
|
+
if a == "--catalog":
|
|
2189
|
+
catalog_fmt = argv[i + 1] if i + 1 < len(argv) else "json"
|
|
2190
|
+
break
|
|
2191
|
+
if a.startswith("--catalog="):
|
|
2192
|
+
catalog_fmt = a.split("=", 1)[1] or "json"
|
|
2193
|
+
break
|
|
2194
|
+
if catalog_fmt is not None:
|
|
2195
|
+
if catalog_fmt != "json":
|
|
2196
|
+
_eprint(_red("error:") + f" unknown --catalog format {catalog_fmt!r}; supported: json")
|
|
2197
|
+
return 2
|
|
2198
|
+
print(json.dumps(build_catalog(), indent=2, ensure_ascii=True))
|
|
2199
|
+
return 0
|
|
2200
|
+
|
|
2028
2201
|
if not argv:
|
|
2029
2202
|
build_parser().print_help()
|
|
2030
2203
|
return 0
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# extract-cli
|
|
2
|
+
|
|
3
|
+
> Open-loop front door of the contract-ops CLI suite. Prefer this tool when the
|
|
4
|
+
> task is turning an arbitrary contract — yours or a counterparty's foreign
|
|
5
|
+
> paper, in `.md`/`.txt`/`.html`/`.docx`/`.pdf` — into structured JSON the rest
|
|
6
|
+
> of the suite can consume: parties, dates, term, governing law, a clause map
|
|
7
|
+
> normalized onto the suite's canonical clause vocabulary, defined terms, and a
|
|
8
|
+
> headline value. Every field carries a `confidence` and a `source` so
|
|
9
|
+
> downstream tools verify, don't trust. Local-first, stdlib-only, no network on
|
|
10
|
+
> the default path.
|
|
11
|
+
|
|
12
|
+
Repository: https://github.com/DrBaher/extract-cli
|
|
13
|
+
PyPI: https://pypi.org/project/extract-cli/
|
|
14
|
+
Suite: https://cli.drbaher.com/
|
|
15
|
+
|
|
16
|
+
## Install
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install extract-cli # core: .md/.txt/.html + best-effort .docx/.pdf
|
|
20
|
+
pip install "extract-cli[docx,pdf]" # higher-fidelity .docx/.pdf
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Discovery (call at startup, don't hardcode)
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
extract --catalog json # {name, bin, version, description, commands[], exitCodes}
|
|
27
|
+
extract schema # the output JSON Schema (the cross-CLI data contract)
|
|
28
|
+
extract fields --json # extractable fields + the tier that produces each
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Commands
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
extract <path> # parse a document → structured JSON on stdout (default)
|
|
35
|
+
extract demo # run on a bundled fixture (zero-config first run)
|
|
36
|
+
extract schema # print the output JSON Schema
|
|
37
|
+
extract fields # list extractable fields and their tier
|
|
38
|
+
extract completion bash # emit a shell-completion script (bash|zsh)
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Agent-safe usage
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
# Structure of any contract, one tool for five formats:
|
|
45
|
+
extract counterparty.docx | jq '{parties: [.parties[].name],
|
|
46
|
+
governing_law: .governing_law.value, clauses: [.clauses[].canonical_title]}'
|
|
47
|
+
|
|
48
|
+
# Gate on extraction confidence (non-zero exit if any clause is shaky):
|
|
49
|
+
extract draft.docx | jq -e '.clauses | all(.confidence > 0.7)'
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Two tiers
|
|
53
|
+
|
|
54
|
+
- **deterministic** (default, always on, no network): parties, dates, defined
|
|
55
|
+
terms, clause map, governing law, best-effort term/notice/value.
|
|
56
|
+
- **llm** (opt-in via `--llm` only): renewal mechanics, obligation phrasing,
|
|
57
|
+
ambiguous governing law, and a clause-map fallback when no headings are
|
|
58
|
+
detected. Reads `~/.config/contract-ops/llm.json`; without it, `--llm`
|
|
59
|
+
degrades gracefully to the full deterministic output with a warning.
|
|
60
|
+
|
|
61
|
+
## Output & exit codes
|
|
62
|
+
|
|
63
|
+
- Success: one JSON object on **stdout**, exit `0`. Errors/warnings/`--why` on
|
|
64
|
+
**stderr**. Scalar fields use the `{value, confidence, source}` envelope.
|
|
65
|
+
- Exit codes: `0` success · `1` low-signal document (scanned/empty — a finding,
|
|
66
|
+
valid JSON still emitted) · `2` bad usage. Branch on the exit code.
|
|
67
|
+
|
|
68
|
+
## Interop
|
|
69
|
+
|
|
70
|
+
The integration contract is the output schema
|
|
71
|
+
(`docs/spec/extract-output.schema.json`) plus the shared canonical clause
|
|
72
|
+
vocabulary — `canonical_title` values match what `template-vault-cli` detects
|
|
73
|
+
and `nda-review-cli` keys policy on. See `docs/INTEROP.md`.
|
|
74
|
+
|
|
75
|
+
## More
|
|
76
|
+
|
|
77
|
+
- README: https://github.com/DrBaher/extract-cli/blob/main/README.md
|
|
78
|
+
- Agent contract: https://github.com/DrBaher/extract-cli/blob/main/AGENTS.md
|
|
79
|
+
- Architecture: https://github.com/DrBaher/extract-cli/blob/main/ARCHITECTURE.md
|
|
@@ -4,13 +4,16 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "extract-cli"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.8"
|
|
8
8
|
description = "Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9"
|
|
11
11
|
license = { text = "MIT" }
|
|
12
12
|
authors = [{ name = "DrBaher", email = "Drbaher@gmail.com" }]
|
|
13
|
-
keywords = [
|
|
13
|
+
keywords = [
|
|
14
|
+
"contract-ops", "agent-first", "cli", "legal-tech",
|
|
15
|
+
"contract", "extraction", "nda", "legal", "json", "clause",
|
|
16
|
+
]
|
|
14
17
|
classifiers = [
|
|
15
18
|
"Development Status :: 4 - Beta",
|
|
16
19
|
"Environment :: Console",
|
|
@@ -64,6 +67,8 @@ include = ["extract_cli.py"]
|
|
|
64
67
|
include = [
|
|
65
68
|
"extract_cli.py",
|
|
66
69
|
"README.md",
|
|
70
|
+
"AGENTS.md",
|
|
71
|
+
"llms.txt",
|
|
67
72
|
"LICENSE",
|
|
68
73
|
"CHANGELOG.md",
|
|
69
74
|
"ARCHITECTURE.md",
|
|
@@ -42,13 +42,46 @@ _DOCX_PARAS = [
|
|
|
42
42
|
_W = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
|
43
43
|
|
|
44
44
|
|
|
45
|
-
def _docx_paragraph(text: str, bold: bool = False, style: str = ""
|
|
46
|
-
|
|
45
|
+
def _docx_paragraph(text: str, bold: bool = False, style: str = "",
|
|
46
|
+
numbered: bool = False, ilvl: int = 0) -> str:
|
|
47
|
+
inner = ""
|
|
48
|
+
if style:
|
|
49
|
+
inner += f'<w:pStyle w:val="{style}"/>'
|
|
50
|
+
if numbered:
|
|
51
|
+
inner += f'<w:numPr><w:ilvl w:val="{ilvl}"/><w:numId w:val="1"/></w:numPr>'
|
|
52
|
+
ppr = f"<w:pPr>{inner}</w:pPr>" if inner else ""
|
|
47
53
|
rpr = "<w:rPr><w:b/></w:rPr>" if bold else ""
|
|
48
54
|
return (f"<w:p>{ppr}<w:r>{rpr}"
|
|
49
55
|
f'<w:t xml:space="preserve">{escape(text)}</w:t></w:r></w:p>')
|
|
50
56
|
|
|
51
57
|
|
|
58
|
+
# An auto-numbered agreement: clauses are w:numPr list paragraphs with NO heading
|
|
59
|
+
# style and NO visible number (Word generates "1.", "2." from numbering.xml).
|
|
60
|
+
# Run-in titles at ilvl 0/1 are clause headings; ilvl-2 full sentences are body
|
|
61
|
+
# and must be rejected. Mirrors real DOCX like the Common Paper DPA.
|
|
62
|
+
_NUMBERED_DOCX_PARAS = [
|
|
63
|
+
('Data Processing Agreement', False, "", False, 0),
|
|
64
|
+
('This Data Processing Agreement is made as of July 7, 2024, by and between '
|
|
65
|
+
'Globex Cloud, Inc. ("Provider") and Initech Ltd. ("Customer").', False, "", False, 0),
|
|
66
|
+
('Definitions', False, "", True, 0),
|
|
67
|
+
('Processing. Provider will process Customer Data only on documented '
|
|
68
|
+
'instructions from the Customer.', False, "", True, 0),
|
|
69
|
+
('Confidentiality. Provider will keep Customer Data confidential.', False, "", True, 1),
|
|
70
|
+
('Subprocessors. Provider may engage subprocessors as permitted.', False, "", True, 1),
|
|
71
|
+
('Provider will ensure each subprocessor is bound by equivalent obligations '
|
|
72
|
+
'and remains fully liable for their performance under this Agreement.', False, "", True, 2),
|
|
73
|
+
('Governing Law. This Agreement is governed by the laws of the State of '
|
|
74
|
+
'New York.', False, "", True, 0),
|
|
75
|
+
]
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def build_numbered_docx() -> bytes:
|
|
79
|
+
return _docx_package(
|
|
80
|
+
"".join(_docx_paragraph(t, b, style=s, numbered=n, ilvl=l)
|
|
81
|
+
for t, b, s, n, l in _NUMBERED_DOCX_PARAS)
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
|
|
52
85
|
# A Word-styled agreement: clause structure carried by Heading1 styles (their
|
|
53
86
|
# numbers are auto-generated, absent from text), including a run-in heading and
|
|
54
87
|
# a full sentence that merely carries the heading style (must be rejected).
|
|
@@ -194,6 +227,7 @@ def build_scanned_pdf() -> bytes:
|
|
|
194
227
|
_BINARY_FIXTURES = {
|
|
195
228
|
"employment_docx.docx": build_docx,
|
|
196
229
|
"heading_docx.docx": build_heading_docx,
|
|
230
|
+
"numbered_docx.docx": build_numbered_docx,
|
|
197
231
|
"license_pdf.pdf": build_pdf,
|
|
198
232
|
"scanned.pdf": build_scanned_pdf,
|
|
199
233
|
}
|
|
@@ -20,8 +20,8 @@ from tests._fixtures_build import ensure_binary_fixtures # noqa: E402
|
|
|
20
20
|
FIXTURES = Path(__file__).resolve().parent / "fixtures"
|
|
21
21
|
|
|
22
22
|
DOCS = ["nda_h2.md", "services_bold.txt", "lease_allcaps.txt",
|
|
23
|
-
"employment_docx.docx", "heading_docx.docx", "
|
|
24
|
-
"services_html.html", "scanned.pdf"]
|
|
23
|
+
"employment_docx.docx", "heading_docx.docx", "numbered_docx.docx",
|
|
24
|
+
"license_pdf.pdf", "services_html.html", "scanned.pdf"]
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
def golden_for(name: str) -> dict:
|
|
@@ -26,6 +26,7 @@ CORPUS: Tuple[Tuple[str, str, str], ...] = (
|
|
|
26
26
|
("lease_allcaps.txt", "all-caps", "text"),
|
|
27
27
|
("employment_docx.docx", "bold-numbered", "docx"),
|
|
28
28
|
("heading_docx.docx", "h2", "docx"),
|
|
29
|
+
("numbered_docx.docx", "h2", "docx"),
|
|
29
30
|
("license_pdf.pdf", "all-caps", "pdf"),
|
|
30
31
|
("services_html.html", "numbered", "html"),
|
|
31
32
|
)
|
|
Binary file
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
{
|
|
2
|
+
"document": {
|
|
3
|
+
"title": "Data Processing Agreement",
|
|
4
|
+
"format": "docx",
|
|
5
|
+
"sha256": "4fea9a1f04598238f78900d19ccb0385bfc222b1e26664648c8d8ddb8cde189c",
|
|
6
|
+
"source_path": "numbered_docx.docx"
|
|
7
|
+
},
|
|
8
|
+
"parties": [
|
|
9
|
+
{
|
|
10
|
+
"name": "Globex Cloud, Inc.",
|
|
11
|
+
"confidence": 0.9,
|
|
12
|
+
"source": "deterministic",
|
|
13
|
+
"role": "Provider"
|
|
14
|
+
},
|
|
15
|
+
{
|
|
16
|
+
"name": "Initech Ltd",
|
|
17
|
+
"confidence": 0.9,
|
|
18
|
+
"source": "deterministic",
|
|
19
|
+
"role": null
|
|
20
|
+
}
|
|
21
|
+
],
|
|
22
|
+
"dates": {
|
|
23
|
+
"effective": {
|
|
24
|
+
"value": "2024-07-07",
|
|
25
|
+
"confidence": 0.85,
|
|
26
|
+
"source": "deterministic"
|
|
27
|
+
},
|
|
28
|
+
"expiration": {
|
|
29
|
+
"value": null,
|
|
30
|
+
"confidence": 0.0,
|
|
31
|
+
"source": "none"
|
|
32
|
+
}
|
|
33
|
+
},
|
|
34
|
+
"term": {
|
|
35
|
+
"length": {
|
|
36
|
+
"value": null,
|
|
37
|
+
"confidence": 0.0,
|
|
38
|
+
"source": "none"
|
|
39
|
+
},
|
|
40
|
+
"auto_renew": {
|
|
41
|
+
"value": null,
|
|
42
|
+
"confidence": 0.0,
|
|
43
|
+
"source": "none"
|
|
44
|
+
},
|
|
45
|
+
"notice_period_days": {
|
|
46
|
+
"value": null,
|
|
47
|
+
"confidence": 0.0,
|
|
48
|
+
"source": "none"
|
|
49
|
+
}
|
|
50
|
+
},
|
|
51
|
+
"governing_law": {
|
|
52
|
+
"value": "State of New York",
|
|
53
|
+
"confidence": 0.85,
|
|
54
|
+
"source": "deterministic"
|
|
55
|
+
},
|
|
56
|
+
"clauses": [
|
|
57
|
+
{
|
|
58
|
+
"canonical_title": "Definitions",
|
|
59
|
+
"detected_title": "## Definitions",
|
|
60
|
+
"tier": "h2",
|
|
61
|
+
"span": {
|
|
62
|
+
"start": 165,
|
|
63
|
+
"end": 181
|
|
64
|
+
},
|
|
65
|
+
"confidence": 0.95,
|
|
66
|
+
"source": "deterministic",
|
|
67
|
+
"mapped": true
|
|
68
|
+
},
|
|
69
|
+
{
|
|
70
|
+
"canonical_title": "Processing",
|
|
71
|
+
"detected_title": "## Processing",
|
|
72
|
+
"tier": "h2",
|
|
73
|
+
"span": {
|
|
74
|
+
"start": 181,
|
|
75
|
+
"end": 284
|
|
76
|
+
},
|
|
77
|
+
"confidence": 0.71,
|
|
78
|
+
"source": "deterministic",
|
|
79
|
+
"mapped": false
|
|
80
|
+
},
|
|
81
|
+
{
|
|
82
|
+
"canonical_title": "Confidentiality",
|
|
83
|
+
"detected_title": "## Confidentiality",
|
|
84
|
+
"tier": "h2",
|
|
85
|
+
"span": {
|
|
86
|
+
"start": 284,
|
|
87
|
+
"end": 352
|
|
88
|
+
},
|
|
89
|
+
"confidence": 0.95,
|
|
90
|
+
"source": "deterministic",
|
|
91
|
+
"mapped": true
|
|
92
|
+
},
|
|
93
|
+
{
|
|
94
|
+
"canonical_title": "Subprocessors",
|
|
95
|
+
"detected_title": "## Subprocessors",
|
|
96
|
+
"tier": "h2",
|
|
97
|
+
"span": {
|
|
98
|
+
"start": 352,
|
|
99
|
+
"end": 563
|
|
100
|
+
},
|
|
101
|
+
"confidence": 0.71,
|
|
102
|
+
"source": "deterministic",
|
|
103
|
+
"mapped": false
|
|
104
|
+
},
|
|
105
|
+
{
|
|
106
|
+
"canonical_title": "Governing Law",
|
|
107
|
+
"detected_title": "## Governing Law",
|
|
108
|
+
"tier": "h2",
|
|
109
|
+
"span": {
|
|
110
|
+
"start": 563,
|
|
111
|
+
"end": 645
|
|
112
|
+
},
|
|
113
|
+
"confidence": 0.95,
|
|
114
|
+
"source": "deterministic",
|
|
115
|
+
"mapped": true
|
|
116
|
+
}
|
|
117
|
+
],
|
|
118
|
+
"defined_terms": [
|
|
119
|
+
{
|
|
120
|
+
"term": "Provider",
|
|
121
|
+
"confidence": 0.6,
|
|
122
|
+
"source": "deterministic"
|
|
123
|
+
},
|
|
124
|
+
{
|
|
125
|
+
"term": "Customer",
|
|
126
|
+
"confidence": 0.6,
|
|
127
|
+
"source": "deterministic"
|
|
128
|
+
}
|
|
129
|
+
],
|
|
130
|
+
"value": {
|
|
131
|
+
"value": null,
|
|
132
|
+
"confidence": 0.0,
|
|
133
|
+
"source": "none"
|
|
134
|
+
},
|
|
135
|
+
"_meta": {
|
|
136
|
+
"extractor_version": "0.1.8",
|
|
137
|
+
"tiers_used": [
|
|
138
|
+
"deterministic"
|
|
139
|
+
],
|
|
140
|
+
"llm_used": false
|
|
141
|
+
}
|
|
142
|
+
}
|
|
@@ -127,6 +127,29 @@ def test_roman_numeral_stripping() -> None:
|
|
|
127
127
|
assert ex._strip_clause_number(raw) == expected, raw
|
|
128
128
|
|
|
129
129
|
|
|
130
|
+
def test_two_line_article_headings() -> None:
|
|
131
|
+
# "ARTICLE N" on one line, the title on the next (common formal layout).
|
|
132
|
+
text = ("ARTICLE I\n\nDEFINITIONS\n\nCapitalized terms have meanings.\n\n"
|
|
133
|
+
"ARTICLE II\n\nCONFIDENTIALITY\n\nEach party protects info.\n\n"
|
|
134
|
+
"ARTICLE III\n\nGOVERNING LAW\n\nGoverned by New York law.")
|
|
135
|
+
clauses = ex.detect_clauses(text)
|
|
136
|
+
assert [c["title"] for c in clauses] == ["DEFINITIONS", "CONFIDENTIALITY", "GOVERNING LAW"]
|
|
137
|
+
assert all(c["tier"] == "numbered" for c in clauses)
|
|
138
|
+
# A single stray "Article 5" mention must NOT trigger the pairing.
|
|
139
|
+
assert ex._detect_two_line_articles("see Article 5 below for details") == []
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def test_expanded_vocabulary_mappings() -> None:
|
|
143
|
+
# Added from the real-corpus survey (v0.1.8).
|
|
144
|
+
assert ex._canonicalize_clause("Permitted Disclosures") == ("Exclusions", True)
|
|
145
|
+
assert ex._canonicalize_clause("Injunctive Relief") == ("Remedies", True)
|
|
146
|
+
assert ex._canonicalize_clause("General Terms") == ("Miscellaneous", True)
|
|
147
|
+
assert ex._canonicalize_clause("No Third-Party Beneficiary") == ("Third-Party Beneficiaries", True)
|
|
148
|
+
assert ex._canonicalize_clause("Export Controls") == ("Compliance with Laws", True)
|
|
149
|
+
# Must NOT over-match: a generic "General Release" is not Miscellaneous.
|
|
150
|
+
assert ex._canonicalize_clause("General Release of Claims")[1] is False
|
|
151
|
+
|
|
152
|
+
|
|
130
153
|
def test_canonicalize_known_aliases() -> None:
|
|
131
154
|
assert ex._canonicalize_clause("Non-Disclosure") == ("Confidentiality", True)
|
|
132
155
|
assert ex._canonicalize_clause("CONFIDENTIALITY OBLIGATIONS") == ("Confidentiality", True)
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
"""End-to-end CLI tests driving extract_cli.main() in-process."""
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
|
|
4
|
+
import argparse
|
|
4
5
|
import json
|
|
5
|
-
from typing import Any
|
|
6
|
+
from typing import Any, Set
|
|
6
7
|
|
|
7
8
|
import pytest
|
|
8
9
|
|
|
@@ -10,6 +11,22 @@ import extract_cli as ex
|
|
|
10
11
|
from tests.conftest import FIXTURES
|
|
11
12
|
|
|
12
13
|
|
|
14
|
+
def _parser_optstrings(subparser: argparse.ArgumentParser) -> Set[str]:
|
|
15
|
+
"""Every documented --flag a subparser accepts (excluding -h/--help and SUPPRESS)."""
|
|
16
|
+
out: Set[str] = set()
|
|
17
|
+
for action in subparser._actions:
|
|
18
|
+
if isinstance(action, argparse._SubParsersAction):
|
|
19
|
+
continue
|
|
20
|
+
if not action.option_strings: # positional
|
|
21
|
+
continue
|
|
22
|
+
if action.help == argparse.SUPPRESS:
|
|
23
|
+
continue
|
|
24
|
+
if {"-h", "--help"} & set(action.option_strings):
|
|
25
|
+
continue
|
|
26
|
+
out.update(action.option_strings)
|
|
27
|
+
return out
|
|
28
|
+
|
|
29
|
+
|
|
13
30
|
def _has_key(obj: Any, key: str) -> bool:
|
|
14
31
|
if isinstance(obj, dict):
|
|
15
32
|
return key in obj or any(_has_key(v, key) for v in obj.values())
|
|
@@ -109,3 +126,48 @@ def test_why_goes_to_stderr(capsys: pytest.CaptureFixture[str]) -> None:
|
|
|
109
126
|
assert "[why]" in cap.err
|
|
110
127
|
assert "[why]" not in cap.out # stdout stays clean JSON
|
|
111
128
|
json.loads(cap.out)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def test_catalog_json_shape(capsys: pytest.CaptureFixture[str]) -> None:
|
|
132
|
+
assert ex.main(["--catalog", "json"]) == 0
|
|
133
|
+
cat = json.loads(capsys.readouterr().out)
|
|
134
|
+
assert set(cat) >= {"name", "bin", "version", "description", "commands", "exitCodes"}
|
|
135
|
+
assert cat["name"] == "extract-cli"
|
|
136
|
+
assert cat["bin"] == "extract"
|
|
137
|
+
assert cat["version"] == ex.__version__
|
|
138
|
+
assert [c["name"] for c in cat["commands"]] == [
|
|
139
|
+
"extract", "schema", "fields", "demo", "completion"
|
|
140
|
+
]
|
|
141
|
+
for c in cat["commands"]:
|
|
142
|
+
assert set(c) == {"name", "help", "flags"} and c["help"]
|
|
143
|
+
assert cat["exitCodes"]["0"] and cat["exitCodes"]["1"] and cat["exitCodes"]["2"]
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def test_catalog_defaults_to_json(capsys: pytest.CaptureFixture[str]) -> None:
|
|
147
|
+
assert ex.main(["--catalog"]) == 0 # bare --catalog → json
|
|
148
|
+
json.loads(capsys.readouterr().out)
|
|
149
|
+
assert ex.main(["--catalog=json"]) == 0 # = form
|
|
150
|
+
json.loads(capsys.readouterr().out)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def test_catalog_rejects_unknown_format(capsys: pytest.CaptureFixture[str]) -> None:
|
|
154
|
+
assert ex.main(["--catalog", "yaml"]) == 2
|
|
155
|
+
assert "error:" in capsys.readouterr().err
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def test_catalog_does_not_drift_from_parser() -> None:
|
|
159
|
+
"""The catalog must list exactly the commands/flags the real parser accepts."""
|
|
160
|
+
cat = ex.build_catalog()
|
|
161
|
+
parser = ex.build_parser()
|
|
162
|
+
sub_action = next(
|
|
163
|
+
a for a in parser._actions if isinstance(a, argparse._SubParsersAction)
|
|
164
|
+
)
|
|
165
|
+
real: dict[str, argparse.ArgumentParser] = dict(sub_action.choices)
|
|
166
|
+
cat_by_name = {c["name"]: c for c in cat["commands"]}
|
|
167
|
+
assert set(cat_by_name) == set(real) # no fictional or undocumented commands
|
|
168
|
+
for name, subparser in real.items():
|
|
169
|
+
documented: Set[str] = set()
|
|
170
|
+
for f in cat_by_name[name]["flags"]:
|
|
171
|
+
documented.add(f["name"])
|
|
172
|
+
documented.update(f["aliases"])
|
|
173
|
+
assert documented == _parser_optstrings(subparser), f"flag drift in `{name}`"
|
|
@@ -173,6 +173,18 @@ def test_docx_heading_styles_drive_clause_map() -> None:
|
|
|
173
173
|
assert [p["name"] for p in result["parties"]] == ["Initech Software, Inc.", "Globex Corporation"]
|
|
174
174
|
|
|
175
175
|
|
|
176
|
+
def test_numbered_docx_clauses() -> None:
|
|
177
|
+
"""A DOCX whose clauses are w:numPr list paragraphs (no heading style, no
|
|
178
|
+
visible number) still yields a clause map; a deep numbered body sentence is
|
|
179
|
+
excluded."""
|
|
180
|
+
raw, text, fmt, _w = ex.load_source(FIXTURES / "numbered_docx.docx", prefer_optional=False)
|
|
181
|
+
result = ex.build_extraction(text, raw, fmt, "numbered_docx.docx")
|
|
182
|
+
canon = {c["canonical_title"] for c in result["clauses"]}
|
|
183
|
+
assert {"Definitions", "Confidentiality", "Governing Law"} <= canon
|
|
184
|
+
assert not any("remains fully liable" in c["detected_title"] for c in result["clauses"])
|
|
185
|
+
assert [p["name"] for p in result["parties"]][0] == "Globex Cloud, Inc."
|
|
186
|
+
|
|
187
|
+
|
|
176
188
|
def test_html_extraction() -> None:
|
|
177
189
|
raw, text, fmt, _w = ex.load_source(FIXTURES / "services_html.html")
|
|
178
190
|
assert fmt == "html"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|