extract-cli 0.1.4__tar.gz → 0.1.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {extract_cli-0.1.4 → extract_cli-0.1.6}/ARCHITECTURE.md +6 -1
- {extract_cli-0.1.4 → extract_cli-0.1.6}/CHANGELOG.md +34 -0
- {extract_cli-0.1.4 → extract_cli-0.1.6}/PKG-INFO +44 -24
- {extract_cli-0.1.4 → extract_cli-0.1.6}/README.md +43 -23
- {extract_cli-0.1.4 → extract_cli-0.1.6}/extract_cli.py +68 -11
- {extract_cli-0.1.4 → extract_cli-0.1.6}/pyproject.toml +1 -1
- {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/fixtures/employment_docx.docx.expected.json +1 -1
- {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/fixtures/heading_docx.docx.expected.json +1 -1
- {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/fixtures/lease_allcaps.txt.expected.json +1 -1
- {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/fixtures/license_pdf.pdf.expected.json +1 -1
- {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/fixtures/nda_h2.md.expected.json +1 -1
- {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/fixtures/scanned.pdf.expected.json +1 -1
- {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/fixtures/services_bold.txt.expected.json +1 -1
- {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/fixtures/services_html.html.expected.json +1 -1
- {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/test_llm.py +35 -0
- {extract_cli-0.1.4 → extract_cli-0.1.6}/.gitignore +0 -0
- {extract_cli-0.1.4 → extract_cli-0.1.6}/CONTRIBUTING.md +0 -0
- {extract_cli-0.1.4 → extract_cli-0.1.6}/LICENSE +0 -0
- {extract_cli-0.1.4 → extract_cli-0.1.6}/Makefile +0 -0
- {extract_cli-0.1.4 → extract_cli-0.1.6}/config/llm.json.example +0 -0
- {extract_cli-0.1.4 → extract_cli-0.1.6}/docs/INTEROP.md +0 -0
- {extract_cli-0.1.4 → extract_cli-0.1.6}/docs/spec/extract-output.schema.json +0 -0
- {extract_cli-0.1.4 → extract_cli-0.1.6}/scripts/release.py +0 -0
- {extract_cli-0.1.4 → extract_cli-0.1.6}/scripts/validate_against_spec.py +0 -0
- {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/_fixtures_build.py +0 -0
- {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/_make_goldens.py +0 -0
- {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/_schema_validator.py +0 -0
- {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/conftest.py +0 -0
- {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/fixtures/employment_docx.docx +0 -0
- {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/fixtures/heading_docx.docx +0 -0
- {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/fixtures/lease_allcaps.txt +0 -0
- {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/fixtures/license_pdf.pdf +0 -0
- {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/fixtures/nda_h2.md +0 -0
- {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/fixtures/scanned.pdf +0 -0
- {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/fixtures/services_bold.txt +0 -0
- {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/fixtures/services_html.html +0 -0
- {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/test_clause_map.py +0 -0
- {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/test_cli.py +0 -0
- {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/test_deterministic.py +0 -0
- {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/test_misc.py +0 -0
- {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/test_property.py +0 -0
- {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/test_schema_conformance.py +0 -0
|
@@ -80,7 +80,12 @@ endpoint. Any failure (no config, network error, unparseable JSON) is caught:
|
|
|
80
80
|
a warning to stderr, deterministic output untouched. The LLM only *adds* fuzzy
|
|
81
81
|
fields (`term.renewal_mechanics`, `obligations`) and fills `governing_law` only
|
|
82
82
|
when the deterministic tier found nothing — it never overwrites a deterministic
|
|
83
|
-
value.
|
|
83
|
+
value. As a **clause-map fallback**, when the deterministic cascade returned no
|
|
84
|
+
clauses the LLM is asked for the section headings (the clause keys are added to
|
|
85
|
+
the prompt only then); the titles are normalized through the same
|
|
86
|
+
`_canonicalize_clause` vocabulary, located in the text for a best-effort span,
|
|
87
|
+
and emitted with `tier: "llm"` / `source: "llm"`. This covers DOCX that
|
|
88
|
+
auto-number with no heading style (their numbers live only in `numbering.xml`).
|
|
84
89
|
|
|
85
90
|
## The output contract
|
|
86
91
|
|
|
@@ -6,6 +6,38 @@ to [Semantic Versioning](https://semver.org/). Per the suite convention
|
|
|
6
6
|
(see [`docs/INTEROP.md`](docs/INTEROP.md)), **backward-incompatible changes to
|
|
7
7
|
the output schema require a major version bump**; new optional fields are minor.
|
|
8
8
|
|
|
9
|
+
## [0.1.6] - 2026-05-21
|
|
10
|
+
|
|
11
|
+
### Docs
|
|
12
|
+
- **Rewrote the README composability section to verified, runnable examples.**
|
|
13
|
+
Testing extract-cli against the real sibling CLIs (`template-vault-cli`,
|
|
14
|
+
`nda-review-cli`) showed the previous pipes were aspirational — the siblings
|
|
15
|
+
expose no `--from-extract`/`--stdin` flag (`nda-review review` takes
|
|
16
|
+
`--file`/`--text`; `template-vault` reads its own vault). The integration
|
|
17
|
+
contract is the **output schema + the shared canonical clause vocabulary**,
|
|
18
|
+
glued by stdout JSON and standard tools (`jq`, `comm`): `extract`'s
|
|
19
|
+
`canonical_title` values are the same names template-vault detects and
|
|
20
|
+
nda-review keys policy on, so a foreign document's clauses line up with the
|
|
21
|
+
suite's with no bespoke adapter. New examples cover clause-coverage gap
|
|
22
|
+
analysis against a vault template and a combined extract+nda-review intake
|
|
23
|
+
report — all runnable today. (Also fixed a broken `jq input_filename` in the
|
|
24
|
+
folder-triage example.) No code or schema change.
|
|
25
|
+
|
|
26
|
+
## [0.1.5] - 2026-05-21
|
|
27
|
+
|
|
28
|
+
### Added
|
|
29
|
+
- **LLM clause-map fallback** (opt-in, `--llm` only). When the deterministic
|
|
30
|
+
cascade detects no clauses — e.g. a `.docx` that auto-numbers via Word's
|
|
31
|
+
numbering with no heading style, the limitation noted in 0.1.4 — the LLM is
|
|
32
|
+
asked for the section headings (the clause request is added to the prompt
|
|
33
|
+
only in that case). Returned titles are normalized through the same canonical
|
|
34
|
+
vocabulary as the deterministic path, located in the document for a
|
|
35
|
+
best-effort span, and emitted with `tier: "llm"`, `source: "llm"`, and a
|
|
36
|
+
modest confidence. The LLM is never consulted for clauses the deterministic
|
|
37
|
+
cascade already found, and the deterministic core remains fully useful with
|
|
38
|
+
no LLM. No schema change (the clause `tier`/`source` enums already allow
|
|
39
|
+
`llm`).
|
|
40
|
+
|
|
9
41
|
## [0.1.4] - 2026-05-21
|
|
10
42
|
|
|
11
43
|
DOCX clause detection, driven by testing against 20 real `.docx` contracts
|
|
@@ -166,6 +198,8 @@ Initial release — the open-loop front door of the contract-ops CLI suite.
|
|
|
166
198
|
intentionally *not* governed by the output schema (the schema describes the
|
|
167
199
|
full default output).
|
|
168
200
|
|
|
201
|
+
[0.1.6]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.6
|
|
202
|
+
[0.1.5]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.5
|
|
169
203
|
[0.1.4]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.4
|
|
170
204
|
[0.1.3]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.3
|
|
171
205
|
[0.1.2]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.2
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: extract-cli
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.6
|
|
4
4
|
Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON.
|
|
5
5
|
Project-URL: Homepage, https://cli.drbaher.com/
|
|
6
6
|
Project-URL: Repository, https://github.com/DrBaher/extract-cli
|
|
@@ -102,6 +102,15 @@ opt-in, never in a hot path, and gated behind an explicit flag and a config
|
|
|
102
102
|
file — if no config is present, `--llm` degrades gracefully with a warning and
|
|
103
103
|
you still get the full deterministic output.
|
|
104
104
|
|
|
105
|
+
**Clause-map fallback.** Some documents (e.g. `.docx` that auto-number clauses
|
|
106
|
+
via Word's numbering with no heading style) carry no signal the deterministic
|
|
107
|
+
cascade can see, so its clause map comes back empty. When `--llm` is set *and*
|
|
108
|
+
no clauses were detected, the LLM is asked for the section headings; the result
|
|
109
|
+
is normalized through the same canonical vocabulary and emitted with
|
|
110
|
+
`tier: "llm"`, `source: "llm"`, and a modest confidence (verify, not trust).
|
|
111
|
+
When the deterministic cascade already found clauses, the LLM is not consulted
|
|
112
|
+
for them.
|
|
113
|
+
|
|
105
114
|
## Commands
|
|
106
115
|
|
|
107
116
|
```bash
|
|
@@ -162,37 +171,48 @@ extract counterparty.pdf | jq '.clauses[] | {canonical_title, detected_title, ma
|
|
|
162
171
|
|
|
163
172
|
## Composability — piping into the rest of the suite
|
|
164
173
|
|
|
165
|
-
`extract-cli` is built to be the first stage of a Unix pipe.
|
|
166
|
-
|
|
174
|
+
`extract-cli` is built to be the first stage of a Unix pipe. The glue is its
|
|
175
|
+
**stdout JSON + standard tools** (`jq`, `comm`) and the **shared clause
|
|
176
|
+
vocabulary** — `extract`'s `canonical_title` values are the same names
|
|
177
|
+
`template-vault-cli` detects and `nda-review-cli` keys policy on, so a foreign
|
|
178
|
+
document's clauses line up with the suite's with no bespoke adapter. Every
|
|
179
|
+
example below is runnable today (verified against the real sibling CLIs).
|
|
167
180
|
|
|
168
181
|
```bash
|
|
169
|
-
# 1)
|
|
170
|
-
extract
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
#
|
|
174
|
-
extract
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
#
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
#
|
|
183
|
-
|
|
182
|
+
# 1) Inspect any contract's structure (.md/.txt/.html/.docx/.pdf, one tool).
|
|
183
|
+
extract counterparty.docx | jq '{parties: [.parties[].name],
|
|
184
|
+
governing_law: .governing_law.value, clauses: [.clauses[].canonical_title]}'
|
|
185
|
+
|
|
186
|
+
# 2) Clause-coverage gap vs your canonical template in template-vault-cli.
|
|
187
|
+
# extract normalizes the counterparty's *foreign* headings onto the same
|
|
188
|
+
# clause vocabulary template-vault detects, so a plain `comm` diffs them.
|
|
189
|
+
template-vault info nda/mutual-standard --json | jq -r '.clauses[].title' | sort > ours.txt
|
|
190
|
+
extract counterparty_nda.docx | jq -r '.clauses[].canonical_title' | sort -u > theirs.txt
|
|
191
|
+
comm -23 ours.txt theirs.txt # clauses in OUR standard that THEY are missing
|
|
192
|
+
comm -13 ours.txt theirs.txt # clauses THEY added that we don't have
|
|
193
|
+
|
|
194
|
+
# 3) Intake: extract for structure, nda-review-cli for a policy verdict on the
|
|
195
|
+
# same foreign doc; merge both views with jq.
|
|
196
|
+
extract counterparty_nda.docx > extract.json
|
|
197
|
+
nda-review review --file counterparty_nda.docx --playbook output/nda_playbook.json \
|
|
198
|
+
--out-json review.json
|
|
199
|
+
jq -n --slurpfile e extract.json --slurpfile r review.json \
|
|
200
|
+
'{parties: [$e[0].parties[].name], governing_law: $e[0].governing_law.value,
|
|
201
|
+
clauses: ($e[0].clauses | length), decision: $r[0].decision, risk: $r[0].risk_score}'
|
|
202
|
+
|
|
203
|
+
# 4) Triage a folder of inbound contracts: governing law + parties per file.
|
|
204
|
+
for f in inbox/*; do
|
|
184
205
|
extract "$f" --fields parties,governing_law --no-confidence \
|
|
185
|
-
| jq -c '{file:
|
|
206
|
+
| jq -c --arg f "$f" '{file: $f, gov: .governing_law, parties: [.parties[].name]}'
|
|
186
207
|
done
|
|
187
208
|
|
|
188
|
-
# 5) Gate a workflow on extraction confidence.
|
|
209
|
+
# 5) Gate a workflow on extraction confidence (non-zero exit if any clause is shaky).
|
|
189
210
|
extract draft.docx | jq -e '.clauses | all(.confidence > 0.7)' && echo "ok to review"
|
|
190
211
|
```
|
|
191
212
|
|
|
192
|
-
> The
|
|
193
|
-
>
|
|
194
|
-
>
|
|
195
|
-
> versioning commitment on the schema.
|
|
213
|
+
> The integration contract is the **output schema** and the **canonical clause
|
|
214
|
+
> vocabulary**, not per-tool flags. See [`docs/INTEROP.md`](docs/INTEROP.md) for
|
|
215
|
+
> the shared conventions and the schema's versioning commitment.
|
|
196
216
|
|
|
197
217
|
## LLM configuration (opt-in)
|
|
198
218
|
|
|
@@ -64,6 +64,15 @@ opt-in, never in a hot path, and gated behind an explicit flag and a config
|
|
|
64
64
|
file — if no config is present, `--llm` degrades gracefully with a warning and
|
|
65
65
|
you still get the full deterministic output.
|
|
66
66
|
|
|
67
|
+
**Clause-map fallback.** Some documents (e.g. `.docx` that auto-number clauses
|
|
68
|
+
via Word's numbering with no heading style) carry no signal the deterministic
|
|
69
|
+
cascade can see, so its clause map comes back empty. When `--llm` is set *and*
|
|
70
|
+
no clauses were detected, the LLM is asked for the section headings; the result
|
|
71
|
+
is normalized through the same canonical vocabulary and emitted with
|
|
72
|
+
`tier: "llm"`, `source: "llm"`, and a modest confidence (verify, not trust).
|
|
73
|
+
When the deterministic cascade already found clauses, the LLM is not consulted
|
|
74
|
+
for them.
|
|
75
|
+
|
|
67
76
|
## Commands
|
|
68
77
|
|
|
69
78
|
```bash
|
|
@@ -124,37 +133,48 @@ extract counterparty.pdf | jq '.clauses[] | {canonical_title, detected_title, ma
|
|
|
124
133
|
|
|
125
134
|
## Composability — piping into the rest of the suite
|
|
126
135
|
|
|
127
|
-
`extract-cli` is built to be the first stage of a Unix pipe.
|
|
128
|
-
|
|
136
|
+
`extract-cli` is built to be the first stage of a Unix pipe. The glue is its
|
|
137
|
+
**stdout JSON + standard tools** (`jq`, `comm`) and the **shared clause
|
|
138
|
+
vocabulary** — `extract`'s `canonical_title` values are the same names
|
|
139
|
+
`template-vault-cli` detects and `nda-review-cli` keys policy on, so a foreign
|
|
140
|
+
document's clauses line up with the suite's with no bespoke adapter. Every
|
|
141
|
+
example below is runnable today (verified against the real sibling CLIs).
|
|
129
142
|
|
|
130
143
|
```bash
|
|
131
|
-
# 1)
|
|
132
|
-
extract
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
#
|
|
136
|
-
extract
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
#
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
#
|
|
145
|
-
|
|
144
|
+
# 1) Inspect any contract's structure (.md/.txt/.html/.docx/.pdf, one tool).
|
|
145
|
+
extract counterparty.docx | jq '{parties: [.parties[].name],
|
|
146
|
+
governing_law: .governing_law.value, clauses: [.clauses[].canonical_title]}'
|
|
147
|
+
|
|
148
|
+
# 2) Clause-coverage gap vs your canonical template in template-vault-cli.
|
|
149
|
+
# extract normalizes the counterparty's *foreign* headings onto the same
|
|
150
|
+
# clause vocabulary template-vault detects, so a plain `comm` diffs them.
|
|
151
|
+
template-vault info nda/mutual-standard --json | jq -r '.clauses[].title' | sort > ours.txt
|
|
152
|
+
extract counterparty_nda.docx | jq -r '.clauses[].canonical_title' | sort -u > theirs.txt
|
|
153
|
+
comm -23 ours.txt theirs.txt # clauses in OUR standard that THEY are missing
|
|
154
|
+
comm -13 ours.txt theirs.txt # clauses THEY added that we don't have
|
|
155
|
+
|
|
156
|
+
# 3) Intake: extract for structure, nda-review-cli for a policy verdict on the
|
|
157
|
+
# same foreign doc; merge both views with jq.
|
|
158
|
+
extract counterparty_nda.docx > extract.json
|
|
159
|
+
nda-review review --file counterparty_nda.docx --playbook output/nda_playbook.json \
|
|
160
|
+
--out-json review.json
|
|
161
|
+
jq -n --slurpfile e extract.json --slurpfile r review.json \
|
|
162
|
+
'{parties: [$e[0].parties[].name], governing_law: $e[0].governing_law.value,
|
|
163
|
+
clauses: ($e[0].clauses | length), decision: $r[0].decision, risk: $r[0].risk_score}'
|
|
164
|
+
|
|
165
|
+
# 4) Triage a folder of inbound contracts: governing law + parties per file.
|
|
166
|
+
for f in inbox/*; do
|
|
146
167
|
extract "$f" --fields parties,governing_law --no-confidence \
|
|
147
|
-
| jq -c '{file:
|
|
168
|
+
| jq -c --arg f "$f" '{file: $f, gov: .governing_law, parties: [.parties[].name]}'
|
|
148
169
|
done
|
|
149
170
|
|
|
150
|
-
# 5) Gate a workflow on extraction confidence.
|
|
171
|
+
# 5) Gate a workflow on extraction confidence (non-zero exit if any clause is shaky).
|
|
151
172
|
extract draft.docx | jq -e '.clauses | all(.confidence > 0.7)' && echo "ok to review"
|
|
152
173
|
```
|
|
153
174
|
|
|
154
|
-
> The
|
|
155
|
-
>
|
|
156
|
-
>
|
|
157
|
-
> versioning commitment on the schema.
|
|
175
|
+
> The integration contract is the **output schema** and the **canonical clause
|
|
176
|
+
> vocabulary**, not per-tool flags. See [`docs/INTEROP.md`](docs/INTEROP.md) for
|
|
177
|
+
> the shared conventions and the schema's versioning commitment.
|
|
158
178
|
|
|
159
179
|
## LLM configuration (opt-in)
|
|
160
180
|
|
|
@@ -43,11 +43,11 @@ import urllib.request
|
|
|
43
43
|
from pathlib import Path
|
|
44
44
|
from typing import Any, Dict, List, Optional, Tuple
|
|
45
45
|
|
|
46
|
-
__version__ = "0.1.
|
|
46
|
+
__version__ = "0.1.6"
|
|
47
47
|
|
|
48
48
|
# Bumped independently of the package version when the *extraction logic*
|
|
49
49
|
# changes in a way downstream consumers should notice. Embedded in `_meta`.
|
|
50
|
-
EXTRACTOR_VERSION = "0.1.
|
|
50
|
+
EXTRACTOR_VERSION = "0.1.6"
|
|
51
51
|
|
|
52
52
|
# JSON Schema version of the output contract (docs/spec/extract-output.schema.json).
|
|
53
53
|
SCHEMA_VERSION = 1
|
|
@@ -1275,13 +1275,27 @@ def load_llm_config() -> Optional[JSON]:
|
|
|
1275
1275
|
return None
|
|
1276
1276
|
|
|
1277
1277
|
|
|
1278
|
-
|
|
1279
|
-
"
|
|
1280
|
-
"
|
|
1281
|
-
"obligations (array of short strings, max 5), governing_law (string or "
|
|
1282
|
-
"null). Base answers strictly on the text. No prose, JSON only.\n\n"
|
|
1283
|
-
"CONTRACT:\n"
|
|
1278
|
+
_LLM_PROMPT_KEYS = (
|
|
1279
|
+
"renewal_mechanics (string or null), obligations (array of short strings, "
|
|
1280
|
+
"max 5), governing_law (string or null)"
|
|
1284
1281
|
)
|
|
1282
|
+
# Requested only when the deterministic clause cascade found nothing (e.g. a
|
|
1283
|
+
# DOCX that auto-numbers with no heading style): ask the model for the section
|
|
1284
|
+
# headings so we can still produce a clause map.
|
|
1285
|
+
_LLM_PROMPT_CLAUSES = (
|
|
1286
|
+
", clauses (array, max 40, of objects {\"title\": \"<the section/clause "
|
|
1287
|
+
"heading, verbatim if possible>\"} in document order, top-level sections "
|
|
1288
|
+
"only)"
|
|
1289
|
+
)
|
|
1290
|
+
|
|
1291
|
+
|
|
1292
|
+
def _build_llm_prompt(text: str, want_clauses: bool) -> str:
|
|
1293
|
+
keys = _LLM_PROMPT_KEYS + (_LLM_PROMPT_CLAUSES if want_clauses else "")
|
|
1294
|
+
return (
|
|
1295
|
+
"You are a contract-extraction assistant. Given the contract text, "
|
|
1296
|
+
"return ONLY a compact JSON object with keys: " + keys + ". Base answers "
|
|
1297
|
+
"strictly on the text. No prose, JSON only.\n\nCONTRACT:\n" + text[:16000]
|
|
1298
|
+
)
|
|
1285
1299
|
|
|
1286
1300
|
|
|
1287
1301
|
def _llm_request(cfg: JSON, prompt: str, timeout: float = 30.0) -> Optional[str]:
|
|
@@ -1337,8 +1351,44 @@ def _extract_json_object(s: str) -> Optional[JSON]:
|
|
|
1337
1351
|
return None
|
|
1338
1352
|
|
|
1339
1353
|
|
|
1354
|
+
def _llm_clause_map(raw: Any, text: str) -> List[JSON]:
|
|
1355
|
+
"""Convert LLM-returned clause titles into schema-conformant clause objects.
|
|
1356
|
+
Titles are canonicalized through the same suite vocabulary the deterministic
|
|
1357
|
+
path uses, located in the document for a best-effort span, and marked
|
|
1358
|
+
tier/source = 'llm' with a modest confidence (verify, not trust)."""
|
|
1359
|
+
if not isinstance(raw, list):
|
|
1360
|
+
return []
|
|
1361
|
+
low = text.lower()
|
|
1362
|
+
out: List[JSON] = []
|
|
1363
|
+
seen: set[str] = set()
|
|
1364
|
+
for item in raw[:40]:
|
|
1365
|
+
title: Any = item.get("title") if isinstance(item, dict) else item
|
|
1366
|
+
if not isinstance(title, str) or not title.strip():
|
|
1367
|
+
continue
|
|
1368
|
+
title = re.sub(r"\s+", " ", title.strip())
|
|
1369
|
+
key = _norm_clause_key(title)
|
|
1370
|
+
if not key or key in seen or _is_noise_clause_title(title):
|
|
1371
|
+
continue
|
|
1372
|
+
seen.add(key)
|
|
1373
|
+
canonical, mapped = _canonicalize_clause(title)
|
|
1374
|
+
idx = low.find(title.lower())
|
|
1375
|
+
span = ({"start": idx, "end": min(idx + len(title), len(text))}
|
|
1376
|
+
if idx >= 0 else {"start": 0, "end": 0})
|
|
1377
|
+
out.append({
|
|
1378
|
+
"canonical_title": canonical,
|
|
1379
|
+
"detected_title": title,
|
|
1380
|
+
"tier": "llm",
|
|
1381
|
+
"span": span,
|
|
1382
|
+
"confidence": 0.5,
|
|
1383
|
+
"source": "llm",
|
|
1384
|
+
"mapped": mapped,
|
|
1385
|
+
})
|
|
1386
|
+
return out
|
|
1387
|
+
|
|
1388
|
+
|
|
1340
1389
|
def llm_enrich(result: JSON, text: str, args_ns: argparse.Namespace) -> None:
|
|
1341
|
-
"""Opt-in enrichment of fuzzy fields
|
|
1390
|
+
"""Opt-in enrichment of fuzzy fields, plus a clause-map fallback when the
|
|
1391
|
+
deterministic cascade found no clauses. Mutates `result` in place. Any
|
|
1342
1392
|
failure (no config, network error, bad JSON) degrades gracefully: a warning
|
|
1343
1393
|
to stderr and the deterministic output is left untouched."""
|
|
1344
1394
|
cfg = load_llm_config()
|
|
@@ -1346,7 +1396,8 @@ def llm_enrich(result: JSON, text: str, args_ns: argparse.Namespace) -> None:
|
|
|
1346
1396
|
_warn(args_ns, "no LLM config found (~/.config/contract-ops/llm.json or "
|
|
1347
1397
|
"./config/llm.json); skipping --llm enrichment")
|
|
1348
1398
|
return
|
|
1349
|
-
|
|
1399
|
+
want_clauses = not result["clauses"]
|
|
1400
|
+
prompt = _build_llm_prompt(text, want_clauses)
|
|
1350
1401
|
try:
|
|
1351
1402
|
raw = _llm_request(cfg, prompt)
|
|
1352
1403
|
except (urllib.error.URLError, TimeoutError, OSError, ValueError) as e:
|
|
@@ -1376,6 +1427,11 @@ def llm_enrich(result: JSON, text: str, args_ns: argparse.Namespace) -> None:
|
|
|
1376
1427
|
if isinstance(gl, str) and gl.strip() and result["governing_law"]["source"] == "none":
|
|
1377
1428
|
result["governing_law"] = _field(gl.strip(), 0.6, "llm")
|
|
1378
1429
|
enriched = True
|
|
1430
|
+
if want_clauses:
|
|
1431
|
+
cmap = _llm_clause_map(obj.get("clauses"), text)
|
|
1432
|
+
if cmap:
|
|
1433
|
+
result["clauses"] = cmap
|
|
1434
|
+
enriched = True
|
|
1379
1435
|
|
|
1380
1436
|
result["_meta"]["llm_used"] = True
|
|
1381
1437
|
if enriched and "llm" not in result["_meta"]["tiers_used"]:
|
|
@@ -1658,7 +1714,8 @@ FIELD_CATALOG: Tuple[Tuple[str, str, str], ...] = (
|
|
|
1658
1714
|
("term.notice_period_days", "deterministic", "Notice period in days, best-effort"),
|
|
1659
1715
|
("term.auto_renew", "deterministic", "Auto-renewal flag, best-effort"),
|
|
1660
1716
|
("governing_law", "deterministic", "Governing law / jurisdiction"),
|
|
1661
|
-
("clauses", "deterministic", "Clause map normalized to the suite's canonical vocabulary"
|
|
1717
|
+
("clauses", "deterministic", "Clause map normalized to the suite's canonical vocabulary "
|
|
1718
|
+
"(LLM fallback under --llm when no headings are detected)"),
|
|
1662
1719
|
("defined_terms", "deterministic", "Defined-term inventory (quoted / parenthetical)"),
|
|
1663
1720
|
("value", "deterministic", "Headline monetary value"),
|
|
1664
1721
|
("term.renewal_mechanics", "llm", "Renewal mechanics (fuzzy; --llm only)"),
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "extract-cli"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.6"
|
|
8
8
|
description = "Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9"
|
|
@@ -67,6 +67,41 @@ def test_enrich_fills_only_missing_governing_law(monkeypatch: pytest.MonkeyPatch
|
|
|
67
67
|
assert result["governing_law"] == {"value": "France", "confidence": 0.6, "source": "llm"}
|
|
68
68
|
|
|
69
69
|
|
|
70
|
+
def test_llm_clause_fallback_when_deterministic_empty(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
71
|
+
from tests._schema_validator import validate
|
|
72
|
+
monkeypatch.setattr(ex, "load_llm_config",
|
|
73
|
+
lambda: {"provider": "anthropic", "api_key": "x"})
|
|
74
|
+
monkeypatch.setattr(ex, "_llm_request", lambda cfg, prompt, timeout=30.0: json.dumps(
|
|
75
|
+
{"clauses": [{"title": "Confidentiality"}, {"title": "Governing Law"},
|
|
76
|
+
{"title": "Special Widget Terms"}]}))
|
|
77
|
+
# A document with no detectable clause headings -> 0 deterministic clauses.
|
|
78
|
+
text = ("This Agreement is made between Acme Co and Beta Co. The parties agree "
|
|
79
|
+
"to maintain confidentiality. Governed by the laws of Delaware.")
|
|
80
|
+
result = ex.build_extraction(text, text.encode("utf-8"), "text", "x.txt")
|
|
81
|
+
assert result["clauses"] == []
|
|
82
|
+
ex.llm_enrich(result, text, _ns())
|
|
83
|
+
cl = result["clauses"]
|
|
84
|
+
assert [c["canonical_title"] for c in cl] == ["Confidentiality", "Governing Law", "Special Widget Terms"]
|
|
85
|
+
assert all(c["tier"] == "llm" and c["source"] == "llm" for c in cl)
|
|
86
|
+
assert cl[0]["mapped"] is True and cl[2]["mapped"] is False
|
|
87
|
+
assert result["_meta"]["llm_used"] is True and "llm" in result["_meta"]["tiers_used"]
|
|
88
|
+
assert validate(result, ex.output_schema()) == [] # llm clauses are schema-conformant
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def test_llm_does_not_replace_deterministic_clauses(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
92
|
+
monkeypatch.setattr(ex, "load_llm_config",
|
|
93
|
+
lambda: {"provider": "anthropic", "api_key": "x"})
|
|
94
|
+
monkeypatch.setattr(ex, "_llm_request", lambda cfg, prompt, timeout=30.0: json.dumps(
|
|
95
|
+
{"clauses": [{"title": "Should Not Appear"}]}))
|
|
96
|
+
text = ex.DEMO_DOCUMENT # has H2 clauses
|
|
97
|
+
result = ex.build_extraction(text, text.encode("utf-8"), "markdown", "d.md")
|
|
98
|
+
assert result["clauses"] and all(c["tier"] == "h2" for c in result["clauses"])
|
|
99
|
+
ex.llm_enrich(result, text, _ns())
|
|
100
|
+
# Deterministic clauses are kept; the LLM clause was never requested/used.
|
|
101
|
+
assert all(c["tier"] == "h2" for c in result["clauses"])
|
|
102
|
+
assert not any(c["detected_title"] == "Should Not Appear" for c in result["clauses"])
|
|
103
|
+
|
|
104
|
+
|
|
70
105
|
def test_request_error_degrades(monkeypatch: pytest.MonkeyPatch,
|
|
71
106
|
capsys: pytest.CaptureFixture[str]) -> None:
|
|
72
107
|
monkeypatch.setattr(ex, "load_llm_config",
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|