extract-cli 0.1.4__tar.gz → 0.1.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. {extract_cli-0.1.4 → extract_cli-0.1.6}/ARCHITECTURE.md +6 -1
  2. {extract_cli-0.1.4 → extract_cli-0.1.6}/CHANGELOG.md +34 -0
  3. {extract_cli-0.1.4 → extract_cli-0.1.6}/PKG-INFO +44 -24
  4. {extract_cli-0.1.4 → extract_cli-0.1.6}/README.md +43 -23
  5. {extract_cli-0.1.4 → extract_cli-0.1.6}/extract_cli.py +68 -11
  6. {extract_cli-0.1.4 → extract_cli-0.1.6}/pyproject.toml +1 -1
  7. {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/fixtures/employment_docx.docx.expected.json +1 -1
  8. {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/fixtures/heading_docx.docx.expected.json +1 -1
  9. {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/fixtures/lease_allcaps.txt.expected.json +1 -1
  10. {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/fixtures/license_pdf.pdf.expected.json +1 -1
  11. {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/fixtures/nda_h2.md.expected.json +1 -1
  12. {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/fixtures/scanned.pdf.expected.json +1 -1
  13. {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/fixtures/services_bold.txt.expected.json +1 -1
  14. {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/fixtures/services_html.html.expected.json +1 -1
  15. {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/test_llm.py +35 -0
  16. {extract_cli-0.1.4 → extract_cli-0.1.6}/.gitignore +0 -0
  17. {extract_cli-0.1.4 → extract_cli-0.1.6}/CONTRIBUTING.md +0 -0
  18. {extract_cli-0.1.4 → extract_cli-0.1.6}/LICENSE +0 -0
  19. {extract_cli-0.1.4 → extract_cli-0.1.6}/Makefile +0 -0
  20. {extract_cli-0.1.4 → extract_cli-0.1.6}/config/llm.json.example +0 -0
  21. {extract_cli-0.1.4 → extract_cli-0.1.6}/docs/INTEROP.md +0 -0
  22. {extract_cli-0.1.4 → extract_cli-0.1.6}/docs/spec/extract-output.schema.json +0 -0
  23. {extract_cli-0.1.4 → extract_cli-0.1.6}/scripts/release.py +0 -0
  24. {extract_cli-0.1.4 → extract_cli-0.1.6}/scripts/validate_against_spec.py +0 -0
  25. {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/_fixtures_build.py +0 -0
  26. {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/_make_goldens.py +0 -0
  27. {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/_schema_validator.py +0 -0
  28. {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/conftest.py +0 -0
  29. {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/fixtures/employment_docx.docx +0 -0
  30. {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/fixtures/heading_docx.docx +0 -0
  31. {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/fixtures/lease_allcaps.txt +0 -0
  32. {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/fixtures/license_pdf.pdf +0 -0
  33. {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/fixtures/nda_h2.md +0 -0
  34. {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/fixtures/scanned.pdf +0 -0
  35. {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/fixtures/services_bold.txt +0 -0
  36. {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/fixtures/services_html.html +0 -0
  37. {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/test_clause_map.py +0 -0
  38. {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/test_cli.py +0 -0
  39. {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/test_deterministic.py +0 -0
  40. {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/test_misc.py +0 -0
  41. {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/test_property.py +0 -0
  42. {extract_cli-0.1.4 → extract_cli-0.1.6}/tests/test_schema_conformance.py +0 -0
@@ -80,7 +80,12 @@ endpoint. Any failure (no config, network error, unparseable JSON) is caught:
80
80
  a warning to stderr, deterministic output untouched. The LLM only *adds* fuzzy
81
81
  fields (`term.renewal_mechanics`, `obligations`) and fills `governing_law` only
82
82
  when the deterministic tier found nothing — it never overwrites a deterministic
83
- value.
83
+ value. As a **clause-map fallback**, when the deterministic cascade returned no
84
+ clauses the LLM is asked for the section headings (the clause keys are added to
85
+ the prompt only then); the titles are normalized through the same
86
+ `_canonicalize_clause` vocabulary, located in the text for a best-effort span,
87
+ and emitted with `tier: "llm"` / `source: "llm"`. This covers DOCX that
88
+ auto-number with no heading style (their numbers live only in `numbering.xml`).
84
89
 
85
90
  ## The output contract
86
91
 
@@ -6,6 +6,38 @@ to [Semantic Versioning](https://semver.org/). Per the suite convention
6
6
  (see [`docs/INTEROP.md`](docs/INTEROP.md)), **backward-incompatible changes to
7
7
  the output schema require a major version bump**; new optional fields are minor.
8
8
 
9
+ ## [0.1.6] - 2026-05-21
10
+
11
+ ### Docs
12
+ - **Rewrote the README composability section to verified, runnable examples.**
13
+ Testing extract-cli against the real sibling CLIs (`template-vault-cli`,
14
+ `nda-review-cli`) showed the previous pipes were aspirational — the siblings
15
+ expose no `--from-extract`/`--stdin` flag (`nda-review review` takes
16
+ `--file`/`--text`; `template-vault` reads its own vault). The integration
17
+ contract is the **output schema + the shared canonical clause vocabulary**,
18
+ glued by stdout JSON and standard tools (`jq`, `comm`): `extract`'s
19
+ `canonical_title` values are the same names template-vault detects and
20
+ nda-review keys policy on, so a foreign document's clauses line up with the
21
+ suite's with no bespoke adapter. New examples cover clause-coverage gap
22
+ analysis against a vault template and a combined extract+nda-review intake
23
+ report — all runnable today. (Also fixed a broken `jq input_filename` in the
24
+ folder-triage example.) No code or schema change.
25
+
26
+ ## [0.1.5] - 2026-05-21
27
+
28
+ ### Added
29
+ - **LLM clause-map fallback** (opt-in, `--llm` only). When the deterministic
30
+ cascade detects no clauses — e.g. a `.docx` that auto-numbers via Word's
31
+ numbering with no heading style, the limitation noted in 0.1.4 — the LLM is
32
+ asked for the section headings (the clause request is added to the prompt
33
+ only in that case). Returned titles are normalized through the same canonical
34
+ vocabulary as the deterministic path, located in the document for a
35
+ best-effort span, and emitted with `tier: "llm"`, `source: "llm"`, and a
36
+ modest confidence. The LLM is never consulted for clauses the deterministic
37
+ cascade already found, and the deterministic core remains fully useful with
38
+ no LLM. No schema change (the clause `tier`/`source` enums already allow
39
+ `llm`).
40
+
9
41
  ## [0.1.4] - 2026-05-21
10
42
 
11
43
  DOCX clause detection, driven by testing against 20 real `.docx` contracts
@@ -166,6 +198,8 @@ Initial release — the open-loop front door of the contract-ops CLI suite.
166
198
  intentionally *not* governed by the output schema (the schema describes the
167
199
  full default output).
168
200
 
201
+ [0.1.6]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.6
202
+ [0.1.5]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.5
169
203
  [0.1.4]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.4
170
204
  [0.1.3]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.3
171
205
  [0.1.2]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.2
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-cli
3
- Version: 0.1.4
3
+ Version: 0.1.6
4
4
  Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON.
5
5
  Project-URL: Homepage, https://cli.drbaher.com/
6
6
  Project-URL: Repository, https://github.com/DrBaher/extract-cli
@@ -102,6 +102,15 @@ opt-in, never in a hot path, and gated behind an explicit flag and a config
102
102
  file — if no config is present, `--llm` degrades gracefully with a warning and
103
103
  you still get the full deterministic output.
104
104
 
105
+ **Clause-map fallback.** Some documents (e.g. `.docx` that auto-number clauses
106
+ via Word's numbering with no heading style) carry no signal the deterministic
107
+ cascade can see, so its clause map comes back empty. When `--llm` is set *and*
108
+ no clauses were detected, the LLM is asked for the section headings; the result
109
+ is normalized through the same canonical vocabulary and emitted with
110
+ `tier: "llm"`, `source: "llm"`, and a modest confidence (verify, not trust).
111
+ When the deterministic cascade already found clauses, the LLM is not consulted
112
+ for them.
113
+
105
114
  ## Commands
106
115
 
107
116
  ```bash
@@ -162,37 +171,48 @@ extract counterparty.pdf | jq '.clauses[] | {canonical_title, detected_title, ma
162
171
 
163
172
  ## Composability — piping into the rest of the suite
164
173
 
165
- `extract-cli` is built to be the first stage of a Unix pipe. Its JSON is the
166
- contract every downstream tool reads.
174
+ `extract-cli` is built to be the first stage of a Unix pipe. The glue is its
175
+ **stdout JSON + standard tools** (`jq`, `comm`) and the **shared clause
176
+ vocabulary** — `extract`'s `canonical_title` values are the same names
177
+ `template-vault-cli` detects and `nda-review-cli` keys policy on, so a foreign
178
+ document's clauses line up with the suite's with no bespoke adapter. Every
179
+ example below is runnable today (verified against the real sibling CLIs).
167
180
 
168
181
  ```bash
169
- # 1) Foreign NDA review. extract normalizes clauses; nda-review runs policy.
170
- extract counterparty_nda.pdf | nda-review review --from-extract -
171
-
172
- # 2) Pull just the clause map and feed compare-cli to diff a foreign doc
173
- # against your canonical template's structure.
174
- extract their_msa.docx --fields clauses | compare-cli align --stdin \
175
- --against msa/standard
176
-
177
- # 3) Archive structured metadata for any inbound paper into the post-signature
178
- # vault, keyed by content hash.
179
- extract signed_contract.pdf | contract-vault put --from-extract - \
180
- --id "$(extract signed_contract.pdf | jq -r .document.sha256)"
181
-
182
- # 4) Triage a folder of inbound contracts: list governing law + parties.
183
- for f in inbox/*.pdf; do
182
+ # 1) Inspect any contract's structure (.md/.txt/.html/.docx/.pdf, one tool).
183
+ extract counterparty.docx | jq '{parties: [.parties[].name],
184
+ governing_law: .governing_law.value, clauses: [.clauses[].canonical_title]}'
185
+
186
+ # 2) Clause-coverage gap vs your canonical template in template-vault-cli.
187
+ # extract normalizes the counterparty's *foreign* headings onto the same
188
+ # clause vocabulary template-vault detects, so a plain `comm` diffs them.
189
+ template-vault info nda/mutual-standard --json | jq -r '.clauses[].title' | sort > ours.txt
190
+ extract counterparty_nda.docx | jq -r '.clauses[].canonical_title' | sort -u > theirs.txt
191
+ comm -23 ours.txt theirs.txt # clauses in OUR standard that THEY are missing
192
+ comm -13 ours.txt theirs.txt # clauses THEY added that we don't have
193
+
194
+ # 3) Intake: extract for structure, nda-review-cli for a policy verdict on the
195
+ # same foreign doc; merge both views with jq.
196
+ extract counterparty_nda.docx > extract.json
197
+ nda-review review --file counterparty_nda.docx --playbook output/nda_playbook.json \
198
+ --out-json review.json
199
+ jq -n --slurpfile e extract.json --slurpfile r review.json \
200
+ '{parties: [$e[0].parties[].name], governing_law: $e[0].governing_law.value,
201
+ clauses: ($e[0].clauses | length), decision: $r[0].decision, risk: $r[0].risk_score}'
202
+
203
+ # 4) Triage a folder of inbound contracts: governing law + parties per file.
204
+ for f in inbox/*; do
184
205
  extract "$f" --fields parties,governing_law --no-confidence \
185
- | jq -c '{file: input_filename, gov: .governing_law, parties: [.parties[].name]}'
206
+ | jq -c --arg f "$f" '{file: $f, gov: .governing_law, parties: [.parties[].name]}'
186
207
  done
187
208
 
188
- # 5) Gate a workflow on extraction confidence.
209
+ # 5) Gate a workflow on extraction confidence (non-zero exit if any clause is shaky).
189
210
  extract draft.docx | jq -e '.clauses | all(.confidence > 0.7)' && echo "ok to review"
190
211
  ```
191
212
 
192
- > The `--from-extract`/`--stdin` flags above are the consumption points the
193
- > sibling CLIs expose (or are adopting) for this contract; see
194
- > [`docs/INTEROP.md`](docs/INTEROP.md) for the shared conventions and the
195
- > versioning commitment on the schema.
213
+ > The integration contract is the **output schema** and the **canonical clause
214
+ > vocabulary**, not per-tool flags. See [`docs/INTEROP.md`](docs/INTEROP.md) for
215
+ > the shared conventions and the schema's versioning commitment.
196
216
 
197
217
  ## LLM configuration (opt-in)
198
218
 
@@ -64,6 +64,15 @@ opt-in, never in a hot path, and gated behind an explicit flag and a config
64
64
  file — if no config is present, `--llm` degrades gracefully with a warning and
65
65
  you still get the full deterministic output.
66
66
 
67
+ **Clause-map fallback.** Some documents (e.g. `.docx` that auto-number clauses
68
+ via Word's numbering with no heading style) carry no signal the deterministic
69
+ cascade can see, so its clause map comes back empty. When `--llm` is set *and*
70
+ no clauses were detected, the LLM is asked for the section headings; the result
71
+ is normalized through the same canonical vocabulary and emitted with
72
+ `tier: "llm"`, `source: "llm"`, and a modest confidence (verify, not trust).
73
+ When the deterministic cascade already found clauses, the LLM is not consulted
74
+ for them.
75
+
67
76
  ## Commands
68
77
 
69
78
  ```bash
@@ -124,37 +133,48 @@ extract counterparty.pdf | jq '.clauses[] | {canonical_title, detected_title, ma
124
133
 
125
134
  ## Composability — piping into the rest of the suite
126
135
 
127
- `extract-cli` is built to be the first stage of a Unix pipe. Its JSON is the
128
- contract every downstream tool reads.
136
+ `extract-cli` is built to be the first stage of a Unix pipe. The glue is its
137
+ **stdout JSON + standard tools** (`jq`, `comm`) and the **shared clause
138
+ vocabulary** — `extract`'s `canonical_title` values are the same names
139
+ `template-vault-cli` detects and `nda-review-cli` keys policy on, so a foreign
140
+ document's clauses line up with the suite's with no bespoke adapter. Every
141
+ example below is runnable today (verified against the real sibling CLIs).
129
142
 
130
143
  ```bash
131
- # 1) Foreign NDA review. extract normalizes clauses; nda-review runs policy.
132
- extract counterparty_nda.pdf | nda-review review --from-extract -
133
-
134
- # 2) Pull just the clause map and feed compare-cli to diff a foreign doc
135
- # against your canonical template's structure.
136
- extract their_msa.docx --fields clauses | compare-cli align --stdin \
137
- --against msa/standard
138
-
139
- # 3) Archive structured metadata for any inbound paper into the post-signature
140
- # vault, keyed by content hash.
141
- extract signed_contract.pdf | contract-vault put --from-extract - \
142
- --id "$(extract signed_contract.pdf | jq -r .document.sha256)"
143
-
144
- # 4) Triage a folder of inbound contracts: list governing law + parties.
145
- for f in inbox/*.pdf; do
144
+ # 1) Inspect any contract's structure (.md/.txt/.html/.docx/.pdf, one tool).
145
+ extract counterparty.docx | jq '{parties: [.parties[].name],
146
+ governing_law: .governing_law.value, clauses: [.clauses[].canonical_title]}'
147
+
148
+ # 2) Clause-coverage gap vs your canonical template in template-vault-cli.
149
+ # extract normalizes the counterparty's *foreign* headings onto the same
150
+ # clause vocabulary template-vault detects, so a plain `comm` diffs them.
151
+ template-vault info nda/mutual-standard --json | jq -r '.clauses[].title' | sort > ours.txt
152
+ extract counterparty_nda.docx | jq -r '.clauses[].canonical_title' | sort -u > theirs.txt
153
+ comm -23 ours.txt theirs.txt # clauses in OUR standard that THEY are missing
154
+ comm -13 ours.txt theirs.txt # clauses THEY added that we don't have
155
+
156
+ # 3) Intake: extract for structure, nda-review-cli for a policy verdict on the
157
+ # same foreign doc; merge both views with jq.
158
+ extract counterparty_nda.docx > extract.json
159
+ nda-review review --file counterparty_nda.docx --playbook output/nda_playbook.json \
160
+ --out-json review.json
161
+ jq -n --slurpfile e extract.json --slurpfile r review.json \
162
+ '{parties: [$e[0].parties[].name], governing_law: $e[0].governing_law.value,
163
+ clauses: ($e[0].clauses | length), decision: $r[0].decision, risk: $r[0].risk_score}'
164
+
165
+ # 4) Triage a folder of inbound contracts: governing law + parties per file.
166
+ for f in inbox/*; do
146
167
  extract "$f" --fields parties,governing_law --no-confidence \
147
- | jq -c '{file: input_filename, gov: .governing_law, parties: [.parties[].name]}'
168
+ | jq -c --arg f "$f" '{file: $f, gov: .governing_law, parties: [.parties[].name]}'
148
169
  done
149
170
 
150
- # 5) Gate a workflow on extraction confidence.
171
+ # 5) Gate a workflow on extraction confidence (non-zero exit if any clause is shaky).
151
172
  extract draft.docx | jq -e '.clauses | all(.confidence > 0.7)' && echo "ok to review"
152
173
  ```
153
174
 
154
- > The `--from-extract`/`--stdin` flags above are the consumption points the
155
- > sibling CLIs expose (or are adopting) for this contract; see
156
- > [`docs/INTEROP.md`](docs/INTEROP.md) for the shared conventions and the
157
- > versioning commitment on the schema.
175
+ > The integration contract is the **output schema** and the **canonical clause
176
+ > vocabulary**, not per-tool flags. See [`docs/INTEROP.md`](docs/INTEROP.md) for
177
+ > the shared conventions and the schema's versioning commitment.
158
178
 
159
179
  ## LLM configuration (opt-in)
160
180
 
@@ -43,11 +43,11 @@ import urllib.request
43
43
  from pathlib import Path
44
44
  from typing import Any, Dict, List, Optional, Tuple
45
45
 
46
- __version__ = "0.1.4"
46
+ __version__ = "0.1.6"
47
47
 
48
48
  # Bumped independently of the package version when the *extraction logic*
49
49
  # changes in a way downstream consumers should notice. Embedded in `_meta`.
50
- EXTRACTOR_VERSION = "0.1.4"
50
+ EXTRACTOR_VERSION = "0.1.6"
51
51
 
52
52
  # JSON Schema version of the output contract (docs/spec/extract-output.schema.json).
53
53
  SCHEMA_VERSION = 1
@@ -1275,13 +1275,27 @@ def load_llm_config() -> Optional[JSON]:
1275
1275
  return None
1276
1276
 
1277
1277
 
1278
- _LLM_PROMPT = (
1279
- "You are a contract-extraction assistant. Given the contract text, return "
1280
- "ONLY a compact JSON object with keys: renewal_mechanics (string or null), "
1281
- "obligations (array of short strings, max 5), governing_law (string or "
1282
- "null). Base answers strictly on the text. No prose, JSON only.\n\n"
1283
- "CONTRACT:\n"
1278
+ _LLM_PROMPT_KEYS = (
1279
+ "renewal_mechanics (string or null), obligations (array of short strings, "
1280
+ "max 5), governing_law (string or null)"
1284
1281
  )
1282
+ # Requested only when the deterministic clause cascade found nothing (e.g. a
1283
+ # DOCX that auto-numbers with no heading style): ask the model for the section
1284
+ # headings so we can still produce a clause map.
1285
+ _LLM_PROMPT_CLAUSES = (
1286
+ ", clauses (array, max 40, of objects {\"title\": \"<the section/clause "
1287
+ "heading, verbatim if possible>\"} in document order, top-level sections "
1288
+ "only)"
1289
+ )
1290
+
1291
+
1292
+ def _build_llm_prompt(text: str, want_clauses: bool) -> str:
1293
+ keys = _LLM_PROMPT_KEYS + (_LLM_PROMPT_CLAUSES if want_clauses else "")
1294
+ return (
1295
+ "You are a contract-extraction assistant. Given the contract text, "
1296
+ "return ONLY a compact JSON object with keys: " + keys + ". Base answers "
1297
+ "strictly on the text. No prose, JSON only.\n\nCONTRACT:\n" + text[:16000]
1298
+ )
1285
1299
 
1286
1300
 
1287
1301
  def _llm_request(cfg: JSON, prompt: str, timeout: float = 30.0) -> Optional[str]:
@@ -1337,8 +1351,44 @@ def _extract_json_object(s: str) -> Optional[JSON]:
1337
1351
  return None
1338
1352
 
1339
1353
 
1354
+ def _llm_clause_map(raw: Any, text: str) -> List[JSON]:
1355
+ """Convert LLM-returned clause titles into schema-conformant clause objects.
1356
+ Titles are canonicalized through the same suite vocabulary the deterministic
1357
+ path uses, located in the document for a best-effort span, and marked
1358
+ tier/source = 'llm' with a modest confidence (verify, not trust)."""
1359
+ if not isinstance(raw, list):
1360
+ return []
1361
+ low = text.lower()
1362
+ out: List[JSON] = []
1363
+ seen: set[str] = set()
1364
+ for item in raw[:40]:
1365
+ title: Any = item.get("title") if isinstance(item, dict) else item
1366
+ if not isinstance(title, str) or not title.strip():
1367
+ continue
1368
+ title = re.sub(r"\s+", " ", title.strip())
1369
+ key = _norm_clause_key(title)
1370
+ if not key or key in seen or _is_noise_clause_title(title):
1371
+ continue
1372
+ seen.add(key)
1373
+ canonical, mapped = _canonicalize_clause(title)
1374
+ idx = low.find(title.lower())
1375
+ span = ({"start": idx, "end": min(idx + len(title), len(text))}
1376
+ if idx >= 0 else {"start": 0, "end": 0})
1377
+ out.append({
1378
+ "canonical_title": canonical,
1379
+ "detected_title": title,
1380
+ "tier": "llm",
1381
+ "span": span,
1382
+ "confidence": 0.5,
1383
+ "source": "llm",
1384
+ "mapped": mapped,
1385
+ })
1386
+ return out
1387
+
1388
+
1340
1389
  def llm_enrich(result: JSON, text: str, args_ns: argparse.Namespace) -> None:
1341
- """Opt-in enrichment of fuzzy fields. Mutates `result` in place. Any
1390
+ """Opt-in enrichment of fuzzy fields, plus a clause-map fallback when the
1391
+ deterministic cascade found no clauses. Mutates `result` in place. Any
1342
1392
  failure (no config, network error, bad JSON) degrades gracefully: a warning
1343
1393
  to stderr and the deterministic output is left untouched."""
1344
1394
  cfg = load_llm_config()
@@ -1346,7 +1396,8 @@ def llm_enrich(result: JSON, text: str, args_ns: argparse.Namespace) -> None:
1346
1396
  _warn(args_ns, "no LLM config found (~/.config/contract-ops/llm.json or "
1347
1397
  "./config/llm.json); skipping --llm enrichment")
1348
1398
  return
1349
- prompt = _LLM_PROMPT + text[:12000]
1399
+ want_clauses = not result["clauses"]
1400
+ prompt = _build_llm_prompt(text, want_clauses)
1350
1401
  try:
1351
1402
  raw = _llm_request(cfg, prompt)
1352
1403
  except (urllib.error.URLError, TimeoutError, OSError, ValueError) as e:
@@ -1376,6 +1427,11 @@ def llm_enrich(result: JSON, text: str, args_ns: argparse.Namespace) -> None:
1376
1427
  if isinstance(gl, str) and gl.strip() and result["governing_law"]["source"] == "none":
1377
1428
  result["governing_law"] = _field(gl.strip(), 0.6, "llm")
1378
1429
  enriched = True
1430
+ if want_clauses:
1431
+ cmap = _llm_clause_map(obj.get("clauses"), text)
1432
+ if cmap:
1433
+ result["clauses"] = cmap
1434
+ enriched = True
1379
1435
 
1380
1436
  result["_meta"]["llm_used"] = True
1381
1437
  if enriched and "llm" not in result["_meta"]["tiers_used"]:
@@ -1658,7 +1714,8 @@ FIELD_CATALOG: Tuple[Tuple[str, str, str], ...] = (
1658
1714
  ("term.notice_period_days", "deterministic", "Notice period in days, best-effort"),
1659
1715
  ("term.auto_renew", "deterministic", "Auto-renewal flag, best-effort"),
1660
1716
  ("governing_law", "deterministic", "Governing law / jurisdiction"),
1661
- ("clauses", "deterministic", "Clause map normalized to the suite's canonical vocabulary"),
1717
+ ("clauses", "deterministic", "Clause map normalized to the suite's canonical vocabulary "
1718
+ "(LLM fallback under --llm when no headings are detected)"),
1662
1719
  ("defined_terms", "deterministic", "Defined-term inventory (quoted / parenthetical)"),
1663
1720
  ("value", "deterministic", "Headline monetary value"),
1664
1721
  ("term.renewal_mechanics", "llm", "Renewal mechanics (fuzzy; --llm only)"),
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "extract-cli"
7
- version = "0.1.4"
7
+ version = "0.1.6"
8
8
  description = "Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
@@ -138,7 +138,7 @@
138
138
  "source": "deterministic"
139
139
  },
140
140
  "_meta": {
141
- "extractor_version": "0.1.4",
141
+ "extractor_version": "0.1.6",
142
142
  "tiers_used": [
143
143
  "deterministic"
144
144
  ],
@@ -133,7 +133,7 @@
133
133
  "source": "none"
134
134
  },
135
135
  "_meta": {
136
- "extractor_version": "0.1.4",
136
+ "extractor_version": "0.1.6",
137
137
  "tiers_used": [
138
138
  "deterministic"
139
139
  ],
@@ -133,7 +133,7 @@
133
133
  "source": "deterministic"
134
134
  },
135
135
  "_meta": {
136
- "extractor_version": "0.1.4",
136
+ "extractor_version": "0.1.6",
137
137
  "tiers_used": [
138
138
  "deterministic"
139
139
  ],
@@ -133,7 +133,7 @@
133
133
  "source": "deterministic"
134
134
  },
135
135
  "_meta": {
136
- "extractor_version": "0.1.4",
136
+ "extractor_version": "0.1.6",
137
137
  "tiers_used": [
138
138
  "deterministic"
139
139
  ],
@@ -143,7 +143,7 @@
143
143
  "source": "none"
144
144
  },
145
145
  "_meta": {
146
- "extractor_version": "0.1.4",
146
+ "extractor_version": "0.1.6",
147
147
  "tiers_used": [
148
148
  "deterministic"
149
149
  ],
@@ -48,7 +48,7 @@
48
48
  "source": "none"
49
49
  },
50
50
  "_meta": {
51
- "extractor_version": "0.1.4",
51
+ "extractor_version": "0.1.6",
52
52
  "tiers_used": [
53
53
  "deterministic"
54
54
  ],
@@ -133,7 +133,7 @@
133
133
  "source": "deterministic"
134
134
  },
135
135
  "_meta": {
136
- "extractor_version": "0.1.4",
136
+ "extractor_version": "0.1.6",
137
137
  "tiers_used": [
138
138
  "deterministic"
139
139
  ],
@@ -148,7 +148,7 @@
148
148
  "source": "deterministic"
149
149
  },
150
150
  "_meta": {
151
- "extractor_version": "0.1.4",
151
+ "extractor_version": "0.1.6",
152
152
  "tiers_used": [
153
153
  "deterministic"
154
154
  ],
@@ -67,6 +67,41 @@ def test_enrich_fills_only_missing_governing_law(monkeypatch: pytest.MonkeyPatch
67
67
  assert result["governing_law"] == {"value": "France", "confidence": 0.6, "source": "llm"}
68
68
 
69
69
 
70
+ def test_llm_clause_fallback_when_deterministic_empty(monkeypatch: pytest.MonkeyPatch) -> None:
71
+ from tests._schema_validator import validate
72
+ monkeypatch.setattr(ex, "load_llm_config",
73
+ lambda: {"provider": "anthropic", "api_key": "x"})
74
+ monkeypatch.setattr(ex, "_llm_request", lambda cfg, prompt, timeout=30.0: json.dumps(
75
+ {"clauses": [{"title": "Confidentiality"}, {"title": "Governing Law"},
76
+ {"title": "Special Widget Terms"}]}))
77
+ # A document with no detectable clause headings -> 0 deterministic clauses.
78
+ text = ("This Agreement is made between Acme Co and Beta Co. The parties agree "
79
+ "to maintain confidentiality. Governed by the laws of Delaware.")
80
+ result = ex.build_extraction(text, text.encode("utf-8"), "text", "x.txt")
81
+ assert result["clauses"] == []
82
+ ex.llm_enrich(result, text, _ns())
83
+ cl = result["clauses"]
84
+ assert [c["canonical_title"] for c in cl] == ["Confidentiality", "Governing Law", "Special Widget Terms"]
85
+ assert all(c["tier"] == "llm" and c["source"] == "llm" for c in cl)
86
+ assert cl[0]["mapped"] is True and cl[2]["mapped"] is False
87
+ assert result["_meta"]["llm_used"] is True and "llm" in result["_meta"]["tiers_used"]
88
+ assert validate(result, ex.output_schema()) == [] # llm clauses are schema-conformant
89
+
90
+
91
+ def test_llm_does_not_replace_deterministic_clauses(monkeypatch: pytest.MonkeyPatch) -> None:
92
+ monkeypatch.setattr(ex, "load_llm_config",
93
+ lambda: {"provider": "anthropic", "api_key": "x"})
94
+ monkeypatch.setattr(ex, "_llm_request", lambda cfg, prompt, timeout=30.0: json.dumps(
95
+ {"clauses": [{"title": "Should Not Appear"}]}))
96
+ text = ex.DEMO_DOCUMENT # has H2 clauses
97
+ result = ex.build_extraction(text, text.encode("utf-8"), "markdown", "d.md")
98
+ assert result["clauses"] and all(c["tier"] == "h2" for c in result["clauses"])
99
+ ex.llm_enrich(result, text, _ns())
100
+ # Deterministic clauses are kept; the LLM clause was never requested/used.
101
+ assert all(c["tier"] == "h2" for c in result["clauses"])
102
+ assert not any(c["detected_title"] == "Should Not Appear" for c in result["clauses"])
103
+
104
+
70
105
  def test_request_error_degrades(monkeypatch: pytest.MonkeyPatch,
71
106
  capsys: pytest.CaptureFixture[str]) -> None:
72
107
  monkeypatch.setattr(ex, "load_llm_config",
File without changes
File without changes
File without changes
File without changes
File without changes