extract-cli 0.1.3__tar.gz → 0.1.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {extract_cli-0.1.3 → extract_cli-0.1.5}/ARCHITECTURE.md +6 -1
- {extract_cli-0.1.3 → extract_cli-0.1.5}/CHANGELOG.md +43 -0
- {extract_cli-0.1.3 → extract_cli-0.1.5}/PKG-INFO +10 -1
- {extract_cli-0.1.3 → extract_cli-0.1.5}/README.md +9 -0
- {extract_cli-0.1.3 → extract_cli-0.1.5}/extract_cli.py +113 -11
- {extract_cli-0.1.3 → extract_cli-0.1.5}/pyproject.toml +1 -1
- {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/_fixtures_build.py +45 -7
- {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/_make_goldens.py +2 -2
- {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/conftest.py +1 -0
- {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/fixtures/employment_docx.docx +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/fixtures/employment_docx.docx.expected.json +2 -2
- extract_cli-0.1.5/tests/fixtures/heading_docx.docx +0 -0
- extract_cli-0.1.5/tests/fixtures/heading_docx.docx.expected.json +142 -0
- {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/fixtures/lease_allcaps.txt.expected.json +1 -1
- {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/fixtures/license_pdf.pdf.expected.json +1 -1
- {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/fixtures/nda_h2.md.expected.json +1 -1
- {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/fixtures/scanned.pdf.expected.json +1 -1
- {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/fixtures/services_bold.txt.expected.json +1 -1
- {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/fixtures/services_html.html.expected.json +1 -1
- {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/test_llm.py +35 -0
- {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/test_misc.py +31 -0
- {extract_cli-0.1.3 → extract_cli-0.1.5}/.gitignore +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.5}/CONTRIBUTING.md +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.5}/LICENSE +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.5}/Makefile +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.5}/config/llm.json.example +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.5}/docs/INTEROP.md +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.5}/docs/spec/extract-output.schema.json +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.5}/scripts/release.py +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.5}/scripts/validate_against_spec.py +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/_schema_validator.py +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/fixtures/lease_allcaps.txt +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/fixtures/license_pdf.pdf +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/fixtures/nda_h2.md +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/fixtures/scanned.pdf +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/fixtures/services_bold.txt +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/fixtures/services_html.html +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/test_clause_map.py +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/test_cli.py +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/test_deterministic.py +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/test_property.py +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/test_schema_conformance.py +0 -0
|
@@ -80,7 +80,12 @@ endpoint. Any failure (no config, network error, unparseable JSON) is caught:
|
|
|
80
80
|
a warning to stderr, deterministic output untouched. The LLM only *adds* fuzzy
|
|
81
81
|
fields (`term.renewal_mechanics`, `obligations`) and fills `governing_law` only
|
|
82
82
|
when the deterministic tier found nothing — it never overwrites a deterministic
|
|
83
|
-
value.
|
|
83
|
+
value. As a **clause-map fallback**, when the deterministic cascade returned no
|
|
84
|
+
clauses the LLM is asked for the section headings (the clause keys are added to
|
|
85
|
+
the prompt only then); the titles are normalized through the same
|
|
86
|
+
`_canonicalize_clause` vocabulary, located in the text for a best-effort span,
|
|
87
|
+
and emitted with `tier: "llm"` / `source: "llm"`. This covers DOCX that
|
|
88
|
+
auto-number with no heading style (their numbers live only in `numbering.xml`).
|
|
84
89
|
|
|
85
90
|
## The output contract
|
|
86
91
|
|
|
@@ -6,6 +6,47 @@ to [Semantic Versioning](https://semver.org/). Per the suite convention
|
|
|
6
6
|
(see [`docs/INTEROP.md`](docs/INTEROP.md)), **backward-incompatible changes to
|
|
7
7
|
the output schema require a major version bump**; new optional fields are minor.
|
|
8
8
|
|
|
9
|
+
## [0.1.5] - 2026-05-21
|
|
10
|
+
|
|
11
|
+
### Added
|
|
12
|
+
- **LLM clause-map fallback** (opt-in, `--llm` only). When the deterministic
|
|
13
|
+
cascade detects no clauses — e.g. a `.docx` that auto-numbers via Word's
|
|
14
|
+
numbering with no heading style, the limitation noted in 0.1.4 — the LLM is
|
|
15
|
+
asked for the section headings (the clause request is added to the prompt
|
|
16
|
+
only in that case). Returned titles are normalized through the same canonical
|
|
17
|
+
vocabulary as the deterministic path, located in the document for a
|
|
18
|
+
best-effort span, and emitted with `tier: "llm"`, `source: "llm"`, and a
|
|
19
|
+
modest confidence. The LLM is never consulted for clauses the deterministic
|
|
20
|
+
cascade already found, and the deterministic core remains fully useful with
|
|
21
|
+
no LLM. No schema change (the clause `tier`/`source` enums already allow
|
|
22
|
+
`llm`).
|
|
23
|
+
|
|
24
|
+
## [0.1.4] - 2026-05-21
|
|
25
|
+
|
|
26
|
+
DOCX clause detection, driven by testing against 20 real `.docx` contracts
|
|
27
|
+
(Common Paper / Bonterms / YC templates via open-agreements, plus government
|
|
28
|
+
samples) — the format we expect most.
|
|
29
|
+
|
|
30
|
+
### Fixed
|
|
31
|
+
- **The DOCX reader now honors Word heading styles.** Real Word contracts carry
|
|
32
|
+
their clause structure in `Heading1`–`Heading9`/`Title` paragraph styles with
|
|
33
|
+
*auto-generated* numbers (absent from the raw text), so the prior cascade
|
|
34
|
+
found almost no clauses. Heading-styled paragraphs are now emitted as `##`
|
|
35
|
+
headings (detected by the strongest tier); run-in headings
|
|
36
|
+
(`Payment. Customer will pay …`) are split into title + body, and a full
|
|
37
|
+
sentence that merely carries a heading style is rejected (not a clause).
|
|
38
|
+
Across the 20-doc sample this took heading-styled agreements from ~0 clauses
|
|
39
|
+
to a clean 14–21 distinct suite-vocabulary clauses each.
|
|
40
|
+
- Binary DOCX test fixtures are now generated deterministically (fixed zip
|
|
41
|
+
timestamp) so their sha256 — and the goldens — are stable across regenerations.
|
|
42
|
+
|
|
43
|
+
### Known limitations (documented)
|
|
44
|
+
- DOCX that auto-number clauses via `numbering.xml` with **no heading style and
|
|
45
|
+
no bold lead** (some Bonterms/older templates use a flat `Plain`/`ListParagraph`
|
|
46
|
+
style) still yield no clause map: the heading text carries no detectable
|
|
47
|
+
signal without reconstructing Word's numbering counters. Parties/dates/
|
|
48
|
+
governing-law still extract.
|
|
49
|
+
|
|
9
50
|
## [0.1.3] - 2026-05-21
|
|
10
51
|
|
|
11
52
|
Clause-map de-noising and party cleanup, driven by testing against 10 more
|
|
@@ -140,6 +181,8 @@ Initial release — the open-loop front door of the contract-ops CLI suite.
|
|
|
140
181
|
intentionally *not* governed by the output schema (the schema describes the
|
|
141
182
|
full default output).
|
|
142
183
|
|
|
184
|
+
[0.1.5]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.5
|
|
185
|
+
[0.1.4]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.4
|
|
143
186
|
[0.1.3]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.3
|
|
144
187
|
[0.1.2]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.2
|
|
145
188
|
[0.1.1]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.1
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: extract-cli
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.5
|
|
4
4
|
Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON.
|
|
5
5
|
Project-URL: Homepage, https://cli.drbaher.com/
|
|
6
6
|
Project-URL: Repository, https://github.com/DrBaher/extract-cli
|
|
@@ -102,6 +102,15 @@ opt-in, never in a hot path, and gated behind an explicit flag and a config
|
|
|
102
102
|
file — if no config is present, `--llm` degrades gracefully with a warning and
|
|
103
103
|
you still get the full deterministic output.
|
|
104
104
|
|
|
105
|
+
**Clause-map fallback.** Some documents (e.g. `.docx` that auto-number clauses
|
|
106
|
+
via Word's numbering with no heading style) carry no signal the deterministic
|
|
107
|
+
cascade can see, so its clause map comes back empty. When `--llm` is set *and*
|
|
108
|
+
no clauses were detected, the LLM is asked for the section headings; the result
|
|
109
|
+
is normalized through the same canonical vocabulary and emitted with
|
|
110
|
+
`tier: "llm"`, `source: "llm"`, and a modest confidence (verify, not trust).
|
|
111
|
+
When the deterministic cascade already found clauses, the LLM is not consulted
|
|
112
|
+
for them.
|
|
113
|
+
|
|
105
114
|
## Commands
|
|
106
115
|
|
|
107
116
|
```bash
|
|
@@ -64,6 +64,15 @@ opt-in, never in a hot path, and gated behind an explicit flag and a config
|
|
|
64
64
|
file — if no config is present, `--llm` degrades gracefully with a warning and
|
|
65
65
|
you still get the full deterministic output.
|
|
66
66
|
|
|
67
|
+
**Clause-map fallback.** Some documents (e.g. `.docx` that auto-number clauses
|
|
68
|
+
via Word's numbering with no heading style) carry no signal the deterministic
|
|
69
|
+
cascade can see, so its clause map comes back empty. When `--llm` is set *and*
|
|
70
|
+
no clauses were detected, the LLM is asked for the section headings; the result
|
|
71
|
+
is normalized through the same canonical vocabulary and emitted with
|
|
72
|
+
`tier: "llm"`, `source: "llm"`, and a modest confidence (verify, not trust).
|
|
73
|
+
When the deterministic cascade already found clauses, the LLM is not consulted
|
|
74
|
+
for them.
|
|
75
|
+
|
|
67
76
|
## Commands
|
|
68
77
|
|
|
69
78
|
```bash
|
|
@@ -43,11 +43,11 @@ import urllib.request
|
|
|
43
43
|
from pathlib import Path
|
|
44
44
|
from typing import Any, Dict, List, Optional, Tuple
|
|
45
45
|
|
|
46
|
-
__version__ = "0.1.
|
|
46
|
+
__version__ = "0.1.5"
|
|
47
47
|
|
|
48
48
|
# Bumped independently of the package version when the *extraction logic*
|
|
49
49
|
# changes in a way downstream consumers should notice. Embedded in `_meta`.
|
|
50
|
-
EXTRACTOR_VERSION = "0.1.
|
|
50
|
+
EXTRACTOR_VERSION = "0.1.5"
|
|
51
51
|
|
|
52
52
|
# JSON Schema version of the output contract (docs/spec/extract-output.schema.json).
|
|
53
53
|
SCHEMA_VERSION = 1
|
|
@@ -950,6 +950,39 @@ def _read_docx(path: Path, raw: bytes, prefer_optional: bool = True) -> Tuple[st
|
|
|
950
950
|
return "", warnings
|
|
951
951
|
|
|
952
952
|
|
|
953
|
+
def _docx_paragraph_style(ppr: Any, w: str) -> Optional[str]:
|
|
954
|
+
if ppr is None:
|
|
955
|
+
return None
|
|
956
|
+
st = ppr.find(w + "pStyle")
|
|
957
|
+
return st.get(w + "val") if st is not None else None
|
|
958
|
+
|
|
959
|
+
|
|
960
|
+
def _is_heading_style(style: Optional[str]) -> bool:
|
|
961
|
+
"""True for Word built-in heading/title styles (Heading1-9, Title, and the
|
|
962
|
+
'H1'/'H2' shorthands). These mark clause headings whose visible numbers are
|
|
963
|
+
auto-generated and absent from the raw text."""
|
|
964
|
+
if not style:
|
|
965
|
+
return False
|
|
966
|
+
s = style.lower()
|
|
967
|
+
return "heading" in s or s == "title" or bool(re.fullmatch(r"h[1-9]", s))
|
|
968
|
+
|
|
969
|
+
|
|
970
|
+
def _docx_heading_title(text: str) -> Optional[str]:
|
|
971
|
+
"""Pull the clause title out of a heading paragraph. Many contracts use a
|
|
972
|
+
run-in heading -- 'Performing Services. Contractor will ...' -- where the
|
|
973
|
+
title is the lead before the first sentence break; a standalone header
|
|
974
|
+
('Services & Restrictions') has no such break and is used whole.
|
|
975
|
+
|
|
976
|
+
Returns None when the paragraph is really a full sentence that merely
|
|
977
|
+
carries a heading style (no run-in title) -- those would otherwise become
|
|
978
|
+
garbage clause titles and mis-map under substring matching."""
|
|
979
|
+
m = re.match(r"\s*(.{2,80}?)[.:]\s+[A-Z(\"“]", text)
|
|
980
|
+
title = m.group(1).strip() if m else text.strip()
|
|
981
|
+
if len(title) > 70 or len(title.split()) > 9:
|
|
982
|
+
return None
|
|
983
|
+
return title
|
|
984
|
+
|
|
985
|
+
|
|
953
986
|
def _read_docx_stdlib(raw: bytes) -> str:
|
|
954
987
|
import io
|
|
955
988
|
import zipfile
|
|
@@ -962,6 +995,7 @@ def _read_docx_stdlib(raw: bytes) -> str:
|
|
|
962
995
|
paras: List[str] = []
|
|
963
996
|
# iter over w:p in document order (includes paragraphs inside table cells).
|
|
964
997
|
for p in root.iter(w + "p"):
|
|
998
|
+
style = _docx_paragraph_style(p.find(w + "pPr"), w)
|
|
965
999
|
run_texts: List[str] = []
|
|
966
1000
|
any_text = False
|
|
967
1001
|
all_bold = True
|
|
@@ -978,6 +1012,17 @@ def _read_docx_stdlib(raw: bytes) -> str:
|
|
|
978
1012
|
if not line:
|
|
979
1013
|
paras.append("")
|
|
980
1014
|
continue
|
|
1015
|
+
# Word heading styles carry the clause structure (their numbers are
|
|
1016
|
+
# auto-generated, so absent from text). Emit them as H2 so the clause
|
|
1017
|
+
# cascade's strongest tier detects them; keep any run-in body too.
|
|
1018
|
+
if _is_heading_style(style):
|
|
1019
|
+
title = _docx_heading_title(line)
|
|
1020
|
+
if title is not None:
|
|
1021
|
+
paras.append(f"## {title}")
|
|
1022
|
+
if len(title) < len(line):
|
|
1023
|
+
paras.append(line[len(title):].lstrip(" .:\t"))
|
|
1024
|
+
continue
|
|
1025
|
+
# Sentence carrying a heading style -> treat as ordinary body text.
|
|
981
1026
|
if any_text and all_bold:
|
|
982
1027
|
line = f"**{line}**"
|
|
983
1028
|
paras.append(line)
|
|
@@ -1230,15 +1275,29 @@ def load_llm_config() -> Optional[JSON]:
|
|
|
1230
1275
|
return None
|
|
1231
1276
|
|
|
1232
1277
|
|
|
1233
|
-
|
|
1234
|
-
"
|
|
1235
|
-
"
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1278
|
+
_LLM_PROMPT_KEYS = (
|
|
1279
|
+
"renewal_mechanics (string or null), obligations (array of short strings, "
|
|
1280
|
+
"max 5), governing_law (string or null)"
|
|
1281
|
+
)
|
|
1282
|
+
# Requested only when the deterministic clause cascade found nothing (e.g. a
|
|
1283
|
+
# DOCX that auto-numbers with no heading style): ask the model for the section
|
|
1284
|
+
# headings so we can still produce a clause map.
|
|
1285
|
+
_LLM_PROMPT_CLAUSES = (
|
|
1286
|
+
", clauses (array, max 40, of objects {\"title\": \"<the section/clause "
|
|
1287
|
+
"heading, verbatim if possible>\"} in document order, top-level sections "
|
|
1288
|
+
"only)"
|
|
1239
1289
|
)
|
|
1240
1290
|
|
|
1241
1291
|
|
|
1292
|
+
def _build_llm_prompt(text: str, want_clauses: bool) -> str:
|
|
1293
|
+
keys = _LLM_PROMPT_KEYS + (_LLM_PROMPT_CLAUSES if want_clauses else "")
|
|
1294
|
+
return (
|
|
1295
|
+
"You are a contract-extraction assistant. Given the contract text, "
|
|
1296
|
+
"return ONLY a compact JSON object with keys: " + keys + ". Base answers "
|
|
1297
|
+
"strictly on the text. No prose, JSON only.\n\nCONTRACT:\n" + text[:16000]
|
|
1298
|
+
)
|
|
1299
|
+
|
|
1300
|
+
|
|
1242
1301
|
def _llm_request(cfg: JSON, prompt: str, timeout: float = 30.0) -> Optional[str]:
|
|
1243
1302
|
provider = str(cfg.get("provider", "anthropic")).lower()
|
|
1244
1303
|
model = cfg.get("model") or ("claude-sonnet-4-6" if provider == "anthropic" else "gpt-4o-mini")
|
|
@@ -1292,8 +1351,44 @@ def _extract_json_object(s: str) -> Optional[JSON]:
|
|
|
1292
1351
|
return None
|
|
1293
1352
|
|
|
1294
1353
|
|
|
1354
|
+
def _llm_clause_map(raw: Any, text: str) -> List[JSON]:
|
|
1355
|
+
"""Convert LLM-returned clause titles into schema-conformant clause objects.
|
|
1356
|
+
Titles are canonicalized through the same suite vocabulary the deterministic
|
|
1357
|
+
path uses, located in the document for a best-effort span, and marked
|
|
1358
|
+
tier/source = 'llm' with a modest confidence (verify, not trust)."""
|
|
1359
|
+
if not isinstance(raw, list):
|
|
1360
|
+
return []
|
|
1361
|
+
low = text.lower()
|
|
1362
|
+
out: List[JSON] = []
|
|
1363
|
+
seen: set[str] = set()
|
|
1364
|
+
for item in raw[:40]:
|
|
1365
|
+
title: Any = item.get("title") if isinstance(item, dict) else item
|
|
1366
|
+
if not isinstance(title, str) or not title.strip():
|
|
1367
|
+
continue
|
|
1368
|
+
title = re.sub(r"\s+", " ", title.strip())
|
|
1369
|
+
key = _norm_clause_key(title)
|
|
1370
|
+
if not key or key in seen or _is_noise_clause_title(title):
|
|
1371
|
+
continue
|
|
1372
|
+
seen.add(key)
|
|
1373
|
+
canonical, mapped = _canonicalize_clause(title)
|
|
1374
|
+
idx = low.find(title.lower())
|
|
1375
|
+
span = ({"start": idx, "end": min(idx + len(title), len(text))}
|
|
1376
|
+
if idx >= 0 else {"start": 0, "end": 0})
|
|
1377
|
+
out.append({
|
|
1378
|
+
"canonical_title": canonical,
|
|
1379
|
+
"detected_title": title,
|
|
1380
|
+
"tier": "llm",
|
|
1381
|
+
"span": span,
|
|
1382
|
+
"confidence": 0.5,
|
|
1383
|
+
"source": "llm",
|
|
1384
|
+
"mapped": mapped,
|
|
1385
|
+
})
|
|
1386
|
+
return out
|
|
1387
|
+
|
|
1388
|
+
|
|
1295
1389
|
def llm_enrich(result: JSON, text: str, args_ns: argparse.Namespace) -> None:
|
|
1296
|
-
"""Opt-in enrichment of fuzzy fields
|
|
1390
|
+
"""Opt-in enrichment of fuzzy fields, plus a clause-map fallback when the
|
|
1391
|
+
deterministic cascade found no clauses. Mutates `result` in place. Any
|
|
1297
1392
|
failure (no config, network error, bad JSON) degrades gracefully: a warning
|
|
1298
1393
|
to stderr and the deterministic output is left untouched."""
|
|
1299
1394
|
cfg = load_llm_config()
|
|
@@ -1301,7 +1396,8 @@ def llm_enrich(result: JSON, text: str, args_ns: argparse.Namespace) -> None:
|
|
|
1301
1396
|
_warn(args_ns, "no LLM config found (~/.config/contract-ops/llm.json or "
|
|
1302
1397
|
"./config/llm.json); skipping --llm enrichment")
|
|
1303
1398
|
return
|
|
1304
|
-
|
|
1399
|
+
want_clauses = not result["clauses"]
|
|
1400
|
+
prompt = _build_llm_prompt(text, want_clauses)
|
|
1305
1401
|
try:
|
|
1306
1402
|
raw = _llm_request(cfg, prompt)
|
|
1307
1403
|
except (urllib.error.URLError, TimeoutError, OSError, ValueError) as e:
|
|
@@ -1331,6 +1427,11 @@ def llm_enrich(result: JSON, text: str, args_ns: argparse.Namespace) -> None:
|
|
|
1331
1427
|
if isinstance(gl, str) and gl.strip() and result["governing_law"]["source"] == "none":
|
|
1332
1428
|
result["governing_law"] = _field(gl.strip(), 0.6, "llm")
|
|
1333
1429
|
enriched = True
|
|
1430
|
+
if want_clauses:
|
|
1431
|
+
cmap = _llm_clause_map(obj.get("clauses"), text)
|
|
1432
|
+
if cmap:
|
|
1433
|
+
result["clauses"] = cmap
|
|
1434
|
+
enriched = True
|
|
1334
1435
|
|
|
1335
1436
|
result["_meta"]["llm_used"] = True
|
|
1336
1437
|
if enriched and "llm" not in result["_meta"]["tiers_used"]:
|
|
@@ -1613,7 +1714,8 @@ FIELD_CATALOG: Tuple[Tuple[str, str, str], ...] = (
|
|
|
1613
1714
|
("term.notice_period_days", "deterministic", "Notice period in days, best-effort"),
|
|
1614
1715
|
("term.auto_renew", "deterministic", "Auto-renewal flag, best-effort"),
|
|
1615
1716
|
("governing_law", "deterministic", "Governing law / jurisdiction"),
|
|
1616
|
-
("clauses", "deterministic", "Clause map normalized to the suite's canonical vocabulary"
|
|
1717
|
+
("clauses", "deterministic", "Clause map normalized to the suite's canonical vocabulary "
|
|
1718
|
+
"(LLM fallback under --llm when no headings are detected)"),
|
|
1617
1719
|
("defined_terms", "deterministic", "Defined-term inventory (quoted / parenthetical)"),
|
|
1618
1720
|
("value", "deterministic", "Headline monetary value"),
|
|
1619
1721
|
("term.renewal_mechanics", "llm", "Renewal mechanics (fuzzy; --llm only)"),
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "extract-cli"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.5"
|
|
8
8
|
description = "Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9"
|
|
@@ -42,14 +42,36 @@ _DOCX_PARAS = [
|
|
|
42
42
|
_W = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
|
43
43
|
|
|
44
44
|
|
|
45
|
-
def _docx_paragraph(text: str, bold: bool) -> str:
|
|
45
|
+
def _docx_paragraph(text: str, bold: bool = False, style: str = "") -> str:
|
|
46
|
+
ppr = f'<w:pPr><w:pStyle w:val="{style}"/></w:pPr>' if style else ""
|
|
46
47
|
rpr = "<w:rPr><w:b/></w:rPr>" if bold else ""
|
|
47
|
-
return (f"<w:p
|
|
48
|
+
return (f"<w:p>{ppr}<w:r>{rpr}"
|
|
48
49
|
f'<w:t xml:space="preserve">{escape(text)}</w:t></w:r></w:p>')
|
|
49
50
|
|
|
50
51
|
|
|
51
|
-
|
|
52
|
-
|
|
52
|
+
# A Word-styled agreement: clause structure carried by Heading1 styles (their
|
|
53
|
+
# numbers are auto-generated, absent from text), including a run-in heading and
|
|
54
|
+
# a full sentence that merely carries the heading style (must be rejected).
|
|
55
|
+
_HEADING_DOCX_PARAS = [
|
|
56
|
+
('Cloud Service Agreement', False, "Title"),
|
|
57
|
+
('This Cloud Service Agreement is entered into as of April 4, 2024, by and '
|
|
58
|
+
'between Initech Software, Inc. (the "Provider") and Globex Corporation '
|
|
59
|
+
'(the "Customer").', False, ""),
|
|
60
|
+
('Confidentiality', False, "Heading1"),
|
|
61
|
+
('Each party will protect the other party’s Confidential Information.', False, ""),
|
|
62
|
+
('Payment. Customer will pay the fees set out in the Order Form within '
|
|
63
|
+
'thirty (30) days.', False, "Heading1"),
|
|
64
|
+
('Term & Termination', False, "Heading1"),
|
|
65
|
+
('The term of this Agreement is two (2) years and will automatically renew '
|
|
66
|
+
'for successive one-year terms.', False, ""),
|
|
67
|
+
('Either party may terminate this Agreement upon material breach that '
|
|
68
|
+
'remains uncured for thirty days after written notice.', False, "Heading1"),
|
|
69
|
+
('Governing Law', False, "Heading1"),
|
|
70
|
+
('This Agreement is governed by the laws of the State of New York.', False, ""),
|
|
71
|
+
]
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _docx_package(body: str) -> bytes:
|
|
53
75
|
document = (
|
|
54
76
|
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
|
|
55
77
|
f'<w:document xmlns:w="{_W}"><w:body>{body}<w:sectPr/></w:body></w:document>'
|
|
@@ -70,14 +92,29 @@ def build_docx() -> bytes:
|
|
|
70
92
|
'Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" '
|
|
71
93
|
'Target="word/document.xml"/></Relationships>'
|
|
72
94
|
)
|
|
95
|
+
# Deterministic: a fixed timestamp on every entry so regenerating the
|
|
96
|
+
# fixture produces byte-identical output (stable sha256 -> stable goldens).
|
|
73
97
|
buf = io.BytesIO()
|
|
74
98
|
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
99
|
+
for name, data in (("[Content_Types].xml", content_types),
|
|
100
|
+
("_rels/.rels", rels),
|
|
101
|
+
("word/document.xml", document)):
|
|
102
|
+
info = zipfile.ZipInfo(name, date_time=(1980, 1, 1, 0, 0, 0))
|
|
103
|
+
info.compress_type = zipfile.ZIP_DEFLATED
|
|
104
|
+
z.writestr(info, data)
|
|
78
105
|
return buf.getvalue()
|
|
79
106
|
|
|
80
107
|
|
|
108
|
+
def build_docx() -> bytes:
|
|
109
|
+
return _docx_package("".join(_docx_paragraph(t, b) for t, b in _DOCX_PARAS))
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def build_heading_docx() -> bytes:
|
|
113
|
+
return _docx_package(
|
|
114
|
+
"".join(_docx_paragraph(t, b, style=s) for t, b, s in _HEADING_DOCX_PARAS)
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
|
|
81
118
|
# --- PDF: a software license with ALL-CAPS headings (Tier 3) ----------------
|
|
82
119
|
|
|
83
120
|
_PDF_TEXT = """SOFTWARE LICENSE AGREEMENT
|
|
@@ -156,6 +193,7 @@ def build_scanned_pdf() -> bytes:
|
|
|
156
193
|
|
|
157
194
|
_BINARY_FIXTURES = {
|
|
158
195
|
"employment_docx.docx": build_docx,
|
|
196
|
+
"heading_docx.docx": build_heading_docx,
|
|
159
197
|
"license_pdf.pdf": build_pdf,
|
|
160
198
|
"scanned.pdf": build_scanned_pdf,
|
|
161
199
|
}
|
|
@@ -20,8 +20,8 @@ from tests._fixtures_build import ensure_binary_fixtures # noqa: E402
|
|
|
20
20
|
FIXTURES = Path(__file__).resolve().parent / "fixtures"
|
|
21
21
|
|
|
22
22
|
DOCS = ["nda_h2.md", "services_bold.txt", "lease_allcaps.txt",
|
|
23
|
-
"employment_docx.docx", "
|
|
24
|
-
"scanned.pdf"]
|
|
23
|
+
"employment_docx.docx", "heading_docx.docx", "license_pdf.pdf",
|
|
24
|
+
"services_html.html", "scanned.pdf"]
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
def golden_for(name: str) -> dict:
|
|
@@ -25,6 +25,7 @@ CORPUS: Tuple[Tuple[str, str, str], ...] = (
|
|
|
25
25
|
("services_bold.txt", "bold-numbered", "text"),
|
|
26
26
|
("lease_allcaps.txt", "all-caps", "text"),
|
|
27
27
|
("employment_docx.docx", "bold-numbered", "docx"),
|
|
28
|
+
("heading_docx.docx", "h2", "docx"),
|
|
28
29
|
("license_pdf.pdf", "all-caps", "pdf"),
|
|
29
30
|
("services_html.html", "numbered", "html"),
|
|
30
31
|
)
|
|
Binary file
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"document": {
|
|
3
3
|
"title": "EMPLOYMENT AGREEMENT",
|
|
4
4
|
"format": "docx",
|
|
5
|
-
"sha256": "
|
|
5
|
+
"sha256": "f50e4b9b0cb77250280eb4c26225009de063b5f4a2318e9e53784d3730d20bd1",
|
|
6
6
|
"source_path": "employment_docx.docx"
|
|
7
7
|
},
|
|
8
8
|
"parties": [
|
|
@@ -138,7 +138,7 @@
|
|
|
138
138
|
"source": "deterministic"
|
|
139
139
|
},
|
|
140
140
|
"_meta": {
|
|
141
|
-
"extractor_version": "0.1.
|
|
141
|
+
"extractor_version": "0.1.5",
|
|
142
142
|
"tiers_used": [
|
|
143
143
|
"deterministic"
|
|
144
144
|
],
|
|
Binary file
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
{
|
|
2
|
+
"document": {
|
|
3
|
+
"title": "Cloud Service Agreement",
|
|
4
|
+
"format": "docx",
|
|
5
|
+
"sha256": "23a3b14196cdca6b58d14c7a6836fe28ff6d2be6c2fd852badb03ab6b6e84056",
|
|
6
|
+
"source_path": "heading_docx.docx"
|
|
7
|
+
},
|
|
8
|
+
"parties": [
|
|
9
|
+
{
|
|
10
|
+
"name": "Initech Software, Inc.",
|
|
11
|
+
"confidence": 0.9,
|
|
12
|
+
"source": "deterministic",
|
|
13
|
+
"role": "Provider"
|
|
14
|
+
},
|
|
15
|
+
{
|
|
16
|
+
"name": "Globex Corporation",
|
|
17
|
+
"confidence": 0.9,
|
|
18
|
+
"source": "deterministic",
|
|
19
|
+
"role": "Customer"
|
|
20
|
+
}
|
|
21
|
+
],
|
|
22
|
+
"dates": {
|
|
23
|
+
"effective": {
|
|
24
|
+
"value": "2024-04-04",
|
|
25
|
+
"confidence": 0.85,
|
|
26
|
+
"source": "deterministic"
|
|
27
|
+
},
|
|
28
|
+
"expiration": {
|
|
29
|
+
"value": null,
|
|
30
|
+
"confidence": 0.0,
|
|
31
|
+
"source": "none"
|
|
32
|
+
}
|
|
33
|
+
},
|
|
34
|
+
"term": {
|
|
35
|
+
"length": {
|
|
36
|
+
"value": "2 years",
|
|
37
|
+
"confidence": 0.7,
|
|
38
|
+
"source": "deterministic"
|
|
39
|
+
},
|
|
40
|
+
"auto_renew": {
|
|
41
|
+
"value": true,
|
|
42
|
+
"confidence": 0.65,
|
|
43
|
+
"source": "deterministic"
|
|
44
|
+
},
|
|
45
|
+
"notice_period_days": {
|
|
46
|
+
"value": null,
|
|
47
|
+
"confidence": 0.0,
|
|
48
|
+
"source": "none"
|
|
49
|
+
}
|
|
50
|
+
},
|
|
51
|
+
"governing_law": {
|
|
52
|
+
"value": "State of New York",
|
|
53
|
+
"confidence": 0.85,
|
|
54
|
+
"source": "deterministic"
|
|
55
|
+
},
|
|
56
|
+
"clauses": [
|
|
57
|
+
{
|
|
58
|
+
"canonical_title": "Cloud Service Agreement",
|
|
59
|
+
"detected_title": "## Cloud Service Agreement",
|
|
60
|
+
"tier": "h2",
|
|
61
|
+
"span": {
|
|
62
|
+
"start": 0,
|
|
63
|
+
"end": 191
|
|
64
|
+
},
|
|
65
|
+
"confidence": 0.71,
|
|
66
|
+
"source": "deterministic",
|
|
67
|
+
"mapped": false
|
|
68
|
+
},
|
|
69
|
+
{
|
|
70
|
+
"canonical_title": "Confidentiality",
|
|
71
|
+
"detected_title": "## Confidentiality",
|
|
72
|
+
"tier": "h2",
|
|
73
|
+
"span": {
|
|
74
|
+
"start": 191,
|
|
75
|
+
"end": 280
|
|
76
|
+
},
|
|
77
|
+
"confidence": 0.95,
|
|
78
|
+
"source": "deterministic",
|
|
79
|
+
"mapped": true
|
|
80
|
+
},
|
|
81
|
+
{
|
|
82
|
+
"canonical_title": "Payment",
|
|
83
|
+
"detected_title": "## Payment",
|
|
84
|
+
"tier": "h2",
|
|
85
|
+
"span": {
|
|
86
|
+
"start": 280,
|
|
87
|
+
"end": 371
|
|
88
|
+
},
|
|
89
|
+
"confidence": 0.95,
|
|
90
|
+
"source": "deterministic",
|
|
91
|
+
"mapped": true
|
|
92
|
+
},
|
|
93
|
+
{
|
|
94
|
+
"canonical_title": "Termination",
|
|
95
|
+
"detected_title": "## Term & Termination",
|
|
96
|
+
"tier": "h2",
|
|
97
|
+
"span": {
|
|
98
|
+
"start": 371,
|
|
99
|
+
"end": 622
|
|
100
|
+
},
|
|
101
|
+
"confidence": 0.95,
|
|
102
|
+
"source": "deterministic",
|
|
103
|
+
"mapped": true
|
|
104
|
+
},
|
|
105
|
+
{
|
|
106
|
+
"canonical_title": "Governing Law",
|
|
107
|
+
"detected_title": "## Governing Law",
|
|
108
|
+
"tier": "h2",
|
|
109
|
+
"span": {
|
|
110
|
+
"start": 622,
|
|
111
|
+
"end": 704
|
|
112
|
+
},
|
|
113
|
+
"confidence": 0.95,
|
|
114
|
+
"source": "deterministic",
|
|
115
|
+
"mapped": true
|
|
116
|
+
}
|
|
117
|
+
],
|
|
118
|
+
"defined_terms": [
|
|
119
|
+
{
|
|
120
|
+
"term": "Provider",
|
|
121
|
+
"confidence": 0.6,
|
|
122
|
+
"source": "deterministic"
|
|
123
|
+
},
|
|
124
|
+
{
|
|
125
|
+
"term": "Customer",
|
|
126
|
+
"confidence": 0.6,
|
|
127
|
+
"source": "deterministic"
|
|
128
|
+
}
|
|
129
|
+
],
|
|
130
|
+
"value": {
|
|
131
|
+
"value": null,
|
|
132
|
+
"confidence": 0.0,
|
|
133
|
+
"source": "none"
|
|
134
|
+
},
|
|
135
|
+
"_meta": {
|
|
136
|
+
"extractor_version": "0.1.5",
|
|
137
|
+
"tiers_used": [
|
|
138
|
+
"deterministic"
|
|
139
|
+
],
|
|
140
|
+
"llm_used": false
|
|
141
|
+
}
|
|
142
|
+
}
|
|
@@ -67,6 +67,41 @@ def test_enrich_fills_only_missing_governing_law(monkeypatch: pytest.MonkeyPatch
|
|
|
67
67
|
assert result["governing_law"] == {"value": "France", "confidence": 0.6, "source": "llm"}
|
|
68
68
|
|
|
69
69
|
|
|
70
|
+
def test_llm_clause_fallback_when_deterministic_empty(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
71
|
+
from tests._schema_validator import validate
|
|
72
|
+
monkeypatch.setattr(ex, "load_llm_config",
|
|
73
|
+
lambda: {"provider": "anthropic", "api_key": "x"})
|
|
74
|
+
monkeypatch.setattr(ex, "_llm_request", lambda cfg, prompt, timeout=30.0: json.dumps(
|
|
75
|
+
{"clauses": [{"title": "Confidentiality"}, {"title": "Governing Law"},
|
|
76
|
+
{"title": "Special Widget Terms"}]}))
|
|
77
|
+
# A document with no detectable clause headings -> 0 deterministic clauses.
|
|
78
|
+
text = ("This Agreement is made between Acme Co and Beta Co. The parties agree "
|
|
79
|
+
"to maintain confidentiality. Governed by the laws of Delaware.")
|
|
80
|
+
result = ex.build_extraction(text, text.encode("utf-8"), "text", "x.txt")
|
|
81
|
+
assert result["clauses"] == []
|
|
82
|
+
ex.llm_enrich(result, text, _ns())
|
|
83
|
+
cl = result["clauses"]
|
|
84
|
+
assert [c["canonical_title"] for c in cl] == ["Confidentiality", "Governing Law", "Special Widget Terms"]
|
|
85
|
+
assert all(c["tier"] == "llm" and c["source"] == "llm" for c in cl)
|
|
86
|
+
assert cl[0]["mapped"] is True and cl[2]["mapped"] is False
|
|
87
|
+
assert result["_meta"]["llm_used"] is True and "llm" in result["_meta"]["tiers_used"]
|
|
88
|
+
assert validate(result, ex.output_schema()) == [] # llm clauses are schema-conformant
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def test_llm_does_not_replace_deterministic_clauses(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
92
|
+
monkeypatch.setattr(ex, "load_llm_config",
|
|
93
|
+
lambda: {"provider": "anthropic", "api_key": "x"})
|
|
94
|
+
monkeypatch.setattr(ex, "_llm_request", lambda cfg, prompt, timeout=30.0: json.dumps(
|
|
95
|
+
{"clauses": [{"title": "Should Not Appear"}]}))
|
|
96
|
+
text = ex.DEMO_DOCUMENT # has H2 clauses
|
|
97
|
+
result = ex.build_extraction(text, text.encode("utf-8"), "markdown", "d.md")
|
|
98
|
+
assert result["clauses"] and all(c["tier"] == "h2" for c in result["clauses"])
|
|
99
|
+
ex.llm_enrich(result, text, _ns())
|
|
100
|
+
# Deterministic clauses are kept; the LLM clause was never requested/used.
|
|
101
|
+
assert all(c["tier"] == "h2" for c in result["clauses"])
|
|
102
|
+
assert not any(c["detected_title"] == "Should Not Appear" for c in result["clauses"])
|
|
103
|
+
|
|
104
|
+
|
|
70
105
|
def test_request_error_degrades(monkeypatch: pytest.MonkeyPatch,
|
|
71
106
|
capsys: pytest.CaptureFixture[str]) -> None:
|
|
72
107
|
monkeypatch.setattr(ex, "load_llm_config",
|
|
@@ -142,6 +142,37 @@ def test_pdf_unescape() -> None:
|
|
|
142
142
|
assert ex._pdf_unescape(r"\101\102") == "AB" # octal escapes
|
|
143
143
|
|
|
144
144
|
|
|
145
|
+
def test_docx_heading_style_helpers() -> None:
|
|
146
|
+
assert ex._is_heading_style("Heading1")
|
|
147
|
+
assert ex._is_heading_style("Heading 2".replace(" ", ""))
|
|
148
|
+
assert ex._is_heading_style("Title")
|
|
149
|
+
assert ex._is_heading_style("h3")
|
|
150
|
+
assert not ex._is_heading_style("Plain")
|
|
151
|
+
assert not ex._is_heading_style(None)
|
|
152
|
+
# Run-in heading: title is the lead before the sentence body.
|
|
153
|
+
assert ex._docx_heading_title("Payment. Customer will pay the fees.") == "Payment"
|
|
154
|
+
assert ex._docx_heading_title("Governing Law") == "Governing Law"
|
|
155
|
+
# A full sentence carrying a heading style is rejected (not a clause title).
|
|
156
|
+
assert ex._docx_heading_title(
|
|
157
|
+
"Either party may terminate this Agreement upon material breach that "
|
|
158
|
+
"remains uncured for thirty days.") is None
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def test_docx_heading_styles_drive_clause_map() -> None:
|
|
162
|
+
"""The Word-styled fixture's clauses come from Heading1 styles (their
|
|
163
|
+
numbers are auto-generated), detected via the H2 tier; the sentence that
|
|
164
|
+
merely carries a heading style is not a clause."""
|
|
165
|
+
raw, text, fmt, _w = ex.load_source(FIXTURES / "heading_docx.docx", prefer_optional=False)
|
|
166
|
+
result = ex.build_extraction(text, raw, fmt, "heading_docx.docx")
|
|
167
|
+
assert result["clauses"], "heading-styled docx should yield clauses"
|
|
168
|
+
canon = {c["canonical_title"] for c in result["clauses"]}
|
|
169
|
+
assert {"Confidentiality", "Payment", "Governing Law"} <= canon
|
|
170
|
+
assert all(c["tier"] == "h2" for c in result["clauses"])
|
|
171
|
+
# The full-sentence "Either party may terminate ..." must not appear.
|
|
172
|
+
assert not any("terminate this Agreement" in c["detected_title"] for c in result["clauses"])
|
|
173
|
+
assert [p["name"] for p in result["parties"]] == ["Initech Software, Inc.", "Globex Corporation"]
|
|
174
|
+
|
|
175
|
+
|
|
145
176
|
def test_html_extraction() -> None:
|
|
146
177
|
raw, text, fmt, _w = ex.load_source(FIXTURES / "services_html.html")
|
|
147
178
|
assert fmt == "html"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|