extract-cli 0.1.9__tar.gz → 0.1.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {extract_cli-0.1.9 → extract_cli-0.1.10}/CHANGELOG.md +26 -0
- {extract_cli-0.1.9 → extract_cli-0.1.10}/Makefile +4 -1
- {extract_cli-0.1.9 → extract_cli-0.1.10}/PKG-INFO +1 -1
- {extract_cli-0.1.9 → extract_cli-0.1.10}/extract_cli.py +50 -35
- {extract_cli-0.1.9 → extract_cli-0.1.10}/pyproject.toml +1 -1
- {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/employment_docx.docx.expected.json +1 -1
- {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/heading_docx.docx.expected.json +1 -1
- {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/lease_allcaps.txt.expected.json +1 -1
- {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/license_pdf.pdf.expected.json +1 -1
- {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/nda_h2.md.expected.json +1 -1
- {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/numbered_docx.docx.expected.json +1 -1
- {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/scanned.pdf.expected.json +1 -1
- {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/services_bold.txt.expected.json +1 -1
- {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/services_html.html.expected.json +1 -1
- extract_cli-0.1.10/tests/test_coverage.py +241 -0
- {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/test_misc.py +40 -0
- {extract_cli-0.1.9 → extract_cli-0.1.10}/.gitignore +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.10}/AGENTS.md +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.10}/ARCHITECTURE.md +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.10}/CONTRIBUTING.md +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.10}/LICENSE +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.10}/README.md +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.10}/config/llm.json.example +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.10}/docs/INTEROP.md +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.10}/docs/spec/extract-output.schema.json +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.10}/llms.txt +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.10}/scripts/release.py +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.10}/scripts/validate_against_spec.py +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/_fixtures_build.py +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/_make_goldens.py +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/_schema_validator.py +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/conftest.py +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/employment_docx.docx +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/heading_docx.docx +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/lease_allcaps.txt +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/license_pdf.pdf +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/nda_h2.md +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/numbered_docx.docx +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/scanned.pdf +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/services_bold.txt +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/services_html.html +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/test_clause_map.py +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/test_cli.py +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/test_deterministic.py +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/test_llm.py +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/test_property.py +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/test_schema_conformance.py +0 -0
|
@@ -6,6 +6,31 @@ to [Semantic Versioning](https://semver.org/). Per the suite convention
|
|
|
6
6
|
(see [`docs/INTEROP.md`](docs/INTEROP.md)), **backward-incompatible changes to
|
|
7
7
|
the output schema require a major version bump**; new optional fields are minor.
|
|
8
8
|
|
|
9
|
+
## [0.1.10] - 2026-05-22
|
|
10
|
+
|
|
11
|
+
### Fixed
|
|
12
|
+
- **The `[docx]` (python-docx) reader now honors Word heading styles**, matching
|
|
13
|
+
the stdlib reader. Previously the python-docx path concatenated paragraph text
|
|
14
|
+
and dropped `Heading1-9`/`Title` styles and `w:numPr` numbering, so installing
|
|
15
|
+
the `[docx]` extra produced an **empty clause map** on heading-styled Word
|
|
16
|
+
contracts (worse than the no-extra stdlib reader). Both readers now share one
|
|
17
|
+
emitter (`_emit_docx_paragraph`) that turns heading-styled / auto-numbered
|
|
18
|
+
paragraphs into `## headings`, so the two paths agree. New tests:
|
|
19
|
+
`test_emit_docx_paragraph` and `test_docx_readers_agree_on_clause_map` (the
|
|
20
|
+
latter asserts the python-docx and stdlib readers produce the same clause map).
|
|
21
|
+
No output-schema change.
|
|
22
|
+
|
|
23
|
+
### Tests / quality
|
|
24
|
+
- **Line coverage raised to 100%** (was 92%/94%). Added a targeted test battery
|
|
25
|
+
for the remaining reachable branches (color/`FORCE_COLOR`, `_warn` silent,
|
|
26
|
+
date/jurisdiction/title/clause edge returns, LLM request/parse/clause-map
|
|
27
|
+
branches, PDF `TJ`-array + stream/budget edges, HTML malformed fallback, DOCX
|
|
28
|
+
empty paragraph, `_is_low_signal` branches, CLI silent/help paths). Genuinely
|
|
29
|
+
unreachable defensive lines and `[docx]`/`[pdf]`-extra fidelity branches are
|
|
30
|
+
marked `# pragma: no cover`. `make coverage` now installs the extras and
|
|
31
|
+
enforces `--fail-under=100`; a CI `coverage` job gates it. No code-behavior or
|
|
32
|
+
schema change.
|
|
33
|
+
|
|
9
34
|
## [0.1.9] - 2026-05-22
|
|
10
35
|
|
|
11
36
|
### Security / robustness
|
|
@@ -271,6 +296,7 @@ Initial release — the open-loop front door of the contract-ops CLI suite.
|
|
|
271
296
|
intentionally *not* governed by the output schema (the schema describes the
|
|
272
297
|
full default output).
|
|
273
298
|
|
|
299
|
+
[0.1.10]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.10
|
|
274
300
|
[0.1.9]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.9
|
|
275
301
|
[0.1.8]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.8
|
|
276
302
|
[0.1.7]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.7
|
|
@@ -31,8 +31,11 @@ test-quick:
|
|
|
31
31
|
$(PYTHON) -m pytest -x -q -k "not property"
|
|
32
32
|
|
|
33
33
|
coverage:
|
|
34
|
+
# Install the [docx]/[pdf] extras so the fidelity-reader paths execute too;
|
|
35
|
+
# without them two extras-only branches stay uncovered (98% vs 100%).
|
|
36
|
+
$(PIP) install -q -e ".[dev,docx,pdf]"
|
|
34
37
|
$(PYTHON) -m coverage run --source=extract_cli -m pytest -q
|
|
35
|
-
$(PYTHON) -m coverage report -m
|
|
38
|
+
$(PYTHON) -m coverage report -m --fail-under=100
|
|
36
39
|
|
|
37
40
|
typecheck:
|
|
38
41
|
$(PYTHON) -m mypy --strict extract_cli.py
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: extract-cli
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.10
|
|
4
4
|
Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON.
|
|
5
5
|
Project-URL: Homepage, https://cli.drbaher.com/
|
|
6
6
|
Project-URL: Repository, https://github.com/DrBaher/extract-cli
|
|
@@ -43,11 +43,11 @@ import urllib.request
|
|
|
43
43
|
from pathlib import Path
|
|
44
44
|
from typing import Any, Dict, List, Optional, Tuple
|
|
45
45
|
|
|
46
|
-
__version__ = "0.1.
|
|
46
|
+
__version__ = "0.1.10"
|
|
47
47
|
|
|
48
48
|
# Bumped independently of the package version when the *extraction logic*
|
|
49
49
|
# changes in a way downstream consumers should notice. Embedded in `_meta`.
|
|
50
|
-
EXTRACTOR_VERSION = "0.1.
|
|
50
|
+
EXTRACTOR_VERSION = "0.1.10"
|
|
51
51
|
|
|
52
52
|
# JSON Schema version of the output contract (docs/spec/extract-output.schema.json).
|
|
53
53
|
SCHEMA_VERSION = 1
|
|
@@ -759,7 +759,7 @@ def extract_governing_law(text: str) -> JSON:
|
|
|
759
759
|
if not m:
|
|
760
760
|
return _none_field()
|
|
761
761
|
juris = re.sub(r"\s+", " ", m.group(1).strip().rstrip(".,")).strip()
|
|
762
|
-
if not juris:
|
|
762
|
+
if not juris: # pragma: no cover - the capture group requires a leading letter
|
|
763
763
|
return _none_field()
|
|
764
764
|
return _field(juris, 0.85)
|
|
765
765
|
|
|
@@ -880,7 +880,7 @@ def extract_defined_terms(text: str) -> List[JSON]:
|
|
|
880
880
|
# Reject sentence-like or lowercase-y captures.
|
|
881
881
|
if len(term) < 2 or len(term.split()) > 6:
|
|
882
882
|
continue
|
|
883
|
-
if not term[0].isupper():
|
|
883
|
+
if not term[0].isupper(): # pragma: no cover - the regexes require an uppercase lead
|
|
884
884
|
continue
|
|
885
885
|
seen.setdefault(term, None)
|
|
886
886
|
if len(seen) >= 50:
|
|
@@ -1075,13 +1075,20 @@ def _read_docx(path: Path, raw: bytes, prefer_optional: bool = True) -> Tuple[st
|
|
|
1075
1075
|
mod = importlib.import_module("docx")
|
|
1076
1076
|
document_cls = getattr(mod, "Document")
|
|
1077
1077
|
doc = document_cls(str(path))
|
|
1078
|
+
w = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
|
|
1078
1079
|
lines: List[str] = []
|
|
1079
1080
|
for para in doc.paragraphs:
|
|
1080
1081
|
line = (para.text or "").strip()
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1082
|
+
# Read the style + numbering off the underlying element so the
|
|
1083
|
+
# cascade sees clause headings (the same logic the stdlib reader
|
|
1084
|
+
# applies); python-docx alone exposes neither as a heading.
|
|
1085
|
+
ppr = para._p.find(w + "pPr")
|
|
1086
|
+
style = _docx_paragraph_style(ppr, w)
|
|
1087
|
+
numbered = bool(ppr is not None and ppr.find(w + "numPr") is not None)
|
|
1088
|
+
all_bold = bool(para.runs) and all(
|
|
1089
|
+
getattr(r, "bold", False) for r in para.runs if (r.text or "").strip())
|
|
1090
|
+
_emit_docx_paragraph(lines, line, style, numbered, all_bold)
|
|
1091
|
+
for table in getattr(doc, "tables", []): # pragma: no cover - [docx] fidelity
|
|
1085
1092
|
for row in table.rows:
|
|
1086
1093
|
for cell in row.cells:
|
|
1087
1094
|
ct = (cell.text or "").strip()
|
|
@@ -1130,6 +1137,30 @@ def _docx_heading_title(text: str) -> Optional[str]:
|
|
|
1130
1137
|
return title
|
|
1131
1138
|
|
|
1132
1139
|
|
|
1140
|
+
def _emit_docx_paragraph(out: List[str], line: str, style: Optional[str],
|
|
1141
|
+
numbered: bool, all_bold: bool) -> None:
|
|
1142
|
+
"""Append one .docx paragraph to `out` the way the clause cascade expects.
|
|
1143
|
+
|
|
1144
|
+
Heading-styled (Heading1-9/Title) or auto-numbered (`w:numPr`) paragraphs --
|
|
1145
|
+
whose visible number is auto-generated and absent from the text -- become a
|
|
1146
|
+
`## <title>` heading (with any run-in body split onto the next line) when the
|
|
1147
|
+
lead looks like a heading; a fully-bold paragraph becomes `**...**`; anything
|
|
1148
|
+
else stays plain. Shared by BOTH the python-docx and stdlib readers so the
|
|
1149
|
+
two paths agree on structure (the python-docx path used to flatten headings,
|
|
1150
|
+
losing the clause map on heading-styled Word docs)."""
|
|
1151
|
+
if not line:
|
|
1152
|
+
out.append("")
|
|
1153
|
+
return
|
|
1154
|
+
if _is_heading_style(style) or numbered:
|
|
1155
|
+
title = _docx_heading_title(line)
|
|
1156
|
+
if title is not None:
|
|
1157
|
+
out.append(f"## {title}")
|
|
1158
|
+
if len(title) < len(line):
|
|
1159
|
+
out.append(line[len(title):].lstrip(" .:\t"))
|
|
1160
|
+
return
|
|
1161
|
+
out.append(f"**{line}**" if all_bold else line)
|
|
1162
|
+
|
|
1163
|
+
|
|
1133
1164
|
def _read_docx_stdlib(raw: bytes) -> str:
|
|
1134
1165
|
import io
|
|
1135
1166
|
import zipfile
|
|
@@ -1153,39 +1184,23 @@ def _read_docx_stdlib(raw: bytes) -> str:
|
|
|
1153
1184
|
style = _docx_paragraph_style(ppr, w)
|
|
1154
1185
|
numbered = ppr is not None and ppr.find(w + "numPr") is not None
|
|
1155
1186
|
run_texts: List[str] = []
|
|
1156
|
-
any_text = False
|
|
1157
1187
|
all_bold = True
|
|
1158
1188
|
for r in p.iter(w + "r"):
|
|
1159
1189
|
rpr = r.find(w + "rPr")
|
|
1160
1190
|
bold = rpr is not None and rpr.find(w + "b") is not None
|
|
1161
1191
|
txt = "".join(t.text or "" for t in r.iter(w + "t"))
|
|
1162
1192
|
if txt:
|
|
1163
|
-
any_text = True
|
|
1164
1193
|
if not bold:
|
|
1165
1194
|
all_bold = False
|
|
1166
1195
|
run_texts.append(txt)
|
|
1167
1196
|
line = "".join(run_texts).strip()
|
|
1168
|
-
if not line:
|
|
1169
|
-
paras.append("")
|
|
1170
|
-
continue
|
|
1171
1197
|
# Clause structure in real Word contracts lives in heading STYLES
|
|
1172
1198
|
# (Heading1-9/Title) or auto-NUMBERED paragraphs (w:numPr) -- in both the
|
|
1173
|
-
# visible number is auto-generated and absent from the text.
|
|
1174
|
-
#
|
|
1175
|
-
#
|
|
1176
|
-
#
|
|
1177
|
-
|
|
1178
|
-
if _is_heading_style(style) or numbered:
|
|
1179
|
-
title = _docx_heading_title(line)
|
|
1180
|
-
if title is not None:
|
|
1181
|
-
paras.append(f"## {title}")
|
|
1182
|
-
if len(title) < len(line):
|
|
1183
|
-
paras.append(line[len(title):].lstrip(" .:\t"))
|
|
1184
|
-
continue
|
|
1185
|
-
# Not heading-like -> treat as ordinary body text.
|
|
1186
|
-
if any_text and all_bold:
|
|
1187
|
-
line = f"**{line}**"
|
|
1188
|
-
paras.append(line)
|
|
1199
|
+
# visible number is auto-generated and absent from the text. The shared
|
|
1200
|
+
# emitter turns those into `## headings` (run-in body split off), bolds
|
|
1201
|
+
# fully-bold lines, and keeps the rest plain. _docx_heading_title rejects
|
|
1202
|
+
# full-sentence body items, so this stays conservative.
|
|
1203
|
+
_emit_docx_paragraph(paras, line, style, numbered, all_bold)
|
|
1189
1204
|
return "\n\n".join(paras)
|
|
1190
1205
|
|
|
1191
1206
|
|
|
@@ -1209,7 +1224,7 @@ def _read_pdf(path: Path, raw: bytes, prefer_optional: bool = True) -> Tuple[str
|
|
|
1209
1224
|
warnings.append(f"pypdf read failed ({e}); falling back to stdlib reader")
|
|
1210
1225
|
try:
|
|
1211
1226
|
text = _read_pdf_stdlib(raw)
|
|
1212
|
-
except Exception as e:
|
|
1227
|
+
except Exception as e: # pragma: no cover - defensive; stdlib reader is bomb-guarded
|
|
1213
1228
|
warnings.append(f"could not parse .pdf ({e}); treating as empty")
|
|
1214
1229
|
return "", warnings
|
|
1215
1230
|
return text, warnings
|
|
@@ -1342,7 +1357,7 @@ def load_source(path: Path, prefer_optional: bool = True) -> Tuple[bytes, str, s
|
|
|
1342
1357
|
raise ExtractError(f"path is a directory, not a file: {path}")
|
|
1343
1358
|
try:
|
|
1344
1359
|
size = path.stat().st_size
|
|
1345
|
-
except OSError:
|
|
1360
|
+
except OSError: # pragma: no cover - defensive; path.exists() already passed
|
|
1346
1361
|
size = 0
|
|
1347
1362
|
if size > MAX_INPUT_BYTES:
|
|
1348
1363
|
raise ExtractError(
|
|
@@ -2315,7 +2330,7 @@ def main(argv: Optional[List[str]] = None) -> int:
|
|
|
2315
2330
|
if hasattr(_stream, "reconfigure"):
|
|
2316
2331
|
try:
|
|
2317
2332
|
_stream.reconfigure(encoding="utf-8", errors="replace")
|
|
2318
|
-
except Exception:
|
|
2333
|
+
except Exception: # pragma: no cover - defensive
|
|
2319
2334
|
pass
|
|
2320
2335
|
|
|
2321
2336
|
argv = sys.argv[1:] if argv is None else argv
|
|
@@ -2358,7 +2373,7 @@ def main(argv: Optional[List[str]] = None) -> int:
|
|
|
2358
2373
|
if first in known:
|
|
2359
2374
|
parser = build_parser()
|
|
2360
2375
|
args = parser.parse_args(argv)
|
|
2361
|
-
if not getattr(args, "func", None):
|
|
2376
|
+
if not getattr(args, "func", None): # pragma: no cover - argparse always sets func
|
|
2362
2377
|
parser.print_help()
|
|
2363
2378
|
return 0
|
|
2364
2379
|
else:
|
|
@@ -2370,7 +2385,7 @@ def main(argv: Optional[List[str]] = None) -> int:
|
|
|
2370
2385
|
except BrokenPipeError: # e.g. `extract foo.md | head`
|
|
2371
2386
|
try:
|
|
2372
2387
|
sys.stdout.close()
|
|
2373
|
-
except Exception:
|
|
2388
|
+
except Exception: # pragma: no cover - defensive
|
|
2374
2389
|
pass
|
|
2375
2390
|
return 0
|
|
2376
2391
|
except KeyboardInterrupt: # pragma: no cover
|
|
@@ -2378,5 +2393,5 @@ def main(argv: Optional[List[str]] = None) -> int:
|
|
|
2378
2393
|
return 130
|
|
2379
2394
|
|
|
2380
2395
|
|
|
2381
|
-
if __name__ == "__main__":
|
|
2396
|
+
if __name__ == "__main__": # pragma: no cover
|
|
2382
2397
|
sys.exit(main())
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "extract-cli"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.10"
|
|
8
8
|
description = "Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9"
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
"""Targeted tests that exercise the remaining reachable branches, to keep line
|
|
2
|
+
coverage at its practical maximum. (Genuinely-unreachable defensive lines and
|
|
3
|
+
[docx]/[pdf]-extra fidelity branches are marked `# pragma: no cover` in the
|
|
4
|
+
source.)"""
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import argparse
|
|
8
|
+
import io
|
|
9
|
+
import json
|
|
10
|
+
import sys as _sys
|
|
11
|
+
import zipfile
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
import pytest
|
|
15
|
+
|
|
16
|
+
import extract_cli as ex
|
|
17
|
+
from tests.conftest import FIXTURES
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _ns(**kw: object) -> argparse.Namespace:
|
|
21
|
+
base = {"silent": False, "why": False}
|
|
22
|
+
base.update(kw)
|
|
23
|
+
return argparse.Namespace(**base)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# --- color + warn -----------------------------------------------------------
|
|
27
|
+
|
|
28
|
+
def test_color_force_on_and_isatty_exception(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
29
|
+
monkeypatch.delenv("NO_COLOR", raising=False)
|
|
30
|
+
monkeypatch.setenv("FORCE_COLOR", "1")
|
|
31
|
+
assert ex._color_enabled() is True
|
|
32
|
+
assert ex._c("x", "32") == "\033[32mx\033[0m"
|
|
33
|
+
monkeypatch.delenv("FORCE_COLOR", raising=False)
|
|
34
|
+
|
|
35
|
+
class _Bad:
|
|
36
|
+
def isatty(self) -> bool:
|
|
37
|
+
raise ValueError("boom")
|
|
38
|
+
assert ex._color_enabled(_Bad()) is False
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def test_warn_silent_is_suppressed(capsys: pytest.CaptureFixture[str]) -> None:
|
|
42
|
+
ex._warn(_ns(silent=True), "hush")
|
|
43
|
+
assert capsys.readouterr().err == ""
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# --- small helpers ----------------------------------------------------------
|
|
47
|
+
|
|
48
|
+
def test_titlecase_edges() -> None:
|
|
49
|
+
assert ex._titlecase(" ") == ""
|
|
50
|
+
assert ex._titlecase("IP Rights") == "IP Rights" # acronym preserved in mixed case
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def test_word_to_int_digit_and_unknown() -> None:
|
|
54
|
+
assert ex._word_to_int("30") == 30
|
|
55
|
+
assert ex._word_to_int("zzz") is None
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def test_date_parse_none_and_unparseable_raw() -> None:
|
|
59
|
+
assert ex._parse_date_to_iso("not a date") is None
|
|
60
|
+
f = ex._date_field_from_str("13/13/2024", 0.85) # matches shape, invalid month
|
|
61
|
+
assert f["source"] == "deterministic" and f["confidence"] < 0.85
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def test_canonicalize_empty_key() -> None:
|
|
65
|
+
assert ex._canonicalize_clause(" ") == (None, False)
|
|
66
|
+
assert ex._canonicalize_clause("1.") == (None, False)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def test_governing_law_and_title_none() -> None:
|
|
70
|
+
assert ex.extract_governing_law("no law clause here")["source"] == "none"
|
|
71
|
+
assert ex.extract_title("", None, "text") is None
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def test_defined_terms_long_and_capped() -> None:
|
|
75
|
+
long_phrase = '"This Is A Very Long Quoted Heading Phrase Indeed"' # > 6 words
|
|
76
|
+
many = " ".join(f'"Term {i}"' for i in range(60))
|
|
77
|
+
terms = [t["term"] for t in ex.extract_defined_terms(long_phrase + " " + many)]
|
|
78
|
+
assert not any("Very Long" in t for t in terms)
|
|
79
|
+
assert len(terms) <= 50
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def test_noise_placeholder_midstring() -> None:
|
|
83
|
+
# Placeholder not at the start -> the mid-string regex branch.
|
|
84
|
+
assert ex._is_noise_clause_title("Fee [ # ]% Cap")
|
|
85
|
+
assert ex._is_noise_clause_title("{placeholder}")
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
# --- format / readers -------------------------------------------------------
|
|
89
|
+
|
|
90
|
+
def test_detect_format_by_magic_bytes(tmp_path: Any) -> None:
|
|
91
|
+
p = tmp_path / "x.dat"
|
|
92
|
+
p.write_bytes(b"%PDF-1.4\nrest")
|
|
93
|
+
assert ex._detect_format(p, p.read_bytes()) == "pdf"
|
|
94
|
+
q = tmp_path / "y.dat"
|
|
95
|
+
q.write_bytes(b"PK\x03\x04rest")
|
|
96
|
+
assert ex._detect_format(q, q.read_bytes()) == "docx"
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def test_pdf_stream_without_endstream() -> None:
|
|
100
|
+
assert ex._read_pdf_stdlib(b"%PDF\nstream\n(text) Tj") == ""
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def test_pdf_decompression_budget_break(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
104
|
+
import zlib
|
|
105
|
+
monkeypatch.setattr(ex, "MAX_DECOMPRESSED_BYTES", 10)
|
|
106
|
+
blob = b"%PDF\nstream\n" + zlib.compress(b"(Hello World) Tj " * 10) + b"\nendstream"
|
|
107
|
+
assert ex._read_pdf_stdlib(blob) == "" # exceeds the tiny budget -> bail, no text
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def test_html_malformed_falls_back(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
111
|
+
def boom(self: object, data: object) -> None:
|
|
112
|
+
raise ValueError("bad markup")
|
|
113
|
+
monkeypatch.setattr(ex._HTMLTextExtractor, "feed", boom)
|
|
114
|
+
out = ex._read_html("<p>hello <b>world</b></p>")
|
|
115
|
+
assert "hello" in out and "<" not in out # crude tag-strip fallback
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def test_docx_empty_paragraph_stdlib() -> None:
|
|
119
|
+
w = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
|
120
|
+
body = '<w:p/><w:p><w:r><w:t>Hello</w:t></w:r></w:p>'
|
|
121
|
+
doc = f'<?xml version="1.0"?><w:document xmlns:w="{w}"><w:body>{body}</w:body></w:document>'
|
|
122
|
+
buf = io.BytesIO()
|
|
123
|
+
with zipfile.ZipFile(buf, "w") as z:
|
|
124
|
+
z.writestr("[Content_Types].xml", "<Types/>")
|
|
125
|
+
z.writestr("word/document.xml", doc)
|
|
126
|
+
assert "Hello" in ex._read_docx_stdlib(buf.getvalue())
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
# --- clause detection edges -------------------------------------------------
|
|
130
|
+
|
|
131
|
+
def test_clause_heading_on_last_line() -> None:
|
|
132
|
+
clauses = ex.detect_clauses("## First\n\nbody text\n\n## Last") # no trailing newline
|
|
133
|
+
assert clauses[-1]["title"] == "Last"
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def test_two_line_article_skips_non_heading_next_line() -> None:
|
|
137
|
+
text = ("ARTICLE I\n\nThis whole next line is a long running sentence, not a heading at all.\n\n"
|
|
138
|
+
"ARTICLE II\n\nCONFIDENTIALITY\n\nbody\n\nARTICLE III\n\nGOVERNING LAW\n\nbody")
|
|
139
|
+
titles = [c["title"] for c in ex.detect_clauses(text)]
|
|
140
|
+
assert "CONFIDENTIALITY" in titles and "GOVERNING LAW" in titles
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def test_is_low_signal_each_branch() -> None:
|
|
144
|
+
def base() -> dict:
|
|
145
|
+
return {"parties": [], "clauses": [],
|
|
146
|
+
"dates": {"effective": ex._none_field(), "expiration": ex._none_field()},
|
|
147
|
+
"governing_law": ex._none_field(), "defined_terms": []}
|
|
148
|
+
r = base(); r["clauses"] = [{}]; assert ex._is_low_signal(r) is False
|
|
149
|
+
r = base(); r["dates"]["effective"] = ex._field("2024-01-01", 0.85); assert ex._is_low_signal(r) is False
|
|
150
|
+
r = base(); r["governing_law"] = ex._field("X", 0.8); assert ex._is_low_signal(r) is False
|
|
151
|
+
r = base(); r["defined_terms"] = [{"term": "X"}]; assert ex._is_low_signal(r) is False
|
|
152
|
+
assert ex._is_low_signal(base()) is True
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
# --- LLM internals (mocked transport) ---------------------------------------
|
|
156
|
+
|
|
157
|
+
class _Resp:
|
|
158
|
+
def __init__(self, body: bytes) -> None:
|
|
159
|
+
self._b = body
|
|
160
|
+
|
|
161
|
+
def read(self) -> bytes:
|
|
162
|
+
return self._b
|
|
163
|
+
|
|
164
|
+
def __enter__(self) -> "_Resp":
|
|
165
|
+
return self
|
|
166
|
+
|
|
167
|
+
def __exit__(self, *a: object) -> bool:
|
|
168
|
+
return False
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def test_llm_request_openai_no_choices(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
172
|
+
monkeypatch.setattr(ex.urllib.request, "urlopen",
|
|
173
|
+
lambda req, timeout=30.0: _Resp(json.dumps({"choices": []}).encode()))
|
|
174
|
+
assert ex._llm_request({"provider": "openai", "api_key": "k"}, "p") is None
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def test_extract_json_object_invalid() -> None:
|
|
178
|
+
assert ex._extract_json_object("prefix {not valid json} suffix") is None
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def test_llm_clause_map_skips() -> None:
|
|
182
|
+
cm = ex._llm_clause_map(
|
|
183
|
+
[{"title": ""}, 123, {"title": "Recitals"}, {"title": "Confidentiality"},
|
|
184
|
+
{"title": "Confidentiality"}], "Confidentiality body")
|
|
185
|
+
assert [c["canonical_title"] for c in cm] == ["Confidentiality"]
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def test_load_llm_config_malformed(monkeypatch: pytest.MonkeyPatch, tmp_path: Any) -> None:
|
|
189
|
+
bad = tmp_path / "llm.json"
|
|
190
|
+
bad.write_text("{not json")
|
|
191
|
+
monkeypatch.setattr(ex, "LLM_CONFIG_PATHS", (bad,))
|
|
192
|
+
assert ex.load_llm_config() is None
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def test_llm_enrich_empty_and_unparseable(monkeypatch: pytest.MonkeyPatch,
|
|
196
|
+
capsys: pytest.CaptureFixture[str]) -> None:
|
|
197
|
+
monkeypatch.setattr(ex, "load_llm_config", lambda: {"provider": "anthropic", "api_key": "k"})
|
|
198
|
+
text = "x"
|
|
199
|
+
monkeypatch.setattr(ex, "_llm_request", lambda c, p, timeout=30.0: "")
|
|
200
|
+
ex.llm_enrich(ex.build_extraction(text, text.encode(), "text", "x.txt"), text, _ns())
|
|
201
|
+
assert "no content" in capsys.readouterr().err
|
|
202
|
+
monkeypatch.setattr(ex, "_llm_request", lambda c, p, timeout=30.0: "not json at all")
|
|
203
|
+
ex.llm_enrich(ex.build_extraction(text, text.encode(), "text", "x.txt"), text, _ns())
|
|
204
|
+
assert "could not parse" in capsys.readouterr().err
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
# --- rendering / CLI edges --------------------------------------------------
|
|
208
|
+
|
|
209
|
+
def test_render_table_unmapped_legend() -> None:
|
|
210
|
+
r = ex.build_extraction("## Zorblax Provisions\n\nbody", b"x", "markdown", "x.md")
|
|
211
|
+
assert "* = not mapped" in ex.render_table(r, no_confidence=False)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def test_cli_silent_table_suppresses_human_view(capsys: pytest.CaptureFixture[str]) -> None:
|
|
215
|
+
assert ex.main([str(FIXTURES / "nda_h2.md"), "--silent", "--format", "table"]) == 0
|
|
216
|
+
assert "Clause map" not in capsys.readouterr().out
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def test_main_no_args_prints_help(capsys: pytest.CaptureFixture[str]) -> None:
|
|
220
|
+
assert ex.main([]) == 0
|
|
221
|
+
assert "usage" in capsys.readouterr().out.lower()
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
# --- last reachable edges ---------------------------------------------------
|
|
225
|
+
|
|
226
|
+
def test_parties_skips_empty_capture() -> None:
|
|
227
|
+
# The second "party" is just a parenthetical role -> cleans to an empty
|
|
228
|
+
# name and is skipped; the first is kept.
|
|
229
|
+
parties = ex.extract_parties('between Acme Corp and ("Receiving Party")')
|
|
230
|
+
assert [p["name"] for p in parties] == ["Acme Corp"]
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def test_signatories_skips_dupes_short_and_reserved() -> None:
|
|
234
|
+
text = "By: Jane Doe\nName: Jane Doe\nName: a\nName: the\n"
|
|
235
|
+
s = ex.extract_signatories(text)
|
|
236
|
+
assert [x["name"] for x in s] == ["Jane Doe"]
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def test_pdf_text_tj_array_branch() -> None:
|
|
240
|
+
# A TJ array of strings inside a text object.
|
|
241
|
+
assert ex._pdf_text_from_content(b"BT [(Hello) (World)] TJ ET") == "HelloWorld"
|
|
@@ -152,6 +152,46 @@ def test_docx_heading_style_helpers() -> None:
|
|
|
152
152
|
# Run-in heading: title is the lead before the sentence body.
|
|
153
153
|
assert ex._docx_heading_title("Payment. Customer will pay the fees.") == "Payment"
|
|
154
154
|
assert ex._docx_heading_title("Governing Law") == "Governing Law"
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def test_emit_docx_paragraph() -> None:
|
|
158
|
+
"""The shared emitter both .docx readers use: heading styles / numbered
|
|
159
|
+
paragraphs become `## headings`, fully-bold lines become `**...**`."""
|
|
160
|
+
out: list[str] = []
|
|
161
|
+
ex._emit_docx_paragraph(out, "Confidentiality", "Heading2", False, False) # heading style
|
|
162
|
+
ex._emit_docx_paragraph(out, "Term", None, True, False) # auto-numbered
|
|
163
|
+
ex._emit_docx_paragraph(out, "Important Notice", None, False, True) # fully bold
|
|
164
|
+
ex._emit_docx_paragraph(out, "Just some body text.", None, False, False) # plain
|
|
165
|
+
ex._emit_docx_paragraph(out, "", None, False, False) # blank
|
|
166
|
+
ex._emit_docx_paragraph(out, "Payment. Customer will pay.", "Heading1", False, False) # run-in
|
|
167
|
+
assert out == [
|
|
168
|
+
"## Confidentiality",
|
|
169
|
+
"## Term",
|
|
170
|
+
"**Important Notice**",
|
|
171
|
+
"Just some body text.",
|
|
172
|
+
"",
|
|
173
|
+
"## Payment",
|
|
174
|
+
"Customer will pay.", # run-in body split onto its own line
|
|
175
|
+
]
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def test_docx_readers_agree_on_clause_map() -> None:
|
|
179
|
+
"""Regression: the python-docx reader must surface the same clause map as the
|
|
180
|
+
stdlib reader on a heading-styled .docx. The python-docx path used to flatten
|
|
181
|
+
heading styles and return an empty clause map. Skips without [docx]."""
|
|
182
|
+
pytest.importorskip("docx")
|
|
183
|
+
path = FIXTURES / "heading_docx.docx"
|
|
184
|
+
raw = path.read_bytes()
|
|
185
|
+
|
|
186
|
+
def clause_titles(prefer_optional: bool) -> list[str]:
|
|
187
|
+
_raw, text, fmt, _w = ex.load_source(path, prefer_optional=prefer_optional)
|
|
188
|
+
result = ex.build_extraction(text, raw, fmt, "h.docx")
|
|
189
|
+
return [c["canonical_title"] for c in result["clauses"]]
|
|
190
|
+
|
|
191
|
+
stdlib = clause_titles(False)
|
|
192
|
+
pydocx = clause_titles(True)
|
|
193
|
+
assert stdlib, "stdlib reader should detect the heading-styled clauses"
|
|
194
|
+
assert pydocx == stdlib, "python-docx path must agree with the stdlib reader"
|
|
155
195
|
# A full sentence carrying a heading style is rejected (not a clause title).
|
|
156
196
|
assert ex._docx_heading_title(
|
|
157
197
|
"Either party may terminate this Agreement upon material breach that "
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|