extract-cli 0.1.9__tar.gz → 0.1.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. {extract_cli-0.1.9 → extract_cli-0.1.10}/CHANGELOG.md +26 -0
  2. {extract_cli-0.1.9 → extract_cli-0.1.10}/Makefile +4 -1
  3. {extract_cli-0.1.9 → extract_cli-0.1.10}/PKG-INFO +1 -1
  4. {extract_cli-0.1.9 → extract_cli-0.1.10}/extract_cli.py +50 -35
  5. {extract_cli-0.1.9 → extract_cli-0.1.10}/pyproject.toml +1 -1
  6. {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/employment_docx.docx.expected.json +1 -1
  7. {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/heading_docx.docx.expected.json +1 -1
  8. {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/lease_allcaps.txt.expected.json +1 -1
  9. {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/license_pdf.pdf.expected.json +1 -1
  10. {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/nda_h2.md.expected.json +1 -1
  11. {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/numbered_docx.docx.expected.json +1 -1
  12. {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/scanned.pdf.expected.json +1 -1
  13. {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/services_bold.txt.expected.json +1 -1
  14. {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/services_html.html.expected.json +1 -1
  15. extract_cli-0.1.10/tests/test_coverage.py +241 -0
  16. {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/test_misc.py +40 -0
  17. {extract_cli-0.1.9 → extract_cli-0.1.10}/.gitignore +0 -0
  18. {extract_cli-0.1.9 → extract_cli-0.1.10}/AGENTS.md +0 -0
  19. {extract_cli-0.1.9 → extract_cli-0.1.10}/ARCHITECTURE.md +0 -0
  20. {extract_cli-0.1.9 → extract_cli-0.1.10}/CONTRIBUTING.md +0 -0
  21. {extract_cli-0.1.9 → extract_cli-0.1.10}/LICENSE +0 -0
  22. {extract_cli-0.1.9 → extract_cli-0.1.10}/README.md +0 -0
  23. {extract_cli-0.1.9 → extract_cli-0.1.10}/config/llm.json.example +0 -0
  24. {extract_cli-0.1.9 → extract_cli-0.1.10}/docs/INTEROP.md +0 -0
  25. {extract_cli-0.1.9 → extract_cli-0.1.10}/docs/spec/extract-output.schema.json +0 -0
  26. {extract_cli-0.1.9 → extract_cli-0.1.10}/llms.txt +0 -0
  27. {extract_cli-0.1.9 → extract_cli-0.1.10}/scripts/release.py +0 -0
  28. {extract_cli-0.1.9 → extract_cli-0.1.10}/scripts/validate_against_spec.py +0 -0
  29. {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/_fixtures_build.py +0 -0
  30. {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/_make_goldens.py +0 -0
  31. {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/_schema_validator.py +0 -0
  32. {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/conftest.py +0 -0
  33. {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/employment_docx.docx +0 -0
  34. {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/heading_docx.docx +0 -0
  35. {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/lease_allcaps.txt +0 -0
  36. {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/license_pdf.pdf +0 -0
  37. {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/nda_h2.md +0 -0
  38. {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/numbered_docx.docx +0 -0
  39. {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/scanned.pdf +0 -0
  40. {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/services_bold.txt +0 -0
  41. {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/services_html.html +0 -0
  42. {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/test_clause_map.py +0 -0
  43. {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/test_cli.py +0 -0
  44. {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/test_deterministic.py +0 -0
  45. {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/test_llm.py +0 -0
  46. {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/test_property.py +0 -0
  47. {extract_cli-0.1.9 → extract_cli-0.1.10}/tests/test_schema_conformance.py +0 -0
@@ -6,6 +6,31 @@ to [Semantic Versioning](https://semver.org/). Per the suite convention
6
6
  (see [`docs/INTEROP.md`](docs/INTEROP.md)), **backward-incompatible changes to
7
7
  the output schema require a major version bump**; new optional fields are minor.
8
8
 
9
+ ## [0.1.10] - 2026-05-22
10
+
11
+ ### Fixed
12
+ - **The `[docx]` (python-docx) reader now honors Word heading styles**, matching
13
+ the stdlib reader. Previously the python-docx path concatenated paragraph text
14
+ and dropped `Heading1-9`/`Title` styles and `w:numPr` numbering, so installing
15
+ the `[docx]` extra produced an **empty clause map** on heading-styled Word
16
+ contracts (worse than the no-extra stdlib reader). Both readers now share one
17
+ emitter (`_emit_docx_paragraph`) that turns heading-styled / auto-numbered
18
+ paragraphs into `## headings`, so the two paths agree. New tests:
19
+ `test_emit_docx_paragraph` and `test_docx_readers_agree_on_clause_map` (the
20
+ latter asserts the python-docx and stdlib readers produce the same clause map).
21
+ No output-schema change.
22
+
23
+ ### Tests / quality
24
+ - **Line coverage raised to 100%** (was 92%/94%). Added a targeted test battery
25
+ for the remaining reachable branches (color/`FORCE_COLOR`, `_warn` silent,
26
+ date/jurisdiction/title/clause edge returns, LLM request/parse/clause-map
27
+ branches, PDF `TJ`-array + stream/budget edges, HTML malformed fallback, DOCX
28
+ empty paragraph, `_is_low_signal` branches, CLI silent/help paths). Genuinely
29
+ unreachable defensive lines and `[docx]`/`[pdf]`-extra fidelity branches are
30
+ marked `# pragma: no cover`. `make coverage` now installs the extras and
31
+ enforces `--fail-under=100`; a CI `coverage` job gates it. No code-behavior or
32
+ schema change.
33
+
9
34
  ## [0.1.9] - 2026-05-22
10
35
 
11
36
  ### Security / robustness
@@ -271,6 +296,7 @@ Initial release — the open-loop front door of the contract-ops CLI suite.
271
296
  intentionally *not* governed by the output schema (the schema describes the
272
297
  full default output).
273
298
 
299
+ [0.1.10]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.10
274
300
  [0.1.9]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.9
275
301
  [0.1.8]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.8
276
302
  [0.1.7]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.7
@@ -31,8 +31,11 @@ test-quick:
31
31
  $(PYTHON) -m pytest -x -q -k "not property"
32
32
 
33
33
  coverage:
34
+ # Install the [docx]/[pdf] extras so the fidelity-reader paths execute too;
35
+ # without them two extras-only branches stay uncovered (98% vs 100%).
36
+ $(PIP) install -q -e ".[dev,docx,pdf]"
34
37
  $(PYTHON) -m coverage run --source=extract_cli -m pytest -q
35
- $(PYTHON) -m coverage report -m
38
+ $(PYTHON) -m coverage report -m --fail-under=100
36
39
 
37
40
  typecheck:
38
41
  $(PYTHON) -m mypy --strict extract_cli.py
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-cli
3
- Version: 0.1.9
3
+ Version: 0.1.10
4
4
  Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON.
5
5
  Project-URL: Homepage, https://cli.drbaher.com/
6
6
  Project-URL: Repository, https://github.com/DrBaher/extract-cli
@@ -43,11 +43,11 @@ import urllib.request
43
43
  from pathlib import Path
44
44
  from typing import Any, Dict, List, Optional, Tuple
45
45
 
46
- __version__ = "0.1.9"
46
+ __version__ = "0.1.10"
47
47
 
48
48
  # Bumped independently of the package version when the *extraction logic*
49
49
  # changes in a way downstream consumers should notice. Embedded in `_meta`.
50
- EXTRACTOR_VERSION = "0.1.9"
50
+ EXTRACTOR_VERSION = "0.1.10"
51
51
 
52
52
  # JSON Schema version of the output contract (docs/spec/extract-output.schema.json).
53
53
  SCHEMA_VERSION = 1
@@ -759,7 +759,7 @@ def extract_governing_law(text: str) -> JSON:
759
759
  if not m:
760
760
  return _none_field()
761
761
  juris = re.sub(r"\s+", " ", m.group(1).strip().rstrip(".,")).strip()
762
- if not juris:
762
+ if not juris: # pragma: no cover - the capture group requires a leading letter
763
763
  return _none_field()
764
764
  return _field(juris, 0.85)
765
765
 
@@ -880,7 +880,7 @@ def extract_defined_terms(text: str) -> List[JSON]:
880
880
  # Reject sentence-like or lowercase-y captures.
881
881
  if len(term) < 2 or len(term.split()) > 6:
882
882
  continue
883
- if not term[0].isupper():
883
+ if not term[0].isupper(): # pragma: no cover - the regexes require an uppercase lead
884
884
  continue
885
885
  seen.setdefault(term, None)
886
886
  if len(seen) >= 50:
@@ -1075,13 +1075,20 @@ def _read_docx(path: Path, raw: bytes, prefer_optional: bool = True) -> Tuple[st
1075
1075
  mod = importlib.import_module("docx")
1076
1076
  document_cls = getattr(mod, "Document")
1077
1077
  doc = document_cls(str(path))
1078
+ w = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
1078
1079
  lines: List[str] = []
1079
1080
  for para in doc.paragraphs:
1080
1081
  line = (para.text or "").strip()
1081
- if line and para.runs and all(getattr(r, "bold", False) for r in para.runs if (r.text or "").strip()):
1082
- line = f"**{line}**"
1083
- lines.append(line)
1084
- for table in getattr(doc, "tables", []):
1082
+ # Read the style + numbering off the underlying element so the
1083
+ # cascade sees clause headings (the same logic the stdlib reader
1084
+ # applies); python-docx alone exposes neither as a heading.
1085
+ ppr = para._p.find(w + "pPr")
1086
+ style = _docx_paragraph_style(ppr, w)
1087
+ numbered = bool(ppr is not None and ppr.find(w + "numPr") is not None)
1088
+ all_bold = bool(para.runs) and all(
1089
+ getattr(r, "bold", False) for r in para.runs if (r.text or "").strip())
1090
+ _emit_docx_paragraph(lines, line, style, numbered, all_bold)
1091
+ for table in getattr(doc, "tables", []): # pragma: no cover - [docx] fidelity
1085
1092
  for row in table.rows:
1086
1093
  for cell in row.cells:
1087
1094
  ct = (cell.text or "").strip()
@@ -1130,6 +1137,30 @@ def _docx_heading_title(text: str) -> Optional[str]:
1130
1137
  return title
1131
1138
 
1132
1139
 
1140
+ def _emit_docx_paragraph(out: List[str], line: str, style: Optional[str],
1141
+ numbered: bool, all_bold: bool) -> None:
1142
+ """Append one .docx paragraph to `out` the way the clause cascade expects.
1143
+
1144
+ Heading-styled (Heading1-9/Title) or auto-numbered (`w:numPr`) paragraphs --
1145
+ whose visible number is auto-generated and absent from the text -- become a
1146
+ `## <title>` heading (with any run-in body split onto the next line) when the
1147
+ lead looks like a heading; a fully-bold paragraph becomes `**...**`; anything
1148
+ else stays plain. Shared by BOTH the python-docx and stdlib readers so the
1149
+ two paths agree on structure (the python-docx path used to flatten headings,
1150
+ losing the clause map on heading-styled Word docs)."""
1151
+ if not line:
1152
+ out.append("")
1153
+ return
1154
+ if _is_heading_style(style) or numbered:
1155
+ title = _docx_heading_title(line)
1156
+ if title is not None:
1157
+ out.append(f"## {title}")
1158
+ if len(title) < len(line):
1159
+ out.append(line[len(title):].lstrip(" .:\t"))
1160
+ return
1161
+ out.append(f"**{line}**" if all_bold else line)
1162
+
1163
+
1133
1164
  def _read_docx_stdlib(raw: bytes) -> str:
1134
1165
  import io
1135
1166
  import zipfile
@@ -1153,39 +1184,23 @@ def _read_docx_stdlib(raw: bytes) -> str:
1153
1184
  style = _docx_paragraph_style(ppr, w)
1154
1185
  numbered = ppr is not None and ppr.find(w + "numPr") is not None
1155
1186
  run_texts: List[str] = []
1156
- any_text = False
1157
1187
  all_bold = True
1158
1188
  for r in p.iter(w + "r"):
1159
1189
  rpr = r.find(w + "rPr")
1160
1190
  bold = rpr is not None and rpr.find(w + "b") is not None
1161
1191
  txt = "".join(t.text or "" for t in r.iter(w + "t"))
1162
1192
  if txt:
1163
- any_text = True
1164
1193
  if not bold:
1165
1194
  all_bold = False
1166
1195
  run_texts.append(txt)
1167
1196
  line = "".join(run_texts).strip()
1168
- if not line:
1169
- paras.append("")
1170
- continue
1171
1197
  # Clause structure in real Word contracts lives in heading STYLES
1172
1198
  # (Heading1-9/Title) or auto-NUMBERED paragraphs (w:numPr) -- in both the
1173
- # visible number is auto-generated and absent from the text. Emit such a
1174
- # paragraph as an H2 heading (strongest cascade tier) when its lead looks
1175
- # like a heading; _docx_heading_title rejects full-sentence body items
1176
- # (e.g. deep numbered sub-points), so this stays conservative. Keep any
1177
- # run-in body as a following paragraph.
1178
- if _is_heading_style(style) or numbered:
1179
- title = _docx_heading_title(line)
1180
- if title is not None:
1181
- paras.append(f"## {title}")
1182
- if len(title) < len(line):
1183
- paras.append(line[len(title):].lstrip(" .:\t"))
1184
- continue
1185
- # Not heading-like -> treat as ordinary body text.
1186
- if any_text and all_bold:
1187
- line = f"**{line}**"
1188
- paras.append(line)
1199
+ # visible number is auto-generated and absent from the text. The shared
1200
+ # emitter turns those into `## headings` (run-in body split off), bolds
1201
+ # fully-bold lines, and keeps the rest plain. _docx_heading_title rejects
1202
+ # full-sentence body items, so this stays conservative.
1203
+ _emit_docx_paragraph(paras, line, style, numbered, all_bold)
1189
1204
  return "\n\n".join(paras)
1190
1205
 
1191
1206
 
@@ -1209,7 +1224,7 @@ def _read_pdf(path: Path, raw: bytes, prefer_optional: bool = True) -> Tuple[str
1209
1224
  warnings.append(f"pypdf read failed ({e}); falling back to stdlib reader")
1210
1225
  try:
1211
1226
  text = _read_pdf_stdlib(raw)
1212
- except Exception as e:
1227
+ except Exception as e: # pragma: no cover - defensive; stdlib reader is bomb-guarded
1213
1228
  warnings.append(f"could not parse .pdf ({e}); treating as empty")
1214
1229
  return "", warnings
1215
1230
  return text, warnings
@@ -1342,7 +1357,7 @@ def load_source(path: Path, prefer_optional: bool = True) -> Tuple[bytes, str, s
1342
1357
  raise ExtractError(f"path is a directory, not a file: {path}")
1343
1358
  try:
1344
1359
  size = path.stat().st_size
1345
- except OSError:
1360
+ except OSError: # pragma: no cover - defensive; path.exists() already passed
1346
1361
  size = 0
1347
1362
  if size > MAX_INPUT_BYTES:
1348
1363
  raise ExtractError(
@@ -2315,7 +2330,7 @@ def main(argv: Optional[List[str]] = None) -> int:
2315
2330
  if hasattr(_stream, "reconfigure"):
2316
2331
  try:
2317
2332
  _stream.reconfigure(encoding="utf-8", errors="replace")
2318
- except Exception:
2333
+ except Exception: # pragma: no cover - defensive
2319
2334
  pass
2320
2335
 
2321
2336
  argv = sys.argv[1:] if argv is None else argv
@@ -2358,7 +2373,7 @@ def main(argv: Optional[List[str]] = None) -> int:
2358
2373
  if first in known:
2359
2374
  parser = build_parser()
2360
2375
  args = parser.parse_args(argv)
2361
- if not getattr(args, "func", None):
2376
+ if not getattr(args, "func", None): # pragma: no cover - argparse always sets func
2362
2377
  parser.print_help()
2363
2378
  return 0
2364
2379
  else:
@@ -2370,7 +2385,7 @@ def main(argv: Optional[List[str]] = None) -> int:
2370
2385
  except BrokenPipeError: # e.g. `extract foo.md | head`
2371
2386
  try:
2372
2387
  sys.stdout.close()
2373
- except Exception:
2388
+ except Exception: # pragma: no cover - defensive
2374
2389
  pass
2375
2390
  return 0
2376
2391
  except KeyboardInterrupt: # pragma: no cover
@@ -2378,5 +2393,5 @@ def main(argv: Optional[List[str]] = None) -> int:
2378
2393
  return 130
2379
2394
 
2380
2395
 
2381
- if __name__ == "__main__":
2396
+ if __name__ == "__main__": # pragma: no cover
2382
2397
  sys.exit(main())
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "extract-cli"
7
- version = "0.1.9"
7
+ version = "0.1.10"
8
8
  description = "Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
@@ -151,7 +151,7 @@
151
151
  ],
152
152
  "signatories": [],
153
153
  "_meta": {
154
- "extractor_version": "0.1.9",
154
+ "extractor_version": "0.1.10",
155
155
  "tiers_used": [
156
156
  "deterministic"
157
157
  ],
@@ -140,7 +140,7 @@
140
140
  "amounts": [],
141
141
  "signatories": [],
142
142
  "_meta": {
143
- "extractor_version": "0.1.9",
143
+ "extractor_version": "0.1.10",
144
144
  "tiers_used": [
145
145
  "deterministic"
146
146
  ],
@@ -146,7 +146,7 @@
146
146
  ],
147
147
  "signatories": [],
148
148
  "_meta": {
149
- "extractor_version": "0.1.9",
149
+ "extractor_version": "0.1.10",
150
150
  "tiers_used": [
151
151
  "deterministic"
152
152
  ],
@@ -146,7 +146,7 @@
146
146
  ],
147
147
  "signatories": [],
148
148
  "_meta": {
149
- "extractor_version": "0.1.9",
149
+ "extractor_version": "0.1.10",
150
150
  "tiers_used": [
151
151
  "deterministic"
152
152
  ],
@@ -150,7 +150,7 @@
150
150
  "amounts": [],
151
151
  "signatories": [],
152
152
  "_meta": {
153
- "extractor_version": "0.1.9",
153
+ "extractor_version": "0.1.10",
154
154
  "tiers_used": [
155
155
  "deterministic"
156
156
  ],
@@ -140,7 +140,7 @@
140
140
  "amounts": [],
141
141
  "signatories": [],
142
142
  "_meta": {
143
- "extractor_version": "0.1.9",
143
+ "extractor_version": "0.1.10",
144
144
  "tiers_used": [
145
145
  "deterministic"
146
146
  ],
@@ -55,7 +55,7 @@
55
55
  "amounts": [],
56
56
  "signatories": [],
57
57
  "_meta": {
58
- "extractor_version": "0.1.9",
58
+ "extractor_version": "0.1.10",
59
59
  "tiers_used": [
60
60
  "deterministic"
61
61
  ],
@@ -146,7 +146,7 @@
146
146
  ],
147
147
  "signatories": [],
148
148
  "_meta": {
149
- "extractor_version": "0.1.9",
149
+ "extractor_version": "0.1.10",
150
150
  "tiers_used": [
151
151
  "deterministic"
152
152
  ],
@@ -161,7 +161,7 @@
161
161
  ],
162
162
  "signatories": [],
163
163
  "_meta": {
164
- "extractor_version": "0.1.9",
164
+ "extractor_version": "0.1.10",
165
165
  "tiers_used": [
166
166
  "deterministic"
167
167
  ],
@@ -0,0 +1,241 @@
1
+ """Targeted tests that exercise the remaining reachable branches, to keep line
2
+ coverage at its practical maximum. (Genuinely-unreachable defensive lines and
3
+ [docx]/[pdf]-extra fidelity branches are marked `# pragma: no cover` in the
4
+ source.)"""
5
+ from __future__ import annotations
6
+
7
+ import argparse
8
+ import io
9
+ import json
10
+ import sys as _sys
11
+ import zipfile
12
+ from typing import Any
13
+
14
+ import pytest
15
+
16
+ import extract_cli as ex
17
+ from tests.conftest import FIXTURES
18
+
19
+
20
+ def _ns(**kw: object) -> argparse.Namespace:
21
+ base = {"silent": False, "why": False}
22
+ base.update(kw)
23
+ return argparse.Namespace(**base)
24
+
25
+
26
+ # --- color + warn -----------------------------------------------------------
27
+
28
+ def test_color_force_on_and_isatty_exception(monkeypatch: pytest.MonkeyPatch) -> None:
29
+ monkeypatch.delenv("NO_COLOR", raising=False)
30
+ monkeypatch.setenv("FORCE_COLOR", "1")
31
+ assert ex._color_enabled() is True
32
+ assert ex._c("x", "32") == "\033[32mx\033[0m"
33
+ monkeypatch.delenv("FORCE_COLOR", raising=False)
34
+
35
+ class _Bad:
36
+ def isatty(self) -> bool:
37
+ raise ValueError("boom")
38
+ assert ex._color_enabled(_Bad()) is False
39
+
40
+
41
+ def test_warn_silent_is_suppressed(capsys: pytest.CaptureFixture[str]) -> None:
42
+ ex._warn(_ns(silent=True), "hush")
43
+ assert capsys.readouterr().err == ""
44
+
45
+
46
+ # --- small helpers ----------------------------------------------------------
47
+
48
+ def test_titlecase_edges() -> None:
49
+ assert ex._titlecase(" ") == ""
50
+ assert ex._titlecase("IP Rights") == "IP Rights" # acronym preserved in mixed case
51
+
52
+
53
+ def test_word_to_int_digit_and_unknown() -> None:
54
+ assert ex._word_to_int("30") == 30
55
+ assert ex._word_to_int("zzz") is None
56
+
57
+
58
+ def test_date_parse_none_and_unparseable_raw() -> None:
59
+ assert ex._parse_date_to_iso("not a date") is None
60
+ f = ex._date_field_from_str("13/13/2024", 0.85) # matches shape, invalid month
61
+ assert f["source"] == "deterministic" and f["confidence"] < 0.85
62
+
63
+
64
+ def test_canonicalize_empty_key() -> None:
65
+ assert ex._canonicalize_clause(" ") == (None, False)
66
+ assert ex._canonicalize_clause("1.") == (None, False)
67
+
68
+
69
+ def test_governing_law_and_title_none() -> None:
70
+ assert ex.extract_governing_law("no law clause here")["source"] == "none"
71
+ assert ex.extract_title("", None, "text") is None
72
+
73
+
74
+ def test_defined_terms_long_and_capped() -> None:
75
+ long_phrase = '"This Is A Very Long Quoted Heading Phrase Indeed"' # > 6 words
76
+ many = " ".join(f'"Term {i}"' for i in range(60))
77
+ terms = [t["term"] for t in ex.extract_defined_terms(long_phrase + " " + many)]
78
+ assert not any("Very Long" in t for t in terms)
79
+ assert len(terms) <= 50
80
+
81
+
82
+ def test_noise_placeholder_midstring() -> None:
83
+ # Placeholder not at the start -> the mid-string regex branch.
84
+ assert ex._is_noise_clause_title("Fee [ # ]% Cap")
85
+ assert ex._is_noise_clause_title("{placeholder}")
86
+
87
+
88
+ # --- format / readers -------------------------------------------------------
89
+
90
+ def test_detect_format_by_magic_bytes(tmp_path: Any) -> None:
91
+ p = tmp_path / "x.dat"
92
+ p.write_bytes(b"%PDF-1.4\nrest")
93
+ assert ex._detect_format(p, p.read_bytes()) == "pdf"
94
+ q = tmp_path / "y.dat"
95
+ q.write_bytes(b"PK\x03\x04rest")
96
+ assert ex._detect_format(q, q.read_bytes()) == "docx"
97
+
98
+
99
+ def test_pdf_stream_without_endstream() -> None:
100
+ assert ex._read_pdf_stdlib(b"%PDF\nstream\n(text) Tj") == ""
101
+
102
+
103
+ def test_pdf_decompression_budget_break(monkeypatch: pytest.MonkeyPatch) -> None:
104
+ import zlib
105
+ monkeypatch.setattr(ex, "MAX_DECOMPRESSED_BYTES", 10)
106
+ blob = b"%PDF\nstream\n" + zlib.compress(b"(Hello World) Tj " * 10) + b"\nendstream"
107
+ assert ex._read_pdf_stdlib(blob) == "" # exceeds the tiny budget -> bail, no text
108
+
109
+
110
+ def test_html_malformed_falls_back(monkeypatch: pytest.MonkeyPatch) -> None:
111
+ def boom(self: object, data: object) -> None:
112
+ raise ValueError("bad markup")
113
+ monkeypatch.setattr(ex._HTMLTextExtractor, "feed", boom)
114
+ out = ex._read_html("<p>hello <b>world</b></p>")
115
+ assert "hello" in out and "<" not in out # crude tag-strip fallback
116
+
117
+
118
+ def test_docx_empty_paragraph_stdlib() -> None:
119
+ w = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
120
+ body = '<w:p/><w:p><w:r><w:t>Hello</w:t></w:r></w:p>'
121
+ doc = f'<?xml version="1.0"?><w:document xmlns:w="{w}"><w:body>{body}</w:body></w:document>'
122
+ buf = io.BytesIO()
123
+ with zipfile.ZipFile(buf, "w") as z:
124
+ z.writestr("[Content_Types].xml", "<Types/>")
125
+ z.writestr("word/document.xml", doc)
126
+ assert "Hello" in ex._read_docx_stdlib(buf.getvalue())
127
+
128
+
129
+ # --- clause detection edges -------------------------------------------------
130
+
131
+ def test_clause_heading_on_last_line() -> None:
132
+ clauses = ex.detect_clauses("## First\n\nbody text\n\n## Last") # no trailing newline
133
+ assert clauses[-1]["title"] == "Last"
134
+
135
+
136
+ def test_two_line_article_skips_non_heading_next_line() -> None:
137
+ text = ("ARTICLE I\n\nThis whole next line is a long running sentence, not a heading at all.\n\n"
138
+ "ARTICLE II\n\nCONFIDENTIALITY\n\nbody\n\nARTICLE III\n\nGOVERNING LAW\n\nbody")
139
+ titles = [c["title"] for c in ex.detect_clauses(text)]
140
+ assert "CONFIDENTIALITY" in titles and "GOVERNING LAW" in titles
141
+
142
+
143
+ def test_is_low_signal_each_branch() -> None:
144
+ def base() -> dict:
145
+ return {"parties": [], "clauses": [],
146
+ "dates": {"effective": ex._none_field(), "expiration": ex._none_field()},
147
+ "governing_law": ex._none_field(), "defined_terms": []}
148
+ r = base(); r["clauses"] = [{}]; assert ex._is_low_signal(r) is False
149
+ r = base(); r["dates"]["effective"] = ex._field("2024-01-01", 0.85); assert ex._is_low_signal(r) is False
150
+ r = base(); r["governing_law"] = ex._field("X", 0.8); assert ex._is_low_signal(r) is False
151
+ r = base(); r["defined_terms"] = [{"term": "X"}]; assert ex._is_low_signal(r) is False
152
+ assert ex._is_low_signal(base()) is True
153
+
154
+
155
+ # --- LLM internals (mocked transport) ---------------------------------------
156
+
157
+ class _Resp:
158
+ def __init__(self, body: bytes) -> None:
159
+ self._b = body
160
+
161
+ def read(self) -> bytes:
162
+ return self._b
163
+
164
+ def __enter__(self) -> "_Resp":
165
+ return self
166
+
167
+ def __exit__(self, *a: object) -> bool:
168
+ return False
169
+
170
+
171
+ def test_llm_request_openai_no_choices(monkeypatch: pytest.MonkeyPatch) -> None:
172
+ monkeypatch.setattr(ex.urllib.request, "urlopen",
173
+ lambda req, timeout=30.0: _Resp(json.dumps({"choices": []}).encode()))
174
+ assert ex._llm_request({"provider": "openai", "api_key": "k"}, "p") is None
175
+
176
+
177
+ def test_extract_json_object_invalid() -> None:
178
+ assert ex._extract_json_object("prefix {not valid json} suffix") is None
179
+
180
+
181
+ def test_llm_clause_map_skips() -> None:
182
+ cm = ex._llm_clause_map(
183
+ [{"title": ""}, 123, {"title": "Recitals"}, {"title": "Confidentiality"},
184
+ {"title": "Confidentiality"}], "Confidentiality body")
185
+ assert [c["canonical_title"] for c in cm] == ["Confidentiality"]
186
+
187
+
188
+ def test_load_llm_config_malformed(monkeypatch: pytest.MonkeyPatch, tmp_path: Any) -> None:
189
+ bad = tmp_path / "llm.json"
190
+ bad.write_text("{not json")
191
+ monkeypatch.setattr(ex, "LLM_CONFIG_PATHS", (bad,))
192
+ assert ex.load_llm_config() is None
193
+
194
+
195
+ def test_llm_enrich_empty_and_unparseable(monkeypatch: pytest.MonkeyPatch,
196
+ capsys: pytest.CaptureFixture[str]) -> None:
197
+ monkeypatch.setattr(ex, "load_llm_config", lambda: {"provider": "anthropic", "api_key": "k"})
198
+ text = "x"
199
+ monkeypatch.setattr(ex, "_llm_request", lambda c, p, timeout=30.0: "")
200
+ ex.llm_enrich(ex.build_extraction(text, text.encode(), "text", "x.txt"), text, _ns())
201
+ assert "no content" in capsys.readouterr().err
202
+ monkeypatch.setattr(ex, "_llm_request", lambda c, p, timeout=30.0: "not json at all")
203
+ ex.llm_enrich(ex.build_extraction(text, text.encode(), "text", "x.txt"), text, _ns())
204
+ assert "could not parse" in capsys.readouterr().err
205
+
206
+
207
+ # --- rendering / CLI edges --------------------------------------------------
208
+
209
+ def test_render_table_unmapped_legend() -> None:
210
+ r = ex.build_extraction("## Zorblax Provisions\n\nbody", b"x", "markdown", "x.md")
211
+ assert "* = not mapped" in ex.render_table(r, no_confidence=False)
212
+
213
+
214
+ def test_cli_silent_table_suppresses_human_view(capsys: pytest.CaptureFixture[str]) -> None:
215
+ assert ex.main([str(FIXTURES / "nda_h2.md"), "--silent", "--format", "table"]) == 0
216
+ assert "Clause map" not in capsys.readouterr().out
217
+
218
+
219
+ def test_main_no_args_prints_help(capsys: pytest.CaptureFixture[str]) -> None:
220
+ assert ex.main([]) == 0
221
+ assert "usage" in capsys.readouterr().out.lower()
222
+
223
+
224
+ # --- last reachable edges ---------------------------------------------------
225
+
226
+ def test_parties_skips_empty_capture() -> None:
227
+ # The second "party" is just a parenthetical role -> cleans to an empty
228
+ # name and is skipped; the first is kept.
229
+ parties = ex.extract_parties('between Acme Corp and ("Receiving Party")')
230
+ assert [p["name"] for p in parties] == ["Acme Corp"]
231
+
232
+
233
+ def test_signatories_skips_dupes_short_and_reserved() -> None:
234
+ text = "By: Jane Doe\nName: Jane Doe\nName: a\nName: the\n"
235
+ s = ex.extract_signatories(text)
236
+ assert [x["name"] for x in s] == ["Jane Doe"]
237
+
238
+
239
+ def test_pdf_text_tj_array_branch() -> None:
240
+ # A TJ array of strings inside a text object.
241
+ assert ex._pdf_text_from_content(b"BT [(Hello) (World)] TJ ET") == "HelloWorld"
@@ -152,6 +152,46 @@ def test_docx_heading_style_helpers() -> None:
152
152
  # Run-in heading: title is the lead before the sentence body.
153
153
  assert ex._docx_heading_title("Payment. Customer will pay the fees.") == "Payment"
154
154
  assert ex._docx_heading_title("Governing Law") == "Governing Law"
155
+
156
+
157
+ def test_emit_docx_paragraph() -> None:
158
+ """The shared emitter both .docx readers use: heading styles / numbered
159
+ paragraphs become `## headings`, fully-bold lines become `**...**`."""
160
+ out: list[str] = []
161
+ ex._emit_docx_paragraph(out, "Confidentiality", "Heading2", False, False) # heading style
162
+ ex._emit_docx_paragraph(out, "Term", None, True, False) # auto-numbered
163
+ ex._emit_docx_paragraph(out, "Important Notice", None, False, True) # fully bold
164
+ ex._emit_docx_paragraph(out, "Just some body text.", None, False, False) # plain
165
+ ex._emit_docx_paragraph(out, "", None, False, False) # blank
166
+ ex._emit_docx_paragraph(out, "Payment. Customer will pay.", "Heading1", False, False) # run-in
167
+ assert out == [
168
+ "## Confidentiality",
169
+ "## Term",
170
+ "**Important Notice**",
171
+ "Just some body text.",
172
+ "",
173
+ "## Payment",
174
+ "Customer will pay.", # run-in body split onto its own line
175
+ ]
176
+
177
+
178
+ def test_docx_readers_agree_on_clause_map() -> None:
179
+ """Regression: the python-docx reader must surface the same clause map as the
180
+ stdlib reader on a heading-styled .docx. The python-docx path used to flatten
181
+ heading styles and return an empty clause map. Skips without [docx]."""
182
+ pytest.importorskip("docx")
183
+ path = FIXTURES / "heading_docx.docx"
184
+ raw = path.read_bytes()
185
+
186
+ def clause_titles(prefer_optional: bool) -> list[str]:
187
+ _raw, text, fmt, _w = ex.load_source(path, prefer_optional=prefer_optional)
188
+ result = ex.build_extraction(text, raw, fmt, "h.docx")
189
+ return [c["canonical_title"] for c in result["clauses"]]
190
+
191
+ stdlib = clause_titles(False)
192
+ pydocx = clause_titles(True)
193
+ assert stdlib, "stdlib reader should detect the heading-styled clauses"
194
+ assert pydocx == stdlib, "python-docx path must agree with the stdlib reader"
155
195
  # A full sentence carrying a heading style is rejected (not a clause title).
156
196
  assert ex._docx_heading_title(
157
197
  "Either party may terminate this Agreement upon material breach that "
File without changes
File without changes
File without changes
File without changes
File without changes