extract-cli 0.1.9__tar.gz → 0.1.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. {extract_cli-0.1.9 → extract_cli-0.1.11}/ARCHITECTURE.md +12 -5
  2. {extract_cli-0.1.9 → extract_cli-0.1.11}/CHANGELOG.md +52 -0
  3. {extract_cli-0.1.9 → extract_cli-0.1.11}/Makefile +4 -1
  4. {extract_cli-0.1.9 → extract_cli-0.1.11}/PKG-INFO +6 -5
  5. {extract_cli-0.1.9 → extract_cli-0.1.11}/README.md +5 -4
  6. {extract_cli-0.1.9 → extract_cli-0.1.11}/extract_cli.py +137 -61
  7. {extract_cli-0.1.9 → extract_cli-0.1.11}/pyproject.toml +1 -1
  8. {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/fixtures/employment_docx.docx.expected.json +2 -2
  9. {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/fixtures/heading_docx.docx.expected.json +2 -2
  10. {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/fixtures/lease_allcaps.txt.expected.json +1 -1
  11. {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/fixtures/license_pdf.pdf.expected.json +1 -1
  12. {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/fixtures/nda_h2.md.expected.json +2 -2
  13. {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/fixtures/numbered_docx.docx.expected.json +1 -1
  14. {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/fixtures/scanned.pdf.expected.json +1 -1
  15. {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/fixtures/services_bold.txt.expected.json +1 -1
  16. {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/fixtures/services_html.html.expected.json +2 -2
  17. extract_cli-0.1.11/tests/test_coverage.py +254 -0
  18. {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/test_deterministic.py +7 -0
  19. {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/test_misc.py +40 -0
  20. {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/test_schema_conformance.py +14 -0
  21. {extract_cli-0.1.9 → extract_cli-0.1.11}/.gitignore +0 -0
  22. {extract_cli-0.1.9 → extract_cli-0.1.11}/AGENTS.md +0 -0
  23. {extract_cli-0.1.9 → extract_cli-0.1.11}/CONTRIBUTING.md +0 -0
  24. {extract_cli-0.1.9 → extract_cli-0.1.11}/LICENSE +0 -0
  25. {extract_cli-0.1.9 → extract_cli-0.1.11}/config/llm.json.example +0 -0
  26. {extract_cli-0.1.9 → extract_cli-0.1.11}/docs/INTEROP.md +0 -0
  27. {extract_cli-0.1.9 → extract_cli-0.1.11}/docs/spec/extract-output.schema.json +0 -0
  28. {extract_cli-0.1.9 → extract_cli-0.1.11}/llms.txt +0 -0
  29. {extract_cli-0.1.9 → extract_cli-0.1.11}/scripts/release.py +0 -0
  30. {extract_cli-0.1.9 → extract_cli-0.1.11}/scripts/validate_against_spec.py +0 -0
  31. {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/_fixtures_build.py +0 -0
  32. {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/_make_goldens.py +0 -0
  33. {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/_schema_validator.py +0 -0
  34. {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/conftest.py +0 -0
  35. {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/fixtures/employment_docx.docx +0 -0
  36. {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/fixtures/heading_docx.docx +0 -0
  37. {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/fixtures/lease_allcaps.txt +0 -0
  38. {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/fixtures/license_pdf.pdf +0 -0
  39. {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/fixtures/nda_h2.md +0 -0
  40. {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/fixtures/numbered_docx.docx +0 -0
  41. {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/fixtures/scanned.pdf +0 -0
  42. {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/fixtures/services_bold.txt +0 -0
  43. {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/fixtures/services_html.html +0 -0
  44. {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/test_clause_map.py +0 -0
  45. {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/test_cli.py +0 -0
  46. {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/test_llm.py +0 -0
  47. {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/test_property.py +0 -0
@@ -40,16 +40,23 @@ the "verify, not trust" contract downstream tools consume.
40
40
 
41
41
  ## The clause map
42
42
 
43
- `detect_clauses(text)` is a faithful port of template-vault-cli's three-tier
44
- cascade; the first tier that fires wins so fallbacks never shadow real
45
- structure:
43
+ `detect_clauses(text)` extends template-vault-cli's clause cascade; the first
44
+ tier that fires wins so fallbacks never shadow real structure:
46
45
 
47
- 1. **`h2`** — `## Heading` (Markdown-native). Needs 1 match.
46
+ 1. **`h2`** — `## Heading` (Markdown-native; also what the DOCX reader emits for
47
+ Word heading styles / `w:numPr` paragraphs). Needs ≥ 1 match.
48
48
  2. **`bold-numbered`** — `**1. Purpose**`, `**Section 4. Term**` (typical of
49
49
  DOCX → text). Needs ≥ 2 matches.
50
- 3. **`all-caps`** — blank-line-framed `CONFIDENTIALITY` lines (typical of legal
50
+ 3. **`numbered`** — plain `1. Term`, `Section 3. Payment`, and two-line
51
+ `ARTICLE N` + title (the dominant format in foreign paper), gated by a
52
+ title-case heuristic. Needs ≥ 2 matches.
53
+ 4. **`all-caps`** — blank-line-framed `CONFIDENTIALITY` lines (typical of legal
51
54
  PDFs), with the single-token-≥-4-letters rule. Needs ≥ 2 matches.
52
55
 
56
+ (Plus an opt-in **`llm`** clause-map fallback under `--llm` when none of the
57
+ above fire — see the LLM tier below.) After detection, running headers/footers
58
+ and front/back-matter are filtered (`_is_noise_clause_title` + repeat dedup).
59
+
53
60
  `_strip_clause_number` removes leading numbering, including Roman numerals
54
61
  1–39 (`_ROMAN_RE` lists longer alternatives first so the engine doesn't
55
62
  short-circuit on a prefix — bare `V`/`X` match).
@@ -6,6 +6,56 @@ to [Semantic Versioning](https://semver.org/). Per the suite convention
6
6
  (see [`docs/INTEROP.md`](docs/INTEROP.md)), **backward-incompatible changes to
7
7
  the output schema require a major version bump**; new optional fields are minor.
8
8
 
9
+ ## [0.1.11] - 2026-05-22
10
+
11
+ Polish pass.
12
+
13
+ ### Fixed
14
+ - **Signature blocks no longer capture the next column's label.** A two-column
15
+ unsigned block (`By: By:` / `Name: Name:`) used to yield garbage
16
+ signatories like `{"name": "By:", "title": "Title:"}`; such captures (and
17
+ blank fill lines) are now rejected, so an unsigned template correctly returns
18
+ no signatories.
19
+
20
+ ### Changed
21
+ - **`extract fields` and `--format table` now surface `jurisdiction`,
22
+ `amounts`, and `signatories`** — they were extracted and in the JSON but not
23
+ discoverable via the catalog or the human table view. A drift-guard test now
24
+ asserts `extract fields` can't diverge from the output schema.
25
+ - **Confidence values centralized into a documented scale** (named `CONF_*`
26
+ constants with a single descending ladder, replacing scattered magic numbers)
27
+ so downstream "verify, not trust" thresholds are principled. The only value
28
+ change: an affirmative auto-renewal is now `0.70` (was `0.65`), matching the
29
+ other best-effort term fields.
30
+ - Docs sweep: refreshed the clause-cascade description (h2 → bold-numbered →
31
+ numbered → all-caps, + the `--llm` fallback) across README/ARCHITECTURE and
32
+ the output-shape example. Line coverage held at 100% (CI-gated).
33
+
34
+ ## [0.1.10] - 2026-05-22
35
+
36
+ ### Fixed
37
+ - **The `[docx]` (python-docx) reader now honors Word heading styles**, matching
38
+ the stdlib reader. Previously the python-docx path concatenated paragraph text
39
+ and dropped `Heading1-9`/`Title` styles and `w:numPr` numbering, so installing
40
+ the `[docx]` extra produced an **empty clause map** on heading-styled Word
41
+ contracts (worse than the no-extra stdlib reader). Both readers now share one
42
+ emitter (`_emit_docx_paragraph`) that turns heading-styled / auto-numbered
43
+ paragraphs into `## headings`, so the two paths agree. New tests:
44
+ `test_emit_docx_paragraph` and `test_docx_readers_agree_on_clause_map` (the
45
+ latter asserts the python-docx and stdlib readers produce the same clause map).
46
+ No output-schema change.
47
+
48
+ ### Tests / quality
49
+ - **Line coverage raised to 100%** (was 92%/94%). Added a targeted test battery
50
+ for the remaining reachable branches (color/`FORCE_COLOR`, `_warn` silent,
51
+ date/jurisdiction/title/clause edge returns, LLM request/parse/clause-map
52
+ branches, PDF `TJ`-array + stream/budget edges, HTML malformed fallback, DOCX
53
+ empty paragraph, `_is_low_signal` branches, CLI silent/help paths). Genuinely
54
+ unreachable defensive lines and `[docx]`/`[pdf]`-extra fidelity branches are
55
+ marked `# pragma: no cover`. `make coverage` now installs the extras and
56
+ enforces `--fail-under=100`; a CI `coverage` job gates it. No code-behavior or
57
+ schema change.
58
+
9
59
  ## [0.1.9] - 2026-05-22
10
60
 
11
61
  ### Security / robustness
@@ -271,6 +321,8 @@ Initial release — the open-loop front door of the contract-ops CLI suite.
271
321
  intentionally *not* governed by the output schema (the schema describes the
272
322
  full default output).
273
323
 
324
+ [0.1.11]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.11
325
+ [0.1.10]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.10
274
326
  [0.1.9]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.9
275
327
  [0.1.8]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.8
276
328
  [0.1.7]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.7
@@ -31,8 +31,11 @@ test-quick:
31
31
  $(PYTHON) -m pytest -x -q -k "not property"
32
32
 
33
33
  coverage:
34
+ # Install the [docx]/[pdf] extras so the fidelity-reader paths execute too;
35
+ # without them two extras-only branches stay uncovered (98% vs 100%).
36
+ $(PIP) install -q -e ".[dev,docx,pdf]"
34
37
  $(PYTHON) -m coverage run --source=extract_cli -m pytest -q
35
- $(PYTHON) -m coverage report -m
38
+ $(PYTHON) -m coverage report -m --fail-under=100
36
39
 
37
40
  typecheck:
38
41
  $(PYTHON) -m mypy --strict extract_cli.py
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-cli
3
- Version: 0.1.9
3
+ Version: 0.1.11
4
4
  Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON.
5
5
  Project-URL: Homepage, https://cli.drbaher.com/
6
6
  Project-URL: Repository, https://github.com/DrBaher/extract-cli
@@ -180,16 +180,17 @@ Streams follow the suite convention: **stdout** is the machine payload (JSON),
180
180
  "value": { "value": "$50,000", "confidence": 0.6, "source": "deterministic" },
181
181
  "amounts": [ { "value": "$50,000", "confidence": 0.6, "source": "deterministic" } ],
182
182
  "signatories": [ { "name": "Jane Doe", "title": "CEO", "confidence": 0.55, "source": "deterministic" } ],
183
- "_meta": { "extractor_version": "0.1.9", "tiers_used": ["deterministic"], "llm_used": false }
183
+ "_meta": { "extractor_version": "0.1.11", "tiers_used": ["deterministic"], "llm_used": false }
184
184
  }
185
185
  ```
186
186
 
187
187
  ## The clause map (the differentiator)
188
188
 
189
189
  A counterparty's "SECTION 7. NON-DISCLOSURE" and your template's
190
- "## Confidentiality" are the same clause. `extract-cli` reuses
191
- template-vault-cli's **clause-detection cascade** (Tier 1 `## H2` headings →
192
- Tier 2 bold-numbered `**1. …**` → Tier 3 ALL-CAPS lines) and a built-in
190
+ "## Confidentiality" are the same clause. `extract-cli` extends
191
+ template-vault-cli's **clause-detection cascade** `## H2` headings →
192
+ bold-numbered `**1. …**` → plain numbered (`1. Term`, `Section 3. …`, two-line
193
+ `ARTICLE N`) → ALL-CAPS lines (and an opt-in `--llm` fallback) — plus a built-in
193
194
  **canonical alias vocabulary** to normalize foreign clause titles onto the
194
195
  names the rest of the suite already speaks. Clauses it can't map are kept with
195
196
  `mapped: false` (and a `*` in the table view) so nothing is silently dropped.
@@ -142,16 +142,17 @@ Streams follow the suite convention: **stdout** is the machine payload (JSON),
142
142
  "value": { "value": "$50,000", "confidence": 0.6, "source": "deterministic" },
143
143
  "amounts": [ { "value": "$50,000", "confidence": 0.6, "source": "deterministic" } ],
144
144
  "signatories": [ { "name": "Jane Doe", "title": "CEO", "confidence": 0.55, "source": "deterministic" } ],
145
- "_meta": { "extractor_version": "0.1.9", "tiers_used": ["deterministic"], "llm_used": false }
145
+ "_meta": { "extractor_version": "0.1.11", "tiers_used": ["deterministic"], "llm_used": false }
146
146
  }
147
147
  ```
148
148
 
149
149
  ## The clause map (the differentiator)
150
150
 
151
151
  A counterparty's "SECTION 7. NON-DISCLOSURE" and your template's
152
- "## Confidentiality" are the same clause. `extract-cli` reuses
153
- template-vault-cli's **clause-detection cascade** (Tier 1 `## H2` headings →
154
- Tier 2 bold-numbered `**1. …**` → Tier 3 ALL-CAPS lines) and a built-in
152
+ "## Confidentiality" are the same clause. `extract-cli` extends
153
+ template-vault-cli's **clause-detection cascade** `## H2` headings →
154
+ bold-numbered `**1. …**` → plain numbered (`1. Term`, `Section 3. …`, two-line
155
+ `ARTICLE N`) → ALL-CAPS lines (and an opt-in `--llm` fallback) — plus a built-in
155
156
  **canonical alias vocabulary** to normalize foreign clause titles onto the
156
157
  names the rest of the suite already speaks. Clauses it can't map are kept with
157
158
  `mapped: false` (and a `*` in the table view) so nothing is silently dropped.
@@ -43,11 +43,11 @@ import urllib.request
43
43
  from pathlib import Path
44
44
  from typing import Any, Dict, List, Optional, Tuple
45
45
 
46
- __version__ = "0.1.9"
46
+ __version__ = "0.1.11"
47
47
 
48
48
  # Bumped independently of the package version when the *extraction logic*
49
49
  # changes in a way downstream consumers should notice. Embedded in `_meta`.
50
- EXTRACTOR_VERSION = "0.1.9"
50
+ EXTRACTOR_VERSION = "0.1.11"
51
51
 
52
52
  # JSON Schema version of the output contract (docs/spec/extract-output.schema.json).
53
53
  SCHEMA_VERSION = 1
@@ -503,6 +503,42 @@ def _none_field() -> JSON:
503
503
  return {"value": None, "confidence": 0.0, "source": "none"}
504
504
 
505
505
 
506
+ # --- Confidence scale -------------------------------------------------------
507
+ # These confidences are "verify, not trust" hints in [0, 1] -- a ranking of
508
+ # *structural certainty*, not calibrated probabilities. Higher means the
509
+ # extraction rests on more unambiguous structure; lower means a looser heuristic
510
+ # or an LLM guess. Downstream tools threshold on them, so they are centralized
511
+ # here and ordered into a single descending ladder rather than scattered as
512
+ # magic numbers:
513
+ #
514
+ # .95 explicit Markdown H2 heading
515
+ # .90 strong unambiguous pattern (parties "between X and Y"; labeled date)
516
+ # .85 clear keyword/structure (governing law; ISO date; bold-numbered heading)
517
+ # .80 keyworded but looser (plain numbered/ARTICLE heading; jurisdiction code)
518
+ # .75 structural-only heading (ALL-CAPS)
519
+ # .70 best-effort regex on common phrasing (term length, notice, auto-renew)
520
+ # .60 weak heuristic / LLM-enriched scalar (value, amounts, defined terms)
521
+ # .55 loose match (signature block, LLM obligations, non-ISO raw date)
522
+ # .50 fuzzy (LLM clause-map fallback)
523
+ CONF_H2 = 0.95
524
+ CONF_PARTIES = 0.90
525
+ CONF_DATE_LABELED = 0.90
526
+ CONF_DATE_ISO = 0.85
527
+ CONF_GOVERNING_LAW = 0.85
528
+ CONF_BOLD_HEADING = 0.85
529
+ CONF_NUMBERED_HEADING = 0.80
530
+ CONF_JURISDICTION = 0.80
531
+ CONF_ALLCAPS_HEADING = 0.75
532
+ CONF_TERM = 0.70
533
+ CONF_WEAK = 0.60
534
+ CONF_LLM = 0.60
535
+ CONF_DATE_RAW = 0.55
536
+ CONF_LLM_LIST = 0.55
537
+ CONF_SIGNATORY = 0.55
538
+ CONF_LLM_CLAUSE = 0.50
539
+ CONF_UNMAPPED_FACTOR = 0.75 # multiplier applied to a clause that doesn't map to the vocabulary
540
+
541
+
506
542
  def _titlecase(s: str) -> str:
507
543
  s = s.strip()
508
544
  if not s:
@@ -675,7 +711,7 @@ def _date_field_from_str(raw: str, base_conf: float) -> JSON:
675
711
  def _date_field(match: Optional["re.Match[str]"]) -> JSON:
676
712
  if match is None:
677
713
  return _none_field()
678
- return _date_field_from_str(match.group(1), 0.85)
714
+ return _date_field_from_str(match.group(1), CONF_DATE_ISO)
679
715
 
680
716
 
681
717
  # Trailing descriptors that follow a party's actual name and should be dropped
@@ -739,7 +775,7 @@ def extract_parties(text: str) -> List[JSON]:
739
775
  name, role = _split_name_role(raw)
740
776
  if not name or len(name) < 2 or len(name) > 120:
741
777
  continue
742
- entry: JSON = {"name": name, "confidence": 0.9, "source": "deterministic"}
778
+ entry: JSON = {"name": name, "confidence": CONF_PARTIES, "source": "deterministic"}
743
779
  entry["role"] = role
744
780
  out.append(entry)
745
781
  return out
@@ -748,7 +784,7 @@ def extract_parties(text: str) -> List[JSON]:
748
784
  def extract_dates(text: str) -> JSON:
749
785
  label = _EFFDATE_LABEL_RE.search(text)
750
786
  if label is not None:
751
- effective = _date_field_from_str(label.group(1), 0.9)
787
+ effective = _date_field_from_str(label.group(1), CONF_DATE_LABELED)
752
788
  else:
753
789
  effective = _date_field(_EFFECTIVE_RE.search(text))
754
790
  return {"effective": effective, "expiration": _date_field(_EXPIRE_RE.search(text))}
@@ -759,9 +795,9 @@ def extract_governing_law(text: str) -> JSON:
759
795
  if not m:
760
796
  return _none_field()
761
797
  juris = re.sub(r"\s+", " ", m.group(1).strip().rstrip(".,")).strip()
762
- if not juris:
798
+ if not juris: # pragma: no cover - the capture group requires a leading letter
763
799
  return _none_field()
764
- return _field(juris, 0.85)
800
+ return _field(juris, CONF_GOVERNING_LAW)
765
801
 
766
802
 
767
803
  def extract_term(text: str) -> JSON:
@@ -773,20 +809,20 @@ def extract_term(text: str) -> JSON:
773
809
  # Only emit when the captured token is a real number; otherwise the
774
810
  # match was a coincidence ("...consecutive days") -> leave as not-found.
775
811
  if num is not None:
776
- length = _field(f"{num} {unit}{'s' if num != 1 else ''}", 0.7)
812
+ length = _field(f"{num} {unit}{'s' if num != 1 else ''}", CONF_TERM)
777
813
 
778
814
  notice = _none_field()
779
815
  nm = _NOTICE_RE.search(text)
780
816
  if nm:
781
817
  days = _word_to_int(nm.group(1))
782
818
  if days is not None:
783
- notice = _field(days, 0.7)
819
+ notice = _field(days, CONF_TERM)
784
820
 
785
821
  auto = _none_field()
786
822
  if _AUTORENEW_NEG_RE.search(text):
787
- auto = _field(False, 0.7)
823
+ auto = _field(False, CONF_TERM)
788
824
  elif _AUTORENEW_POS_RE.search(text):
789
- auto = _field(True, 0.65)
825
+ auto = _field(True, CONF_TERM)
790
826
 
791
827
  return {"length": length, "auto_renew": auto, "notice_period_days": notice}
792
828
 
@@ -795,7 +831,7 @@ def extract_value(text: str) -> JSON:
795
831
  m = _MONEY_RE.search(text)
796
832
  if not m:
797
833
  return _none_field()
798
- return _field(re.sub(r"\s+", " ", m.group(0).strip()), 0.6)
834
+ return _field(re.sub(r"\s+", " ", m.group(0).strip()), CONF_WEAK)
799
835
 
800
836
 
801
837
  def extract_amounts(text: str) -> List[JSON]:
@@ -807,7 +843,7 @@ def extract_amounts(text: str) -> List[JSON]:
807
843
  seen.setdefault(amt, None)
808
844
  if len(seen) >= 30:
809
845
  break
810
- return [{"value": a, "confidence": 0.6, "source": "deterministic"} for a in seen]
846
+ return [{"value": a, "confidence": CONF_WEAK, "source": "deterministic"} for a in seen]
811
847
 
812
848
 
813
849
  # Signature blocks: "By: <name>", "Name: <name>", "Printed Name: <name>".
@@ -820,20 +856,32 @@ _SIG_TITLE_RE = re.compile(
820
856
  r"(?:^|\n)[ \t]*(?:Title|Its)[ \t]*:[ \t]*([^\n_{}\[\]]{2,60})",
821
857
  re.IGNORECASE,
822
858
  )
859
+ # A captured value is rejected when it's really the next column's label (common
860
+ # in two-column signature blocks: "By: By:") or a blank fill line.
861
+ _SIG_LABEL_RE = re.compile(r"(?:by|name|title|signature|its|date|signed|print)\b", re.IGNORECASE)
862
+
863
+
864
+ def _clean_sig_value(raw: str) -> Optional[str]:
865
+ v = re.sub(r"\s+", " ", raw).strip(" .,:")
866
+ if (len(v) < 2 or v.lower() == "the"
867
+ or not any(c.isalpha() for c in v)
868
+ or _SIG_LABEL_RE.match(v)):
869
+ return None
870
+ return v
823
871
 
824
872
 
825
873
  def extract_signatories(text: str) -> List[JSON]:
826
874
  """Best-effort signature-block names (and titles, when adjacent). Skips
827
875
  unfilled placeholders. Blank on a template; populated on executed paper."""
828
- titles = [re.sub(r"\s+", " ", m.group(1)).strip(" .,") for m in _SIG_TITLE_RE.finditer(text)]
876
+ titles = [_clean_sig_value(m.group(1)) for m in _SIG_TITLE_RE.finditer(text)]
829
877
  out: List[JSON] = []
830
878
  seen: Dict[str, None] = {}
831
879
  for i, m in enumerate(_SIGNATORY_RE.finditer(text)):
832
- name = re.sub(r"\s+", " ", m.group(1)).strip(" .,")
833
- if len(name) < 2 or name.lower() in ("the", "name", "title") or name in seen:
880
+ name = _clean_sig_value(m.group(1))
881
+ if name is None or name in seen:
834
882
  continue
835
883
  seen[name] = None
836
- entry: JSON = {"name": name, "confidence": 0.55, "source": "deterministic"}
884
+ entry: JSON = {"name": name, "confidence": CONF_SIGNATORY, "source": "deterministic"}
837
885
  entry["title"] = titles[i] if i < len(titles) else None
838
886
  out.append(entry)
839
887
  if len(out) >= 12:
@@ -869,7 +917,7 @@ def extract_jurisdiction(governing_law: JSON) -> JSON:
869
917
  if len(name) >= 5 and name in key:
870
918
  code = c
871
919
  break
872
- return _field(code, 0.8, "deterministic") if code else _none_field()
920
+ return _field(code, CONF_JURISDICTION, "deterministic") if code else _none_field()
873
921
 
874
922
 
875
923
  def extract_defined_terms(text: str) -> List[JSON]:
@@ -880,12 +928,12 @@ def extract_defined_terms(text: str) -> List[JSON]:
880
928
  # Reject sentence-like or lowercase-y captures.
881
929
  if len(term) < 2 or len(term.split()) > 6:
882
930
  continue
883
- if not term[0].isupper():
931
+ if not term[0].isupper(): # pragma: no cover - the regexes require an uppercase lead
884
932
  continue
885
933
  seen.setdefault(term, None)
886
934
  if len(seen) >= 50:
887
935
  break
888
- return [{"term": t, "confidence": 0.6, "source": "deterministic"} for t in seen]
936
+ return [{"term": t, "confidence": CONF_WEAK, "source": "deterministic"} for t in seen]
889
937
 
890
938
 
891
939
  # Detected-heading titles that are almost never real clauses: front/back-matter,
@@ -936,9 +984,9 @@ def extract_clauses(text: str) -> List[JSON]:
936
984
  continue
937
985
  canonical, mapped = _canonicalize_clause(c["title"])
938
986
  tier = c["tier"]
939
- base = {"h2": 0.95, "bold-numbered": 0.85, "numbered": 0.8,
940
- "all-caps": 0.75, "explicit": 0.95}.get(tier, 0.7)
941
- conf = round(base * (1.0 if mapped else 0.75), 2)
987
+ base = {"h2": CONF_H2, "bold-numbered": CONF_BOLD_HEADING, "numbered": CONF_NUMBERED_HEADING,
988
+ "all-caps": CONF_ALLCAPS_HEADING, "explicit": CONF_H2}.get(tier, CONF_TERM)
989
+ conf = round(base * (1.0 if mapped else CONF_UNMAPPED_FACTOR), 2)
942
990
  out.append({
943
991
  "canonical_title": canonical,
944
992
  "detected_title": c["detected"],
@@ -1075,13 +1123,20 @@ def _read_docx(path: Path, raw: bytes, prefer_optional: bool = True) -> Tuple[st
1075
1123
  mod = importlib.import_module("docx")
1076
1124
  document_cls = getattr(mod, "Document")
1077
1125
  doc = document_cls(str(path))
1126
+ w = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
1078
1127
  lines: List[str] = []
1079
1128
  for para in doc.paragraphs:
1080
1129
  line = (para.text or "").strip()
1081
- if line and para.runs and all(getattr(r, "bold", False) for r in para.runs if (r.text or "").strip()):
1082
- line = f"**{line}**"
1083
- lines.append(line)
1084
- for table in getattr(doc, "tables", []):
1130
+ # Read the style + numbering off the underlying element so the
1131
+ # cascade sees clause headings (the same logic the stdlib reader
1132
+ # applies); python-docx alone exposes neither as a heading.
1133
+ ppr = para._p.find(w + "pPr")
1134
+ style = _docx_paragraph_style(ppr, w)
1135
+ numbered = bool(ppr is not None and ppr.find(w + "numPr") is not None)
1136
+ all_bold = bool(para.runs) and all(
1137
+ getattr(r, "bold", False) for r in para.runs if (r.text or "").strip())
1138
+ _emit_docx_paragraph(lines, line, style, numbered, all_bold)
1139
+ for table in getattr(doc, "tables", []): # pragma: no cover - [docx] fidelity
1085
1140
  for row in table.rows:
1086
1141
  for cell in row.cells:
1087
1142
  ct = (cell.text or "").strip()
@@ -1130,6 +1185,30 @@ def _docx_heading_title(text: str) -> Optional[str]:
1130
1185
  return title
1131
1186
 
1132
1187
 
1188
+ def _emit_docx_paragraph(out: List[str], line: str, style: Optional[str],
1189
+ numbered: bool, all_bold: bool) -> None:
1190
+ """Append one .docx paragraph to `out` the way the clause cascade expects.
1191
+
1192
+ Heading-styled (Heading1-9/Title) or auto-numbered (`w:numPr`) paragraphs --
1193
+ whose visible number is auto-generated and absent from the text -- become a
1194
+ `## <title>` heading (with any run-in body split onto the next line) when the
1195
+ lead looks like a heading; a fully-bold paragraph becomes `**...**`; anything
1196
+ else stays plain. Shared by BOTH the python-docx and stdlib readers so the
1197
+ two paths agree on structure (the python-docx path used to flatten headings,
1198
+ losing the clause map on heading-styled Word docs)."""
1199
+ if not line:
1200
+ out.append("")
1201
+ return
1202
+ if _is_heading_style(style) or numbered:
1203
+ title = _docx_heading_title(line)
1204
+ if title is not None:
1205
+ out.append(f"## {title}")
1206
+ if len(title) < len(line):
1207
+ out.append(line[len(title):].lstrip(" .:\t"))
1208
+ return
1209
+ out.append(f"**{line}**" if all_bold else line)
1210
+
1211
+
1133
1212
  def _read_docx_stdlib(raw: bytes) -> str:
1134
1213
  import io
1135
1214
  import zipfile
@@ -1153,39 +1232,23 @@ def _read_docx_stdlib(raw: bytes) -> str:
1153
1232
  style = _docx_paragraph_style(ppr, w)
1154
1233
  numbered = ppr is not None and ppr.find(w + "numPr") is not None
1155
1234
  run_texts: List[str] = []
1156
- any_text = False
1157
1235
  all_bold = True
1158
1236
  for r in p.iter(w + "r"):
1159
1237
  rpr = r.find(w + "rPr")
1160
1238
  bold = rpr is not None and rpr.find(w + "b") is not None
1161
1239
  txt = "".join(t.text or "" for t in r.iter(w + "t"))
1162
1240
  if txt:
1163
- any_text = True
1164
1241
  if not bold:
1165
1242
  all_bold = False
1166
1243
  run_texts.append(txt)
1167
1244
  line = "".join(run_texts).strip()
1168
- if not line:
1169
- paras.append("")
1170
- continue
1171
1245
  # Clause structure in real Word contracts lives in heading STYLES
1172
1246
  # (Heading1-9/Title) or auto-NUMBERED paragraphs (w:numPr) -- in both the
1173
- # visible number is auto-generated and absent from the text. Emit such a
1174
- # paragraph as an H2 heading (strongest cascade tier) when its lead looks
1175
- # like a heading; _docx_heading_title rejects full-sentence body items
1176
- # (e.g. deep numbered sub-points), so this stays conservative. Keep any
1177
- # run-in body as a following paragraph.
1178
- if _is_heading_style(style) or numbered:
1179
- title = _docx_heading_title(line)
1180
- if title is not None:
1181
- paras.append(f"## {title}")
1182
- if len(title) < len(line):
1183
- paras.append(line[len(title):].lstrip(" .:\t"))
1184
- continue
1185
- # Not heading-like -> treat as ordinary body text.
1186
- if any_text and all_bold:
1187
- line = f"**{line}**"
1188
- paras.append(line)
1247
+ # visible number is auto-generated and absent from the text. The shared
1248
+ # emitter turns those into `## headings` (run-in body split off), bolds
1249
+ # fully-bold lines, and keeps the rest plain. _docx_heading_title rejects
1250
+ # full-sentence body items, so this stays conservative.
1251
+ _emit_docx_paragraph(paras, line, style, numbered, all_bold)
1189
1252
  return "\n\n".join(paras)
1190
1253
 
1191
1254
 
@@ -1209,7 +1272,7 @@ def _read_pdf(path: Path, raw: bytes, prefer_optional: bool = True) -> Tuple[str
1209
1272
  warnings.append(f"pypdf read failed ({e}); falling back to stdlib reader")
1210
1273
  try:
1211
1274
  text = _read_pdf_stdlib(raw)
1212
- except Exception as e:
1275
+ except Exception as e: # pragma: no cover - defensive; stdlib reader is bomb-guarded
1213
1276
  warnings.append(f"could not parse .pdf ({e}); treating as empty")
1214
1277
  return "", warnings
1215
1278
  return text, warnings
@@ -1342,7 +1405,7 @@ def load_source(path: Path, prefer_optional: bool = True) -> Tuple[bytes, str, s
1342
1405
  raise ExtractError(f"path is a directory, not a file: {path}")
1343
1406
  try:
1344
1407
  size = path.stat().st_size
1345
- except OSError:
1408
+ except OSError: # pragma: no cover - defensive; path.exists() already passed
1346
1409
  size = 0
1347
1410
  if size > MAX_INPUT_BYTES:
1348
1411
  raise ExtractError(
@@ -1557,7 +1620,7 @@ def _llm_clause_map(raw: Any, text: str) -> List[JSON]:
1557
1620
  "detected_title": title,
1558
1621
  "tier": "llm",
1559
1622
  "span": span,
1560
- "confidence": 0.5,
1623
+ "confidence": CONF_LLM_CLAUSE,
1561
1624
  "source": "llm",
1562
1625
  "mapped": mapped,
1563
1626
  })
@@ -1592,18 +1655,18 @@ def llm_enrich(result: JSON, text: str, args_ns: argparse.Namespace) -> None:
1592
1655
  enriched = False
1593
1656
  rm = obj.get("renewal_mechanics")
1594
1657
  if isinstance(rm, str) and rm.strip():
1595
- result["term"]["renewal_mechanics"] = _field(rm.strip(), 0.6, "llm")
1658
+ result["term"]["renewal_mechanics"] = _field(rm.strip(), CONF_LLM, "llm")
1596
1659
  enriched = True
1597
1660
  obligations = obj.get("obligations")
1598
1661
  if isinstance(obligations, list) and obligations:
1599
1662
  result["obligations"] = [
1600
- {"text": str(o).strip(), "confidence": 0.55, "source": "llm"}
1663
+ {"text": str(o).strip(), "confidence": CONF_LLM_LIST, "source": "llm"}
1601
1664
  for o in obligations[:5] if str(o).strip()
1602
1665
  ]
1603
1666
  enriched = True
1604
1667
  gl = obj.get("governing_law")
1605
1668
  if isinstance(gl, str) and gl.strip() and result["governing_law"]["source"] == "none":
1606
- result["governing_law"] = _field(gl.strip(), 0.6, "llm")
1669
+ result["governing_law"] = _field(gl.strip(), CONF_LLM, "llm")
1607
1670
  enriched = True
1608
1671
  if want_clauses:
1609
1672
  cmap = _llm_clause_map(obj.get("clauses"), text)
@@ -1692,10 +1755,20 @@ def render_table(result: JSON, no_confidence: bool) -> str:
1692
1755
  lines.append(f" renewal : {_fv(term['renewal_mechanics'])} {_dim('[llm]')}")
1693
1756
  if "governing_law" in result:
1694
1757
  lines.append(_bold("Governing law"))
1695
- lines.append(f" {_fv(result['governing_law'])}")
1758
+ juris = result.get("jurisdiction", {}).get("value")
1759
+ suffix = _dim(f" [{juris}]") if juris else ""
1760
+ lines.append(f" {_fv(result['governing_law'])}{suffix}")
1696
1761
  if "value" in result:
1762
+ amts = result.get("amounts") or []
1763
+ extra = _dim(f" (+{len(amts) - 1} more)") if len(amts) > 1 else ""
1697
1764
  lines.append(_bold("Value"))
1698
- lines.append(f" {_fv(result['value'])}")
1765
+ lines.append(f" {_fv(result['value'])}{extra}")
1766
+ signatories = result.get("signatories")
1767
+ if signatories:
1768
+ lines.append(_bold(f"Signatories ({len(signatories)})"))
1769
+ for s in signatories[:6]:
1770
+ title = f" - {s['title']}" if s.get("title") else ""
1771
+ lines.append(f" {s['name']}{title}")
1699
1772
  clauses = result.get("clauses")
1700
1773
  if clauses is not None:
1701
1774
  lines.append(_bold(f"Clause map ({len(clauses)})"))
@@ -1920,11 +1993,14 @@ FIELD_CATALOG: Tuple[Tuple[str, str, str], ...] = (
1920
1993
  ("term.length", "deterministic", "Term length, best-effort"),
1921
1994
  ("term.notice_period_days", "deterministic", "Notice period in days, best-effort"),
1922
1995
  ("term.auto_renew", "deterministic", "Auto-renewal flag, best-effort"),
1923
- ("governing_law", "deterministic", "Governing law / jurisdiction"),
1996
+ ("governing_law", "deterministic", "Governing law text ('governed by the laws of ...')"),
1997
+ ("jurisdiction", "deterministic", "Governing law normalized to a code (e.g. US-DE)"),
1924
1998
  ("clauses", "deterministic", "Clause map normalized to the suite's canonical vocabulary "
1925
1999
  "(LLM fallback under --llm when no headings are detected)"),
1926
2000
  ("defined_terms", "deterministic", "Defined-term inventory (quoted / parenthetical)"),
1927
2001
  ("value", "deterministic", "Headline monetary value"),
2002
+ ("amounts", "deterministic", "All distinct monetary amounts"),
2003
+ ("signatories", "deterministic", "Signature-block names/titles (By:/Name:/Title:)"),
1928
2004
  ("term.renewal_mechanics", "llm", "Renewal mechanics (fuzzy; --llm only)"),
1929
2005
  ("obligations", "llm", "Key obligation phrasing (fuzzy; --llm only)"),
1930
2006
  )
@@ -2315,7 +2391,7 @@ def main(argv: Optional[List[str]] = None) -> int:
2315
2391
  if hasattr(_stream, "reconfigure"):
2316
2392
  try:
2317
2393
  _stream.reconfigure(encoding="utf-8", errors="replace")
2318
- except Exception:
2394
+ except Exception: # pragma: no cover - defensive
2319
2395
  pass
2320
2396
 
2321
2397
  argv = sys.argv[1:] if argv is None else argv
@@ -2358,7 +2434,7 @@ def main(argv: Optional[List[str]] = None) -> int:
2358
2434
  if first in known:
2359
2435
  parser = build_parser()
2360
2436
  args = parser.parse_args(argv)
2361
- if not getattr(args, "func", None):
2437
+ if not getattr(args, "func", None): # pragma: no cover - argparse always sets func
2362
2438
  parser.print_help()
2363
2439
  return 0
2364
2440
  else:
@@ -2370,7 +2446,7 @@ def main(argv: Optional[List[str]] = None) -> int:
2370
2446
  except BrokenPipeError: # e.g. `extract foo.md | head`
2371
2447
  try:
2372
2448
  sys.stdout.close()
2373
- except Exception:
2449
+ except Exception: # pragma: no cover - defensive
2374
2450
  pass
2375
2451
  return 0
2376
2452
  except KeyboardInterrupt: # pragma: no cover
@@ -2378,5 +2454,5 @@ def main(argv: Optional[List[str]] = None) -> int:
2378
2454
  return 130
2379
2455
 
2380
2456
 
2381
- if __name__ == "__main__":
2457
+ if __name__ == "__main__": # pragma: no cover
2382
2458
  sys.exit(main())
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "extract-cli"
7
- version = "0.1.9"
7
+ version = "0.1.11"
8
8
  description = "Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
@@ -39,7 +39,7 @@
39
39
  },
40
40
  "auto_renew": {
41
41
  "value": true,
42
- "confidence": 0.65,
42
+ "confidence": 0.7,
43
43
  "source": "deterministic"
44
44
  },
45
45
  "notice_period_days": {
@@ -151,7 +151,7 @@
151
151
  ],
152
152
  "signatories": [],
153
153
  "_meta": {
154
- "extractor_version": "0.1.9",
154
+ "extractor_version": "0.1.11",
155
155
  "tiers_used": [
156
156
  "deterministic"
157
157
  ],
@@ -39,7 +39,7 @@
39
39
  },
40
40
  "auto_renew": {
41
41
  "value": true,
42
- "confidence": 0.65,
42
+ "confidence": 0.7,
43
43
  "source": "deterministic"
44
44
  },
45
45
  "notice_period_days": {
@@ -140,7 +140,7 @@
140
140
  "amounts": [],
141
141
  "signatories": [],
142
142
  "_meta": {
143
- "extractor_version": "0.1.9",
143
+ "extractor_version": "0.1.11",
144
144
  "tiers_used": [
145
145
  "deterministic"
146
146
  ],
@@ -146,7 +146,7 @@
146
146
  ],
147
147
  "signatories": [],
148
148
  "_meta": {
149
- "extractor_version": "0.1.9",
149
+ "extractor_version": "0.1.11",
150
150
  "tiers_used": [
151
151
  "deterministic"
152
152
  ],
@@ -146,7 +146,7 @@
146
146
  ],
147
147
  "signatories": [],
148
148
  "_meta": {
149
- "extractor_version": "0.1.9",
149
+ "extractor_version": "0.1.11",
150
150
  "tiers_used": [
151
151
  "deterministic"
152
152
  ],
@@ -39,7 +39,7 @@
39
39
  },
40
40
  "auto_renew": {
41
41
  "value": true,
42
- "confidence": 0.65,
42
+ "confidence": 0.7,
43
43
  "source": "deterministic"
44
44
  },
45
45
  "notice_period_days": {
@@ -150,7 +150,7 @@
150
150
  "amounts": [],
151
151
  "signatories": [],
152
152
  "_meta": {
153
- "extractor_version": "0.1.9",
153
+ "extractor_version": "0.1.11",
154
154
  "tiers_used": [
155
155
  "deterministic"
156
156
  ],
@@ -140,7 +140,7 @@
140
140
  "amounts": [],
141
141
  "signatories": [],
142
142
  "_meta": {
143
- "extractor_version": "0.1.9",
143
+ "extractor_version": "0.1.11",
144
144
  "tiers_used": [
145
145
  "deterministic"
146
146
  ],
@@ -55,7 +55,7 @@
55
55
  "amounts": [],
56
56
  "signatories": [],
57
57
  "_meta": {
58
- "extractor_version": "0.1.9",
58
+ "extractor_version": "0.1.11",
59
59
  "tiers_used": [
60
60
  "deterministic"
61
61
  ],
@@ -146,7 +146,7 @@
146
146
  ],
147
147
  "signatories": [],
148
148
  "_meta": {
149
- "extractor_version": "0.1.9",
149
+ "extractor_version": "0.1.11",
150
150
  "tiers_used": [
151
151
  "deterministic"
152
152
  ],
@@ -39,7 +39,7 @@
39
39
  },
40
40
  "auto_renew": {
41
41
  "value": true,
42
- "confidence": 0.65,
42
+ "confidence": 0.7,
43
43
  "source": "deterministic"
44
44
  },
45
45
  "notice_period_days": {
@@ -161,7 +161,7 @@
161
161
  ],
162
162
  "signatories": [],
163
163
  "_meta": {
164
- "extractor_version": "0.1.9",
164
+ "extractor_version": "0.1.11",
165
165
  "tiers_used": [
166
166
  "deterministic"
167
167
  ],
@@ -0,0 +1,254 @@
1
+ """Targeted tests that exercise the remaining reachable branches, to keep line
2
+ coverage at its practical maximum. (Genuinely-unreachable defensive lines and
3
+ [docx]/[pdf]-extra fidelity branches are marked `# pragma: no cover` in the
4
+ source.)"""
5
+ from __future__ import annotations
6
+
7
+ import argparse
8
+ import io
9
+ import json
10
+ import sys as _sys
11
+ import zipfile
12
+ from typing import Any
13
+
14
+ import pytest
15
+
16
+ import extract_cli as ex
17
+ from tests.conftest import FIXTURES
18
+
19
+
20
+ def _ns(**kw: object) -> argparse.Namespace:
21
+ base = {"silent": False, "why": False}
22
+ base.update(kw)
23
+ return argparse.Namespace(**base)
24
+
25
+
26
+ # --- color + warn -----------------------------------------------------------
27
+
28
+ def test_color_force_on_and_isatty_exception(monkeypatch: pytest.MonkeyPatch) -> None:
29
+ monkeypatch.delenv("NO_COLOR", raising=False)
30
+ monkeypatch.setenv("FORCE_COLOR", "1")
31
+ assert ex._color_enabled() is True
32
+ assert ex._c("x", "32") == "\033[32mx\033[0m"
33
+ monkeypatch.delenv("FORCE_COLOR", raising=False)
34
+
35
+ class _Bad:
36
+ def isatty(self) -> bool:
37
+ raise ValueError("boom")
38
+ assert ex._color_enabled(_Bad()) is False
39
+
40
+
41
+ def test_warn_silent_is_suppressed(capsys: pytest.CaptureFixture[str]) -> None:
42
+ ex._warn(_ns(silent=True), "hush")
43
+ assert capsys.readouterr().err == ""
44
+
45
+
46
+ # --- small helpers ----------------------------------------------------------
47
+
48
+ def test_titlecase_edges() -> None:
49
+ assert ex._titlecase(" ") == ""
50
+ assert ex._titlecase("IP Rights") == "IP Rights" # acronym preserved in mixed case
51
+
52
+
53
+ def test_word_to_int_digit_and_unknown() -> None:
54
+ assert ex._word_to_int("30") == 30
55
+ assert ex._word_to_int("zzz") is None
56
+
57
+
58
+ def test_date_parse_none_and_unparseable_raw() -> None:
59
+ assert ex._parse_date_to_iso("not a date") is None
60
+ f = ex._date_field_from_str("13/13/2024", 0.85) # matches shape, invalid month
61
+ assert f["source"] == "deterministic" and f["confidence"] < 0.85
62
+
63
+
64
+ def test_canonicalize_empty_key() -> None:
65
+ assert ex._canonicalize_clause(" ") == (None, False)
66
+ assert ex._canonicalize_clause("1.") == (None, False)
67
+
68
+
69
+ def test_governing_law_and_title_none() -> None:
70
+ assert ex.extract_governing_law("no law clause here")["source"] == "none"
71
+ assert ex.extract_title("", None, "text") is None
72
+
73
+
74
+ def test_defined_terms_long_and_capped() -> None:
75
+ long_phrase = '"This Is A Very Long Quoted Heading Phrase Indeed"' # > 6 words
76
+ many = " ".join(f'"Term {i}"' for i in range(60))
77
+ terms = [t["term"] for t in ex.extract_defined_terms(long_phrase + " " + many)]
78
+ assert not any("Very Long" in t for t in terms)
79
+ assert len(terms) <= 50
80
+
81
+
82
+ def test_noise_placeholder_midstring() -> None:
83
+ # Placeholder not at the start -> the mid-string regex branch.
84
+ assert ex._is_noise_clause_title("Fee [ # ]% Cap")
85
+ assert ex._is_noise_clause_title("{placeholder}")
86
+
87
+
88
+ # --- format / readers -------------------------------------------------------
89
+
90
+ def test_detect_format_by_magic_bytes(tmp_path: Any) -> None:
91
+ p = tmp_path / "x.dat"
92
+ p.write_bytes(b"%PDF-1.4\nrest")
93
+ assert ex._detect_format(p, p.read_bytes()) == "pdf"
94
+ q = tmp_path / "y.dat"
95
+ q.write_bytes(b"PK\x03\x04rest")
96
+ assert ex._detect_format(q, q.read_bytes()) == "docx"
97
+
98
+
99
+ def test_pdf_stream_without_endstream() -> None:
100
+ assert ex._read_pdf_stdlib(b"%PDF\nstream\n(text) Tj") == ""
101
+
102
+
103
+ def test_pdf_decompression_budget_break(monkeypatch: pytest.MonkeyPatch) -> None:
104
+ import zlib
105
+ monkeypatch.setattr(ex, "MAX_DECOMPRESSED_BYTES", 10)
106
+ blob = b"%PDF\nstream\n" + zlib.compress(b"(Hello World) Tj " * 10) + b"\nendstream"
107
+ assert ex._read_pdf_stdlib(blob) == "" # exceeds the tiny budget -> bail, no text
108
+
109
+
110
+ def test_html_malformed_falls_back(monkeypatch: pytest.MonkeyPatch) -> None:
111
+ def boom(self: object, data: object) -> None:
112
+ raise ValueError("bad markup")
113
+ monkeypatch.setattr(ex._HTMLTextExtractor, "feed", boom)
114
+ out = ex._read_html("<p>hello <b>world</b></p>")
115
+ assert "hello" in out and "<" not in out # crude tag-strip fallback
116
+
117
+
118
+ def test_docx_empty_paragraph_stdlib() -> None:
119
+ w = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
120
+ body = '<w:p/><w:p><w:r><w:t>Hello</w:t></w:r></w:p>'
121
+ doc = f'<?xml version="1.0"?><w:document xmlns:w="{w}"><w:body>{body}</w:body></w:document>'
122
+ buf = io.BytesIO()
123
+ with zipfile.ZipFile(buf, "w") as z:
124
+ z.writestr("[Content_Types].xml", "<Types/>")
125
+ z.writestr("word/document.xml", doc)
126
+ assert "Hello" in ex._read_docx_stdlib(buf.getvalue())
127
+
128
+
129
+ # --- clause detection edges -------------------------------------------------
130
+
131
+ def test_clause_heading_on_last_line() -> None:
132
+ clauses = ex.detect_clauses("## First\n\nbody text\n\n## Last") # no trailing newline
133
+ assert clauses[-1]["title"] == "Last"
134
+
135
+
136
+ def test_two_line_article_skips_non_heading_next_line() -> None:
137
+ text = ("ARTICLE I\n\nThis whole next line is a long running sentence, not a heading at all.\n\n"
138
+ "ARTICLE II\n\nCONFIDENTIALITY\n\nbody\n\nARTICLE III\n\nGOVERNING LAW\n\nbody")
139
+ titles = [c["title"] for c in ex.detect_clauses(text)]
140
+ assert "CONFIDENTIALITY" in titles and "GOVERNING LAW" in titles
141
+
142
+
143
+ def test_is_low_signal_each_branch() -> None:
144
+ def base() -> dict:
145
+ return {"parties": [], "clauses": [],
146
+ "dates": {"effective": ex._none_field(), "expiration": ex._none_field()},
147
+ "governing_law": ex._none_field(), "defined_terms": []}
148
+ r = base(); r["clauses"] = [{}]; assert ex._is_low_signal(r) is False
149
+ r = base(); r["dates"]["effective"] = ex._field("2024-01-01", 0.85); assert ex._is_low_signal(r) is False
150
+ r = base(); r["governing_law"] = ex._field("X", 0.8); assert ex._is_low_signal(r) is False
151
+ r = base(); r["defined_terms"] = [{"term": "X"}]; assert ex._is_low_signal(r) is False
152
+ assert ex._is_low_signal(base()) is True
153
+
154
+
155
+ # --- LLM internals (mocked transport) ---------------------------------------
156
+
157
+ class _Resp:
158
+ def __init__(self, body: bytes) -> None:
159
+ self._b = body
160
+
161
+ def read(self) -> bytes:
162
+ return self._b
163
+
164
+ def __enter__(self) -> "_Resp":
165
+ return self
166
+
167
+ def __exit__(self, *a: object) -> bool:
168
+ return False
169
+
170
+
171
+ def test_llm_request_openai_no_choices(monkeypatch: pytest.MonkeyPatch) -> None:
172
+ monkeypatch.setattr(ex.urllib.request, "urlopen",
173
+ lambda req, timeout=30.0: _Resp(json.dumps({"choices": []}).encode()))
174
+ assert ex._llm_request({"provider": "openai", "api_key": "k"}, "p") is None
175
+
176
+
177
+ def test_extract_json_object_invalid() -> None:
178
+ assert ex._extract_json_object("prefix {not valid json} suffix") is None
179
+
180
+
181
+ def test_llm_clause_map_skips() -> None:
182
+ cm = ex._llm_clause_map(
183
+ [{"title": ""}, 123, {"title": "Recitals"}, {"title": "Confidentiality"},
184
+ {"title": "Confidentiality"}], "Confidentiality body")
185
+ assert [c["canonical_title"] for c in cm] == ["Confidentiality"]
186
+
187
+
188
+ def test_load_llm_config_malformed(monkeypatch: pytest.MonkeyPatch, tmp_path: Any) -> None:
189
+ bad = tmp_path / "llm.json"
190
+ bad.write_text("{not json")
191
+ monkeypatch.setattr(ex, "LLM_CONFIG_PATHS", (bad,))
192
+ assert ex.load_llm_config() is None
193
+
194
+
195
+ def test_llm_enrich_empty_and_unparseable(monkeypatch: pytest.MonkeyPatch,
196
+ capsys: pytest.CaptureFixture[str]) -> None:
197
+ monkeypatch.setattr(ex, "load_llm_config", lambda: {"provider": "anthropic", "api_key": "k"})
198
+ text = "x"
199
+ monkeypatch.setattr(ex, "_llm_request", lambda c, p, timeout=30.0: "")
200
+ ex.llm_enrich(ex.build_extraction(text, text.encode(), "text", "x.txt"), text, _ns())
201
+ assert "no content" in capsys.readouterr().err
202
+ monkeypatch.setattr(ex, "_llm_request", lambda c, p, timeout=30.0: "not json at all")
203
+ ex.llm_enrich(ex.build_extraction(text, text.encode(), "text", "x.txt"), text, _ns())
204
+ assert "could not parse" in capsys.readouterr().err
205
+
206
+
207
+ # --- rendering / CLI edges --------------------------------------------------
208
+
209
+ def test_render_table_unmapped_legend() -> None:
210
+ r = ex.build_extraction("## Zorblax Provisions\n\nbody", b"x", "markdown", "x.md")
211
+ assert "* = not mapped" in ex.render_table(r, no_confidence=False)
212
+
213
+
214
+ def test_render_table_jurisdiction_amounts_signatories() -> None:
215
+ r = ex.build_extraction("body", b"x", "markdown", "x.md")
216
+ r["jurisdiction"] = ex._field("US-DE", ex.CONF_JURISDICTION)
217
+ r["amounts"] = [{"value": "$1", "confidence": 0.6, "source": "deterministic"},
218
+ {"value": "$2", "confidence": 0.6, "source": "deterministic"}]
219
+ r["signatories"] = [{"name": "Jane Doe", "title": "CEO",
220
+ "confidence": ex.CONF_SIGNATORY, "source": "deterministic"}]
221
+ table = ex.render_table(r, no_confidence=False)
222
+ assert "US-DE" in table
223
+ assert "+1 more" in table
224
+ assert "Signatories (1)" in table and "Jane Doe - CEO" in table
225
+
226
+
227
+ def test_cli_silent_table_suppresses_human_view(capsys: pytest.CaptureFixture[str]) -> None:
228
+ assert ex.main([str(FIXTURES / "nda_h2.md"), "--silent", "--format", "table"]) == 0
229
+ assert "Clause map" not in capsys.readouterr().out
230
+
231
+
232
+ def test_main_no_args_prints_help(capsys: pytest.CaptureFixture[str]) -> None:
233
+ assert ex.main([]) == 0
234
+ assert "usage" in capsys.readouterr().out.lower()
235
+
236
+
237
+ # --- last reachable edges ---------------------------------------------------
238
+
239
+ def test_parties_skips_empty_capture() -> None:
240
+ # The second "party" is just a parenthetical role -> cleans to an empty
241
+ # name and is skipped; the first is kept.
242
+ parties = ex.extract_parties('between Acme Corp and ("Receiving Party")')
243
+ assert [p["name"] for p in parties] == ["Acme Corp"]
244
+
245
+
246
+ def test_signatories_skips_dupes_short_and_reserved() -> None:
247
+ text = "By: Jane Doe\nName: Jane Doe\nName: a\nName: the\n"
248
+ s = ex.extract_signatories(text)
249
+ assert [x["name"] for x in s] == ["Jane Doe"]
250
+
251
+
252
+ def test_pdf_text_tj_array_branch() -> None:
253
+ # A TJ array of strings inside a text object.
254
+ assert ex._pdf_text_from_content(b"BT [(Hello) (World)] TJ ET") == "HelloWorld"
@@ -165,6 +165,13 @@ def test_signatories() -> None:
165
165
  assert ex.extract_signatories("Name: {party_1_signatory}\nBy: _____________") == []
166
166
 
167
167
 
168
+ def test_signatories_two_column_blank_block() -> None:
169
+ # An unsigned two-column block ("By: By:") must NOT capture the next
170
+ # column's label as a name.
171
+ text = "By: By:\nName: Name:\nTitle: Title:\n"
172
+ assert ex.extract_signatories(text) == []
173
+
174
+
168
175
  def test_value_money() -> None:
169
176
  assert ex.extract_value("a fee of $250,000 is due")["value"] == "$250,000"
170
177
  assert ex.extract_value("budget is USD 1.5 million")["value"].startswith("USD")
@@ -152,6 +152,46 @@ def test_docx_heading_style_helpers() -> None:
152
152
  # Run-in heading: title is the lead before the sentence body.
153
153
  assert ex._docx_heading_title("Payment. Customer will pay the fees.") == "Payment"
154
154
  assert ex._docx_heading_title("Governing Law") == "Governing Law"
155
+
156
+
157
+ def test_emit_docx_paragraph() -> None:
158
+ """The shared emitter both .docx readers use: heading styles / numbered
159
+ paragraphs become `## headings`, fully-bold lines become `**...**`."""
160
+ out: list[str] = []
161
+ ex._emit_docx_paragraph(out, "Confidentiality", "Heading2", False, False) # heading style
162
+ ex._emit_docx_paragraph(out, "Term", None, True, False) # auto-numbered
163
+ ex._emit_docx_paragraph(out, "Important Notice", None, False, True) # fully bold
164
+ ex._emit_docx_paragraph(out, "Just some body text.", None, False, False) # plain
165
+ ex._emit_docx_paragraph(out, "", None, False, False) # blank
166
+ ex._emit_docx_paragraph(out, "Payment. Customer will pay.", "Heading1", False, False) # run-in
167
+ assert out == [
168
+ "## Confidentiality",
169
+ "## Term",
170
+ "**Important Notice**",
171
+ "Just some body text.",
172
+ "",
173
+ "## Payment",
174
+ "Customer will pay.", # run-in body split onto its own line
175
+ ]
176
+
177
+
178
+ def test_docx_readers_agree_on_clause_map() -> None:
179
+ """Regression: the python-docx reader must surface the same clause map as the
180
+ stdlib reader on a heading-styled .docx. The python-docx path used to flatten
181
+ heading styles and return an empty clause map. Skips without [docx]."""
182
+ pytest.importorskip("docx")
183
+ path = FIXTURES / "heading_docx.docx"
184
+ raw = path.read_bytes()
185
+
186
+ def clause_titles(prefer_optional: bool) -> list[str]:
187
+ _raw, text, fmt, _w = ex.load_source(path, prefer_optional=prefer_optional)
188
+ result = ex.build_extraction(text, raw, fmt, "h.docx")
189
+ return [c["canonical_title"] for c in result["clauses"]]
190
+
191
+ stdlib = clause_titles(False)
192
+ pydocx = clause_titles(True)
193
+ assert stdlib, "stdlib reader should detect the heading-styled clauses"
194
+ assert pydocx == stdlib, "python-docx path must agree with the stdlib reader"
155
195
  # A full sentence carrying a heading style is rejected (not a clause title).
156
196
  assert ex._docx_heading_title(
157
197
  "Either party may terminate this Agreement upon material breach that "
@@ -56,6 +56,20 @@ def test_schema_command_emits_committed_spec() -> None:
56
56
  assert json.loads(proc.stdout) == json.loads(SPEC_FILE.read_text(encoding="utf-8"))
57
57
 
58
58
 
59
+ def test_fields_catalog_covers_schema() -> None:
60
+ """`extract fields` (FIELD_CATALOG) must not silently drift from the output
61
+ schema -- every top-level output field appears in the catalog."""
62
+ schema_top = set(SCHEMA["properties"]) - {"_meta"}
63
+ catalog_prefixes = {f.split(".")[0] for f, _tier, _desc in ex.FIELD_CATALOG}
64
+ assert schema_top - catalog_prefixes == set()
65
+
66
+
67
+ def test_confidence_scale_is_a_descending_ladder() -> None:
68
+ assert ex.CONF_H2 >= ex.CONF_PARTIES >= ex.CONF_GOVERNING_LAW >= ex.CONF_NUMBERED_HEADING
69
+ assert ex.CONF_ALLCAPS_HEADING >= ex.CONF_TERM >= ex.CONF_WEAK >= ex.CONF_LLM_CLAUSE
70
+ assert 0.0 < ex.CONF_LLM_CLAUSE and ex.CONF_H2 <= 1.0
71
+
72
+
59
73
  def test_schema_is_self_describing() -> None:
60
74
  assert SCHEMA["$schema"] == "https://json-schema.org/draft/2020-12/schema"
61
75
  assert "extract-cli" in SCHEMA["title"]
File without changes
File without changes
File without changes
File without changes