extract-cli 0.1.10__tar.gz → 0.1.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. {extract_cli-0.1.10 → extract_cli-0.1.12}/ARCHITECTURE.md +12 -5
  2. {extract_cli-0.1.10 → extract_cli-0.1.12}/CHANGELOG.md +39 -0
  3. {extract_cli-0.1.10 → extract_cli-0.1.12}/PKG-INFO +6 -5
  4. {extract_cli-0.1.10 → extract_cli-0.1.12}/README.md +5 -4
  5. {extract_cli-0.1.10 → extract_cli-0.1.12}/extract_cli.py +120 -36
  6. {extract_cli-0.1.10 → extract_cli-0.1.12}/pyproject.toml +1 -1
  7. {extract_cli-0.1.10 → extract_cli-0.1.12}/tests/fixtures/employment_docx.docx.expected.json +2 -2
  8. {extract_cli-0.1.10 → extract_cli-0.1.12}/tests/fixtures/heading_docx.docx.expected.json +2 -2
  9. {extract_cli-0.1.10 → extract_cli-0.1.12}/tests/fixtures/lease_allcaps.txt.expected.json +1 -1
  10. {extract_cli-0.1.10 → extract_cli-0.1.12}/tests/fixtures/license_pdf.pdf.expected.json +1 -1
  11. {extract_cli-0.1.10 → extract_cli-0.1.12}/tests/fixtures/nda_h2.md.expected.json +2 -2
  12. {extract_cli-0.1.10 → extract_cli-0.1.12}/tests/fixtures/numbered_docx.docx.expected.json +1 -1
  13. {extract_cli-0.1.10 → extract_cli-0.1.12}/tests/fixtures/scanned.pdf.expected.json +1 -1
  14. {extract_cli-0.1.10 → extract_cli-0.1.12}/tests/fixtures/services_bold.txt.expected.json +1 -1
  15. {extract_cli-0.1.10 → extract_cli-0.1.12}/tests/fixtures/services_html.html.expected.json +2 -2
  16. {extract_cli-0.1.10 → extract_cli-0.1.12}/tests/test_coverage.py +13 -0
  17. {extract_cli-0.1.10 → extract_cli-0.1.12}/tests/test_deterministic.py +7 -0
  18. {extract_cli-0.1.10 → extract_cli-0.1.12}/tests/test_misc.py +24 -0
  19. {extract_cli-0.1.10 → extract_cli-0.1.12}/tests/test_schema_conformance.py +14 -0
  20. {extract_cli-0.1.10 → extract_cli-0.1.12}/.gitignore +0 -0
  21. {extract_cli-0.1.10 → extract_cli-0.1.12}/AGENTS.md +0 -0
  22. {extract_cli-0.1.10 → extract_cli-0.1.12}/CONTRIBUTING.md +0 -0
  23. {extract_cli-0.1.10 → extract_cli-0.1.12}/LICENSE +0 -0
  24. {extract_cli-0.1.10 → extract_cli-0.1.12}/Makefile +0 -0
  25. {extract_cli-0.1.10 → extract_cli-0.1.12}/config/llm.json.example +0 -0
  26. {extract_cli-0.1.10 → extract_cli-0.1.12}/docs/INTEROP.md +0 -0
  27. {extract_cli-0.1.10 → extract_cli-0.1.12}/docs/spec/extract-output.schema.json +0 -0
  28. {extract_cli-0.1.10 → extract_cli-0.1.12}/llms.txt +0 -0
  29. {extract_cli-0.1.10 → extract_cli-0.1.12}/scripts/release.py +0 -0
  30. {extract_cli-0.1.10 → extract_cli-0.1.12}/scripts/validate_against_spec.py +0 -0
  31. {extract_cli-0.1.10 → extract_cli-0.1.12}/tests/_fixtures_build.py +0 -0
  32. {extract_cli-0.1.10 → extract_cli-0.1.12}/tests/_make_goldens.py +0 -0
  33. {extract_cli-0.1.10 → extract_cli-0.1.12}/tests/_schema_validator.py +0 -0
  34. {extract_cli-0.1.10 → extract_cli-0.1.12}/tests/conftest.py +0 -0
  35. {extract_cli-0.1.10 → extract_cli-0.1.12}/tests/fixtures/employment_docx.docx +0 -0
  36. {extract_cli-0.1.10 → extract_cli-0.1.12}/tests/fixtures/heading_docx.docx +0 -0
  37. {extract_cli-0.1.10 → extract_cli-0.1.12}/tests/fixtures/lease_allcaps.txt +0 -0
  38. {extract_cli-0.1.10 → extract_cli-0.1.12}/tests/fixtures/license_pdf.pdf +0 -0
  39. {extract_cli-0.1.10 → extract_cli-0.1.12}/tests/fixtures/nda_h2.md +0 -0
  40. {extract_cli-0.1.10 → extract_cli-0.1.12}/tests/fixtures/numbered_docx.docx +0 -0
  41. {extract_cli-0.1.10 → extract_cli-0.1.12}/tests/fixtures/scanned.pdf +0 -0
  42. {extract_cli-0.1.10 → extract_cli-0.1.12}/tests/fixtures/services_bold.txt +0 -0
  43. {extract_cli-0.1.10 → extract_cli-0.1.12}/tests/fixtures/services_html.html +0 -0
  44. {extract_cli-0.1.10 → extract_cli-0.1.12}/tests/test_clause_map.py +0 -0
  45. {extract_cli-0.1.10 → extract_cli-0.1.12}/tests/test_cli.py +0 -0
  46. {extract_cli-0.1.10 → extract_cli-0.1.12}/tests/test_llm.py +0 -0
  47. {extract_cli-0.1.10 → extract_cli-0.1.12}/tests/test_property.py +0 -0
@@ -40,16 +40,23 @@ the "verify, not trust" contract downstream tools consume.
40
40
 
41
41
  ## The clause map
42
42
 
43
- `detect_clauses(text)` is a faithful port of template-vault-cli's three-tier
44
- cascade; the first tier that fires wins so fallbacks never shadow real
45
- structure:
43
+ `detect_clauses(text)` extends template-vault-cli's clause cascade; the first
44
+ tier that fires wins so fallbacks never shadow real structure:
46
45
 
47
- 1. **`h2`** — `## Heading` (Markdown-native). Needs 1 match.
46
+ 1. **`h2`** — `## Heading` (Markdown-native; also what the DOCX reader emits for
47
+ Word heading styles / `w:numPr` paragraphs). Needs ≥ 1 match.
48
48
  2. **`bold-numbered`** — `**1. Purpose**`, `**Section 4. Term**` (typical of
49
49
  DOCX → text). Needs ≥ 2 matches.
50
- 3. **`all-caps`** — blank-line-framed `CONFIDENTIALITY` lines (typical of legal
50
+ 3. **`numbered`** — plain `1. Term`, `Section 3. Payment`, and two-line
51
+ `ARTICLE N` + title (the dominant format in foreign paper), gated by a
52
+ title-case heuristic. Needs ≥ 2 matches.
53
+ 4. **`all-caps`** — blank-line-framed `CONFIDENTIALITY` lines (typical of legal
51
54
  PDFs), with the single-token-≥-4-letters rule. Needs ≥ 2 matches.
52
55
 
56
+ (Plus an opt-in **`llm`** clause-map fallback under `--llm` when none of the
57
+ above fire — see the LLM tier below.) After detection, running headers/footers
58
+ and front/back-matter are filtered (`_is_noise_clause_title` + repeat dedup).
59
+
53
60
  `_strip_clause_number` removes leading numbering, including Roman numerals
54
61
  1–39 (`_ROMAN_RE` lists longer alternatives first so the engine doesn't
55
62
  short-circuit on a prefix — bare `V`/`X` match).
@@ -6,6 +6,43 @@ to [Semantic Versioning](https://semver.org/). Per the suite convention
6
6
  (see [`docs/INTEROP.md`](docs/INTEROP.md)), **backward-incompatible changes to
7
7
  the output schema require a major version bump**; new optional fields are minor.
8
8
 
9
+ ## [0.1.12] - 2026-05-22
10
+
11
+ ### Security
12
+ - **Fixed an XML entity-expansion ("billion laughs") vulnerability in `.docx`
13
+ parsing.** The 0.1.9 resource bounds only checked *size*, but a tiny
14
+ `word/document.xml` declaring a DTD with nested entities passes the size
15
+ check and then expands exponentially in the XML parser (both ElementTree and
16
+ lxml/python-docx resolve internal entities). A new `_docx_xml_guard` runs
17
+ before either reader and refuses any `document.xml` that declares a
18
+ DTD/entities (a legitimate OOXML part never does) — degrading gracefully to
19
+ empty text with a warning. Verified on both the stdlib and `[docx]` paths.
20
+
21
+ ## [0.1.11] - 2026-05-22
22
+
23
+ Polish pass.
24
+
25
+ ### Fixed
26
+ - **Signature blocks no longer capture the next column's label.** A two-column
27
+ unsigned block (`By: By:` / `Name: Name:`) used to yield garbage
28
+ signatories like `{"name": "By:", "title": "Title:"}`; such captures (and
29
+ blank fill lines) are now rejected, so an unsigned template correctly returns
30
+ no signatories.
31
+
32
+ ### Changed
33
+ - **`extract fields` and `--format table` now surface `jurisdiction`,
34
+ `amounts`, and `signatories`** — they were extracted and in the JSON but not
35
+ discoverable via the catalog or the human table view. A drift-guard test now
36
+ asserts `extract fields` can't diverge from the output schema.
37
+ - **Confidence values centralized into a documented scale** (named `CONF_*`
38
+ constants with a single descending ladder, replacing scattered magic numbers)
39
+ so downstream "verify, not trust" thresholds are principled. The only value
40
+ change: an affirmative auto-renewal is now `0.70` (was `0.65`), matching the
41
+ other best-effort term fields.
42
+ - Docs sweep: refreshed the clause-cascade description (h2 → bold-numbered →
43
+ numbered → all-caps, + the `--llm` fallback) across README/ARCHITECTURE and
44
+ the output-shape example. Line coverage held at 100% (CI-gated).
45
+
9
46
  ## [0.1.10] - 2026-05-22
10
47
 
11
48
  ### Fixed
@@ -296,6 +333,8 @@ Initial release — the open-loop front door of the contract-ops CLI suite.
296
333
  intentionally *not* governed by the output schema (the schema describes the
297
334
  full default output).
298
335
 
336
+ [0.1.12]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.12
337
+ [0.1.11]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.11
299
338
  [0.1.10]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.10
300
339
  [0.1.9]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.9
301
340
  [0.1.8]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.8
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-cli
3
- Version: 0.1.10
3
+ Version: 0.1.12
4
4
  Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON.
5
5
  Project-URL: Homepage, https://cli.drbaher.com/
6
6
  Project-URL: Repository, https://github.com/DrBaher/extract-cli
@@ -180,16 +180,17 @@ Streams follow the suite convention: **stdout** is the machine payload (JSON),
180
180
  "value": { "value": "$50,000", "confidence": 0.6, "source": "deterministic" },
181
181
  "amounts": [ { "value": "$50,000", "confidence": 0.6, "source": "deterministic" } ],
182
182
  "signatories": [ { "name": "Jane Doe", "title": "CEO", "confidence": 0.55, "source": "deterministic" } ],
183
- "_meta": { "extractor_version": "0.1.9", "tiers_used": ["deterministic"], "llm_used": false }
183
+ "_meta": { "extractor_version": "0.1.11", "tiers_used": ["deterministic"], "llm_used": false }
184
184
  }
185
185
  ```
186
186
 
187
187
  ## The clause map (the differentiator)
188
188
 
189
189
  A counterparty's "SECTION 7. NON-DISCLOSURE" and your template's
190
- "## Confidentiality" are the same clause. `extract-cli` reuses
191
- template-vault-cli's **clause-detection cascade** (Tier 1 `## H2` headings →
192
- Tier 2 bold-numbered `**1. …**` → Tier 3 ALL-CAPS lines) and a built-in
190
+ "## Confidentiality" are the same clause. `extract-cli` extends
191
+ template-vault-cli's **clause-detection cascade** `## H2` headings →
192
+ bold-numbered `**1. …**` → plain numbered (`1. Term`, `Section 3. …`, two-line
193
+ `ARTICLE N`) → ALL-CAPS lines (and an opt-in `--llm` fallback) — plus a built-in
193
194
  **canonical alias vocabulary** to normalize foreign clause titles onto the
194
195
  names the rest of the suite already speaks. Clauses it can't map are kept with
195
196
  `mapped: false` (and a `*` in the table view) so nothing is silently dropped.
@@ -142,16 +142,17 @@ Streams follow the suite convention: **stdout** is the machine payload (JSON),
142
142
  "value": { "value": "$50,000", "confidence": 0.6, "source": "deterministic" },
143
143
  "amounts": [ { "value": "$50,000", "confidence": 0.6, "source": "deterministic" } ],
144
144
  "signatories": [ { "name": "Jane Doe", "title": "CEO", "confidence": 0.55, "source": "deterministic" } ],
145
- "_meta": { "extractor_version": "0.1.9", "tiers_used": ["deterministic"], "llm_used": false }
145
+ "_meta": { "extractor_version": "0.1.11", "tiers_used": ["deterministic"], "llm_used": false }
146
146
  }
147
147
  ```
148
148
 
149
149
  ## The clause map (the differentiator)
150
150
 
151
151
  A counterparty's "SECTION 7. NON-DISCLOSURE" and your template's
152
- "## Confidentiality" are the same clause. `extract-cli` reuses
153
- template-vault-cli's **clause-detection cascade** (Tier 1 `## H2` headings →
154
- Tier 2 bold-numbered `**1. …**` → Tier 3 ALL-CAPS lines) and a built-in
152
+ "## Confidentiality" are the same clause. `extract-cli` extends
153
+ template-vault-cli's **clause-detection cascade** `## H2` headings →
154
+ bold-numbered `**1. …**` → plain numbered (`1. Term`, `Section 3. …`, two-line
155
+ `ARTICLE N`) → ALL-CAPS lines (and an opt-in `--llm` fallback) — plus a built-in
155
156
  **canonical alias vocabulary** to normalize foreign clause titles onto the
156
157
  names the rest of the suite already speaks. Clauses it can't map are kept with
157
158
  `mapped: false` (and a `*` in the table view) so nothing is silently dropped.
@@ -43,11 +43,11 @@ import urllib.request
43
43
  from pathlib import Path
44
44
  from typing import Any, Dict, List, Optional, Tuple
45
45
 
46
- __version__ = "0.1.10"
46
+ __version__ = "0.1.12"
47
47
 
48
48
  # Bumped independently of the package version when the *extraction logic*
49
49
  # changes in a way downstream consumers should notice. Embedded in `_meta`.
50
- EXTRACTOR_VERSION = "0.1.10"
50
+ EXTRACTOR_VERSION = "0.1.12"
51
51
 
52
52
  # JSON Schema version of the output contract (docs/spec/extract-output.schema.json).
53
53
  SCHEMA_VERSION = 1
@@ -503,6 +503,42 @@ def _none_field() -> JSON:
503
503
  return {"value": None, "confidence": 0.0, "source": "none"}
504
504
 
505
505
 
506
+ # --- Confidence scale -------------------------------------------------------
507
+ # These confidences are "verify, not trust" hints in [0, 1] -- a ranking of
508
+ # *structural certainty*, not calibrated probabilities. Higher means the
509
+ # extraction rests on more unambiguous structure; lower means a looser heuristic
510
+ # or an LLM guess. Downstream tools threshold on them, so they are centralized
511
+ # here and ordered into a single descending ladder rather than scattered as
512
+ # magic numbers:
513
+ #
514
+ # .95 explicit Markdown H2 heading
515
+ # .90 strong unambiguous pattern (parties "between X and Y"; labeled date)
516
+ # .85 clear keyword/structure (governing law; ISO date; bold-numbered heading)
517
+ # .80 keyworded but looser (plain numbered/ARTICLE heading; jurisdiction code)
518
+ # .75 structural-only heading (ALL-CAPS)
519
+ # .70 best-effort regex on common phrasing (term length, notice, auto-renew)
520
+ # .60 weak heuristic / LLM-enriched scalar (value, amounts, defined terms)
521
+ # .55 loose match (signature block, LLM obligations, non-ISO raw date)
522
+ # .50 fuzzy (LLM clause-map fallback)
523
+ CONF_H2 = 0.95
524
+ CONF_PARTIES = 0.90
525
+ CONF_DATE_LABELED = 0.90
526
+ CONF_DATE_ISO = 0.85
527
+ CONF_GOVERNING_LAW = 0.85
528
+ CONF_BOLD_HEADING = 0.85
529
+ CONF_NUMBERED_HEADING = 0.80
530
+ CONF_JURISDICTION = 0.80
531
+ CONF_ALLCAPS_HEADING = 0.75
532
+ CONF_TERM = 0.70
533
+ CONF_WEAK = 0.60
534
+ CONF_LLM = 0.60
535
+ CONF_DATE_RAW = 0.55
536
+ CONF_LLM_LIST = 0.55
537
+ CONF_SIGNATORY = 0.55
538
+ CONF_LLM_CLAUSE = 0.50
539
+ CONF_UNMAPPED_FACTOR = 0.75 # multiplier applied to a clause that doesn't map to the vocabulary
540
+
541
+
506
542
  def _titlecase(s: str) -> str:
507
543
  s = s.strip()
508
544
  if not s:
@@ -675,7 +711,7 @@ def _date_field_from_str(raw: str, base_conf: float) -> JSON:
675
711
  def _date_field(match: Optional["re.Match[str]"]) -> JSON:
676
712
  if match is None:
677
713
  return _none_field()
678
- return _date_field_from_str(match.group(1), 0.85)
714
+ return _date_field_from_str(match.group(1), CONF_DATE_ISO)
679
715
 
680
716
 
681
717
  # Trailing descriptors that follow a party's actual name and should be dropped
@@ -739,7 +775,7 @@ def extract_parties(text: str) -> List[JSON]:
739
775
  name, role = _split_name_role(raw)
740
776
  if not name or len(name) < 2 or len(name) > 120:
741
777
  continue
742
- entry: JSON = {"name": name, "confidence": 0.9, "source": "deterministic"}
778
+ entry: JSON = {"name": name, "confidence": CONF_PARTIES, "source": "deterministic"}
743
779
  entry["role"] = role
744
780
  out.append(entry)
745
781
  return out
@@ -748,7 +784,7 @@ def extract_parties(text: str) -> List[JSON]:
748
784
  def extract_dates(text: str) -> JSON:
749
785
  label = _EFFDATE_LABEL_RE.search(text)
750
786
  if label is not None:
751
- effective = _date_field_from_str(label.group(1), 0.9)
787
+ effective = _date_field_from_str(label.group(1), CONF_DATE_LABELED)
752
788
  else:
753
789
  effective = _date_field(_EFFECTIVE_RE.search(text))
754
790
  return {"effective": effective, "expiration": _date_field(_EXPIRE_RE.search(text))}
@@ -761,7 +797,7 @@ def extract_governing_law(text: str) -> JSON:
761
797
  juris = re.sub(r"\s+", " ", m.group(1).strip().rstrip(".,")).strip()
762
798
  if not juris: # pragma: no cover - the capture group requires a leading letter
763
799
  return _none_field()
764
- return _field(juris, 0.85)
800
+ return _field(juris, CONF_GOVERNING_LAW)
765
801
 
766
802
 
767
803
  def extract_term(text: str) -> JSON:
@@ -773,20 +809,20 @@ def extract_term(text: str) -> JSON:
773
809
  # Only emit when the captured token is a real number; otherwise the
774
810
  # match was a coincidence ("...consecutive days") -> leave as not-found.
775
811
  if num is not None:
776
- length = _field(f"{num} {unit}{'s' if num != 1 else ''}", 0.7)
812
+ length = _field(f"{num} {unit}{'s' if num != 1 else ''}", CONF_TERM)
777
813
 
778
814
  notice = _none_field()
779
815
  nm = _NOTICE_RE.search(text)
780
816
  if nm:
781
817
  days = _word_to_int(nm.group(1))
782
818
  if days is not None:
783
- notice = _field(days, 0.7)
819
+ notice = _field(days, CONF_TERM)
784
820
 
785
821
  auto = _none_field()
786
822
  if _AUTORENEW_NEG_RE.search(text):
787
- auto = _field(False, 0.7)
823
+ auto = _field(False, CONF_TERM)
788
824
  elif _AUTORENEW_POS_RE.search(text):
789
- auto = _field(True, 0.65)
825
+ auto = _field(True, CONF_TERM)
790
826
 
791
827
  return {"length": length, "auto_renew": auto, "notice_period_days": notice}
792
828
 
@@ -795,7 +831,7 @@ def extract_value(text: str) -> JSON:
795
831
  m = _MONEY_RE.search(text)
796
832
  if not m:
797
833
  return _none_field()
798
- return _field(re.sub(r"\s+", " ", m.group(0).strip()), 0.6)
834
+ return _field(re.sub(r"\s+", " ", m.group(0).strip()), CONF_WEAK)
799
835
 
800
836
 
801
837
  def extract_amounts(text: str) -> List[JSON]:
@@ -807,7 +843,7 @@ def extract_amounts(text: str) -> List[JSON]:
807
843
  seen.setdefault(amt, None)
808
844
  if len(seen) >= 30:
809
845
  break
810
- return [{"value": a, "confidence": 0.6, "source": "deterministic"} for a in seen]
846
+ return [{"value": a, "confidence": CONF_WEAK, "source": "deterministic"} for a in seen]
811
847
 
812
848
 
813
849
  # Signature blocks: "By: <name>", "Name: <name>", "Printed Name: <name>".
@@ -820,20 +856,32 @@ _SIG_TITLE_RE = re.compile(
820
856
  r"(?:^|\n)[ \t]*(?:Title|Its)[ \t]*:[ \t]*([^\n_{}\[\]]{2,60})",
821
857
  re.IGNORECASE,
822
858
  )
859
+ # A captured value is rejected when it's really the next column's label (common
860
+ # in two-column signature blocks: "By: By:") or a blank fill line.
861
+ _SIG_LABEL_RE = re.compile(r"(?:by|name|title|signature|its|date|signed|print)\b", re.IGNORECASE)
862
+
863
+
864
+ def _clean_sig_value(raw: str) -> Optional[str]:
865
+ v = re.sub(r"\s+", " ", raw).strip(" .,:")
866
+ if (len(v) < 2 or v.lower() == "the"
867
+ or not any(c.isalpha() for c in v)
868
+ or _SIG_LABEL_RE.match(v)):
869
+ return None
870
+ return v
823
871
 
824
872
 
825
873
  def extract_signatories(text: str) -> List[JSON]:
826
874
  """Best-effort signature-block names (and titles, when adjacent). Skips
827
875
  unfilled placeholders. Blank on a template; populated on executed paper."""
828
- titles = [re.sub(r"\s+", " ", m.group(1)).strip(" .,") for m in _SIG_TITLE_RE.finditer(text)]
876
+ titles = [_clean_sig_value(m.group(1)) for m in _SIG_TITLE_RE.finditer(text)]
829
877
  out: List[JSON] = []
830
878
  seen: Dict[str, None] = {}
831
879
  for i, m in enumerate(_SIGNATORY_RE.finditer(text)):
832
- name = re.sub(r"\s+", " ", m.group(1)).strip(" .,")
833
- if len(name) < 2 or name.lower() in ("the", "name", "title") or name in seen:
880
+ name = _clean_sig_value(m.group(1))
881
+ if name is None or name in seen:
834
882
  continue
835
883
  seen[name] = None
836
- entry: JSON = {"name": name, "confidence": 0.55, "source": "deterministic"}
884
+ entry: JSON = {"name": name, "confidence": CONF_SIGNATORY, "source": "deterministic"}
837
885
  entry["title"] = titles[i] if i < len(titles) else None
838
886
  out.append(entry)
839
887
  if len(out) >= 12:
@@ -869,7 +917,7 @@ def extract_jurisdiction(governing_law: JSON) -> JSON:
869
917
  if len(name) >= 5 and name in key:
870
918
  code = c
871
919
  break
872
- return _field(code, 0.8, "deterministic") if code else _none_field()
920
+ return _field(code, CONF_JURISDICTION, "deterministic") if code else _none_field()
873
921
 
874
922
 
875
923
  def extract_defined_terms(text: str) -> List[JSON]:
@@ -885,7 +933,7 @@ def extract_defined_terms(text: str) -> List[JSON]:
885
933
  seen.setdefault(term, None)
886
934
  if len(seen) >= 50:
887
935
  break
888
- return [{"term": t, "confidence": 0.6, "source": "deterministic"} for t in seen]
936
+ return [{"term": t, "confidence": CONF_WEAK, "source": "deterministic"} for t in seen]
889
937
 
890
938
 
891
939
  # Detected-heading titles that are almost never real clauses: front/back-matter,
@@ -936,9 +984,9 @@ def extract_clauses(text: str) -> List[JSON]:
936
984
  continue
937
985
  canonical, mapped = _canonicalize_clause(c["title"])
938
986
  tier = c["tier"]
939
- base = {"h2": 0.95, "bold-numbered": 0.85, "numbered": 0.8,
940
- "all-caps": 0.75, "explicit": 0.95}.get(tier, 0.7)
941
- conf = round(base * (1.0 if mapped else 0.75), 2)
987
+ base = {"h2": CONF_H2, "bold-numbered": CONF_BOLD_HEADING, "numbered": CONF_NUMBERED_HEADING,
988
+ "all-caps": CONF_ALLCAPS_HEADING, "explicit": CONF_H2}.get(tier, CONF_TERM)
989
+ conf = round(base * (1.0 if mapped else CONF_UNMAPPED_FACTOR), 2)
942
990
  out.append({
943
991
  "canonical_title": canonical,
944
992
  "detected_title": c["detected"],
@@ -1062,6 +1110,32 @@ def _read_html(raw_text: str) -> str:
1062
1110
  return parser.get_text()
1063
1111
 
1064
1112
 
1113
+ def _docx_xml_guard(raw: bytes) -> Optional[str]:
1114
+ """Run before EITHER docx reader on untrusted input. Returns a reason string
1115
+ if word/document.xml is unsafe to parse, else None:
1116
+ * decompresses past MAX_DECOMPRESSED_BYTES (zip bomb), or
1117
+ * declares a DTD/entities -- a tiny 'billion laughs' part that passes the
1118
+ size check but expands exponentially in the XML parser (ElementTree
1119
+ *and* lxml/python-docx resolve internal entities). A legitimate OOXML
1120
+ document.xml never declares one, so refusing is safe.
1121
+ """
1122
+ import io
1123
+ import zipfile
1124
+ try:
1125
+ with zipfile.ZipFile(io.BytesIO(raw)) as z:
1126
+ info = z.getinfo("word/document.xml")
1127
+ if info.file_size > MAX_DECOMPRESSED_BYTES:
1128
+ return (f"word/document.xml decompresses to {info.file_size} bytes "
1129
+ f"(> {MAX_DECOMPRESSED_BYTES} cap)")
1130
+ with z.open("word/document.xml") as f:
1131
+ head = f.read(65536)
1132
+ except Exception:
1133
+ return None # not a valid zip / no document.xml -> let the readers report it
1134
+ if re.search(rb"<!DOCTYPE|<!ENTITY", head, re.IGNORECASE):
1135
+ return "document.xml declares a DTD/entities (XML-bomb guard)"
1136
+ return None
1137
+
1138
+
1065
1139
  def _read_docx(path: Path, raw: bytes, prefer_optional: bool = True) -> Tuple[str, List[str]]:
1066
1140
  """Extract text from a .docx. Uses python-docx for higher fidelity when the
1067
1141
  optional [docx] extra is installed; otherwise a stdlib zipfile/XML reader
@@ -1070,6 +1144,10 @@ def _read_docx(path: Path, raw: bytes, prefer_optional: bool = True) -> Tuple[st
1070
1144
  `prefer_optional=False` forces the stdlib reader regardless of what's
1071
1145
  installed -- used to pin reproducible golden fixtures."""
1072
1146
  warnings: List[str] = []
1147
+ unsafe = _docx_xml_guard(raw)
1148
+ if unsafe is not None:
1149
+ warnings.append(f"could not parse .docx ({unsafe}); treating as empty")
1150
+ return "", warnings
1073
1151
  if prefer_optional and importlib.util.find_spec("docx") is not None:
1074
1152
  try:
1075
1153
  mod = importlib.import_module("docx")
@@ -1168,14 +1246,7 @@ def _read_docx_stdlib(raw: bytes) -> str:
1168
1246
 
1169
1247
  w = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
1170
1248
  with zipfile.ZipFile(io.BytesIO(raw)) as z:
1171
- # Zip-bomb guard: the uncompressed size is in the header, so check it
1172
- # before reading (don't decompress GBs into memory).
1173
- info = z.getinfo("word/document.xml")
1174
- if info.file_size > MAX_DECOMPRESSED_BYTES:
1175
- raise ValueError(
1176
- f"word/document.xml decompresses to {info.file_size} bytes "
1177
- f"(> {MAX_DECOMPRESSED_BYTES} cap)")
1178
- xml = z.read("word/document.xml")
1249
+ xml = z.read("word/document.xml") # size/XML-bomb already vetted by _docx_xml_guard
1179
1250
  root = ET.fromstring(xml)
1180
1251
  paras: List[str] = []
1181
1252
  # iter over w:p in document order (includes paragraphs inside table cells).
@@ -1572,7 +1643,7 @@ def _llm_clause_map(raw: Any, text: str) -> List[JSON]:
1572
1643
  "detected_title": title,
1573
1644
  "tier": "llm",
1574
1645
  "span": span,
1575
- "confidence": 0.5,
1646
+ "confidence": CONF_LLM_CLAUSE,
1576
1647
  "source": "llm",
1577
1648
  "mapped": mapped,
1578
1649
  })
@@ -1607,18 +1678,18 @@ def llm_enrich(result: JSON, text: str, args_ns: argparse.Namespace) -> None:
1607
1678
  enriched = False
1608
1679
  rm = obj.get("renewal_mechanics")
1609
1680
  if isinstance(rm, str) and rm.strip():
1610
- result["term"]["renewal_mechanics"] = _field(rm.strip(), 0.6, "llm")
1681
+ result["term"]["renewal_mechanics"] = _field(rm.strip(), CONF_LLM, "llm")
1611
1682
  enriched = True
1612
1683
  obligations = obj.get("obligations")
1613
1684
  if isinstance(obligations, list) and obligations:
1614
1685
  result["obligations"] = [
1615
- {"text": str(o).strip(), "confidence": 0.55, "source": "llm"}
1686
+ {"text": str(o).strip(), "confidence": CONF_LLM_LIST, "source": "llm"}
1616
1687
  for o in obligations[:5] if str(o).strip()
1617
1688
  ]
1618
1689
  enriched = True
1619
1690
  gl = obj.get("governing_law")
1620
1691
  if isinstance(gl, str) and gl.strip() and result["governing_law"]["source"] == "none":
1621
- result["governing_law"] = _field(gl.strip(), 0.6, "llm")
1692
+ result["governing_law"] = _field(gl.strip(), CONF_LLM, "llm")
1622
1693
  enriched = True
1623
1694
  if want_clauses:
1624
1695
  cmap = _llm_clause_map(obj.get("clauses"), text)
@@ -1707,10 +1778,20 @@ def render_table(result: JSON, no_confidence: bool) -> str:
1707
1778
  lines.append(f" renewal : {_fv(term['renewal_mechanics'])} {_dim('[llm]')}")
1708
1779
  if "governing_law" in result:
1709
1780
  lines.append(_bold("Governing law"))
1710
- lines.append(f" {_fv(result['governing_law'])}")
1781
+ juris = result.get("jurisdiction", {}).get("value")
1782
+ suffix = _dim(f" [{juris}]") if juris else ""
1783
+ lines.append(f" {_fv(result['governing_law'])}{suffix}")
1711
1784
  if "value" in result:
1785
+ amts = result.get("amounts") or []
1786
+ extra = _dim(f" (+{len(amts) - 1} more)") if len(amts) > 1 else ""
1712
1787
  lines.append(_bold("Value"))
1713
- lines.append(f" {_fv(result['value'])}")
1788
+ lines.append(f" {_fv(result['value'])}{extra}")
1789
+ signatories = result.get("signatories")
1790
+ if signatories:
1791
+ lines.append(_bold(f"Signatories ({len(signatories)})"))
1792
+ for s in signatories[:6]:
1793
+ title = f" - {s['title']}" if s.get("title") else ""
1794
+ lines.append(f" {s['name']}{title}")
1714
1795
  clauses = result.get("clauses")
1715
1796
  if clauses is not None:
1716
1797
  lines.append(_bold(f"Clause map ({len(clauses)})"))
@@ -1935,11 +2016,14 @@ FIELD_CATALOG: Tuple[Tuple[str, str, str], ...] = (
1935
2016
  ("term.length", "deterministic", "Term length, best-effort"),
1936
2017
  ("term.notice_period_days", "deterministic", "Notice period in days, best-effort"),
1937
2018
  ("term.auto_renew", "deterministic", "Auto-renewal flag, best-effort"),
1938
- ("governing_law", "deterministic", "Governing law / jurisdiction"),
2019
+ ("governing_law", "deterministic", "Governing law text ('governed by the laws of ...')"),
2020
+ ("jurisdiction", "deterministic", "Governing law normalized to a code (e.g. US-DE)"),
1939
2021
  ("clauses", "deterministic", "Clause map normalized to the suite's canonical vocabulary "
1940
2022
  "(LLM fallback under --llm when no headings are detected)"),
1941
2023
  ("defined_terms", "deterministic", "Defined-term inventory (quoted / parenthetical)"),
1942
2024
  ("value", "deterministic", "Headline monetary value"),
2025
+ ("amounts", "deterministic", "All distinct monetary amounts"),
2026
+ ("signatories", "deterministic", "Signature-block names/titles (By:/Name:/Title:)"),
1943
2027
  ("term.renewal_mechanics", "llm", "Renewal mechanics (fuzzy; --llm only)"),
1944
2028
  ("obligations", "llm", "Key obligation phrasing (fuzzy; --llm only)"),
1945
2029
  )
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "extract-cli"
7
- version = "0.1.10"
7
+ version = "0.1.12"
8
8
  description = "Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
@@ -39,7 +39,7 @@
39
39
  },
40
40
  "auto_renew": {
41
41
  "value": true,
42
- "confidence": 0.65,
42
+ "confidence": 0.7,
43
43
  "source": "deterministic"
44
44
  },
45
45
  "notice_period_days": {
@@ -151,7 +151,7 @@
151
151
  ],
152
152
  "signatories": [],
153
153
  "_meta": {
154
- "extractor_version": "0.1.10",
154
+ "extractor_version": "0.1.12",
155
155
  "tiers_used": [
156
156
  "deterministic"
157
157
  ],
@@ -39,7 +39,7 @@
39
39
  },
40
40
  "auto_renew": {
41
41
  "value": true,
42
- "confidence": 0.65,
42
+ "confidence": 0.7,
43
43
  "source": "deterministic"
44
44
  },
45
45
  "notice_period_days": {
@@ -140,7 +140,7 @@
140
140
  "amounts": [],
141
141
  "signatories": [],
142
142
  "_meta": {
143
- "extractor_version": "0.1.10",
143
+ "extractor_version": "0.1.12",
144
144
  "tiers_used": [
145
145
  "deterministic"
146
146
  ],
@@ -146,7 +146,7 @@
146
146
  ],
147
147
  "signatories": [],
148
148
  "_meta": {
149
- "extractor_version": "0.1.10",
149
+ "extractor_version": "0.1.12",
150
150
  "tiers_used": [
151
151
  "deterministic"
152
152
  ],
@@ -146,7 +146,7 @@
146
146
  ],
147
147
  "signatories": [],
148
148
  "_meta": {
149
- "extractor_version": "0.1.10",
149
+ "extractor_version": "0.1.12",
150
150
  "tiers_used": [
151
151
  "deterministic"
152
152
  ],
@@ -39,7 +39,7 @@
39
39
  },
40
40
  "auto_renew": {
41
41
  "value": true,
42
- "confidence": 0.65,
42
+ "confidence": 0.7,
43
43
  "source": "deterministic"
44
44
  },
45
45
  "notice_period_days": {
@@ -150,7 +150,7 @@
150
150
  "amounts": [],
151
151
  "signatories": [],
152
152
  "_meta": {
153
- "extractor_version": "0.1.10",
153
+ "extractor_version": "0.1.12",
154
154
  "tiers_used": [
155
155
  "deterministic"
156
156
  ],
@@ -140,7 +140,7 @@
140
140
  "amounts": [],
141
141
  "signatories": [],
142
142
  "_meta": {
143
- "extractor_version": "0.1.10",
143
+ "extractor_version": "0.1.12",
144
144
  "tiers_used": [
145
145
  "deterministic"
146
146
  ],
@@ -55,7 +55,7 @@
55
55
  "amounts": [],
56
56
  "signatories": [],
57
57
  "_meta": {
58
- "extractor_version": "0.1.10",
58
+ "extractor_version": "0.1.12",
59
59
  "tiers_used": [
60
60
  "deterministic"
61
61
  ],
@@ -146,7 +146,7 @@
146
146
  ],
147
147
  "signatories": [],
148
148
  "_meta": {
149
- "extractor_version": "0.1.10",
149
+ "extractor_version": "0.1.12",
150
150
  "tiers_used": [
151
151
  "deterministic"
152
152
  ],
@@ -39,7 +39,7 @@
39
39
  },
40
40
  "auto_renew": {
41
41
  "value": true,
42
- "confidence": 0.65,
42
+ "confidence": 0.7,
43
43
  "source": "deterministic"
44
44
  },
45
45
  "notice_period_days": {
@@ -161,7 +161,7 @@
161
161
  ],
162
162
  "signatories": [],
163
163
  "_meta": {
164
- "extractor_version": "0.1.10",
164
+ "extractor_version": "0.1.12",
165
165
  "tiers_used": [
166
166
  "deterministic"
167
167
  ],
@@ -211,6 +211,19 @@ def test_render_table_unmapped_legend() -> None:
211
211
  assert "* = not mapped" in ex.render_table(r, no_confidence=False)
212
212
 
213
213
 
214
+ def test_render_table_jurisdiction_amounts_signatories() -> None:
215
+ r = ex.build_extraction("body", b"x", "markdown", "x.md")
216
+ r["jurisdiction"] = ex._field("US-DE", ex.CONF_JURISDICTION)
217
+ r["amounts"] = [{"value": "$1", "confidence": 0.6, "source": "deterministic"},
218
+ {"value": "$2", "confidence": 0.6, "source": "deterministic"}]
219
+ r["signatories"] = [{"name": "Jane Doe", "title": "CEO",
220
+ "confidence": ex.CONF_SIGNATORY, "source": "deterministic"}]
221
+ table = ex.render_table(r, no_confidence=False)
222
+ assert "US-DE" in table
223
+ assert "+1 more" in table
224
+ assert "Signatories (1)" in table and "Jane Doe - CEO" in table
225
+
226
+
214
227
  def test_cli_silent_table_suppresses_human_view(capsys: pytest.CaptureFixture[str]) -> None:
215
228
  assert ex.main([str(FIXTURES / "nda_h2.md"), "--silent", "--format", "table"]) == 0
216
229
  assert "Clause map" not in capsys.readouterr().out
@@ -165,6 +165,13 @@ def test_signatories() -> None:
165
165
  assert ex.extract_signatories("Name: {party_1_signatory}\nBy: _____________") == []
166
166
 
167
167
 
168
+ def test_signatories_two_column_blank_block() -> None:
169
+ # An unsigned two-column block ("By: By:") must NOT capture the next
170
+ # column's label as a name.
171
+ text = "By: By:\nName: Name:\nTitle: Title:\n"
172
+ assert ex.extract_signatories(text) == []
173
+
174
+
168
175
  def test_value_money() -> None:
169
176
  assert ex.extract_value("a fee of $250,000 is due")["value"] == "$250,000"
170
177
  assert ex.extract_value("budget is USD 1.5 million")["value"].startswith("USD")
@@ -277,6 +277,30 @@ def test_docx_zip_bomb_guard(tmp_path: Any) -> None:
277
277
  assert any("decompress" in w for w in warnings)
278
278
 
279
279
 
280
+ def test_docx_xml_entity_bomb_refused(tmp_path: Any) -> None:
281
+ # A tiny 'billion laughs' document.xml passes the size check but would expand
282
+ # exponentially in the XML parser; the DTD/entity guard refuses it.
283
+ import io
284
+ import zipfile
285
+ w = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
286
+ bomb = (
287
+ '<?xml version="1.0"?>\n'
288
+ '<!DOCTYPE r [<!ENTITY a "AAAA"><!ENTITY b "&a;&a;&a;&a;">]>\n'
289
+ f'<w:document xmlns:w="{w}"><w:body><w:p><w:r><w:t>&b;</w:t></w:r>'
290
+ '</w:p></w:body></w:document>'
291
+ ).encode()
292
+ buf = io.BytesIO()
293
+ with zipfile.ZipFile(buf, "w") as z:
294
+ z.writestr("[Content_Types].xml", "<Types/>")
295
+ z.writestr("word/document.xml", bomb)
296
+ p = tmp_path / "xmlbomb.docx"
297
+ p.write_bytes(buf.getvalue())
298
+ assert p.stat().st_size < 100_000 # tiny on disk
299
+ raw, text, fmt, warnings = ex.load_source(p) # default reader path
300
+ assert fmt == "docx" and text == ""
301
+ assert any("DTD/entities" in w for w in warnings)
302
+
303
+
280
304
  def test_numbered_docx_clauses() -> None:
281
305
  """A DOCX whose clauses are w:numPr list paragraphs (no heading style, no
282
306
  visible number) still yields a clause map; a deep numbered body sentence is
@@ -56,6 +56,20 @@ def test_schema_command_emits_committed_spec() -> None:
56
56
  assert json.loads(proc.stdout) == json.loads(SPEC_FILE.read_text(encoding="utf-8"))
57
57
 
58
58
 
59
+ def test_fields_catalog_covers_schema() -> None:
60
+ """`extract fields` (FIELD_CATALOG) must not silently drift from the output
61
+ schema -- every top-level output field appears in the catalog."""
62
+ schema_top = set(SCHEMA["properties"]) - {"_meta"}
63
+ catalog_prefixes = {f.split(".")[0] for f, _tier, _desc in ex.FIELD_CATALOG}
64
+ assert schema_top - catalog_prefixes == set()
65
+
66
+
67
+ def test_confidence_scale_is_a_descending_ladder() -> None:
68
+ assert ex.CONF_H2 >= ex.CONF_PARTIES >= ex.CONF_GOVERNING_LAW >= ex.CONF_NUMBERED_HEADING
69
+ assert ex.CONF_ALLCAPS_HEADING >= ex.CONF_TERM >= ex.CONF_WEAK >= ex.CONF_LLM_CLAUSE
70
+ assert 0.0 < ex.CONF_LLM_CLAUSE and ex.CONF_H2 <= 1.0
71
+
72
+
59
73
  def test_schema_is_self_describing() -> None:
60
74
  assert SCHEMA["$schema"] == "https://json-schema.org/draft/2020-12/schema"
61
75
  assert "extract-cli" in SCHEMA["title"]
File without changes
File without changes
File without changes
File without changes
File without changes