extract-cli 0.1.2__tar.gz → 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. {extract_cli-0.1.2 → extract_cli-0.1.4}/CHANGELOG.md +53 -0
  2. {extract_cli-0.1.2 → extract_cli-0.1.4}/PKG-INFO +1 -1
  3. {extract_cli-0.1.2 → extract_cli-0.1.4}/extract_cli.py +92 -4
  4. {extract_cli-0.1.2 → extract_cli-0.1.4}/pyproject.toml +1 -1
  5. {extract_cli-0.1.2 → extract_cli-0.1.4}/tests/_fixtures_build.py +45 -7
  6. {extract_cli-0.1.2 → extract_cli-0.1.4}/tests/_make_goldens.py +2 -2
  7. {extract_cli-0.1.2 → extract_cli-0.1.4}/tests/conftest.py +1 -0
  8. {extract_cli-0.1.2 → extract_cli-0.1.4}/tests/fixtures/employment_docx.docx +0 -0
  9. {extract_cli-0.1.2 → extract_cli-0.1.4}/tests/fixtures/employment_docx.docx.expected.json +2 -2
  10. extract_cli-0.1.4/tests/fixtures/heading_docx.docx +0 -0
  11. extract_cli-0.1.4/tests/fixtures/heading_docx.docx.expected.json +142 -0
  12. {extract_cli-0.1.2 → extract_cli-0.1.4}/tests/fixtures/lease_allcaps.txt.expected.json +1 -1
  13. {extract_cli-0.1.2 → extract_cli-0.1.4}/tests/fixtures/license_pdf.pdf.expected.json +1 -1
  14. {extract_cli-0.1.2 → extract_cli-0.1.4}/tests/fixtures/nda_h2.md.expected.json +1 -1
  15. {extract_cli-0.1.2 → extract_cli-0.1.4}/tests/fixtures/scanned.pdf.expected.json +1 -1
  16. {extract_cli-0.1.2 → extract_cli-0.1.4}/tests/fixtures/services_bold.txt.expected.json +1 -1
  17. {extract_cli-0.1.2 → extract_cli-0.1.4}/tests/fixtures/services_html.html.expected.json +1 -1
  18. {extract_cli-0.1.2 → extract_cli-0.1.4}/tests/test_clause_map.py +25 -0
  19. {extract_cli-0.1.2 → extract_cli-0.1.4}/tests/test_misc.py +31 -0
  20. {extract_cli-0.1.2 → extract_cli-0.1.4}/.gitignore +0 -0
  21. {extract_cli-0.1.2 → extract_cli-0.1.4}/ARCHITECTURE.md +0 -0
  22. {extract_cli-0.1.2 → extract_cli-0.1.4}/CONTRIBUTING.md +0 -0
  23. {extract_cli-0.1.2 → extract_cli-0.1.4}/LICENSE +0 -0
  24. {extract_cli-0.1.2 → extract_cli-0.1.4}/Makefile +0 -0
  25. {extract_cli-0.1.2 → extract_cli-0.1.4}/README.md +0 -0
  26. {extract_cli-0.1.2 → extract_cli-0.1.4}/config/llm.json.example +0 -0
  27. {extract_cli-0.1.2 → extract_cli-0.1.4}/docs/INTEROP.md +0 -0
  28. {extract_cli-0.1.2 → extract_cli-0.1.4}/docs/spec/extract-output.schema.json +0 -0
  29. {extract_cli-0.1.2 → extract_cli-0.1.4}/scripts/release.py +0 -0
  30. {extract_cli-0.1.2 → extract_cli-0.1.4}/scripts/validate_against_spec.py +0 -0
  31. {extract_cli-0.1.2 → extract_cli-0.1.4}/tests/_schema_validator.py +0 -0
  32. {extract_cli-0.1.2 → extract_cli-0.1.4}/tests/fixtures/lease_allcaps.txt +0 -0
  33. {extract_cli-0.1.2 → extract_cli-0.1.4}/tests/fixtures/license_pdf.pdf +0 -0
  34. {extract_cli-0.1.2 → extract_cli-0.1.4}/tests/fixtures/nda_h2.md +0 -0
  35. {extract_cli-0.1.2 → extract_cli-0.1.4}/tests/fixtures/scanned.pdf +0 -0
  36. {extract_cli-0.1.2 → extract_cli-0.1.4}/tests/fixtures/services_bold.txt +0 -0
  37. {extract_cli-0.1.2 → extract_cli-0.1.4}/tests/fixtures/services_html.html +0 -0
  38. {extract_cli-0.1.2 → extract_cli-0.1.4}/tests/test_cli.py +0 -0
  39. {extract_cli-0.1.2 → extract_cli-0.1.4}/tests/test_deterministic.py +0 -0
  40. {extract_cli-0.1.2 → extract_cli-0.1.4}/tests/test_llm.py +0 -0
  41. {extract_cli-0.1.2 → extract_cli-0.1.4}/tests/test_property.py +0 -0
  42. {extract_cli-0.1.2 → extract_cli-0.1.4}/tests/test_schema_conformance.py +0 -0
@@ -6,6 +6,57 @@ to [Semantic Versioning](https://semver.org/). Per the suite convention
6
6
  (see [`docs/INTEROP.md`](docs/INTEROP.md)), **backward-incompatible changes to
7
7
  the output schema require a major version bump**; new optional fields are minor.
8
8
 
9
+ ## [0.1.4] - 2026-05-21
10
+
11
+ DOCX clause detection, driven by testing against 20 real `.docx` contracts
12
+ (Common Paper / Bonterms / YC templates via open-agreements, plus government
13
+ samples) — the format we expect most.
14
+
15
+ ### Fixed
16
+ - **The DOCX reader now honors Word heading styles.** Real Word contracts carry
17
+ their clause structure in `Heading1`–`Heading9`/`Title` paragraph styles with
18
+ *auto-generated* numbers (absent from the raw text), so the prior cascade
19
+ found almost no clauses. Heading-styled paragraphs are now emitted as `##`
20
+ headings (detected by the strongest tier); run-in headings
21
+ (`Payment. Customer will pay …`) are split into title + body, and a full
22
+ sentence that merely carries a heading style is rejected (not a clause).
23
+ Across the 20-doc sample this took heading-styled agreements from ~0 clauses
24
+ to a clean 14–21 distinct suite-vocabulary clauses each.
25
+ - Binary DOCX test fixtures are now generated deterministically (fixed zip
26
+ timestamp) so their sha256 — and the goldens — are stable across regenerations.
27
+
28
+ ### Known limitations (documented)
29
+ - DOCX that auto-number clauses via `numbering.xml` with **no heading style and
30
+ no bold lead** (some Bonterms/older templates use a flat `Plain`/`ListParagraph`
31
+ style) still yield no clause map: the heading text carries no detectable
32
+ signal without reconstructing Word's numbering counters. Parties/dates/
33
+ governing-law still extract.
34
+
35
+ ## [0.1.3] - 2026-05-21
36
+
37
+ Clause-map de-noising and party cleanup, driven by testing against 10 more
38
+ contracts (SEC EDGAR credit, loan, employment, lease, asset-purchase, and
39
+ consulting HTML exhibits; Apache PDFs).
40
+
41
+ ### Fixed
42
+ - **Clause map drops structural noise** common in dense real documents:
43
+ a heading whose title repeats 3+ times is treated as a running header/footer
44
+ (one lease's `Ks 112708-2` page code went from 44 "clauses" to 0), and
45
+ front/back-matter (`Table of Contents`, `Exhibit B`, `Schedule 2.1`) and
46
+ document codes/page numbers (4+ consecutive digits) are filtered out.
47
+ - **Party-name cleanup** extended: trailing `together with …`, `, as
48
+ administrative agent`, and a dangling unclosed parenthetical
49
+ (`(each of them being`) are trimmed.
50
+
51
+ ### Notes
52
+ - On dense documents the deterministic clause map can still surface a few
53
+ non-clause headings (e.g. address lines in a notices block); consumers
54
+ wanting only suite-vocabulary clauses should filter on `mapped == true`,
55
+ which isolates the real clauses (the noise is always `mapped == false`).
56
+ - Known best-effort edge cases on varied real paper: a bare role word as a
57
+ party name ("Landlord"), and a middle-initial period truncating a personal
58
+ name ("John C." → "John C"). Best-effort fields carry confidence/source.
59
+
9
60
  ## [0.1.2] - 2026-05-21
10
61
 
11
62
  More real-world hardening, driven by testing against five additional contracts
@@ -115,6 +166,8 @@ Initial release — the open-loop front door of the contract-ops CLI suite.
115
166
  intentionally *not* governed by the output schema (the schema describes the
116
167
  full default output).
117
168
 
169
+ [0.1.4]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.4
170
+ [0.1.3]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.3
118
171
  [0.1.2]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.2
119
172
  [0.1.1]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.1
120
173
  [0.1.0]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-cli
3
- Version: 0.1.2
3
+ Version: 0.1.4
4
4
  Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON.
5
5
  Project-URL: Homepage, https://cli.drbaher.com/
6
6
  Project-URL: Repository, https://github.com/DrBaher/extract-cli
@@ -43,11 +43,11 @@ import urllib.request
43
43
  from pathlib import Path
44
44
  from typing import Any, Dict, List, Optional, Tuple
45
45
 
46
- __version__ = "0.1.2"
46
+ __version__ = "0.1.4"
47
47
 
48
48
  # Bumped independently of the package version when the *extraction logic*
49
49
  # changes in a way downstream consumers should notice. Embedded in `_meta`.
50
- EXTRACTOR_VERSION = "0.1.2"
50
+ EXTRACTOR_VERSION = "0.1.4"
51
51
 
52
52
  # JSON Schema version of the output contract (docs/spec/extract-output.schema.json).
53
53
  SCHEMA_VERSION = 1
@@ -629,18 +629,25 @@ _PARTY_CUT_MARKERS: Tuple[str, ...] = (
629
629
  r"\s+organized\b",
630
630
  r"\s+incorporated\b",
631
631
  r"\s+whose\b",
632
+ r"\s+together\b",
633
+ r",\s+as\s+\w", # ", as administrative agent"
632
634
  r"\s+(?:as\s+of|dated|effective)\b",
633
635
  )
634
636
 
635
637
 
636
638
  def _clean_party_name(s: str) -> str:
637
639
  """Trim a captured party name down to the entity name, dropping trailing
638
- descriptors ('a Delaware corporation', 'd/b/a ...', 'as of ...')."""
640
+ descriptors ('a Delaware corporation', 'd/b/a ...', 'together with ...',
641
+ 'as of ...') and any dangling unclosed parenthetical ('(each of them ...')."""
639
642
  s = re.sub(r"\s+", " ", s).strip().strip(",").strip()
640
643
  for pat in _PARTY_CUT_MARKERS:
641
644
  m = re.search(pat, s, re.IGNORECASE)
642
645
  if m:
643
646
  s = s[: m.start()].strip().strip(",").strip()
647
+ # Drop a trailing parenthetical that was opened but never closed (the close
648
+ # fell outside the captured span), e.g. "Glenn Rufrano (each of them being".
649
+ if "(" in s and ")" not in s:
650
+ s = s[: s.index("(")].strip().strip(",").strip()
644
651
  return s.strip("\"“”").strip()
645
652
 
646
653
 
@@ -741,9 +748,45 @@ def extract_defined_terms(text: str) -> List[JSON]:
741
748
  return [{"term": t, "confidence": 0.6, "source": "deterministic"} for t in seen]
742
749
 
743
750
 
751
+ # Detected-heading titles that are almost never real clauses: front/back-matter,
752
+ # page/document codes, exhibit & schedule references.
753
+ _NOISE_TITLE_PREFIX_RE = re.compile(
754
+ r"^(?:table\s+of\s+contents|exhibit|schedule|annex|appendix|attachment|"
755
+ r"signature\s+page|page)\b",
756
+ re.IGNORECASE,
757
+ )
758
+
759
+
760
+ def _is_noise_clause_title(title: str) -> bool:
761
+ """True for detected 'headings' that are structural noise rather than
762
+ clauses -- document codes/page numbers (4+ consecutive digits, e.g.
763
+ 'Ks 112708-2'), and front/back-matter like 'Table of Contents' or
764
+ 'Exhibit B'. Safe filters only; kept conservative to avoid dropping real
765
+ clauses."""
766
+ t = title.strip()
767
+ if re.search(r"\d{4,}", t):
768
+ return True
769
+ if _NOISE_TITLE_PREFIX_RE.match(t):
770
+ return True
771
+ return False
772
+
773
+
744
774
  def extract_clauses(text: str) -> List[JSON]:
775
+ detected = detect_clauses(text)
776
+ # A heading whose title repeats 3+ times across the document is almost
777
+ # always a running header/footer (e.g. a page code), not that many distinct
778
+ # clauses -- drop every occurrence. (Counted on the normalized title.)
779
+ counts: Dict[str, int] = {}
780
+ for c in detected:
781
+ k = _norm_clause_key(c["title"])
782
+ counts[k] = counts.get(k, 0) + 1
783
+
745
784
  out: List[JSON] = []
746
- for c in detect_clauses(text):
785
+ for c in detected:
786
+ if counts[_norm_clause_key(c["title"])] >= 3:
787
+ continue
788
+ if _is_noise_clause_title(c["title"]):
789
+ continue
747
790
  canonical, mapped = _canonicalize_clause(c["title"])
748
791
  tier = c["tier"]
749
792
  base = {"h2": 0.95, "bold-numbered": 0.85, "numbered": 0.8,
@@ -907,6 +950,39 @@ def _read_docx(path: Path, raw: bytes, prefer_optional: bool = True) -> Tuple[st
907
950
  return "", warnings
908
951
 
909
952
 
953
+ def _docx_paragraph_style(ppr: Any, w: str) -> Optional[str]:
954
+ if ppr is None:
955
+ return None
956
+ st = ppr.find(w + "pStyle")
957
+ return st.get(w + "val") if st is not None else None
958
+
959
+
960
+ def _is_heading_style(style: Optional[str]) -> bool:
961
+ """True for Word built-in heading/title styles (Heading1-9, Title, and the
962
+ 'H1'/'H2' shorthands). These mark clause headings whose visible numbers are
963
+ auto-generated and absent from the raw text."""
964
+ if not style:
965
+ return False
966
+ s = style.lower()
967
+ return "heading" in s or s == "title" or bool(re.fullmatch(r"h[1-9]", s))
968
+
969
+
970
+ def _docx_heading_title(text: str) -> Optional[str]:
971
+ """Pull the clause title out of a heading paragraph. Many contracts use a
972
+ run-in heading -- 'Performing Services. Contractor will ...' -- where the
973
+ title is the lead before the first sentence break; a standalone header
974
+ ('Services & Restrictions') has no such break and is used whole.
975
+
976
+ Returns None when the paragraph is really a full sentence that merely
977
+ carries a heading style (no run-in title) -- those would otherwise become
978
+ garbage clause titles and mis-map under substring matching."""
979
+ m = re.match(r"\s*(.{2,80}?)[.:]\s+[A-Z(\"“]", text)
980
+ title = m.group(1).strip() if m else text.strip()
981
+ if len(title) > 70 or len(title.split()) > 9:
982
+ return None
983
+ return title
984
+
985
+
910
986
  def _read_docx_stdlib(raw: bytes) -> str:
911
987
  import io
912
988
  import zipfile
@@ -919,6 +995,7 @@ def _read_docx_stdlib(raw: bytes) -> str:
919
995
  paras: List[str] = []
920
996
  # iter over w:p in document order (includes paragraphs inside table cells).
921
997
  for p in root.iter(w + "p"):
998
+ style = _docx_paragraph_style(p.find(w + "pPr"), w)
922
999
  run_texts: List[str] = []
923
1000
  any_text = False
924
1001
  all_bold = True
@@ -935,6 +1012,17 @@ def _read_docx_stdlib(raw: bytes) -> str:
935
1012
  if not line:
936
1013
  paras.append("")
937
1014
  continue
1015
+ # Word heading styles carry the clause structure (their numbers are
1016
+ # auto-generated, so absent from text). Emit them as H2 so the clause
1017
+ # cascade's strongest tier detects them; keep any run-in body too.
1018
+ if _is_heading_style(style):
1019
+ title = _docx_heading_title(line)
1020
+ if title is not None:
1021
+ paras.append(f"## {title}")
1022
+ if len(title) < len(line):
1023
+ paras.append(line[len(title):].lstrip(" .:\t"))
1024
+ continue
1025
+ # Sentence carrying a heading style -> treat as ordinary body text.
938
1026
  if any_text and all_bold:
939
1027
  line = f"**{line}**"
940
1028
  paras.append(line)
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "extract-cli"
7
- version = "0.1.2"
7
+ version = "0.1.4"
8
8
  description = "Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
@@ -42,14 +42,36 @@ _DOCX_PARAS = [
42
42
  _W = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
43
43
 
44
44
 
45
- def _docx_paragraph(text: str, bold: bool) -> str:
45
+ def _docx_paragraph(text: str, bold: bool = False, style: str = "") -> str:
46
+ ppr = f'<w:pPr><w:pStyle w:val="{style}"/></w:pPr>' if style else ""
46
47
  rpr = "<w:rPr><w:b/></w:rPr>" if bold else ""
47
- return (f"<w:p><w:r>{rpr}"
48
+ return (f"<w:p>{ppr}<w:r>{rpr}"
48
49
  f'<w:t xml:space="preserve">{escape(text)}</w:t></w:r></w:p>')
49
50
 
50
51
 
51
- def build_docx() -> bytes:
52
- body = "".join(_docx_paragraph(t, b) for t, b in _DOCX_PARAS)
52
+ # A Word-styled agreement: clause structure carried by Heading1 styles (their
53
+ # numbers are auto-generated, absent from text), including a run-in heading and
54
+ # a full sentence that merely carries the heading style (must be rejected).
55
+ _HEADING_DOCX_PARAS = [
56
+ ('Cloud Service Agreement', False, "Title"),
57
+ ('This Cloud Service Agreement is entered into as of April 4, 2024, by and '
58
+ 'between Initech Software, Inc. (the "Provider") and Globex Corporation '
59
+ '(the "Customer").', False, ""),
60
+ ('Confidentiality', False, "Heading1"),
61
+ ('Each party will protect the other party’s Confidential Information.', False, ""),
62
+ ('Payment. Customer will pay the fees set out in the Order Form within '
63
+ 'thirty (30) days.', False, "Heading1"),
64
+ ('Term & Termination', False, "Heading1"),
65
+ ('The term of this Agreement is two (2) years and will automatically renew '
66
+ 'for successive one-year terms.', False, ""),
67
+ ('Either party may terminate this Agreement upon material breach that '
68
+ 'remains uncured for thirty days after written notice.', False, "Heading1"),
69
+ ('Governing Law', False, "Heading1"),
70
+ ('This Agreement is governed by the laws of the State of New York.', False, ""),
71
+ ]
72
+
73
+
74
+ def _docx_package(body: str) -> bytes:
53
75
  document = (
54
76
  '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
55
77
  f'<w:document xmlns:w="{_W}"><w:body>{body}<w:sectPr/></w:body></w:document>'
@@ -70,14 +92,29 @@ def build_docx() -> bytes:
70
92
  'Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" '
71
93
  'Target="word/document.xml"/></Relationships>'
72
94
  )
95
+ # Deterministic: a fixed timestamp on every entry so regenerating the
96
+ # fixture produces byte-identical output (stable sha256 -> stable goldens).
73
97
  buf = io.BytesIO()
74
98
  with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
75
- z.writestr("[Content_Types].xml", content_types)
76
- z.writestr("_rels/.rels", rels)
77
- z.writestr("word/document.xml", document)
99
+ for name, data in (("[Content_Types].xml", content_types),
100
+ ("_rels/.rels", rels),
101
+ ("word/document.xml", document)):
102
+ info = zipfile.ZipInfo(name, date_time=(1980, 1, 1, 0, 0, 0))
103
+ info.compress_type = zipfile.ZIP_DEFLATED
104
+ z.writestr(info, data)
78
105
  return buf.getvalue()
79
106
 
80
107
 
108
+ def build_docx() -> bytes:
109
+ return _docx_package("".join(_docx_paragraph(t, b) for t, b in _DOCX_PARAS))
110
+
111
+
112
+ def build_heading_docx() -> bytes:
113
+ return _docx_package(
114
+ "".join(_docx_paragraph(t, b, style=s) for t, b, s in _HEADING_DOCX_PARAS)
115
+ )
116
+
117
+
81
118
  # --- PDF: a software license with ALL-CAPS headings (Tier 3) ----------------
82
119
 
83
120
  _PDF_TEXT = """SOFTWARE LICENSE AGREEMENT
@@ -156,6 +193,7 @@ def build_scanned_pdf() -> bytes:
156
193
 
157
194
  _BINARY_FIXTURES = {
158
195
  "employment_docx.docx": build_docx,
196
+ "heading_docx.docx": build_heading_docx,
159
197
  "license_pdf.pdf": build_pdf,
160
198
  "scanned.pdf": build_scanned_pdf,
161
199
  }
@@ -20,8 +20,8 @@ from tests._fixtures_build import ensure_binary_fixtures # noqa: E402
20
20
  FIXTURES = Path(__file__).resolve().parent / "fixtures"
21
21
 
22
22
  DOCS = ["nda_h2.md", "services_bold.txt", "lease_allcaps.txt",
23
- "employment_docx.docx", "license_pdf.pdf", "services_html.html",
24
- "scanned.pdf"]
23
+ "employment_docx.docx", "heading_docx.docx", "license_pdf.pdf",
24
+ "services_html.html", "scanned.pdf"]
25
25
 
26
26
 
27
27
  def golden_for(name: str) -> dict:
@@ -25,6 +25,7 @@ CORPUS: Tuple[Tuple[str, str, str], ...] = (
25
25
  ("services_bold.txt", "bold-numbered", "text"),
26
26
  ("lease_allcaps.txt", "all-caps", "text"),
27
27
  ("employment_docx.docx", "bold-numbered", "docx"),
28
+ ("heading_docx.docx", "h2", "docx"),
28
29
  ("license_pdf.pdf", "all-caps", "pdf"),
29
30
  ("services_html.html", "numbered", "html"),
30
31
  )
@@ -2,7 +2,7 @@
2
2
  "document": {
3
3
  "title": "EMPLOYMENT AGREEMENT",
4
4
  "format": "docx",
5
- "sha256": "1ba94a7bfd5a32a6d080cc6704cefb786e26dc16a8985857d562182da5f7298f",
5
+ "sha256": "f50e4b9b0cb77250280eb4c26225009de063b5f4a2318e9e53784d3730d20bd1",
6
6
  "source_path": "employment_docx.docx"
7
7
  },
8
8
  "parties": [
@@ -138,7 +138,7 @@
138
138
  "source": "deterministic"
139
139
  },
140
140
  "_meta": {
141
- "extractor_version": "0.1.2",
141
+ "extractor_version": "0.1.4",
142
142
  "tiers_used": [
143
143
  "deterministic"
144
144
  ],
@@ -0,0 +1,142 @@
1
+ {
2
+ "document": {
3
+ "title": "Cloud Service Agreement",
4
+ "format": "docx",
5
+ "sha256": "23a3b14196cdca6b58d14c7a6836fe28ff6d2be6c2fd852badb03ab6b6e84056",
6
+ "source_path": "heading_docx.docx"
7
+ },
8
+ "parties": [
9
+ {
10
+ "name": "Initech Software, Inc.",
11
+ "confidence": 0.9,
12
+ "source": "deterministic",
13
+ "role": "Provider"
14
+ },
15
+ {
16
+ "name": "Globex Corporation",
17
+ "confidence": 0.9,
18
+ "source": "deterministic",
19
+ "role": "Customer"
20
+ }
21
+ ],
22
+ "dates": {
23
+ "effective": {
24
+ "value": "2024-04-04",
25
+ "confidence": 0.85,
26
+ "source": "deterministic"
27
+ },
28
+ "expiration": {
29
+ "value": null,
30
+ "confidence": 0.0,
31
+ "source": "none"
32
+ }
33
+ },
34
+ "term": {
35
+ "length": {
36
+ "value": "2 years",
37
+ "confidence": 0.7,
38
+ "source": "deterministic"
39
+ },
40
+ "auto_renew": {
41
+ "value": true,
42
+ "confidence": 0.65,
43
+ "source": "deterministic"
44
+ },
45
+ "notice_period_days": {
46
+ "value": null,
47
+ "confidence": 0.0,
48
+ "source": "none"
49
+ }
50
+ },
51
+ "governing_law": {
52
+ "value": "State of New York",
53
+ "confidence": 0.85,
54
+ "source": "deterministic"
55
+ },
56
+ "clauses": [
57
+ {
58
+ "canonical_title": "Cloud Service Agreement",
59
+ "detected_title": "## Cloud Service Agreement",
60
+ "tier": "h2",
61
+ "span": {
62
+ "start": 0,
63
+ "end": 191
64
+ },
65
+ "confidence": 0.71,
66
+ "source": "deterministic",
67
+ "mapped": false
68
+ },
69
+ {
70
+ "canonical_title": "Confidentiality",
71
+ "detected_title": "## Confidentiality",
72
+ "tier": "h2",
73
+ "span": {
74
+ "start": 191,
75
+ "end": 280
76
+ },
77
+ "confidence": 0.95,
78
+ "source": "deterministic",
79
+ "mapped": true
80
+ },
81
+ {
82
+ "canonical_title": "Payment",
83
+ "detected_title": "## Payment",
84
+ "tier": "h2",
85
+ "span": {
86
+ "start": 280,
87
+ "end": 371
88
+ },
89
+ "confidence": 0.95,
90
+ "source": "deterministic",
91
+ "mapped": true
92
+ },
93
+ {
94
+ "canonical_title": "Termination",
95
+ "detected_title": "## Term & Termination",
96
+ "tier": "h2",
97
+ "span": {
98
+ "start": 371,
99
+ "end": 622
100
+ },
101
+ "confidence": 0.95,
102
+ "source": "deterministic",
103
+ "mapped": true
104
+ },
105
+ {
106
+ "canonical_title": "Governing Law",
107
+ "detected_title": "## Governing Law",
108
+ "tier": "h2",
109
+ "span": {
110
+ "start": 622,
111
+ "end": 704
112
+ },
113
+ "confidence": 0.95,
114
+ "source": "deterministic",
115
+ "mapped": true
116
+ }
117
+ ],
118
+ "defined_terms": [
119
+ {
120
+ "term": "Provider",
121
+ "confidence": 0.6,
122
+ "source": "deterministic"
123
+ },
124
+ {
125
+ "term": "Customer",
126
+ "confidence": 0.6,
127
+ "source": "deterministic"
128
+ }
129
+ ],
130
+ "value": {
131
+ "value": null,
132
+ "confidence": 0.0,
133
+ "source": "none"
134
+ },
135
+ "_meta": {
136
+ "extractor_version": "0.1.4",
137
+ "tiers_used": [
138
+ "deterministic"
139
+ ],
140
+ "llm_used": false
141
+ }
142
+ }
@@ -133,7 +133,7 @@
133
133
  "source": "deterministic"
134
134
  },
135
135
  "_meta": {
136
- "extractor_version": "0.1.2",
136
+ "extractor_version": "0.1.4",
137
137
  "tiers_used": [
138
138
  "deterministic"
139
139
  ],
@@ -133,7 +133,7 @@
133
133
  "source": "deterministic"
134
134
  },
135
135
  "_meta": {
136
- "extractor_version": "0.1.2",
136
+ "extractor_version": "0.1.4",
137
137
  "tiers_used": [
138
138
  "deterministic"
139
139
  ],
@@ -143,7 +143,7 @@
143
143
  "source": "none"
144
144
  },
145
145
  "_meta": {
146
- "extractor_version": "0.1.2",
146
+ "extractor_version": "0.1.4",
147
147
  "tiers_used": [
148
148
  "deterministic"
149
149
  ],
@@ -48,7 +48,7 @@
48
48
  "source": "none"
49
49
  },
50
50
  "_meta": {
51
- "extractor_version": "0.1.2",
51
+ "extractor_version": "0.1.4",
52
52
  "tiers_used": [
53
53
  "deterministic"
54
54
  ],
@@ -133,7 +133,7 @@
133
133
  "source": "deterministic"
134
134
  },
135
135
  "_meta": {
136
- "extractor_version": "0.1.2",
136
+ "extractor_version": "0.1.4",
137
137
  "tiers_used": [
138
138
  "deterministic"
139
139
  ],
@@ -148,7 +148,7 @@
148
148
  "source": "deterministic"
149
149
  },
150
150
  "_meta": {
151
- "extractor_version": "0.1.2",
151
+ "extractor_version": "0.1.4",
152
152
  "tiers_used": [
153
153
  "deterministic"
154
154
  ],
@@ -69,6 +69,31 @@ def test_trailing_period_stripped_from_titles() -> None:
69
69
  assert ex._canonicalize_clause("Survival.") == ("Survival", True)
70
70
 
71
71
 
72
+ def test_repeated_heading_treated_as_boilerplate() -> None:
73
+ # A "heading" that repeats 3+ times is a running header/footer, not clauses.
74
+ body = "\n\n".join("## Ks 99-2\n\nfoo" for _ in range(4))
75
+ text = "## Confidentiality\n\nreal body\n\n" + body
76
+ clauses = ex.extract_clauses(text)
77
+ titles = [c["canonical_title"] for c in clauses]
78
+ assert "Confidentiality" in titles
79
+ assert not any("Ks" in (t or "") for t in titles)
80
+
81
+
82
+ def test_noise_clause_titles_filtered() -> None:
83
+ assert ex._is_noise_clause_title("Ks 112708-2") # 4+ digit code
84
+ assert ex._is_noise_clause_title("Table of Contents")
85
+ assert ex._is_noise_clause_title("Exhibit B")
86
+ assert ex._is_noise_clause_title("Schedule 2.1")
87
+ assert not ex._is_noise_clause_title("Confidentiality")
88
+ assert not ex._is_noise_clause_title("Term and Termination")
89
+
90
+
91
+ def test_party_cuts_together_as_agent_and_unclosed_paren() -> None:
92
+ assert ex._clean_party_name("Foo LLC, together with its affiliates") == "Foo LLC"
93
+ assert ex._clean_party_name("GE Capital Corporation, as administrative agent") == "GE Capital Corporation"
94
+ assert ex._clean_party_name("Glenn Rufrano (each of them being") == "Glenn Rufrano"
95
+
96
+
72
97
  def test_cascade_priority_h2_wins() -> None:
73
98
  # An H2 present means the bold/all-caps fallbacks must not fire.
74
99
  text = "## Real Heading\n\n**1. Not A Heading**\n\nALSO NOT A HEADING\n\nbody"
@@ -142,6 +142,37 @@ def test_pdf_unescape() -> None:
142
142
  assert ex._pdf_unescape(r"\101\102") == "AB" # octal escapes
143
143
 
144
144
 
145
+ def test_docx_heading_style_helpers() -> None:
146
+ assert ex._is_heading_style("Heading1")
147
+ assert ex._is_heading_style("Heading 2".replace(" ", ""))
148
+ assert ex._is_heading_style("Title")
149
+ assert ex._is_heading_style("h3")
150
+ assert not ex._is_heading_style("Plain")
151
+ assert not ex._is_heading_style(None)
152
+ # Run-in heading: title is the lead before the sentence body.
153
+ assert ex._docx_heading_title("Payment. Customer will pay the fees.") == "Payment"
154
+ assert ex._docx_heading_title("Governing Law") == "Governing Law"
155
+ # A full sentence carrying a heading style is rejected (not a clause title).
156
+ assert ex._docx_heading_title(
157
+ "Either party may terminate this Agreement upon material breach that "
158
+ "remains uncured for thirty days.") is None
159
+
160
+
161
+ def test_docx_heading_styles_drive_clause_map() -> None:
162
+ """The Word-styled fixture's clauses come from Heading1 styles (their
163
+ numbers are auto-generated), detected via the H2 tier; the sentence that
164
+ merely carries a heading style is not a clause."""
165
+ raw, text, fmt, _w = ex.load_source(FIXTURES / "heading_docx.docx", prefer_optional=False)
166
+ result = ex.build_extraction(text, raw, fmt, "heading_docx.docx")
167
+ assert result["clauses"], "heading-styled docx should yield clauses"
168
+ canon = {c["canonical_title"] for c in result["clauses"]}
169
+ assert {"Confidentiality", "Payment", "Governing Law"} <= canon
170
+ assert all(c["tier"] == "h2" for c in result["clauses"])
171
+ # The full-sentence "Either party may terminate ..." must not appear.
172
+ assert not any("terminate this Agreement" in c["detected_title"] for c in result["clauses"])
173
+ assert [p["name"] for p in result["parties"]] == ["Initech Software, Inc.", "Globex Corporation"]
174
+
175
+
145
176
  def test_html_extraction() -> None:
146
177
  raw, text, fmt, _w = ex.load_source(FIXTURES / "services_html.html")
147
178
  assert fmt == "html"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes