extract-cli 0.1.3__tar.gz → 0.1.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. {extract_cli-0.1.3 → extract_cli-0.1.5}/ARCHITECTURE.md +6 -1
  2. {extract_cli-0.1.3 → extract_cli-0.1.5}/CHANGELOG.md +43 -0
  3. {extract_cli-0.1.3 → extract_cli-0.1.5}/PKG-INFO +10 -1
  4. {extract_cli-0.1.3 → extract_cli-0.1.5}/README.md +9 -0
  5. {extract_cli-0.1.3 → extract_cli-0.1.5}/extract_cli.py +113 -11
  6. {extract_cli-0.1.3 → extract_cli-0.1.5}/pyproject.toml +1 -1
  7. {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/_fixtures_build.py +45 -7
  8. {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/_make_goldens.py +2 -2
  9. {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/conftest.py +1 -0
  10. {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/fixtures/employment_docx.docx +0 -0
  11. {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/fixtures/employment_docx.docx.expected.json +2 -2
  12. extract_cli-0.1.5/tests/fixtures/heading_docx.docx +0 -0
  13. extract_cli-0.1.5/tests/fixtures/heading_docx.docx.expected.json +142 -0
  14. {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/fixtures/lease_allcaps.txt.expected.json +1 -1
  15. {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/fixtures/license_pdf.pdf.expected.json +1 -1
  16. {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/fixtures/nda_h2.md.expected.json +1 -1
  17. {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/fixtures/scanned.pdf.expected.json +1 -1
  18. {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/fixtures/services_bold.txt.expected.json +1 -1
  19. {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/fixtures/services_html.html.expected.json +1 -1
  20. {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/test_llm.py +35 -0
  21. {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/test_misc.py +31 -0
  22. {extract_cli-0.1.3 → extract_cli-0.1.5}/.gitignore +0 -0
  23. {extract_cli-0.1.3 → extract_cli-0.1.5}/CONTRIBUTING.md +0 -0
  24. {extract_cli-0.1.3 → extract_cli-0.1.5}/LICENSE +0 -0
  25. {extract_cli-0.1.3 → extract_cli-0.1.5}/Makefile +0 -0
  26. {extract_cli-0.1.3 → extract_cli-0.1.5}/config/llm.json.example +0 -0
  27. {extract_cli-0.1.3 → extract_cli-0.1.5}/docs/INTEROP.md +0 -0
  28. {extract_cli-0.1.3 → extract_cli-0.1.5}/docs/spec/extract-output.schema.json +0 -0
  29. {extract_cli-0.1.3 → extract_cli-0.1.5}/scripts/release.py +0 -0
  30. {extract_cli-0.1.3 → extract_cli-0.1.5}/scripts/validate_against_spec.py +0 -0
  31. {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/_schema_validator.py +0 -0
  32. {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/fixtures/lease_allcaps.txt +0 -0
  33. {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/fixtures/license_pdf.pdf +0 -0
  34. {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/fixtures/nda_h2.md +0 -0
  35. {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/fixtures/scanned.pdf +0 -0
  36. {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/fixtures/services_bold.txt +0 -0
  37. {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/fixtures/services_html.html +0 -0
  38. {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/test_clause_map.py +0 -0
  39. {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/test_cli.py +0 -0
  40. {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/test_deterministic.py +0 -0
  41. {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/test_property.py +0 -0
  42. {extract_cli-0.1.3 → extract_cli-0.1.5}/tests/test_schema_conformance.py +0 -0
@@ -80,7 +80,12 @@ endpoint. Any failure (no config, network error, unparseable JSON) is caught:
80
80
  a warning to stderr, deterministic output untouched. The LLM only *adds* fuzzy
81
81
  fields (`term.renewal_mechanics`, `obligations`) and fills `governing_law` only
82
82
  when the deterministic tier found nothing — it never overwrites a deterministic
83
- value.
83
+ value. As a **clause-map fallback**, when the deterministic cascade returned no
84
+ clauses the LLM is asked for the section headings (the clause keys are added to
85
+ the prompt only then); the titles are normalized through the same
86
+ `_canonicalize_clause` vocabulary, located in the text for a best-effort span,
87
+ and emitted with `tier: "llm"` / `source: "llm"`. This covers DOCX that
88
+ auto-number with no heading style (their numbers live only in `numbering.xml`).
84
89
 
85
90
  ## The output contract
86
91
 
@@ -6,6 +6,47 @@ to [Semantic Versioning](https://semver.org/). Per the suite convention
6
6
  (see [`docs/INTEROP.md`](docs/INTEROP.md)), **backward-incompatible changes to
7
7
  the output schema require a major version bump**; new optional fields are minor.
8
8
 
9
+ ## [0.1.5] - 2026-05-21
10
+
11
+ ### Added
12
+ - **LLM clause-map fallback** (opt-in, `--llm` only). When the deterministic
13
+ cascade detects no clauses — e.g. a `.docx` that auto-numbers via Word's
14
+ numbering with no heading style, the limitation noted in 0.1.4 — the LLM is
15
+ asked for the section headings (the clause request is added to the prompt
16
+ only in that case). Returned titles are normalized through the same canonical
17
+ vocabulary as the deterministic path, located in the document for a
18
+ best-effort span, and emitted with `tier: "llm"`, `source: "llm"`, and a
19
+ modest confidence. The LLM is never consulted for clauses the deterministic
20
+ cascade already found, and the deterministic core remains fully useful with
21
+ no LLM. No schema change (the clause `tier`/`source` enums already allow
22
+ `llm`).
23
+
24
+ ## [0.1.4] - 2026-05-21
25
+
26
+ DOCX clause detection, driven by testing against 20 real `.docx` contracts
27
+ (Common Paper / Bonterms / YC templates via open-agreements, plus government
28
+ samples) — the format we expect most.
29
+
30
+ ### Fixed
31
+ - **The DOCX reader now honors Word heading styles.** Real Word contracts carry
32
+ their clause structure in `Heading1`–`Heading9`/`Title` paragraph styles with
33
+ *auto-generated* numbers (absent from the raw text), so the prior cascade
34
+ found almost no clauses. Heading-styled paragraphs are now emitted as `##`
35
+ headings (detected by the strongest tier); run-in headings
36
+ (`Payment. Customer will pay …`) are split into title + body, and a full
37
+ sentence that merely carries a heading style is rejected (not a clause).
38
+ Across the 20-doc sample this took heading-styled agreements from ~0 clauses
39
+ to a clean 14–21 distinct suite-vocabulary clauses each.
40
+ - Binary DOCX test fixtures are now generated deterministically (fixed zip
41
+ timestamp) so their sha256 — and the goldens — are stable across regenerations.
42
+
43
+ ### Known limitations (documented)
44
+ - DOCX that auto-number clauses via `numbering.xml` with **no heading style and
45
+ no bold lead** (some Bonterms/older templates use a flat `Plain`/`ListParagraph`
46
+ style) still yield no clause map: the heading text carries no detectable
47
+ signal without reconstructing Word's numbering counters. Parties/dates/
48
+ governing-law still extract.
49
+
9
50
  ## [0.1.3] - 2026-05-21
10
51
 
11
52
  Clause-map de-noising and party cleanup, driven by testing against 10 more
@@ -140,6 +181,8 @@ Initial release — the open-loop front door of the contract-ops CLI suite.
140
181
  intentionally *not* governed by the output schema (the schema describes the
141
182
  full default output).
142
183
 
184
+ [0.1.5]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.5
185
+ [0.1.4]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.4
143
186
  [0.1.3]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.3
144
187
  [0.1.2]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.2
145
188
  [0.1.1]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.1
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-cli
3
- Version: 0.1.3
3
+ Version: 0.1.5
4
4
  Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON.
5
5
  Project-URL: Homepage, https://cli.drbaher.com/
6
6
  Project-URL: Repository, https://github.com/DrBaher/extract-cli
@@ -102,6 +102,15 @@ opt-in, never in a hot path, and gated behind an explicit flag and a config
102
102
  file — if no config is present, `--llm` degrades gracefully with a warning and
103
103
  you still get the full deterministic output.
104
104
 
105
+ **Clause-map fallback.** Some documents (e.g. `.docx` that auto-number clauses
106
+ via Word's numbering with no heading style) carry no signal the deterministic
107
+ cascade can see, so its clause map comes back empty. When `--llm` is set *and*
108
+ no clauses were detected, the LLM is asked for the section headings; the result
109
+ is normalized through the same canonical vocabulary and emitted with
110
+ `tier: "llm"`, `source: "llm"`, and a modest confidence (verify, not trust).
111
+ When the deterministic cascade already found clauses, the LLM is not consulted
112
+ for them.
113
+
105
114
  ## Commands
106
115
 
107
116
  ```bash
@@ -64,6 +64,15 @@ opt-in, never in a hot path, and gated behind an explicit flag and a config
64
64
  file — if no config is present, `--llm` degrades gracefully with a warning and
65
65
  you still get the full deterministic output.
66
66
 
67
+ **Clause-map fallback.** Some documents (e.g. `.docx` that auto-number clauses
68
+ via Word's numbering with no heading style) carry no signal the deterministic
69
+ cascade can see, so its clause map comes back empty. When `--llm` is set *and*
70
+ no clauses were detected, the LLM is asked for the section headings; the result
71
+ is normalized through the same canonical vocabulary and emitted with
72
+ `tier: "llm"`, `source: "llm"`, and a modest confidence (verify, not trust).
73
+ When the deterministic cascade already found clauses, the LLM is not consulted
74
+ for them.
75
+
67
76
  ## Commands
68
77
 
69
78
  ```bash
@@ -43,11 +43,11 @@ import urllib.request
43
43
  from pathlib import Path
44
44
  from typing import Any, Dict, List, Optional, Tuple
45
45
 
46
- __version__ = "0.1.3"
46
+ __version__ = "0.1.5"
47
47
 
48
48
  # Bumped independently of the package version when the *extraction logic*
49
49
  # changes in a way downstream consumers should notice. Embedded in `_meta`.
50
- EXTRACTOR_VERSION = "0.1.3"
50
+ EXTRACTOR_VERSION = "0.1.5"
51
51
 
52
52
  # JSON Schema version of the output contract (docs/spec/extract-output.schema.json).
53
53
  SCHEMA_VERSION = 1
@@ -950,6 +950,39 @@ def _read_docx(path: Path, raw: bytes, prefer_optional: bool = True) -> Tuple[st
950
950
  return "", warnings
951
951
 
952
952
 
953
+ def _docx_paragraph_style(ppr: Any, w: str) -> Optional[str]:
954
+ if ppr is None:
955
+ return None
956
+ st = ppr.find(w + "pStyle")
957
+ return st.get(w + "val") if st is not None else None
958
+
959
+
960
+ def _is_heading_style(style: Optional[str]) -> bool:
961
+ """True for Word built-in heading/title styles (Heading1-9, Title, and the
962
+ 'H1'/'H2' shorthands). These mark clause headings whose visible numbers are
963
+ auto-generated and absent from the raw text."""
964
+ if not style:
965
+ return False
966
+ s = style.lower()
967
+ return "heading" in s or s == "title" or bool(re.fullmatch(r"h[1-9]", s))
968
+
969
+
970
+ def _docx_heading_title(text: str) -> Optional[str]:
971
+ """Pull the clause title out of a heading paragraph. Many contracts use a
972
+ run-in heading -- 'Performing Services. Contractor will ...' -- where the
973
+ title is the lead before the first sentence break; a standalone header
974
+ ('Services & Restrictions') has no such break and is used whole.
975
+
976
+ Returns None when the paragraph is really a full sentence that merely
977
+ carries a heading style (no run-in title) -- those would otherwise become
978
+ garbage clause titles and mis-map under substring matching."""
979
+ m = re.match(r"\s*(.{2,80}?)[.:]\s+[A-Z(\"“]", text)
980
+ title = m.group(1).strip() if m else text.strip()
981
+ if len(title) > 70 or len(title.split()) > 9:
982
+ return None
983
+ return title
984
+
985
+
953
986
  def _read_docx_stdlib(raw: bytes) -> str:
954
987
  import io
955
988
  import zipfile
@@ -962,6 +995,7 @@ def _read_docx_stdlib(raw: bytes) -> str:
962
995
  paras: List[str] = []
963
996
  # iter over w:p in document order (includes paragraphs inside table cells).
964
997
  for p in root.iter(w + "p"):
998
+ style = _docx_paragraph_style(p.find(w + "pPr"), w)
965
999
  run_texts: List[str] = []
966
1000
  any_text = False
967
1001
  all_bold = True
@@ -978,6 +1012,17 @@ def _read_docx_stdlib(raw: bytes) -> str:
978
1012
  if not line:
979
1013
  paras.append("")
980
1014
  continue
1015
+ # Word heading styles carry the clause structure (their numbers are
1016
+ # auto-generated, so absent from text). Emit them as H2 so the clause
1017
+ # cascade's strongest tier detects them; keep any run-in body too.
1018
+ if _is_heading_style(style):
1019
+ title = _docx_heading_title(line)
1020
+ if title is not None:
1021
+ paras.append(f"## {title}")
1022
+ if len(title) < len(line):
1023
+ paras.append(line[len(title):].lstrip(" .:\t"))
1024
+ continue
1025
+ # Sentence carrying a heading style -> treat as ordinary body text.
981
1026
  if any_text and all_bold:
982
1027
  line = f"**{line}**"
983
1028
  paras.append(line)
@@ -1230,15 +1275,29 @@ def load_llm_config() -> Optional[JSON]:
1230
1275
  return None
1231
1276
 
1232
1277
 
1233
- _LLM_PROMPT = (
1234
- "You are a contract-extraction assistant. Given the contract text, return "
1235
- "ONLY a compact JSON object with keys: renewal_mechanics (string or null), "
1236
- "obligations (array of short strings, max 5), governing_law (string or "
1237
- "null). Base answers strictly on the text. No prose, JSON only.\n\n"
1238
- "CONTRACT:\n"
1278
+ _LLM_PROMPT_KEYS = (
1279
+ "renewal_mechanics (string or null), obligations (array of short strings, "
1280
+ "max 5), governing_law (string or null)"
1281
+ )
1282
+ # Requested only when the deterministic clause cascade found nothing (e.g. a
1283
+ # DOCX that auto-numbers with no heading style): ask the model for the section
1284
+ # headings so we can still produce a clause map.
1285
+ _LLM_PROMPT_CLAUSES = (
1286
+ ", clauses (array, max 40, of objects {\"title\": \"<the section/clause "
1287
+ "heading, verbatim if possible>\"} in document order, top-level sections "
1288
+ "only)"
1239
1289
  )
1240
1290
 
1241
1291
 
1292
+ def _build_llm_prompt(text: str, want_clauses: bool) -> str:
1293
+ keys = _LLM_PROMPT_KEYS + (_LLM_PROMPT_CLAUSES if want_clauses else "")
1294
+ return (
1295
+ "You are a contract-extraction assistant. Given the contract text, "
1296
+ "return ONLY a compact JSON object with keys: " + keys + ". Base answers "
1297
+ "strictly on the text. No prose, JSON only.\n\nCONTRACT:\n" + text[:16000]
1298
+ )
1299
+
1300
+
1242
1301
  def _llm_request(cfg: JSON, prompt: str, timeout: float = 30.0) -> Optional[str]:
1243
1302
  provider = str(cfg.get("provider", "anthropic")).lower()
1244
1303
  model = cfg.get("model") or ("claude-sonnet-4-6" if provider == "anthropic" else "gpt-4o-mini")
@@ -1292,8 +1351,44 @@ def _extract_json_object(s: str) -> Optional[JSON]:
1292
1351
  return None
1293
1352
 
1294
1353
 
1354
+ def _llm_clause_map(raw: Any, text: str) -> List[JSON]:
1355
+ """Convert LLM-returned clause titles into schema-conformant clause objects.
1356
+ Titles are canonicalized through the same suite vocabulary the deterministic
1357
+ path uses, located in the document for a best-effort span, and marked
1358
+ tier/source = 'llm' with a modest confidence (verify, not trust)."""
1359
+ if not isinstance(raw, list):
1360
+ return []
1361
+ low = text.lower()
1362
+ out: List[JSON] = []
1363
+ seen: set[str] = set()
1364
+ for item in raw[:40]:
1365
+ title: Any = item.get("title") if isinstance(item, dict) else item
1366
+ if not isinstance(title, str) or not title.strip():
1367
+ continue
1368
+ title = re.sub(r"\s+", " ", title.strip())
1369
+ key = _norm_clause_key(title)
1370
+ if not key or key in seen or _is_noise_clause_title(title):
1371
+ continue
1372
+ seen.add(key)
1373
+ canonical, mapped = _canonicalize_clause(title)
1374
+ idx = low.find(title.lower())
1375
+ span = ({"start": idx, "end": min(idx + len(title), len(text))}
1376
+ if idx >= 0 else {"start": 0, "end": 0})
1377
+ out.append({
1378
+ "canonical_title": canonical,
1379
+ "detected_title": title,
1380
+ "tier": "llm",
1381
+ "span": span,
1382
+ "confidence": 0.5,
1383
+ "source": "llm",
1384
+ "mapped": mapped,
1385
+ })
1386
+ return out
1387
+
1388
+
1295
1389
  def llm_enrich(result: JSON, text: str, args_ns: argparse.Namespace) -> None:
1296
- """Opt-in enrichment of fuzzy fields. Mutates `result` in place. Any
1390
+ """Opt-in enrichment of fuzzy fields, plus a clause-map fallback when the
1391
+ deterministic cascade found no clauses. Mutates `result` in place. Any
1297
1392
  failure (no config, network error, bad JSON) degrades gracefully: a warning
1298
1393
  to stderr and the deterministic output is left untouched."""
1299
1394
  cfg = load_llm_config()
@@ -1301,7 +1396,8 @@ def llm_enrich(result: JSON, text: str, args_ns: argparse.Namespace) -> None:
1301
1396
  _warn(args_ns, "no LLM config found (~/.config/contract-ops/llm.json or "
1302
1397
  "./config/llm.json); skipping --llm enrichment")
1303
1398
  return
1304
- prompt = _LLM_PROMPT + text[:12000]
1399
+ want_clauses = not result["clauses"]
1400
+ prompt = _build_llm_prompt(text, want_clauses)
1305
1401
  try:
1306
1402
  raw = _llm_request(cfg, prompt)
1307
1403
  except (urllib.error.URLError, TimeoutError, OSError, ValueError) as e:
@@ -1331,6 +1427,11 @@ def llm_enrich(result: JSON, text: str, args_ns: argparse.Namespace) -> None:
1331
1427
  if isinstance(gl, str) and gl.strip() and result["governing_law"]["source"] == "none":
1332
1428
  result["governing_law"] = _field(gl.strip(), 0.6, "llm")
1333
1429
  enriched = True
1430
+ if want_clauses:
1431
+ cmap = _llm_clause_map(obj.get("clauses"), text)
1432
+ if cmap:
1433
+ result["clauses"] = cmap
1434
+ enriched = True
1334
1435
 
1335
1436
  result["_meta"]["llm_used"] = True
1336
1437
  if enriched and "llm" not in result["_meta"]["tiers_used"]:
@@ -1613,7 +1714,8 @@ FIELD_CATALOG: Tuple[Tuple[str, str, str], ...] = (
1613
1714
  ("term.notice_period_days", "deterministic", "Notice period in days, best-effort"),
1614
1715
  ("term.auto_renew", "deterministic", "Auto-renewal flag, best-effort"),
1615
1716
  ("governing_law", "deterministic", "Governing law / jurisdiction"),
1616
- ("clauses", "deterministic", "Clause map normalized to the suite's canonical vocabulary"),
1717
+ ("clauses", "deterministic", "Clause map normalized to the suite's canonical vocabulary "
1718
+ "(LLM fallback under --llm when no headings are detected)"),
1617
1719
  ("defined_terms", "deterministic", "Defined-term inventory (quoted / parenthetical)"),
1618
1720
  ("value", "deterministic", "Headline monetary value"),
1619
1721
  ("term.renewal_mechanics", "llm", "Renewal mechanics (fuzzy; --llm only)"),
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "extract-cli"
7
- version = "0.1.3"
7
+ version = "0.1.5"
8
8
  description = "Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
@@ -42,14 +42,36 @@ _DOCX_PARAS = [
42
42
  _W = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
43
43
 
44
44
 
45
- def _docx_paragraph(text: str, bold: bool) -> str:
45
+ def _docx_paragraph(text: str, bold: bool = False, style: str = "") -> str:
46
+ ppr = f'<w:pPr><w:pStyle w:val="{style}"/></w:pPr>' if style else ""
46
47
  rpr = "<w:rPr><w:b/></w:rPr>" if bold else ""
47
- return (f"<w:p><w:r>{rpr}"
48
+ return (f"<w:p>{ppr}<w:r>{rpr}"
48
49
  f'<w:t xml:space="preserve">{escape(text)}</w:t></w:r></w:p>')
49
50
 
50
51
 
51
- def build_docx() -> bytes:
52
- body = "".join(_docx_paragraph(t, b) for t, b in _DOCX_PARAS)
52
+ # A Word-styled agreement: clause structure carried by Heading1 styles (their
53
+ # numbers are auto-generated, absent from text), including a run-in heading and
54
+ # a full sentence that merely carries the heading style (must be rejected).
55
+ _HEADING_DOCX_PARAS = [
56
+ ('Cloud Service Agreement', False, "Title"),
57
+ ('This Cloud Service Agreement is entered into as of April 4, 2024, by and '
58
+ 'between Initech Software, Inc. (the "Provider") and Globex Corporation '
59
+ '(the "Customer").', False, ""),
60
+ ('Confidentiality', False, "Heading1"),
61
+ ('Each party will protect the other party’s Confidential Information.', False, ""),
62
+ ('Payment. Customer will pay the fees set out in the Order Form within '
63
+ 'thirty (30) days.', False, "Heading1"),
64
+ ('Term & Termination', False, "Heading1"),
65
+ ('The term of this Agreement is two (2) years and will automatically renew '
66
+ 'for successive one-year terms.', False, ""),
67
+ ('Either party may terminate this Agreement upon material breach that '
68
+ 'remains uncured for thirty days after written notice.', False, "Heading1"),
69
+ ('Governing Law', False, "Heading1"),
70
+ ('This Agreement is governed by the laws of the State of New York.', False, ""),
71
+ ]
72
+
73
+
74
+ def _docx_package(body: str) -> bytes:
53
75
  document = (
54
76
  '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
55
77
  f'<w:document xmlns:w="{_W}"><w:body>{body}<w:sectPr/></w:body></w:document>'
@@ -70,14 +92,29 @@ def build_docx() -> bytes:
70
92
  'Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" '
71
93
  'Target="word/document.xml"/></Relationships>'
72
94
  )
95
+ # Deterministic: a fixed timestamp on every entry so regenerating the
96
+ # fixture produces byte-identical output (stable sha256 -> stable goldens).
73
97
  buf = io.BytesIO()
74
98
  with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
75
- z.writestr("[Content_Types].xml", content_types)
76
- z.writestr("_rels/.rels", rels)
77
- z.writestr("word/document.xml", document)
99
+ for name, data in (("[Content_Types].xml", content_types),
100
+ ("_rels/.rels", rels),
101
+ ("word/document.xml", document)):
102
+ info = zipfile.ZipInfo(name, date_time=(1980, 1, 1, 0, 0, 0))
103
+ info.compress_type = zipfile.ZIP_DEFLATED
104
+ z.writestr(info, data)
78
105
  return buf.getvalue()
79
106
 
80
107
 
108
+ def build_docx() -> bytes:
109
+ return _docx_package("".join(_docx_paragraph(t, b) for t, b in _DOCX_PARAS))
110
+
111
+
112
+ def build_heading_docx() -> bytes:
113
+ return _docx_package(
114
+ "".join(_docx_paragraph(t, b, style=s) for t, b, s in _HEADING_DOCX_PARAS)
115
+ )
116
+
117
+
81
118
  # --- PDF: a software license with ALL-CAPS headings (Tier 3) ----------------
82
119
 
83
120
  _PDF_TEXT = """SOFTWARE LICENSE AGREEMENT
@@ -156,6 +193,7 @@ def build_scanned_pdf() -> bytes:
156
193
 
157
194
  _BINARY_FIXTURES = {
158
195
  "employment_docx.docx": build_docx,
196
+ "heading_docx.docx": build_heading_docx,
159
197
  "license_pdf.pdf": build_pdf,
160
198
  "scanned.pdf": build_scanned_pdf,
161
199
  }
@@ -20,8 +20,8 @@ from tests._fixtures_build import ensure_binary_fixtures # noqa: E402
20
20
  FIXTURES = Path(__file__).resolve().parent / "fixtures"
21
21
 
22
22
  DOCS = ["nda_h2.md", "services_bold.txt", "lease_allcaps.txt",
23
- "employment_docx.docx", "license_pdf.pdf", "services_html.html",
24
- "scanned.pdf"]
23
+ "employment_docx.docx", "heading_docx.docx", "license_pdf.pdf",
24
+ "services_html.html", "scanned.pdf"]
25
25
 
26
26
 
27
27
  def golden_for(name: str) -> dict:
@@ -25,6 +25,7 @@ CORPUS: Tuple[Tuple[str, str, str], ...] = (
25
25
  ("services_bold.txt", "bold-numbered", "text"),
26
26
  ("lease_allcaps.txt", "all-caps", "text"),
27
27
  ("employment_docx.docx", "bold-numbered", "docx"),
28
+ ("heading_docx.docx", "h2", "docx"),
28
29
  ("license_pdf.pdf", "all-caps", "pdf"),
29
30
  ("services_html.html", "numbered", "html"),
30
31
  )
@@ -2,7 +2,7 @@
2
2
  "document": {
3
3
  "title": "EMPLOYMENT AGREEMENT",
4
4
  "format": "docx",
5
- "sha256": "1ba94a7bfd5a32a6d080cc6704cefb786e26dc16a8985857d562182da5f7298f",
5
+ "sha256": "f50e4b9b0cb77250280eb4c26225009de063b5f4a2318e9e53784d3730d20bd1",
6
6
  "source_path": "employment_docx.docx"
7
7
  },
8
8
  "parties": [
@@ -138,7 +138,7 @@
138
138
  "source": "deterministic"
139
139
  },
140
140
  "_meta": {
141
- "extractor_version": "0.1.3",
141
+ "extractor_version": "0.1.5",
142
142
  "tiers_used": [
143
143
  "deterministic"
144
144
  ],
@@ -0,0 +1,142 @@
1
+ {
2
+ "document": {
3
+ "title": "Cloud Service Agreement",
4
+ "format": "docx",
5
+ "sha256": "23a3b14196cdca6b58d14c7a6836fe28ff6d2be6c2fd852badb03ab6b6e84056",
6
+ "source_path": "heading_docx.docx"
7
+ },
8
+ "parties": [
9
+ {
10
+ "name": "Initech Software, Inc.",
11
+ "confidence": 0.9,
12
+ "source": "deterministic",
13
+ "role": "Provider"
14
+ },
15
+ {
16
+ "name": "Globex Corporation",
17
+ "confidence": 0.9,
18
+ "source": "deterministic",
19
+ "role": "Customer"
20
+ }
21
+ ],
22
+ "dates": {
23
+ "effective": {
24
+ "value": "2024-04-04",
25
+ "confidence": 0.85,
26
+ "source": "deterministic"
27
+ },
28
+ "expiration": {
29
+ "value": null,
30
+ "confidence": 0.0,
31
+ "source": "none"
32
+ }
33
+ },
34
+ "term": {
35
+ "length": {
36
+ "value": "2 years",
37
+ "confidence": 0.7,
38
+ "source": "deterministic"
39
+ },
40
+ "auto_renew": {
41
+ "value": true,
42
+ "confidence": 0.65,
43
+ "source": "deterministic"
44
+ },
45
+ "notice_period_days": {
46
+ "value": null,
47
+ "confidence": 0.0,
48
+ "source": "none"
49
+ }
50
+ },
51
+ "governing_law": {
52
+ "value": "State of New York",
53
+ "confidence": 0.85,
54
+ "source": "deterministic"
55
+ },
56
+ "clauses": [
57
+ {
58
+ "canonical_title": "Cloud Service Agreement",
59
+ "detected_title": "## Cloud Service Agreement",
60
+ "tier": "h2",
61
+ "span": {
62
+ "start": 0,
63
+ "end": 191
64
+ },
65
+ "confidence": 0.71,
66
+ "source": "deterministic",
67
+ "mapped": false
68
+ },
69
+ {
70
+ "canonical_title": "Confidentiality",
71
+ "detected_title": "## Confidentiality",
72
+ "tier": "h2",
73
+ "span": {
74
+ "start": 191,
75
+ "end": 280
76
+ },
77
+ "confidence": 0.95,
78
+ "source": "deterministic",
79
+ "mapped": true
80
+ },
81
+ {
82
+ "canonical_title": "Payment",
83
+ "detected_title": "## Payment",
84
+ "tier": "h2",
85
+ "span": {
86
+ "start": 280,
87
+ "end": 371
88
+ },
89
+ "confidence": 0.95,
90
+ "source": "deterministic",
91
+ "mapped": true
92
+ },
93
+ {
94
+ "canonical_title": "Termination",
95
+ "detected_title": "## Term & Termination",
96
+ "tier": "h2",
97
+ "span": {
98
+ "start": 371,
99
+ "end": 622
100
+ },
101
+ "confidence": 0.95,
102
+ "source": "deterministic",
103
+ "mapped": true
104
+ },
105
+ {
106
+ "canonical_title": "Governing Law",
107
+ "detected_title": "## Governing Law",
108
+ "tier": "h2",
109
+ "span": {
110
+ "start": 622,
111
+ "end": 704
112
+ },
113
+ "confidence": 0.95,
114
+ "source": "deterministic",
115
+ "mapped": true
116
+ }
117
+ ],
118
+ "defined_terms": [
119
+ {
120
+ "term": "Provider",
121
+ "confidence": 0.6,
122
+ "source": "deterministic"
123
+ },
124
+ {
125
+ "term": "Customer",
126
+ "confidence": 0.6,
127
+ "source": "deterministic"
128
+ }
129
+ ],
130
+ "value": {
131
+ "value": null,
132
+ "confidence": 0.0,
133
+ "source": "none"
134
+ },
135
+ "_meta": {
136
+ "extractor_version": "0.1.5",
137
+ "tiers_used": [
138
+ "deterministic"
139
+ ],
140
+ "llm_used": false
141
+ }
142
+ }
@@ -133,7 +133,7 @@
133
133
  "source": "deterministic"
134
134
  },
135
135
  "_meta": {
136
- "extractor_version": "0.1.3",
136
+ "extractor_version": "0.1.5",
137
137
  "tiers_used": [
138
138
  "deterministic"
139
139
  ],
@@ -133,7 +133,7 @@
133
133
  "source": "deterministic"
134
134
  },
135
135
  "_meta": {
136
- "extractor_version": "0.1.3",
136
+ "extractor_version": "0.1.5",
137
137
  "tiers_used": [
138
138
  "deterministic"
139
139
  ],
@@ -143,7 +143,7 @@
143
143
  "source": "none"
144
144
  },
145
145
  "_meta": {
146
- "extractor_version": "0.1.3",
146
+ "extractor_version": "0.1.5",
147
147
  "tiers_used": [
148
148
  "deterministic"
149
149
  ],
@@ -48,7 +48,7 @@
48
48
  "source": "none"
49
49
  },
50
50
  "_meta": {
51
- "extractor_version": "0.1.3",
51
+ "extractor_version": "0.1.5",
52
52
  "tiers_used": [
53
53
  "deterministic"
54
54
  ],
@@ -133,7 +133,7 @@
133
133
  "source": "deterministic"
134
134
  },
135
135
  "_meta": {
136
- "extractor_version": "0.1.3",
136
+ "extractor_version": "0.1.5",
137
137
  "tiers_used": [
138
138
  "deterministic"
139
139
  ],
@@ -148,7 +148,7 @@
148
148
  "source": "deterministic"
149
149
  },
150
150
  "_meta": {
151
- "extractor_version": "0.1.3",
151
+ "extractor_version": "0.1.5",
152
152
  "tiers_used": [
153
153
  "deterministic"
154
154
  ],
@@ -67,6 +67,41 @@ def test_enrich_fills_only_missing_governing_law(monkeypatch: pytest.MonkeyPatch
67
67
  assert result["governing_law"] == {"value": "France", "confidence": 0.6, "source": "llm"}
68
68
 
69
69
 
70
+ def test_llm_clause_fallback_when_deterministic_empty(monkeypatch: pytest.MonkeyPatch) -> None:
71
+ from tests._schema_validator import validate
72
+ monkeypatch.setattr(ex, "load_llm_config",
73
+ lambda: {"provider": "anthropic", "api_key": "x"})
74
+ monkeypatch.setattr(ex, "_llm_request", lambda cfg, prompt, timeout=30.0: json.dumps(
75
+ {"clauses": [{"title": "Confidentiality"}, {"title": "Governing Law"},
76
+ {"title": "Special Widget Terms"}]}))
77
+ # A document with no detectable clause headings -> 0 deterministic clauses.
78
+ text = ("This Agreement is made between Acme Co and Beta Co. The parties agree "
79
+ "to maintain confidentiality. Governed by the laws of Delaware.")
80
+ result = ex.build_extraction(text, text.encode("utf-8"), "text", "x.txt")
81
+ assert result["clauses"] == []
82
+ ex.llm_enrich(result, text, _ns())
83
+ cl = result["clauses"]
84
+ assert [c["canonical_title"] for c in cl] == ["Confidentiality", "Governing Law", "Special Widget Terms"]
85
+ assert all(c["tier"] == "llm" and c["source"] == "llm" for c in cl)
86
+ assert cl[0]["mapped"] is True and cl[2]["mapped"] is False
87
+ assert result["_meta"]["llm_used"] is True and "llm" in result["_meta"]["tiers_used"]
88
+ assert validate(result, ex.output_schema()) == [] # llm clauses are schema-conformant
89
+
90
+
91
+ def test_llm_does_not_replace_deterministic_clauses(monkeypatch: pytest.MonkeyPatch) -> None:
92
+ monkeypatch.setattr(ex, "load_llm_config",
93
+ lambda: {"provider": "anthropic", "api_key": "x"})
94
+ monkeypatch.setattr(ex, "_llm_request", lambda cfg, prompt, timeout=30.0: json.dumps(
95
+ {"clauses": [{"title": "Should Not Appear"}]}))
96
+ text = ex.DEMO_DOCUMENT # has H2 clauses
97
+ result = ex.build_extraction(text, text.encode("utf-8"), "markdown", "d.md")
98
+ assert result["clauses"] and all(c["tier"] == "h2" for c in result["clauses"])
99
+ ex.llm_enrich(result, text, _ns())
100
+ # Deterministic clauses are kept; the LLM clause was never requested/used.
101
+ assert all(c["tier"] == "h2" for c in result["clauses"])
102
+ assert not any(c["detected_title"] == "Should Not Appear" for c in result["clauses"])
103
+
104
+
70
105
  def test_request_error_degrades(monkeypatch: pytest.MonkeyPatch,
71
106
  capsys: pytest.CaptureFixture[str]) -> None:
72
107
  monkeypatch.setattr(ex, "load_llm_config",
@@ -142,6 +142,37 @@ def test_pdf_unescape() -> None:
142
142
  assert ex._pdf_unescape(r"\101\102") == "AB" # octal escapes
143
143
 
144
144
 
145
+ def test_docx_heading_style_helpers() -> None:
146
+ assert ex._is_heading_style("Heading1")
147
+ assert ex._is_heading_style("Heading 2".replace(" ", ""))
148
+ assert ex._is_heading_style("Title")
149
+ assert ex._is_heading_style("h3")
150
+ assert not ex._is_heading_style("Plain")
151
+ assert not ex._is_heading_style(None)
152
+ # Run-in heading: title is the lead before the sentence body.
153
+ assert ex._docx_heading_title("Payment. Customer will pay the fees.") == "Payment"
154
+ assert ex._docx_heading_title("Governing Law") == "Governing Law"
155
+ # A full sentence carrying a heading style is rejected (not a clause title).
156
+ assert ex._docx_heading_title(
157
+ "Either party may terminate this Agreement upon material breach that "
158
+ "remains uncured for thirty days.") is None
159
+
160
+
161
+ def test_docx_heading_styles_drive_clause_map() -> None:
162
+ """The Word-styled fixture's clauses come from Heading1 styles (their
163
+ numbers are auto-generated), detected via the H2 tier; the sentence that
164
+ merely carries a heading style is not a clause."""
165
+ raw, text, fmt, _w = ex.load_source(FIXTURES / "heading_docx.docx", prefer_optional=False)
166
+ result = ex.build_extraction(text, raw, fmt, "heading_docx.docx")
167
+ assert result["clauses"], "heading-styled docx should yield clauses"
168
+ canon = {c["canonical_title"] for c in result["clauses"]}
169
+ assert {"Confidentiality", "Payment", "Governing Law"} <= canon
170
+ assert all(c["tier"] == "h2" for c in result["clauses"])
171
+ # The full-sentence "Either party may terminate ..." must not appear.
172
+ assert not any("terminate this Agreement" in c["detected_title"] for c in result["clauses"])
173
+ assert [p["name"] for p in result["parties"]] == ["Initech Software, Inc.", "Globex Corporation"]
174
+
175
+
145
176
  def test_html_extraction() -> None:
146
177
  raw, text, fmt, _w = ex.load_source(FIXTURES / "services_html.html")
147
178
  assert fmt == "html"
File without changes
File without changes
File without changes
File without changes
File without changes