PyPI - extract-cli - Versions diffs - 0.1.9__tar.gz → 0.1.10__tar.gz - Mend

extract-cli 0.1.9tar.gz → 0.1.10tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

{extract_cli-0.1.9 → extract_cli-0.1.10}/CHANGELOG.md RENAMED Viewed

@@ -6,6 +6,31 @@ to [Semantic Versioning](https://semver.org/). Per the suite convention
 (see [`docs/INTEROP.md`](docs/INTEROP.md)), **backward-incompatible changes to
 the output schema require a major version bump**; new optional fields are minor.
+## [0.1.10] - 2026-05-22
+### Fixed
+- **The `[docx]` (python-docx) reader now honors Word heading styles**, matching
+  the stdlib reader. Previously the python-docx path concatenated paragraph text
+  and dropped `Heading1-9`/`Title` styles and `w:numPr` numbering, so installing
+  the `[docx]` extra produced an **empty clause map** on heading-styled Word
+  contracts (worse than the no-extra stdlib reader). Both readers now share one
+  emitter (`_emit_docx_paragraph`) that turns heading-styled / auto-numbered
+  paragraphs into `## headings`, so the two paths agree. New tests:
+  `test_emit_docx_paragraph` and `test_docx_readers_agree_on_clause_map` (the
+  latter asserts the python-docx and stdlib readers produce the same clause map).
+  No output-schema change.
+### Tests / quality
+- **Line coverage raised to 100%** (was 92%/94%). Added a targeted test battery
+  for the remaining reachable branches (color/`FORCE_COLOR`, `_warn` silent,
+  date/jurisdiction/title/clause edge returns, LLM request/parse/clause-map
+  branches, PDF `TJ`-array + stream/budget edges, HTML malformed fallback, DOCX
+  empty paragraph, `_is_low_signal` branches, CLI silent/help paths). Genuinely
+  unreachable defensive lines and `[docx]`/`[pdf]`-extra fidelity branches are
+  marked `# pragma: no cover`. `make coverage` now installs the extras and
+  enforces `--fail-under=100`; a CI `coverage` job gates it. No code-behavior or
+  schema change.
 ## [0.1.9] - 2026-05-22
 ### Security / robustness
@@ -271,6 +296,7 @@ Initial release — the open-loop front door of the contract-ops CLI suite.
   intentionally *not* governed by the output schema (the schema describes the
   full default output).
+[0.1.10]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.10
 [0.1.9]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.9
 [0.1.8]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.8
 [0.1.7]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.7

{extract_cli-0.1.9 → extract_cli-0.1.10}/Makefile RENAMED Viewed

@@ -31,8 +31,11 @@ test-quick:
 	$(PYTHON) -m pytest -x -q -k "not property"
 coverage:
+	# Install the [docx]/[pdf] extras so the fidelity-reader paths execute too;
+	# without them two extras-only branches stay uncovered (98% vs 100%).
+	$(PIP) install -q -e ".[dev,docx,pdf]"
 	$(PYTHON) -m coverage run --source=extract_cli -m pytest -q
-	$(PYTHON) -m coverage report -m
+	$(PYTHON) -m coverage report -m --fail-under=100
 typecheck:
 	$(PYTHON) -m mypy --strict extract_cli.py

{extract_cli-0.1.9 → extract_cli-0.1.10}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: extract-cli
-Version: 0.1.9
+Version: 0.1.10
 Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON.
 Project-URL: Homepage, https://cli.drbaher.com/
 Project-URL: Repository, https://github.com/DrBaher/extract-cli

{extract_cli-0.1.9 → extract_cli-0.1.10}/extract_cli.py RENAMED Viewed

@@ -43,11 +43,11 @@ import urllib.request
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
-__version__ = "0.1.9"
+__version__ = "0.1.10"
 # Bumped independently of the package version when the *extraction logic*
 # changes in a way downstream consumers should notice. Embedded in `_meta`.
-EXTRACTOR_VERSION = "0.1.9"
+EXTRACTOR_VERSION = "0.1.10"
 # JSON Schema version of the output contract (docs/spec/extract-output.schema.json).
 SCHEMA_VERSION = 1
@@ -759,7 +759,7 @@ def extract_governing_law(text: str) -> JSON:
     if not m:
         return _none_field()
     juris = re.sub(r"\s+", " ", m.group(1).strip().rstrip(".,")).strip()
-    if not juris:
+    if not juris:  # pragma: no cover - the capture group requires a leading letter
         return _none_field()
     return _field(juris, 0.85)
@@ -880,7 +880,7 @@ def extract_defined_terms(text: str) -> List[JSON]:
             # Reject sentence-like or lowercase-y captures.
             if len(term) < 2 or len(term.split()) > 6:
                 continue
-            if not term[0].isupper():
+            if not term[0].isupper():  # pragma: no cover - the regexes require an uppercase lead
                 continue
             seen.setdefault(term, None)
             if len(seen) >= 50:
@@ -1075,13 +1075,20 @@ def _read_docx(path: Path, raw: bytes, prefer_optional: bool = True) -> Tuple[st
             mod = importlib.import_module("docx")
             document_cls = getattr(mod, "Document")
             doc = document_cls(str(path))
+            w = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
             lines: List[str] = []
             for para in doc.paragraphs:
                 line = (para.text or "").strip()
-                if line and para.runs and all(getattr(r, "bold", False) for r in para.runs if (r.text or "").strip()):
-                    line = f"**{line}**"
-                lines.append(line)
-            for table in getattr(doc, "tables", []):
+                # Read the style + numbering off the underlying element so the
+                # cascade sees clause headings (the same logic the stdlib reader
+                # applies); python-docx alone exposes neither as a heading.
+                ppr = para._p.find(w + "pPr")
+                style = _docx_paragraph_style(ppr, w)
+                numbered = bool(ppr is not None and ppr.find(w + "numPr") is not None)
+                all_bold = bool(para.runs) and all(
+                    getattr(r, "bold", False) for r in para.runs if (r.text or "").strip())
+                _emit_docx_paragraph(lines, line, style, numbered, all_bold)
+            for table in getattr(doc, "tables", []):  # pragma: no cover - [docx] fidelity
                 for row in table.rows:
                     for cell in row.cells:
                         ct = (cell.text or "").strip()
@@ -1130,6 +1137,30 @@ def _docx_heading_title(text: str) -> Optional[str]:
     return title
+def _emit_docx_paragraph(out: List[str], line: str, style: Optional[str],
+                         numbered: bool, all_bold: bool) -> None:
+    """Append one .docx paragraph to `out` the way the clause cascade expects.
+    Heading-styled (Heading1-9/Title) or auto-numbered (`w:numPr`) paragraphs --
+    whose visible number is auto-generated and absent from the text -- become a
+    `## <title>` heading (with any run-in body split onto the next line) when the
+    lead looks like a heading; a fully-bold paragraph becomes `**...**`; anything
+    else stays plain. Shared by BOTH the python-docx and stdlib readers so the
+    two paths agree on structure (the python-docx path used to flatten headings,
+    losing the clause map on heading-styled Word docs)."""
+    if not line:
+        out.append("")
+        return
+    if _is_heading_style(style) or numbered:
+        title = _docx_heading_title(line)
+        if title is not None:
+            out.append(f"## {title}")
+            if len(title) < len(line):
+                out.append(line[len(title):].lstrip(" .:\t"))
+            return
+    out.append(f"**{line}**" if all_bold else line)
 def _read_docx_stdlib(raw: bytes) -> str:
     import io
     import zipfile
@@ -1153,39 +1184,23 @@ def _read_docx_stdlib(raw: bytes) -> str:
         style = _docx_paragraph_style(ppr, w)
         numbered = ppr is not None and ppr.find(w + "numPr") is not None
         run_texts: List[str] = []
-        any_text = False
         all_bold = True
         for r in p.iter(w + "r"):
             rpr = r.find(w + "rPr")
             bold = rpr is not None and rpr.find(w + "b") is not None
             txt = "".join(t.text or "" for t in r.iter(w + "t"))
             if txt:
-                any_text = True
                 if not bold:
                     all_bold = False
                 run_texts.append(txt)
         line = "".join(run_texts).strip()
-        if not line:
-            paras.append("")
-            continue
         # Clause structure in real Word contracts lives in heading STYLES
         # (Heading1-9/Title) or auto-NUMBERED paragraphs (w:numPr) -- in both the
-        # visible number is auto-generated and absent from the text. Emit such a
-        # paragraph as an H2 heading (strongest cascade tier) when its lead looks
-        # like a heading; _docx_heading_title rejects full-sentence body items
-        # (e.g. deep numbered sub-points), so this stays conservative. Keep any
-        # run-in body as a following paragraph.
-        if _is_heading_style(style) or numbered:
-            title = _docx_heading_title(line)
-            if title is not None:
-                paras.append(f"## {title}")
-                if len(title) < len(line):
-                    paras.append(line[len(title):].lstrip(" .:\t"))
-                continue
-            # Not heading-like -> treat as ordinary body text.
-        if any_text and all_bold:
-            line = f"**{line}**"
-        paras.append(line)
+        # visible number is auto-generated and absent from the text. The shared
+        # emitter turns those into `## headings` (run-in body split off), bolds
+        # fully-bold lines, and keeps the rest plain. _docx_heading_title rejects
+        # full-sentence body items, so this stays conservative.
+        _emit_docx_paragraph(paras, line, style, numbered, all_bold)
     return "\n\n".join(paras)
@@ -1209,7 +1224,7 @@ def _read_pdf(path: Path, raw: bytes, prefer_optional: bool = True) -> Tuple[str
             warnings.append(f"pypdf read failed ({e}); falling back to stdlib reader")
     try:
         text = _read_pdf_stdlib(raw)
-    except Exception as e:
+    except Exception as e:  # pragma: no cover - defensive; stdlib reader is bomb-guarded
         warnings.append(f"could not parse .pdf ({e}); treating as empty")
         return "", warnings
     return text, warnings
@@ -1342,7 +1357,7 @@ def load_source(path: Path, prefer_optional: bool = True) -> Tuple[bytes, str, s
         raise ExtractError(f"path is a directory, not a file: {path}")
     try:
         size = path.stat().st_size
-    except OSError:
+    except OSError:  # pragma: no cover - defensive; path.exists() already passed
         size = 0
     if size > MAX_INPUT_BYTES:
         raise ExtractError(
@@ -2315,7 +2330,7 @@ def main(argv: Optional[List[str]] = None) -> int:
         if hasattr(_stream, "reconfigure"):
             try:
                 _stream.reconfigure(encoding="utf-8", errors="replace")
-            except Exception:
+            except Exception:  # pragma: no cover - defensive
                 pass
     argv = sys.argv[1:] if argv is None else argv
@@ -2358,7 +2373,7 @@ def main(argv: Optional[List[str]] = None) -> int:
         if first in known:
             parser = build_parser()
             args = parser.parse_args(argv)
-            if not getattr(args, "func", None):
+            if not getattr(args, "func", None):  # pragma: no cover - argparse always sets func
                 parser.print_help()
                 return 0
         else:
@@ -2370,7 +2385,7 @@ def main(argv: Optional[List[str]] = None) -> int:
     except BrokenPipeError:  # e.g. `extract foo.md | head`
         try:
             sys.stdout.close()
-        except Exception:
+        except Exception:  # pragma: no cover - defensive
             pass
         return 0
     except KeyboardInterrupt:  # pragma: no cover
@@ -2378,5 +2393,5 @@ def main(argv: Optional[List[str]] = None) -> int:
         return 130
-if __name__ == "__main__":
+if __name__ == "__main__":  # pragma: no cover
     sys.exit(main())

{extract_cli-0.1.9 → extract_cli-0.1.10}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "extract-cli"
-version = "0.1.9"
+version = "0.1.10"
 description = "Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON."
 readme = "README.md"
 requires-python = ">=3.9"

{extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/employment_docx.docx.expected.json RENAMED Viewed

@@ -151,7 +151,7 @@
   ],
   "signatories": [],
   "_meta": {
-    "extractor_version": "0.1.9",
+    "extractor_version": "0.1.10",
     "tiers_used": [
       "deterministic"
     ],

{extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/heading_docx.docx.expected.json RENAMED Viewed

@@ -140,7 +140,7 @@
   "amounts": [],
   "signatories": [],
   "_meta": {
-    "extractor_version": "0.1.9",
+    "extractor_version": "0.1.10",
     "tiers_used": [
       "deterministic"
     ],

{extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/lease_allcaps.txt.expected.json RENAMED Viewed

@@ -146,7 +146,7 @@
   ],
   "signatories": [],
   "_meta": {
-    "extractor_version": "0.1.9",
+    "extractor_version": "0.1.10",
     "tiers_used": [
       "deterministic"
     ],

{extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/license_pdf.pdf.expected.json RENAMED Viewed

@@ -146,7 +146,7 @@
   ],
   "signatories": [],
   "_meta": {
-    "extractor_version": "0.1.9",
+    "extractor_version": "0.1.10",
     "tiers_used": [
       "deterministic"
     ],

{extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/nda_h2.md.expected.json RENAMED Viewed

@@ -150,7 +150,7 @@
   "amounts": [],
   "signatories": [],
   "_meta": {
-    "extractor_version": "0.1.9",
+    "extractor_version": "0.1.10",
     "tiers_used": [
       "deterministic"
     ],

{extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/numbered_docx.docx.expected.json RENAMED Viewed

@@ -140,7 +140,7 @@
   "amounts": [],
   "signatories": [],
   "_meta": {
-    "extractor_version": "0.1.9",
+    "extractor_version": "0.1.10",
     "tiers_used": [
       "deterministic"
     ],

{extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/scanned.pdf.expected.json RENAMED Viewed

@@ -55,7 +55,7 @@
   "amounts": [],
   "signatories": [],
   "_meta": {
-    "extractor_version": "0.1.9",
+    "extractor_version": "0.1.10",
     "tiers_used": [
       "deterministic"
     ],

{extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/services_bold.txt.expected.json RENAMED Viewed

@@ -146,7 +146,7 @@
   ],
   "signatories": [],
   "_meta": {
-    "extractor_version": "0.1.9",
+    "extractor_version": "0.1.10",
     "tiers_used": [
       "deterministic"
     ],

{extract_cli-0.1.9 → extract_cli-0.1.10}/tests/fixtures/services_html.html.expected.json RENAMED Viewed

@@ -161,7 +161,7 @@
   ],
   "signatories": [],
   "_meta": {
-    "extractor_version": "0.1.9",
+    "extractor_version": "0.1.10",
     "tiers_used": [
       "deterministic"
     ],

extract_cli-0.1.10/tests/test_coverage.py ADDED Viewed

@@ -0,0 +1,241 @@
+"""Targeted tests that exercise the remaining reachable branches, to keep line
+coverage at its practical maximum. (Genuinely-unreachable defensive lines and
+[docx]/[pdf]-extra fidelity branches are marked `# pragma: no cover` in the
+source.)"""
+from __future__ import annotations
+import argparse
+import io
+import json
+import sys as _sys
+import zipfile
+from typing import Any
+import pytest
+import extract_cli as ex
+from tests.conftest import FIXTURES
+def _ns(**kw: object) -> argparse.Namespace:
+    base = {"silent": False, "why": False}
+    base.update(kw)
+    return argparse.Namespace(**base)
+# --- color + warn -----------------------------------------------------------
+def test_color_force_on_and_isatty_exception(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.delenv("NO_COLOR", raising=False)
+    monkeypatch.setenv("FORCE_COLOR", "1")
+    assert ex._color_enabled() is True
+    assert ex._c("x", "32") == "\033[32mx\033[0m"
+    monkeypatch.delenv("FORCE_COLOR", raising=False)
+    class _Bad:
+        def isatty(self) -> bool:
+            raise ValueError("boom")
+    assert ex._color_enabled(_Bad()) is False
+def test_warn_silent_is_suppressed(capsys: pytest.CaptureFixture[str]) -> None:
+    ex._warn(_ns(silent=True), "hush")
+    assert capsys.readouterr().err == ""
+# --- small helpers ----------------------------------------------------------
+def test_titlecase_edges() -> None:
+    assert ex._titlecase("   ") == ""
+    assert ex._titlecase("IP Rights") == "IP Rights"  # acronym preserved in mixed case
+def test_word_to_int_digit_and_unknown() -> None:
+    assert ex._word_to_int("30") == 30
+    assert ex._word_to_int("zzz") is None
+def test_date_parse_none_and_unparseable_raw() -> None:
+    assert ex._parse_date_to_iso("not a date") is None
+    f = ex._date_field_from_str("13/13/2024", 0.85)  # matches shape, invalid month
+    assert f["source"] == "deterministic" and f["confidence"] < 0.85
+def test_canonicalize_empty_key() -> None:
+    assert ex._canonicalize_clause("   ") == (None, False)
+    assert ex._canonicalize_clause("1.") == (None, False)
+def test_governing_law_and_title_none() -> None:
+    assert ex.extract_governing_law("no law clause here")["source"] == "none"
+    assert ex.extract_title("", None, "text") is None
+def test_defined_terms_long_and_capped() -> None:
+    long_phrase = '"This Is A Very Long Quoted Heading Phrase Indeed"'  # > 6 words
+    many = " ".join(f'"Term {i}"' for i in range(60))
+    terms = [t["term"] for t in ex.extract_defined_terms(long_phrase + " " + many)]
+    assert not any("Very Long" in t for t in terms)
+    assert len(terms) <= 50
+def test_noise_placeholder_midstring() -> None:
+    # Placeholder not at the start -> the mid-string regex branch.
+    assert ex._is_noise_clause_title("Fee [ # ]% Cap")
+    assert ex._is_noise_clause_title("{placeholder}")
+# --- format / readers -------------------------------------------------------
+def test_detect_format_by_magic_bytes(tmp_path: Any) -> None:
+    p = tmp_path / "x.dat"
+    p.write_bytes(b"%PDF-1.4\nrest")
+    assert ex._detect_format(p, p.read_bytes()) == "pdf"
+    q = tmp_path / "y.dat"
+    q.write_bytes(b"PK\x03\x04rest")
+    assert ex._detect_format(q, q.read_bytes()) == "docx"
+def test_pdf_stream_without_endstream() -> None:
+    assert ex._read_pdf_stdlib(b"%PDF\nstream\n(text) Tj") == ""
+def test_pdf_decompression_budget_break(monkeypatch: pytest.MonkeyPatch) -> None:
+    import zlib
+    monkeypatch.setattr(ex, "MAX_DECOMPRESSED_BYTES", 10)
+    blob = b"%PDF\nstream\n" + zlib.compress(b"(Hello World) Tj " * 10) + b"\nendstream"
+    assert ex._read_pdf_stdlib(blob) == ""  # exceeds the tiny budget -> bail, no text
+def test_html_malformed_falls_back(monkeypatch: pytest.MonkeyPatch) -> None:
+    def boom(self: object, data: object) -> None:
+        raise ValueError("bad markup")
+    monkeypatch.setattr(ex._HTMLTextExtractor, "feed", boom)
+    out = ex._read_html("<p>hello <b>world</b></p>")
+    assert "hello" in out and "<" not in out  # crude tag-strip fallback
+def test_docx_empty_paragraph_stdlib() -> None:
+    w = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
+    body = '<w:p/><w:p><w:r><w:t>Hello</w:t></w:r></w:p>'
+    doc = f'<?xml version="1.0"?><w:document xmlns:w="{w}"><w:body>{body}</w:body></w:document>'
+    buf = io.BytesIO()
+    with zipfile.ZipFile(buf, "w") as z:
+        z.writestr("[Content_Types].xml", "<Types/>")
+        z.writestr("word/document.xml", doc)
+    assert "Hello" in ex._read_docx_stdlib(buf.getvalue())
+# --- clause detection edges -------------------------------------------------
+def test_clause_heading_on_last_line() -> None:
+    clauses = ex.detect_clauses("## First\n\nbody text\n\n## Last")  # no trailing newline
+    assert clauses[-1]["title"] == "Last"
+def test_two_line_article_skips_non_heading_next_line() -> None:
+    text = ("ARTICLE I\n\nThis whole next line is a long running sentence, not a heading at all.\n\n"
+            "ARTICLE II\n\nCONFIDENTIALITY\n\nbody\n\nARTICLE III\n\nGOVERNING LAW\n\nbody")
+    titles = [c["title"] for c in ex.detect_clauses(text)]
+    assert "CONFIDENTIALITY" in titles and "GOVERNING LAW" in titles
+def test_is_low_signal_each_branch() -> None:
+    def base() -> dict:
+        return {"parties": [], "clauses": [],
+                "dates": {"effective": ex._none_field(), "expiration": ex._none_field()},
+                "governing_law": ex._none_field(), "defined_terms": []}
+    r = base(); r["clauses"] = [{}]; assert ex._is_low_signal(r) is False
+    r = base(); r["dates"]["effective"] = ex._field("2024-01-01", 0.85); assert ex._is_low_signal(r) is False
+    r = base(); r["governing_law"] = ex._field("X", 0.8); assert ex._is_low_signal(r) is False
+    r = base(); r["defined_terms"] = [{"term": "X"}]; assert ex._is_low_signal(r) is False
+    assert ex._is_low_signal(base()) is True
+# --- LLM internals (mocked transport) ---------------------------------------
+class _Resp:
+    def __init__(self, body: bytes) -> None:
+        self._b = body
+    def read(self) -> bytes:
+        return self._b
+    def __enter__(self) -> "_Resp":
+        return self
+    def __exit__(self, *a: object) -> bool:
+        return False
+def test_llm_request_openai_no_choices(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setattr(ex.urllib.request, "urlopen",
+                        lambda req, timeout=30.0: _Resp(json.dumps({"choices": []}).encode()))
+    assert ex._llm_request({"provider": "openai", "api_key": "k"}, "p") is None
+def test_extract_json_object_invalid() -> None:
+    assert ex._extract_json_object("prefix {not valid json} suffix") is None
+def test_llm_clause_map_skips() -> None:
+    cm = ex._llm_clause_map(
+        [{"title": ""}, 123, {"title": "Recitals"}, {"title": "Confidentiality"},
+         {"title": "Confidentiality"}], "Confidentiality body")
+    assert [c["canonical_title"] for c in cm] == ["Confidentiality"]
+def test_load_llm_config_malformed(monkeypatch: pytest.MonkeyPatch, tmp_path: Any) -> None:
+    bad = tmp_path / "llm.json"
+    bad.write_text("{not json")
+    monkeypatch.setattr(ex, "LLM_CONFIG_PATHS", (bad,))
+    assert ex.load_llm_config() is None
+def test_llm_enrich_empty_and_unparseable(monkeypatch: pytest.MonkeyPatch,
+                                          capsys: pytest.CaptureFixture[str]) -> None:
+    monkeypatch.setattr(ex, "load_llm_config", lambda: {"provider": "anthropic", "api_key": "k"})
+    text = "x"
+    monkeypatch.setattr(ex, "_llm_request", lambda c, p, timeout=30.0: "")
+    ex.llm_enrich(ex.build_extraction(text, text.encode(), "text", "x.txt"), text, _ns())
+    assert "no content" in capsys.readouterr().err
+    monkeypatch.setattr(ex, "_llm_request", lambda c, p, timeout=30.0: "not json at all")
+    ex.llm_enrich(ex.build_extraction(text, text.encode(), "text", "x.txt"), text, _ns())
+    assert "could not parse" in capsys.readouterr().err
+# --- rendering / CLI edges --------------------------------------------------
+def test_render_table_unmapped_legend() -> None:
+    r = ex.build_extraction("## Zorblax Provisions\n\nbody", b"x", "markdown", "x.md")
+    assert "* = not mapped" in ex.render_table(r, no_confidence=False)
+def test_cli_silent_table_suppresses_human_view(capsys: pytest.CaptureFixture[str]) -> None:
+    assert ex.main([str(FIXTURES / "nda_h2.md"), "--silent", "--format", "table"]) == 0
+    assert "Clause map" not in capsys.readouterr().out
+def test_main_no_args_prints_help(capsys: pytest.CaptureFixture[str]) -> None:
+    assert ex.main([]) == 0
+    assert "usage" in capsys.readouterr().out.lower()
+# --- last reachable edges ---------------------------------------------------
+def test_parties_skips_empty_capture() -> None:
+    # The second "party" is just a parenthetical role -> cleans to an empty
+    # name and is skipped; the first is kept.
+    parties = ex.extract_parties('between Acme Corp and ("Receiving Party")')
+    assert [p["name"] for p in parties] == ["Acme Corp"]
+def test_signatories_skips_dupes_short_and_reserved() -> None:
+    text = "By: Jane Doe\nName: Jane Doe\nName: a\nName: the\n"
+    s = ex.extract_signatories(text)
+    assert [x["name"] for x in s] == ["Jane Doe"]
+def test_pdf_text_tj_array_branch() -> None:
+    # A TJ array of strings inside a text object.
+    assert ex._pdf_text_from_content(b"BT [(Hello) (World)] TJ ET") == "HelloWorld"

{extract_cli-0.1.9 → extract_cli-0.1.10}/tests/test_misc.py RENAMED Viewed

@@ -152,6 +152,46 @@ def test_docx_heading_style_helpers() -> None:
     # Run-in heading: title is the lead before the sentence body.
     assert ex._docx_heading_title("Payment.  Customer will pay the fees.") == "Payment"
     assert ex._docx_heading_title("Governing Law") == "Governing Law"
+def test_emit_docx_paragraph() -> None:
+    """The shared emitter both .docx readers use: heading styles / numbered
+    paragraphs become `## headings`, fully-bold lines become `**...**`."""
+    out: list[str] = []
+    ex._emit_docx_paragraph(out, "Confidentiality", "Heading2", False, False)   # heading style
+    ex._emit_docx_paragraph(out, "Term", None, True, False)                     # auto-numbered
+    ex._emit_docx_paragraph(out, "Important Notice", None, False, True)         # fully bold
+    ex._emit_docx_paragraph(out, "Just some body text.", None, False, False)    # plain
+    ex._emit_docx_paragraph(out, "", None, False, False)                        # blank
+    ex._emit_docx_paragraph(out, "Payment.  Customer will pay.", "Heading1", False, False)  # run-in
+    assert out == [
+        "## Confidentiality",
+        "## Term",
+        "**Important Notice**",
+        "Just some body text.",
+        "",
+        "## Payment",
+        "Customer will pay.",   # run-in body split onto its own line
+    ]
+def test_docx_readers_agree_on_clause_map() -> None:
+    """Regression: the python-docx reader must surface the same clause map as the
+    stdlib reader on a heading-styled .docx. The python-docx path used to flatten
+    heading styles and return an empty clause map. Skips without [docx]."""
+    pytest.importorskip("docx")
+    path = FIXTURES / "heading_docx.docx"
+    raw = path.read_bytes()
+    def clause_titles(prefer_optional: bool) -> list[str]:
+        _raw, text, fmt, _w = ex.load_source(path, prefer_optional=prefer_optional)
+        result = ex.build_extraction(text, raw, fmt, "h.docx")
+        return [c["canonical_title"] for c in result["clauses"]]
+    stdlib = clause_titles(False)
+    pydocx = clause_titles(True)
+    assert stdlib, "stdlib reader should detect the heading-styled clauses"
+    assert pydocx == stdlib, "python-docx path must agree with the stdlib reader"
     # A full sentence carrying a heading style is rejected (not a clause title).
     assert ex._docx_heading_title(
         "Either party may terminate this Agreement upon material breach that "