PyPI - extract-cli - Versions diffs - 0.1.11__tar.gz → 0.1.12__tar.gz - Mend

extract-cli 0.1.11tar.gz → 0.1.12tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

{extract_cli-0.1.11 → extract_cli-0.1.12}/CHANGELOG.md RENAMED Viewed

@@ -6,6 +6,18 @@ to [Semantic Versioning](https://semver.org/). Per the suite convention
 (see [`docs/INTEROP.md`](docs/INTEROP.md)), **backward-incompatible changes to
 the output schema require a major version bump**; new optional fields are minor.
+## [0.1.12] - 2026-05-22
+### Security
+- **Fixed an XML entity-expansion ("billion laughs") vulnerability in `.docx`
+  parsing.** The 0.1.9 resource bounds only checked *size*, but a tiny
+  `word/document.xml` declaring a DTD with nested entities passes the size
+  check and then expands exponentially in the XML parser (both ElementTree and
+  lxml/python-docx resolve internal entities). A new `_docx_xml_guard` runs
+  before either reader and refuses any `document.xml` that declares a
+  DTD/entities (a legitimate OOXML part never does) — degrading gracefully to
+  empty text with a warning. Verified on both the stdlib and `[docx]` paths.
 ## [0.1.11] - 2026-05-22
 Polish pass.
@@ -321,6 +333,7 @@ Initial release — the open-loop front door of the contract-ops CLI suite.
   intentionally *not* governed by the output schema (the schema describes the
   full default output).
+[0.1.12]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.12
 [0.1.11]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.11
 [0.1.10]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.10
 [0.1.9]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.9

{extract_cli-0.1.11 → extract_cli-0.1.12}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: extract-cli
-Version: 0.1.11
+Version: 0.1.12
 Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON.
 Project-URL: Homepage, https://cli.drbaher.com/
 Project-URL: Repository, https://github.com/DrBaher/extract-cli

{extract_cli-0.1.11 → extract_cli-0.1.12}/extract_cli.py RENAMED Viewed

@@ -43,11 +43,11 @@ import urllib.request
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
-__version__ = "0.1.11"
+__version__ = "0.1.12"
 # Bumped independently of the package version when the *extraction logic*
 # changes in a way downstream consumers should notice. Embedded in `_meta`.
-EXTRACTOR_VERSION = "0.1.11"
+EXTRACTOR_VERSION = "0.1.12"
 # JSON Schema version of the output contract (docs/spec/extract-output.schema.json).
 SCHEMA_VERSION = 1
@@ -1110,6 +1110,32 @@ def _read_html(raw_text: str) -> str:
     return parser.get_text()
+def _docx_xml_guard(raw: bytes) -> Optional[str]:
+    """Run before EITHER docx reader on untrusted input. Returns a reason string
+    if word/document.xml is unsafe to parse, else None:
+      * decompresses past MAX_DECOMPRESSED_BYTES (zip bomb), or
+      * declares a DTD/entities -- a tiny 'billion laughs' part that passes the
+        size check but expands exponentially in the XML parser (ElementTree
+        *and* lxml/python-docx resolve internal entities). A legitimate OOXML
+        document.xml never declares one, so refusing is safe.
+    """
+    import io
+    import zipfile
+    try:
+        with zipfile.ZipFile(io.BytesIO(raw)) as z:
+            info = z.getinfo("word/document.xml")
+            if info.file_size > MAX_DECOMPRESSED_BYTES:
+                return (f"word/document.xml decompresses to {info.file_size} bytes "
+                        f"(> {MAX_DECOMPRESSED_BYTES} cap)")
+            with z.open("word/document.xml") as f:
+                head = f.read(65536)
+    except Exception:
+        return None  # not a valid zip / no document.xml -> let the readers report it
+    if re.search(rb"<!DOCTYPE|<!ENTITY", head, re.IGNORECASE):
+        return "document.xml declares a DTD/entities (XML-bomb guard)"
+    return None
 def _read_docx(path: Path, raw: bytes, prefer_optional: bool = True) -> Tuple[str, List[str]]:
     """Extract text from a .docx. Uses python-docx for higher fidelity when the
     optional [docx] extra is installed; otherwise a stdlib zipfile/XML reader
@@ -1118,6 +1144,10 @@ def _read_docx(path: Path, raw: bytes, prefer_optional: bool = True) -> Tuple[st
     `prefer_optional=False` forces the stdlib reader regardless of what's
     installed -- used to pin reproducible golden fixtures."""
     warnings: List[str] = []
+    unsafe = _docx_xml_guard(raw)
+    if unsafe is not None:
+        warnings.append(f"could not parse .docx ({unsafe}); treating as empty")
+        return "", warnings
     if prefer_optional and importlib.util.find_spec("docx") is not None:
         try:
             mod = importlib.import_module("docx")
@@ -1216,14 +1246,7 @@ def _read_docx_stdlib(raw: bytes) -> str:
     w = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
     with zipfile.ZipFile(io.BytesIO(raw)) as z:
-        # Zip-bomb guard: the uncompressed size is in the header, so check it
-        # before reading (don't decompress GBs into memory).
-        info = z.getinfo("word/document.xml")
-        if info.file_size > MAX_DECOMPRESSED_BYTES:
-            raise ValueError(
-                f"word/document.xml decompresses to {info.file_size} bytes "
-                f"(> {MAX_DECOMPRESSED_BYTES} cap)")
-        xml = z.read("word/document.xml")
+        xml = z.read("word/document.xml")  # size/XML-bomb already vetted by _docx_xml_guard
     root = ET.fromstring(xml)
     paras: List[str] = []
     # iter over w:p in document order (includes paragraphs inside table cells).

{extract_cli-0.1.11 → extract_cli-0.1.12}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "extract-cli"
-version = "0.1.11"
+version = "0.1.12"
 description = "Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON."
 readme = "README.md"
 requires-python = ">=3.9"

{extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/employment_docx.docx.expected.json RENAMED Viewed

@@ -151,7 +151,7 @@
   ],
   "signatories": [],
   "_meta": {
-    "extractor_version": "0.1.11",
+    "extractor_version": "0.1.12",
     "tiers_used": [
       "deterministic"
     ],

{extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/heading_docx.docx.expected.json RENAMED Viewed

@@ -140,7 +140,7 @@
   "amounts": [],
   "signatories": [],
   "_meta": {
-    "extractor_version": "0.1.11",
+    "extractor_version": "0.1.12",
     "tiers_used": [
       "deterministic"
     ],

{extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/lease_allcaps.txt.expected.json RENAMED Viewed

@@ -146,7 +146,7 @@
   ],
   "signatories": [],
   "_meta": {
-    "extractor_version": "0.1.11",
+    "extractor_version": "0.1.12",
     "tiers_used": [
       "deterministic"
     ],

{extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/license_pdf.pdf.expected.json RENAMED Viewed

@@ -146,7 +146,7 @@
   ],
   "signatories": [],
   "_meta": {
-    "extractor_version": "0.1.11",
+    "extractor_version": "0.1.12",
     "tiers_used": [
       "deterministic"
     ],

{extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/nda_h2.md.expected.json RENAMED Viewed

@@ -150,7 +150,7 @@
   "amounts": [],
   "signatories": [],
   "_meta": {
-    "extractor_version": "0.1.11",
+    "extractor_version": "0.1.12",
     "tiers_used": [
       "deterministic"
     ],

{extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/numbered_docx.docx.expected.json RENAMED Viewed

@@ -140,7 +140,7 @@
   "amounts": [],
   "signatories": [],
   "_meta": {
-    "extractor_version": "0.1.11",
+    "extractor_version": "0.1.12",
     "tiers_used": [
       "deterministic"
     ],

{extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/scanned.pdf.expected.json RENAMED Viewed

@@ -55,7 +55,7 @@
   "amounts": [],
   "signatories": [],
   "_meta": {
-    "extractor_version": "0.1.11",
+    "extractor_version": "0.1.12",
     "tiers_used": [
       "deterministic"
     ],

{extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/services_bold.txt.expected.json RENAMED Viewed

@@ -146,7 +146,7 @@
   ],
   "signatories": [],
   "_meta": {
-    "extractor_version": "0.1.11",
+    "extractor_version": "0.1.12",
     "tiers_used": [
       "deterministic"
     ],

{extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/services_html.html.expected.json RENAMED Viewed

@@ -161,7 +161,7 @@
   ],
   "signatories": [],
   "_meta": {
-    "extractor_version": "0.1.11",
+    "extractor_version": "0.1.12",
     "tiers_used": [
       "deterministic"
     ],

{extract_cli-0.1.11 → extract_cli-0.1.12}/tests/test_misc.py RENAMED Viewed

@@ -277,6 +277,30 @@ def test_docx_zip_bomb_guard(tmp_path: Any) -> None:
     assert any("decompress" in w for w in warnings)
+def test_docx_xml_entity_bomb_refused(tmp_path: Any) -> None:
+    # A tiny 'billion laughs' document.xml passes the size check but would expand
+    # exponentially in the XML parser; the DTD/entity guard refuses it.
+    import io
+    import zipfile
+    w = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
+    bomb = (
+        '<?xml version="1.0"?>\n'
+        '<!DOCTYPE r [<!ENTITY a "AAAA"><!ENTITY b "&a;&a;&a;&a;">]>\n'
+        f'<w:document xmlns:w="{w}"><w:body><w:p><w:r><w:t>&b;</w:t></w:r>'
+        '</w:p></w:body></w:document>'
+    ).encode()
+    buf = io.BytesIO()
+    with zipfile.ZipFile(buf, "w") as z:
+        z.writestr("[Content_Types].xml", "<Types/>")
+        z.writestr("word/document.xml", bomb)
+    p = tmp_path / "xmlbomb.docx"
+    p.write_bytes(buf.getvalue())
+    assert p.stat().st_size < 100_000  # tiny on disk
+    raw, text, fmt, warnings = ex.load_source(p)  # default reader path
+    assert fmt == "docx" and text == ""
+    assert any("DTD/entities" in w for w in warnings)
 def test_numbered_docx_clauses() -> None:
     """A DOCX whose clauses are w:numPr list paragraphs (no heading style, no
     visible number) still yields a clause map; a deep numbered body sentence is