extract-cli 0.1.11__tar.gz → 0.1.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {extract_cli-0.1.11 → extract_cli-0.1.12}/CHANGELOG.md +13 -0
- {extract_cli-0.1.11 → extract_cli-0.1.12}/PKG-INFO +1 -1
- {extract_cli-0.1.11 → extract_cli-0.1.12}/extract_cli.py +33 -10
- {extract_cli-0.1.11 → extract_cli-0.1.12}/pyproject.toml +1 -1
- {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/employment_docx.docx.expected.json +1 -1
- {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/heading_docx.docx.expected.json +1 -1
- {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/lease_allcaps.txt.expected.json +1 -1
- {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/license_pdf.pdf.expected.json +1 -1
- {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/nda_h2.md.expected.json +1 -1
- {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/numbered_docx.docx.expected.json +1 -1
- {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/scanned.pdf.expected.json +1 -1
- {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/services_bold.txt.expected.json +1 -1
- {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/services_html.html.expected.json +1 -1
- {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/test_misc.py +24 -0
- {extract_cli-0.1.11 → extract_cli-0.1.12}/.gitignore +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.12}/AGENTS.md +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.12}/ARCHITECTURE.md +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.12}/CONTRIBUTING.md +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.12}/LICENSE +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.12}/Makefile +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.12}/README.md +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.12}/config/llm.json.example +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.12}/docs/INTEROP.md +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.12}/docs/spec/extract-output.schema.json +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.12}/llms.txt +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.12}/scripts/release.py +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.12}/scripts/validate_against_spec.py +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/_fixtures_build.py +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/_make_goldens.py +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/_schema_validator.py +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/conftest.py +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/employment_docx.docx +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/heading_docx.docx +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/lease_allcaps.txt +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/license_pdf.pdf +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/nda_h2.md +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/numbered_docx.docx +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/scanned.pdf +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/services_bold.txt +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/services_html.html +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/test_clause_map.py +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/test_cli.py +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/test_coverage.py +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/test_deterministic.py +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/test_llm.py +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/test_property.py +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/test_schema_conformance.py +0 -0
|
@@ -6,6 +6,18 @@ to [Semantic Versioning](https://semver.org/). Per the suite convention
|
|
|
6
6
|
(see [`docs/INTEROP.md`](docs/INTEROP.md)), **backward-incompatible changes to
|
|
7
7
|
the output schema require a major version bump**; new optional fields are minor.
|
|
8
8
|
|
|
9
|
+
## [0.1.12] - 2026-05-22
|
|
10
|
+
|
|
11
|
+
### Security
|
|
12
|
+
- **Fixed an XML entity-expansion ("billion laughs") vulnerability in `.docx`
|
|
13
|
+
parsing.** The 0.1.9 resource bounds only checked *size*, but a tiny
|
|
14
|
+
`word/document.xml` declaring a DTD with nested entities passes the size
|
|
15
|
+
check and then expands exponentially in the XML parser (both ElementTree and
|
|
16
|
+
lxml/python-docx resolve internal entities). A new `_docx_xml_guard` runs
|
|
17
|
+
before either reader and refuses any `document.xml` that declares a
|
|
18
|
+
DTD/entities (a legitimate OOXML part never does) — degrading gracefully to
|
|
19
|
+
empty text with a warning. Verified on both the stdlib and `[docx]` paths.
|
|
20
|
+
|
|
9
21
|
## [0.1.11] - 2026-05-22
|
|
10
22
|
|
|
11
23
|
Polish pass.
|
|
@@ -321,6 +333,7 @@ Initial release — the open-loop front door of the contract-ops CLI suite.
|
|
|
321
333
|
intentionally *not* governed by the output schema (the schema describes the
|
|
322
334
|
full default output).
|
|
323
335
|
|
|
336
|
+
[0.1.12]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.12
|
|
324
337
|
[0.1.11]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.11
|
|
325
338
|
[0.1.10]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.10
|
|
326
339
|
[0.1.9]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.9
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: extract-cli
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.12
|
|
4
4
|
Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON.
|
|
5
5
|
Project-URL: Homepage, https://cli.drbaher.com/
|
|
6
6
|
Project-URL: Repository, https://github.com/DrBaher/extract-cli
|
|
@@ -43,11 +43,11 @@ import urllib.request
|
|
|
43
43
|
from pathlib import Path
|
|
44
44
|
from typing import Any, Dict, List, Optional, Tuple
|
|
45
45
|
|
|
46
|
-
__version__ = "0.1.
|
|
46
|
+
__version__ = "0.1.12"
|
|
47
47
|
|
|
48
48
|
# Bumped independently of the package version when the *extraction logic*
|
|
49
49
|
# changes in a way downstream consumers should notice. Embedded in `_meta`.
|
|
50
|
-
EXTRACTOR_VERSION = "0.1.
|
|
50
|
+
EXTRACTOR_VERSION = "0.1.12"
|
|
51
51
|
|
|
52
52
|
# JSON Schema version of the output contract (docs/spec/extract-output.schema.json).
|
|
53
53
|
SCHEMA_VERSION = 1
|
|
@@ -1110,6 +1110,32 @@ def _read_html(raw_text: str) -> str:
|
|
|
1110
1110
|
return parser.get_text()
|
|
1111
1111
|
|
|
1112
1112
|
|
|
1113
|
+
def _docx_xml_guard(raw: bytes) -> Optional[str]:
|
|
1114
|
+
"""Run before EITHER docx reader on untrusted input. Returns a reason string
|
|
1115
|
+
if word/document.xml is unsafe to parse, else None:
|
|
1116
|
+
* decompresses past MAX_DECOMPRESSED_BYTES (zip bomb), or
|
|
1117
|
+
* declares a DTD/entities -- a tiny 'billion laughs' part that passes the
|
|
1118
|
+
size check but expands exponentially in the XML parser (ElementTree
|
|
1119
|
+
*and* lxml/python-docx resolve internal entities). A legitimate OOXML
|
|
1120
|
+
document.xml never declares one, so refusing is safe.
|
|
1121
|
+
"""
|
|
1122
|
+
import io
|
|
1123
|
+
import zipfile
|
|
1124
|
+
try:
|
|
1125
|
+
with zipfile.ZipFile(io.BytesIO(raw)) as z:
|
|
1126
|
+
info = z.getinfo("word/document.xml")
|
|
1127
|
+
if info.file_size > MAX_DECOMPRESSED_BYTES:
|
|
1128
|
+
return (f"word/document.xml decompresses to {info.file_size} bytes "
|
|
1129
|
+
f"(> {MAX_DECOMPRESSED_BYTES} cap)")
|
|
1130
|
+
with z.open("word/document.xml") as f:
|
|
1131
|
+
head = f.read(65536)
|
|
1132
|
+
except Exception:
|
|
1133
|
+
return None # not a valid zip / no document.xml -> let the readers report it
|
|
1134
|
+
if re.search(rb"<!DOCTYPE|<!ENTITY", head, re.IGNORECASE):
|
|
1135
|
+
return "document.xml declares a DTD/entities (XML-bomb guard)"
|
|
1136
|
+
return None
|
|
1137
|
+
|
|
1138
|
+
|
|
1113
1139
|
def _read_docx(path: Path, raw: bytes, prefer_optional: bool = True) -> Tuple[str, List[str]]:
|
|
1114
1140
|
"""Extract text from a .docx. Uses python-docx for higher fidelity when the
|
|
1115
1141
|
optional [docx] extra is installed; otherwise a stdlib zipfile/XML reader
|
|
@@ -1118,6 +1144,10 @@ def _read_docx(path: Path, raw: bytes, prefer_optional: bool = True) -> Tuple[st
|
|
|
1118
1144
|
`prefer_optional=False` forces the stdlib reader regardless of what's
|
|
1119
1145
|
installed -- used to pin reproducible golden fixtures."""
|
|
1120
1146
|
warnings: List[str] = []
|
|
1147
|
+
unsafe = _docx_xml_guard(raw)
|
|
1148
|
+
if unsafe is not None:
|
|
1149
|
+
warnings.append(f"could not parse .docx ({unsafe}); treating as empty")
|
|
1150
|
+
return "", warnings
|
|
1121
1151
|
if prefer_optional and importlib.util.find_spec("docx") is not None:
|
|
1122
1152
|
try:
|
|
1123
1153
|
mod = importlib.import_module("docx")
|
|
@@ -1216,14 +1246,7 @@ def _read_docx_stdlib(raw: bytes) -> str:
|
|
|
1216
1246
|
|
|
1217
1247
|
w = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
|
|
1218
1248
|
with zipfile.ZipFile(io.BytesIO(raw)) as z:
|
|
1219
|
-
#
|
|
1220
|
-
# before reading (don't decompress GBs into memory).
|
|
1221
|
-
info = z.getinfo("word/document.xml")
|
|
1222
|
-
if info.file_size > MAX_DECOMPRESSED_BYTES:
|
|
1223
|
-
raise ValueError(
|
|
1224
|
-
f"word/document.xml decompresses to {info.file_size} bytes "
|
|
1225
|
-
f"(> {MAX_DECOMPRESSED_BYTES} cap)")
|
|
1226
|
-
xml = z.read("word/document.xml")
|
|
1249
|
+
xml = z.read("word/document.xml") # size/XML-bomb already vetted by _docx_xml_guard
|
|
1227
1250
|
root = ET.fromstring(xml)
|
|
1228
1251
|
paras: List[str] = []
|
|
1229
1252
|
# iter over w:p in document order (includes paragraphs inside table cells).
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "extract-cli"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.12"
|
|
8
8
|
description = "Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9"
|
|
@@ -277,6 +277,30 @@ def test_docx_zip_bomb_guard(tmp_path: Any) -> None:
|
|
|
277
277
|
assert any("decompress" in w for w in warnings)
|
|
278
278
|
|
|
279
279
|
|
|
280
|
+
def test_docx_xml_entity_bomb_refused(tmp_path: Any) -> None:
|
|
281
|
+
# A tiny 'billion laughs' document.xml passes the size check but would expand
|
|
282
|
+
# exponentially in the XML parser; the DTD/entity guard refuses it.
|
|
283
|
+
import io
|
|
284
|
+
import zipfile
|
|
285
|
+
w = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
|
286
|
+
bomb = (
|
|
287
|
+
'<?xml version="1.0"?>\n'
|
|
288
|
+
'<!DOCTYPE r [<!ENTITY a "AAAA"><!ENTITY b "&a;&a;&a;&a;">]>\n'
|
|
289
|
+
f'<w:document xmlns:w="{w}"><w:body><w:p><w:r><w:t>&b;</w:t></w:r>'
|
|
290
|
+
'</w:p></w:body></w:document>'
|
|
291
|
+
).encode()
|
|
292
|
+
buf = io.BytesIO()
|
|
293
|
+
with zipfile.ZipFile(buf, "w") as z:
|
|
294
|
+
z.writestr("[Content_Types].xml", "<Types/>")
|
|
295
|
+
z.writestr("word/document.xml", bomb)
|
|
296
|
+
p = tmp_path / "xmlbomb.docx"
|
|
297
|
+
p.write_bytes(buf.getvalue())
|
|
298
|
+
assert p.stat().st_size < 100_000 # tiny on disk
|
|
299
|
+
raw, text, fmt, warnings = ex.load_source(p) # default reader path
|
|
300
|
+
assert fmt == "docx" and text == ""
|
|
301
|
+
assert any("DTD/entities" in w for w in warnings)
|
|
302
|
+
|
|
303
|
+
|
|
280
304
|
def test_numbered_docx_clauses() -> None:
|
|
281
305
|
"""A DOCX whose clauses are w:numPr list paragraphs (no heading style, no
|
|
282
306
|
visible number) still yields a clause map; a deep numbered body sentence is
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|