extract-cli 0.1.11__tar.gz → 0.1.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. {extract_cli-0.1.11 → extract_cli-0.1.12}/CHANGELOG.md +13 -0
  2. {extract_cli-0.1.11 → extract_cli-0.1.12}/PKG-INFO +1 -1
  3. {extract_cli-0.1.11 → extract_cli-0.1.12}/extract_cli.py +33 -10
  4. {extract_cli-0.1.11 → extract_cli-0.1.12}/pyproject.toml +1 -1
  5. {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/employment_docx.docx.expected.json +1 -1
  6. {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/heading_docx.docx.expected.json +1 -1
  7. {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/lease_allcaps.txt.expected.json +1 -1
  8. {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/license_pdf.pdf.expected.json +1 -1
  9. {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/nda_h2.md.expected.json +1 -1
  10. {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/numbered_docx.docx.expected.json +1 -1
  11. {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/scanned.pdf.expected.json +1 -1
  12. {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/services_bold.txt.expected.json +1 -1
  13. {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/services_html.html.expected.json +1 -1
  14. {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/test_misc.py +24 -0
  15. {extract_cli-0.1.11 → extract_cli-0.1.12}/.gitignore +0 -0
  16. {extract_cli-0.1.11 → extract_cli-0.1.12}/AGENTS.md +0 -0
  17. {extract_cli-0.1.11 → extract_cli-0.1.12}/ARCHITECTURE.md +0 -0
  18. {extract_cli-0.1.11 → extract_cli-0.1.12}/CONTRIBUTING.md +0 -0
  19. {extract_cli-0.1.11 → extract_cli-0.1.12}/LICENSE +0 -0
  20. {extract_cli-0.1.11 → extract_cli-0.1.12}/Makefile +0 -0
  21. {extract_cli-0.1.11 → extract_cli-0.1.12}/README.md +0 -0
  22. {extract_cli-0.1.11 → extract_cli-0.1.12}/config/llm.json.example +0 -0
  23. {extract_cli-0.1.11 → extract_cli-0.1.12}/docs/INTEROP.md +0 -0
  24. {extract_cli-0.1.11 → extract_cli-0.1.12}/docs/spec/extract-output.schema.json +0 -0
  25. {extract_cli-0.1.11 → extract_cli-0.1.12}/llms.txt +0 -0
  26. {extract_cli-0.1.11 → extract_cli-0.1.12}/scripts/release.py +0 -0
  27. {extract_cli-0.1.11 → extract_cli-0.1.12}/scripts/validate_against_spec.py +0 -0
  28. {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/_fixtures_build.py +0 -0
  29. {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/_make_goldens.py +0 -0
  30. {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/_schema_validator.py +0 -0
  31. {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/conftest.py +0 -0
  32. {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/employment_docx.docx +0 -0
  33. {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/heading_docx.docx +0 -0
  34. {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/lease_allcaps.txt +0 -0
  35. {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/license_pdf.pdf +0 -0
  36. {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/nda_h2.md +0 -0
  37. {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/numbered_docx.docx +0 -0
  38. {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/scanned.pdf +0 -0
  39. {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/services_bold.txt +0 -0
  40. {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/fixtures/services_html.html +0 -0
  41. {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/test_clause_map.py +0 -0
  42. {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/test_cli.py +0 -0
  43. {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/test_coverage.py +0 -0
  44. {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/test_deterministic.py +0 -0
  45. {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/test_llm.py +0 -0
  46. {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/test_property.py +0 -0
  47. {extract_cli-0.1.11 → extract_cli-0.1.12}/tests/test_schema_conformance.py +0 -0
@@ -6,6 +6,18 @@ to [Semantic Versioning](https://semver.org/). Per the suite convention
6
6
  (see [`docs/INTEROP.md`](docs/INTEROP.md)), **backward-incompatible changes to
7
7
  the output schema require a major version bump**; new optional fields are minor.
8
8
 
9
+ ## [0.1.12] - 2026-05-22
10
+
11
+ ### Security
12
+ - **Fixed an XML entity-expansion ("billion laughs") vulnerability in `.docx`
13
+ parsing.** The 0.1.9 resource bounds only checked *size*, but a tiny
14
+ `word/document.xml` declaring a DTD with nested entities passes the size
15
+ check and then expands exponentially in the XML parser (both ElementTree and
16
+ lxml/python-docx resolve internal entities). A new `_docx_xml_guard` runs
17
+ before either reader and refuses any `document.xml` that declares a
18
+ DTD/entities (a legitimate OOXML part never does) — degrading gracefully to
19
+ empty text with a warning. Verified on both the stdlib and `[docx]` paths.
20
+
9
21
  ## [0.1.11] - 2026-05-22
10
22
 
11
23
  Polish pass.
@@ -321,6 +333,7 @@ Initial release — the open-loop front door of the contract-ops CLI suite.
321
333
  intentionally *not* governed by the output schema (the schema describes the
322
334
  full default output).
323
335
 
336
+ [0.1.12]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.12
324
337
  [0.1.11]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.11
325
338
  [0.1.10]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.10
326
339
  [0.1.9]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.9
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-cli
3
- Version: 0.1.11
3
+ Version: 0.1.12
4
4
  Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON.
5
5
  Project-URL: Homepage, https://cli.drbaher.com/
6
6
  Project-URL: Repository, https://github.com/DrBaher/extract-cli
@@ -43,11 +43,11 @@ import urllib.request
43
43
  from pathlib import Path
44
44
  from typing import Any, Dict, List, Optional, Tuple
45
45
 
46
- __version__ = "0.1.11"
46
+ __version__ = "0.1.12"
47
47
 
48
48
  # Bumped independently of the package version when the *extraction logic*
49
49
  # changes in a way downstream consumers should notice. Embedded in `_meta`.
50
- EXTRACTOR_VERSION = "0.1.11"
50
+ EXTRACTOR_VERSION = "0.1.12"
51
51
 
52
52
  # JSON Schema version of the output contract (docs/spec/extract-output.schema.json).
53
53
  SCHEMA_VERSION = 1
@@ -1110,6 +1110,32 @@ def _read_html(raw_text: str) -> str:
1110
1110
  return parser.get_text()
1111
1111
 
1112
1112
 
1113
+ def _docx_xml_guard(raw: bytes) -> Optional[str]:
1114
+ """Run before EITHER docx reader on untrusted input. Returns a reason string
1115
+ if word/document.xml is unsafe to parse, else None:
1116
+ * decompresses past MAX_DECOMPRESSED_BYTES (zip bomb), or
1117
+ * declares a DTD/entities -- a tiny 'billion laughs' part that passes the
1118
+ size check but expands exponentially in the XML parser (ElementTree
1119
+ *and* lxml/python-docx resolve internal entities). A legitimate OOXML
1120
+ document.xml never declares one, so refusing is safe.
1121
+ """
1122
+ import io
1123
+ import zipfile
1124
+ try:
1125
+ with zipfile.ZipFile(io.BytesIO(raw)) as z:
1126
+ info = z.getinfo("word/document.xml")
1127
+ if info.file_size > MAX_DECOMPRESSED_BYTES:
1128
+ return (f"word/document.xml decompresses to {info.file_size} bytes "
1129
+ f"(> {MAX_DECOMPRESSED_BYTES} cap)")
1130
+ with z.open("word/document.xml") as f:
1131
+ head = f.read(65536)
1132
+ except Exception:
1133
+ return None # not a valid zip / no document.xml -> let the readers report it
1134
+ if re.search(rb"<!DOCTYPE|<!ENTITY", head, re.IGNORECASE):
1135
+ return "document.xml declares a DTD/entities (XML-bomb guard)"
1136
+ return None
1137
+
1138
+
1113
1139
  def _read_docx(path: Path, raw: bytes, prefer_optional: bool = True) -> Tuple[str, List[str]]:
1114
1140
  """Extract text from a .docx. Uses python-docx for higher fidelity when the
1115
1141
  optional [docx] extra is installed; otherwise a stdlib zipfile/XML reader
@@ -1118,6 +1144,10 @@ def _read_docx(path: Path, raw: bytes, prefer_optional: bool = True) -> Tuple[st
1118
1144
  `prefer_optional=False` forces the stdlib reader regardless of what's
1119
1145
  installed -- used to pin reproducible golden fixtures."""
1120
1146
  warnings: List[str] = []
1147
+ unsafe = _docx_xml_guard(raw)
1148
+ if unsafe is not None:
1149
+ warnings.append(f"could not parse .docx ({unsafe}); treating as empty")
1150
+ return "", warnings
1121
1151
  if prefer_optional and importlib.util.find_spec("docx") is not None:
1122
1152
  try:
1123
1153
  mod = importlib.import_module("docx")
@@ -1216,14 +1246,7 @@ def _read_docx_stdlib(raw: bytes) -> str:
1216
1246
 
1217
1247
  w = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
1218
1248
  with zipfile.ZipFile(io.BytesIO(raw)) as z:
1219
- # Zip-bomb guard: the uncompressed size is in the header, so check it
1220
- # before reading (don't decompress GBs into memory).
1221
- info = z.getinfo("word/document.xml")
1222
- if info.file_size > MAX_DECOMPRESSED_BYTES:
1223
- raise ValueError(
1224
- f"word/document.xml decompresses to {info.file_size} bytes "
1225
- f"(> {MAX_DECOMPRESSED_BYTES} cap)")
1226
- xml = z.read("word/document.xml")
1249
+ xml = z.read("word/document.xml") # size/XML-bomb already vetted by _docx_xml_guard
1227
1250
  root = ET.fromstring(xml)
1228
1251
  paras: List[str] = []
1229
1252
  # iter over w:p in document order (includes paragraphs inside table cells).
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "extract-cli"
7
- version = "0.1.11"
7
+ version = "0.1.12"
8
8
  description = "Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
@@ -151,7 +151,7 @@
151
151
  ],
152
152
  "signatories": [],
153
153
  "_meta": {
154
- "extractor_version": "0.1.11",
154
+ "extractor_version": "0.1.12",
155
155
  "tiers_used": [
156
156
  "deterministic"
157
157
  ],
@@ -140,7 +140,7 @@
140
140
  "amounts": [],
141
141
  "signatories": [],
142
142
  "_meta": {
143
- "extractor_version": "0.1.11",
143
+ "extractor_version": "0.1.12",
144
144
  "tiers_used": [
145
145
  "deterministic"
146
146
  ],
@@ -146,7 +146,7 @@
146
146
  ],
147
147
  "signatories": [],
148
148
  "_meta": {
149
- "extractor_version": "0.1.11",
149
+ "extractor_version": "0.1.12",
150
150
  "tiers_used": [
151
151
  "deterministic"
152
152
  ],
@@ -146,7 +146,7 @@
146
146
  ],
147
147
  "signatories": [],
148
148
  "_meta": {
149
- "extractor_version": "0.1.11",
149
+ "extractor_version": "0.1.12",
150
150
  "tiers_used": [
151
151
  "deterministic"
152
152
  ],
@@ -150,7 +150,7 @@
150
150
  "amounts": [],
151
151
  "signatories": [],
152
152
  "_meta": {
153
- "extractor_version": "0.1.11",
153
+ "extractor_version": "0.1.12",
154
154
  "tiers_used": [
155
155
  "deterministic"
156
156
  ],
@@ -140,7 +140,7 @@
140
140
  "amounts": [],
141
141
  "signatories": [],
142
142
  "_meta": {
143
- "extractor_version": "0.1.11",
143
+ "extractor_version": "0.1.12",
144
144
  "tiers_used": [
145
145
  "deterministic"
146
146
  ],
@@ -55,7 +55,7 @@
55
55
  "amounts": [],
56
56
  "signatories": [],
57
57
  "_meta": {
58
- "extractor_version": "0.1.11",
58
+ "extractor_version": "0.1.12",
59
59
  "tiers_used": [
60
60
  "deterministic"
61
61
  ],
@@ -146,7 +146,7 @@
146
146
  ],
147
147
  "signatories": [],
148
148
  "_meta": {
149
- "extractor_version": "0.1.11",
149
+ "extractor_version": "0.1.12",
150
150
  "tiers_used": [
151
151
  "deterministic"
152
152
  ],
@@ -161,7 +161,7 @@
161
161
  ],
162
162
  "signatories": [],
163
163
  "_meta": {
164
- "extractor_version": "0.1.11",
164
+ "extractor_version": "0.1.12",
165
165
  "tiers_used": [
166
166
  "deterministic"
167
167
  ],
@@ -277,6 +277,30 @@ def test_docx_zip_bomb_guard(tmp_path: Any) -> None:
277
277
  assert any("decompress" in w for w in warnings)
278
278
 
279
279
 
280
+ def test_docx_xml_entity_bomb_refused(tmp_path: Any) -> None:
281
+ # A tiny 'billion laughs' document.xml passes the size check but would expand
282
+ # exponentially in the XML parser; the DTD/entity guard refuses it.
283
+ import io
284
+ import zipfile
285
+ w = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
286
+ bomb = (
287
+ '<?xml version="1.0"?>\n'
288
+ '<!DOCTYPE r [<!ENTITY a "AAAA"><!ENTITY b "&a;&a;&a;&a;">]>\n'
289
+ f'<w:document xmlns:w="{w}"><w:body><w:p><w:r><w:t>&b;</w:t></w:r>'
290
+ '</w:p></w:body></w:document>'
291
+ ).encode()
292
+ buf = io.BytesIO()
293
+ with zipfile.ZipFile(buf, "w") as z:
294
+ z.writestr("[Content_Types].xml", "<Types/>")
295
+ z.writestr("word/document.xml", bomb)
296
+ p = tmp_path / "xmlbomb.docx"
297
+ p.write_bytes(buf.getvalue())
298
+ assert p.stat().st_size < 100_000 # tiny on disk
299
+ raw, text, fmt, warnings = ex.load_source(p) # default reader path
300
+ assert fmt == "docx" and text == ""
301
+ assert any("DTD/entities" in w for w in warnings)
302
+
303
+
280
304
  def test_numbered_docx_clauses() -> None:
281
305
  """A DOCX whose clauses are w:numPr list paragraphs (no heading style, no
282
306
  visible number) still yields a clause map; a deep numbered body sentence is
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes