extract-cli 0.1.11__tar.gz → 0.1.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. {extract_cli-0.1.11 → extract_cli-0.1.13}/CHANGELOG.md +35 -0
  2. {extract_cli-0.1.11 → extract_cli-0.1.13}/Makefile +3 -0
  3. {extract_cli-0.1.11 → extract_cli-0.1.13}/PKG-INFO +22 -2
  4. {extract_cli-0.1.11 → extract_cli-0.1.13}/README.md +21 -1
  5. {extract_cli-0.1.11 → extract_cli-0.1.13}/extract_cli.py +59 -18
  6. {extract_cli-0.1.11 → extract_cli-0.1.13}/pyproject.toml +1 -1
  7. extract_cli-0.1.13/tests/eval/ATTRIBUTION.md +20 -0
  8. extract_cli-0.1.13/tests/eval/corpus/consulting_mtm.htm +980 -0
  9. extract_cli-0.1.13/tests/eval/corpus/emp_arcp.htm +18 -0
  10. extract_cli-0.1.13/tests/eval/corpus/emp_celsci.txt +494 -0
  11. extract_cli-0.1.13/tests/eval/corpus/emp_quadgraphics.htm +1318 -0
  12. extract_cli-0.1.13/tests/eval/corpus/msa_kpmg.txt +754 -0
  13. extract_cli-0.1.13/tests/eval/corpus/services_visteon.txt +1054 -0
  14. extract_cli-0.1.13/tests/eval/evaluate.py +123 -0
  15. extract_cli-0.1.13/tests/eval/gold.json +51 -0
  16. {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/fixtures/employment_docx.docx.expected.json +1 -1
  17. {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/fixtures/heading_docx.docx.expected.json +1 -1
  18. {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/fixtures/lease_allcaps.txt.expected.json +1 -1
  19. {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/fixtures/license_pdf.pdf.expected.json +1 -1
  20. {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/fixtures/nda_h2.md.expected.json +1 -1
  21. {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/fixtures/numbered_docx.docx.expected.json +1 -1
  22. {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/fixtures/scanned.pdf.expected.json +1 -1
  23. {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/fixtures/services_bold.txt.expected.json +1 -1
  24. {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/fixtures/services_html.html.expected.json +1 -1
  25. {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/test_deterministic.py +17 -0
  26. extract_cli-0.1.13/tests/test_eval.py +26 -0
  27. {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/test_misc.py +24 -0
  28. {extract_cli-0.1.11 → extract_cli-0.1.13}/.gitignore +0 -0
  29. {extract_cli-0.1.11 → extract_cli-0.1.13}/AGENTS.md +0 -0
  30. {extract_cli-0.1.11 → extract_cli-0.1.13}/ARCHITECTURE.md +0 -0
  31. {extract_cli-0.1.11 → extract_cli-0.1.13}/CONTRIBUTING.md +0 -0
  32. {extract_cli-0.1.11 → extract_cli-0.1.13}/LICENSE +0 -0
  33. {extract_cli-0.1.11 → extract_cli-0.1.13}/config/llm.json.example +0 -0
  34. {extract_cli-0.1.11 → extract_cli-0.1.13}/docs/INTEROP.md +0 -0
  35. {extract_cli-0.1.11 → extract_cli-0.1.13}/docs/spec/extract-output.schema.json +0 -0
  36. {extract_cli-0.1.11 → extract_cli-0.1.13}/llms.txt +0 -0
  37. {extract_cli-0.1.11 → extract_cli-0.1.13}/scripts/release.py +0 -0
  38. {extract_cli-0.1.11 → extract_cli-0.1.13}/scripts/validate_against_spec.py +0 -0
  39. {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/_fixtures_build.py +0 -0
  40. {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/_make_goldens.py +0 -0
  41. {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/_schema_validator.py +0 -0
  42. {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/conftest.py +0 -0
  43. {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/fixtures/employment_docx.docx +0 -0
  44. {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/fixtures/heading_docx.docx +0 -0
  45. {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/fixtures/lease_allcaps.txt +0 -0
  46. {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/fixtures/license_pdf.pdf +0 -0
  47. {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/fixtures/nda_h2.md +0 -0
  48. {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/fixtures/numbered_docx.docx +0 -0
  49. {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/fixtures/scanned.pdf +0 -0
  50. {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/fixtures/services_bold.txt +0 -0
  51. {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/fixtures/services_html.html +0 -0
  52. {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/test_clause_map.py +0 -0
  53. {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/test_cli.py +0 -0
  54. {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/test_coverage.py +0 -0
  55. {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/test_llm.py +0 -0
  56. {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/test_property.py +0 -0
  57. {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/test_schema_conformance.py +0 -0
@@ -6,6 +6,39 @@ to [Semantic Versioning](https://semver.org/). Per the suite convention
6
6
  (see [`docs/INTEROP.md`](docs/INTEROP.md)), **backward-incompatible changes to
7
7
  the output schema require a major version bump**; new optional fields are minor.
8
8
 
9
+ ## [0.1.13] - 2026-05-22
10
+
11
+ ### Added
12
+ - **Accuracy benchmark** (`tests/eval/`, `make eval`). Scores the deterministic
13
+ tier against a small corpus of real, executed SEC-EDGAR contracts with
14
+ hand-verified ground truth, reporting precision/recall/F1 per field — turning
15
+ "best-effort" into a measured number. Current: parties F1 0.96, effective
16
+ date / governing law / jurisdiction 1.00, clause recall 0.45 (heading
17
+ detection on dense HTML is the known weak spot). `tests/test_eval.py` gates it
18
+ so accuracy can't silently regress.
19
+
20
+ ### Fixed / improved (surfaced by the benchmark)
21
+ - **Governing-law detection** now covers the common connector phrasings beyond
22
+ "governed by the laws of X": "governed by, **and enforced in accordance
23
+ with,** the laws of X", "**interpreted and enforced in accordance with** the
24
+ laws of X", "**construed under** the laws of X". (Benchmark: governing law
25
+ 0.67 → 1.00.)
26
+ - **Jurisdiction normalization** now maps **all 50 US states + DC** (plus more
27
+ Canadian provinces / UK nations / countries), not just a dozen. (Benchmark:
28
+ jurisdiction 0.67 → 1.00.)
29
+
30
+ ## [0.1.12] - 2026-05-22
31
+
32
+ ### Security
33
+ - **Fixed an XML entity-expansion ("billion laughs") vulnerability in `.docx`
34
+ parsing.** The 0.1.9 resource bounds only checked *size*, but a tiny
35
+ `word/document.xml` declaring a DTD with nested entities passes the size
36
+ check and then expands exponentially in the XML parser (both ElementTree and
37
+ lxml/python-docx resolve internal entities). A new `_docx_xml_guard` runs
38
+ before either reader and refuses any `document.xml` that declares a
39
+ DTD/entities (a legitimate OOXML part never does) — degrading gracefully to
40
+ empty text with a warning. Verified on both the stdlib and `[docx]` paths.
41
+
9
42
  ## [0.1.11] - 2026-05-22
10
43
 
11
44
  Polish pass.
@@ -321,6 +354,8 @@ Initial release — the open-loop front door of the contract-ops CLI suite.
321
354
  intentionally *not* governed by the output schema (the schema describes the
322
355
  full default output).
323
356
 
357
+ [0.1.13]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.13
358
+ [0.1.12]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.12
324
359
  [0.1.11]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.11
325
360
  [0.1.10]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.10
326
361
  [0.1.9]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.9
@@ -40,6 +40,9 @@ coverage:
40
40
  typecheck:
41
41
  $(PYTHON) -m mypy --strict extract_cli.py
42
42
 
43
+ eval:
44
+ $(PYTHON) tests/eval/evaluate.py
45
+
43
46
  build: clean
44
47
  $(PYTHON) -m build
45
48
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-cli
3
- Version: 0.1.11
3
+ Version: 0.1.13
4
4
  Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON.
5
5
  Project-URL: Homepage, https://cli.drbaher.com/
6
6
  Project-URL: Repository, https://github.com/DrBaher/extract-cli
@@ -256,13 +256,33 @@ paths. Configure it once and every suite tool that adopts the same lookup gets
256
256
  LLM features for free. Without it, `--llm` just warns and returns the
257
257
  deterministic output.
258
258
 
259
+ ## Accuracy
260
+
261
+ Line coverage tells you the code runs; it doesn't tell you the extraction is
262
+ *correct*. `make eval` scores the deterministic tier against a small corpus of
263
+ **real, executed contracts** (SEC EDGAR filings) with hand-verified ground truth
264
+ ([`tests/eval/`](tests/eval/)), reporting precision/recall per field:
265
+
266
+ | Field | Score |
267
+ |---|---|
268
+ | parties | P 1.00 · R 0.92 · F1 0.96 |
269
+ | effective date | accuracy 1.00 |
270
+ | governing law | accuracy 1.00 |
271
+ | jurisdiction (normalized) | accuracy 1.00 |
272
+ | clauses (recall on verified sections) | 0.45 |
273
+
274
+ Clause recall is the honest weak spot — heading detection on dense HTML
275
+ exhibits still misses sections. A test (`tests/test_eval.py`) gates these so
276
+ accuracy can't silently regress.
277
+
259
278
  ## Development
260
279
 
261
280
  ```bash
262
281
  make install # editable install with the [dev] extra
263
282
  make test # full suite
264
- make coverage # suite + coverage report
283
+ make coverage # suite + coverage report (installs extras; fails under 100%)
265
284
  make typecheck # mypy --strict
285
+ make eval # accuracy benchmark vs the labeled corpus
266
286
  make build # wheel + sdist
267
287
  make smoke # build, install the wheel in a clean venv, run it
268
288
  make spec-check # assert docs/spec schema == `extract schema`
@@ -218,13 +218,33 @@ paths. Configure it once and every suite tool that adopts the same lookup gets
218
218
  LLM features for free. Without it, `--llm` just warns and returns the
219
219
  deterministic output.
220
220
 
221
+ ## Accuracy
222
+
223
+ Line coverage tells you the code runs; it doesn't tell you the extraction is
224
+ *correct*. `make eval` scores the deterministic tier against a small corpus of
225
+ **real, executed contracts** (SEC EDGAR filings) with hand-verified ground truth
226
+ ([`tests/eval/`](tests/eval/)), reporting precision/recall per field:
227
+
228
+ | Field | Score |
229
+ |---|---|
230
+ | parties | P 1.00 · R 0.92 · F1 0.96 |
231
+ | effective date | accuracy 1.00 |
232
+ | governing law | accuracy 1.00 |
233
+ | jurisdiction (normalized) | accuracy 1.00 |
234
+ | clauses (recall on verified sections) | 0.45 |
235
+
236
+ Clause recall is the honest weak spot — heading detection on dense HTML
237
+ exhibits still misses sections. A test (`tests/test_eval.py`) gates these so
238
+ accuracy can't silently regress.
239
+
221
240
  ## Development
222
241
 
223
242
  ```bash
224
243
  make install # editable install with the [dev] extra
225
244
  make test # full suite
226
- make coverage # suite + coverage report
245
+ make coverage # suite + coverage report (installs extras; fails under 100%)
227
246
  make typecheck # mypy --strict
247
+ make eval # accuracy benchmark vs the labeled corpus
228
248
  make build # wheel + sdist
229
249
  make smoke # build, install the wheel in a clean venv, run it
230
250
  make spec-check # assert docs/spec schema == `extract schema`
@@ -43,11 +43,11 @@ import urllib.request
43
43
  from pathlib import Path
44
44
  from typing import Any, Dict, List, Optional, Tuple
45
45
 
46
- __version__ = "0.1.11"
46
+ __version__ = "0.1.13"
47
47
 
48
48
  # Bumped independently of the package version when the *extraction logic*
49
49
  # changes in a way downstream consumers should notice. Embedded in `_meta`.
50
- EXTRACTOR_VERSION = "0.1.11"
50
+ EXTRACTOR_VERSION = "0.1.13"
51
51
 
52
52
  # JSON Schema version of the output contract (docs/spec/extract-output.schema.json).
53
53
  SCHEMA_VERSION = 1
@@ -616,8 +616,11 @@ _ROLE_PAREN_RE = re.compile(
616
616
  # enforces a capitalized proper noun (a global re.IGNORECASE would defeat that
617
617
  # and over-capture trailing lowercase clauses like ", without regard to ...").
618
618
  _GOV_LAW_RE = re.compile(
619
- r"(?i:governed\s+by(?:\s+and\s+construed\s+in\s+accordance\s+with)?\s+"
620
- r"(?:the\s+)?laws?\s+of\s+(?:the\s+)?)"
619
+ # Allow a short same-sentence gap between "governed by" and "laws of" so the
620
+ # many real connector phrasings are covered: "...and construed in accordance
621
+ # with...", "...and enforced in accordance with...", "the internal laws of",
622
+ # etc. (bounded + lazy so it stays within the clause).
623
+ r"(?i:(?:governed|construed|interpreted|enforced)\b[^.\n]{0,60}?\blaws?\s+of\s+(?:the\s+)?)"
621
624
  r"([A-Z][A-Za-z\.\- ]+?(?:,\s*[A-Z][A-Za-z\.\- ]+?)?)"
622
625
  r"(?=[\.,;\n)]|\s+and\b|\s+without\b|$)",
623
626
  )
@@ -889,16 +892,31 @@ def extract_signatories(text: str) -> List[JSON]:
889
892
  return out
890
893
 
891
894
 
892
- # Free-text jurisdiction -> a normalized ISO-ish code (best-effort, common only).
895
+ # Free-text jurisdiction -> a normalized ISO 3166-2 / ISO 3166-1 code. All 50 US
896
+ # states + DC, common Canadian provinces, UK nations, and frequent countries.
897
+ _US_STATES: Dict[str, str] = {
898
+ "alabama": "AL", "alaska": "AK", "arizona": "AZ", "arkansas": "AR",
899
+ "california": "CA", "colorado": "CO", "connecticut": "CT", "delaware": "DE",
900
+ "florida": "FL", "georgia": "GA", "hawaii": "HI", "idaho": "ID",
901
+ "illinois": "IL", "indiana": "IN", "iowa": "IA", "kansas": "KS",
902
+ "kentucky": "KY", "louisiana": "LA", "maine": "ME", "maryland": "MD",
903
+ "massachusetts": "MA", "michigan": "MI", "minnesota": "MN", "mississippi": "MS",
904
+ "missouri": "MO", "montana": "MT", "nebraska": "NE", "nevada": "NV",
905
+ "new hampshire": "NH", "new jersey": "NJ", "new mexico": "NM", "new york": "NY",
906
+ "north carolina": "NC", "north dakota": "ND", "ohio": "OH", "oklahoma": "OK",
907
+ "oregon": "OR", "pennsylvania": "PA", "rhode island": "RI", "south carolina": "SC",
908
+ "south dakota": "SD", "tennessee": "TN", "texas": "TX", "utah": "UT",
909
+ "vermont": "VT", "virginia": "VA", "washington": "WA", "west virginia": "WV",
910
+ "wisconsin": "WI", "wyoming": "WY", "district of columbia": "DC",
911
+ }
893
912
  _JURISDICTION_CODES: Dict[str, str] = {
894
- "delaware": "US-DE", "new york": "US-NY", "california": "US-CA",
895
- "texas": "US-TX", "illinois": "US-IL", "massachusetts": "US-MA",
896
- "washington": "US-WA", "florida": "US-FL", "nevada": "US-NV",
897
- "new jersey": "US-NJ", "pennsylvania": "US-PA", "michigan": "US-MI",
913
+ **{name: f"US-{code}" for name, code in _US_STATES.items()},
898
914
  "ontario": "CA-ON", "quebec": "CA-QC", "british columbia": "CA-BC",
899
- "england and wales": "GB-EAW", "england": "GB-ENG", "scotland": "GB-SCT",
915
+ "alberta": "CA-AB", "england and wales": "GB-EAW", "england": "GB-ENG",
916
+ "scotland": "GB-SCT", "wales": "GB-WLS", "northern ireland": "GB-NIR",
900
917
  "united kingdom": "GB", "france": "FR", "germany": "DE", "ireland": "IE",
901
918
  "singapore": "SG", "australia": "AU", "india": "IN", "netherlands": "NL",
919
+ "switzerland": "CH", "japan": "JP",
902
920
  }
903
921
 
904
922
 
@@ -1110,6 +1128,32 @@ def _read_html(raw_text: str) -> str:
1110
1128
  return parser.get_text()
1111
1129
 
1112
1130
 
1131
+ def _docx_xml_guard(raw: bytes) -> Optional[str]:
1132
+ """Run before EITHER docx reader on untrusted input. Returns a reason string
1133
+ if word/document.xml is unsafe to parse, else None:
1134
+ * decompresses past MAX_DECOMPRESSED_BYTES (zip bomb), or
1135
+ * declares a DTD/entities -- a tiny 'billion laughs' part that passes the
1136
+ size check but expands exponentially in the XML parser (ElementTree
1137
+ *and* lxml/python-docx resolve internal entities). A legitimate OOXML
1138
+ document.xml never declares one, so refusing is safe.
1139
+ """
1140
+ import io
1141
+ import zipfile
1142
+ try:
1143
+ with zipfile.ZipFile(io.BytesIO(raw)) as z:
1144
+ info = z.getinfo("word/document.xml")
1145
+ if info.file_size > MAX_DECOMPRESSED_BYTES:
1146
+ return (f"word/document.xml decompresses to {info.file_size} bytes "
1147
+ f"(> {MAX_DECOMPRESSED_BYTES} cap)")
1148
+ with z.open("word/document.xml") as f:
1149
+ head = f.read(65536)
1150
+ except Exception:
1151
+ return None # not a valid zip / no document.xml -> let the readers report it
1152
+ if re.search(rb"<!DOCTYPE|<!ENTITY", head, re.IGNORECASE):
1153
+ return "document.xml declares a DTD/entities (XML-bomb guard)"
1154
+ return None
1155
+
1156
+
1113
1157
  def _read_docx(path: Path, raw: bytes, prefer_optional: bool = True) -> Tuple[str, List[str]]:
1114
1158
  """Extract text from a .docx. Uses python-docx for higher fidelity when the
1115
1159
  optional [docx] extra is installed; otherwise a stdlib zipfile/XML reader
@@ -1118,6 +1162,10 @@ def _read_docx(path: Path, raw: bytes, prefer_optional: bool = True) -> Tuple[st
1118
1162
  `prefer_optional=False` forces the stdlib reader regardless of what's
1119
1163
  installed -- used to pin reproducible golden fixtures."""
1120
1164
  warnings: List[str] = []
1165
+ unsafe = _docx_xml_guard(raw)
1166
+ if unsafe is not None:
1167
+ warnings.append(f"could not parse .docx ({unsafe}); treating as empty")
1168
+ return "", warnings
1121
1169
  if prefer_optional and importlib.util.find_spec("docx") is not None:
1122
1170
  try:
1123
1171
  mod = importlib.import_module("docx")
@@ -1216,14 +1264,7 @@ def _read_docx_stdlib(raw: bytes) -> str:
1216
1264
 
1217
1265
  w = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
1218
1266
  with zipfile.ZipFile(io.BytesIO(raw)) as z:
1219
- # Zip-bomb guard: the uncompressed size is in the header, so check it
1220
- # before reading (don't decompress GBs into memory).
1221
- info = z.getinfo("word/document.xml")
1222
- if info.file_size > MAX_DECOMPRESSED_BYTES:
1223
- raise ValueError(
1224
- f"word/document.xml decompresses to {info.file_size} bytes "
1225
- f"(> {MAX_DECOMPRESSED_BYTES} cap)")
1226
- xml = z.read("word/document.xml")
1267
+ xml = z.read("word/document.xml") # size/XML-bomb already vetted by _docx_xml_guard
1227
1268
  root = ET.fromstring(xml)
1228
1269
  paras: List[str] = []
1229
1270
  # iter over w:p in document order (includes paragraphs inside table cells).
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "extract-cli"
7
- version = "0.1.11"
7
+ version = "0.1.13"
8
8
  description = "Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
@@ -0,0 +1,20 @@
1
+ # Benchmark corpus — sources & licensing
2
+
3
+ The accuracy benchmark (`tests/eval/`) scores extract-cli against a small set of
4
+ **real, executed contracts** filed publicly with the U.S. Securities and
5
+ Exchange Commission (SEC EDGAR). SEC filings are public records; these exhibits
6
+ are reproduced here, unmodified, solely as a regression/accuracy test fixture.
7
+
8
+ | File | Source (SEC EDGAR) |
9
+ |---|---|
10
+ | `emp_celsci.txt` | CEL-SCI Corporation — Exhibit 10(ooo), employment agreement |
11
+ | `msa_kpmg.txt` | Blade Internet Ventures / KPMG Consulting — master services agreement |
12
+ | `services_visteon.txt` | Visteon Corporation — salaried employee lease agreement |
13
+ | `consulting_mtm.htm` | MTM Technologies — consulting agreement |
14
+ | `emp_arcp.htm` | American Realty Capital Properties — employment agreement |
15
+ | `emp_quadgraphics.htm` | Quad/Graphics, Inc. — employment agreement |
16
+
17
+ Ground truth (`gold.json`) was hand-verified against each document's text — the
18
+ parties, effective date, governing law, normalized jurisdiction, and a
19
+ verified subset of section headings. It is intentionally independent of what the
20
+ extractor currently produces.