extract-cli 0.1.11__tar.gz → 0.1.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {extract_cli-0.1.11 → extract_cli-0.1.13}/CHANGELOG.md +35 -0
- {extract_cli-0.1.11 → extract_cli-0.1.13}/Makefile +3 -0
- {extract_cli-0.1.11 → extract_cli-0.1.13}/PKG-INFO +22 -2
- {extract_cli-0.1.11 → extract_cli-0.1.13}/README.md +21 -1
- {extract_cli-0.1.11 → extract_cli-0.1.13}/extract_cli.py +59 -18
- {extract_cli-0.1.11 → extract_cli-0.1.13}/pyproject.toml +1 -1
- extract_cli-0.1.13/tests/eval/ATTRIBUTION.md +20 -0
- extract_cli-0.1.13/tests/eval/corpus/consulting_mtm.htm +980 -0
- extract_cli-0.1.13/tests/eval/corpus/emp_arcp.htm +18 -0
- extract_cli-0.1.13/tests/eval/corpus/emp_celsci.txt +494 -0
- extract_cli-0.1.13/tests/eval/corpus/emp_quadgraphics.htm +1318 -0
- extract_cli-0.1.13/tests/eval/corpus/msa_kpmg.txt +754 -0
- extract_cli-0.1.13/tests/eval/corpus/services_visteon.txt +1054 -0
- extract_cli-0.1.13/tests/eval/evaluate.py +123 -0
- extract_cli-0.1.13/tests/eval/gold.json +51 -0
- {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/fixtures/employment_docx.docx.expected.json +1 -1
- {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/fixtures/heading_docx.docx.expected.json +1 -1
- {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/fixtures/lease_allcaps.txt.expected.json +1 -1
- {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/fixtures/license_pdf.pdf.expected.json +1 -1
- {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/fixtures/nda_h2.md.expected.json +1 -1
- {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/fixtures/numbered_docx.docx.expected.json +1 -1
- {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/fixtures/scanned.pdf.expected.json +1 -1
- {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/fixtures/services_bold.txt.expected.json +1 -1
- {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/fixtures/services_html.html.expected.json +1 -1
- {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/test_deterministic.py +17 -0
- extract_cli-0.1.13/tests/test_eval.py +26 -0
- {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/test_misc.py +24 -0
- {extract_cli-0.1.11 → extract_cli-0.1.13}/.gitignore +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.13}/AGENTS.md +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.13}/ARCHITECTURE.md +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.13}/CONTRIBUTING.md +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.13}/LICENSE +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.13}/config/llm.json.example +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.13}/docs/INTEROP.md +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.13}/docs/spec/extract-output.schema.json +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.13}/llms.txt +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.13}/scripts/release.py +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.13}/scripts/validate_against_spec.py +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/_fixtures_build.py +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/_make_goldens.py +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/_schema_validator.py +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/conftest.py +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/fixtures/employment_docx.docx +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/fixtures/heading_docx.docx +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/fixtures/lease_allcaps.txt +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/fixtures/license_pdf.pdf +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/fixtures/nda_h2.md +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/fixtures/numbered_docx.docx +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/fixtures/scanned.pdf +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/fixtures/services_bold.txt +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/fixtures/services_html.html +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/test_clause_map.py +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/test_cli.py +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/test_coverage.py +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/test_llm.py +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/test_property.py +0 -0
- {extract_cli-0.1.11 → extract_cli-0.1.13}/tests/test_schema_conformance.py +0 -0
|
@@ -6,6 +6,39 @@ to [Semantic Versioning](https://semver.org/). Per the suite convention
|
|
|
6
6
|
(see [`docs/INTEROP.md`](docs/INTEROP.md)), **backward-incompatible changes to
|
|
7
7
|
the output schema require a major version bump**; new optional fields are minor.
|
|
8
8
|
|
|
9
|
+
## [0.1.13] - 2026-05-22
|
|
10
|
+
|
|
11
|
+
### Added
|
|
12
|
+
- **Accuracy benchmark** (`tests/eval/`, `make eval`). Scores the deterministic
|
|
13
|
+
tier against a small corpus of real, executed SEC-EDGAR contracts with
|
|
14
|
+
hand-verified ground truth, reporting precision/recall/F1 per field — turning
|
|
15
|
+
"best-effort" into a measured number. Current: parties F1 0.96, effective
|
|
16
|
+
date / governing law / jurisdiction 1.00, clause recall 0.45 (heading
|
|
17
|
+
detection on dense HTML is the known weak spot). `tests/test_eval.py` gates it
|
|
18
|
+
so accuracy can't silently regress.
|
|
19
|
+
|
|
20
|
+
### Fixed / improved (surfaced by the benchmark)
|
|
21
|
+
- **Governing-law detection** now covers the common connector phrasings beyond
|
|
22
|
+
"governed by the laws of X": "governed by, **and enforced in accordance
|
|
23
|
+
with,** the laws of X", "**interpreted and enforced in accordance with** the
|
|
24
|
+
laws of X", "**construed under** the laws of X". (Benchmark: governing law
|
|
25
|
+
0.67 → 1.00.)
|
|
26
|
+
- **Jurisdiction normalization** now maps **all 50 US states + DC** (plus more
|
|
27
|
+
Canadian provinces / UK nations / countries), not just a dozen. (Benchmark:
|
|
28
|
+
jurisdiction 0.67 → 1.00.)
|
|
29
|
+
|
|
30
|
+
## [0.1.12] - 2026-05-22
|
|
31
|
+
|
|
32
|
+
### Security
|
|
33
|
+
- **Fixed an XML entity-expansion ("billion laughs") vulnerability in `.docx`
|
|
34
|
+
parsing.** The 0.1.9 resource bounds only checked *size*, but a tiny
|
|
35
|
+
`word/document.xml` declaring a DTD with nested entities passes the size
|
|
36
|
+
check and then expands exponentially in the XML parser (both ElementTree and
|
|
37
|
+
lxml/python-docx resolve internal entities). A new `_docx_xml_guard` runs
|
|
38
|
+
before either reader and refuses any `document.xml` that declares a
|
|
39
|
+
DTD/entities (a legitimate OOXML part never does) — degrading gracefully to
|
|
40
|
+
empty text with a warning. Verified on both the stdlib and `[docx]` paths.
|
|
41
|
+
|
|
9
42
|
## [0.1.11] - 2026-05-22
|
|
10
43
|
|
|
11
44
|
Polish pass.
|
|
@@ -321,6 +354,8 @@ Initial release — the open-loop front door of the contract-ops CLI suite.
|
|
|
321
354
|
intentionally *not* governed by the output schema (the schema describes the
|
|
322
355
|
full default output).
|
|
323
356
|
|
|
357
|
+
[0.1.13]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.13
|
|
358
|
+
[0.1.12]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.12
|
|
324
359
|
[0.1.11]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.11
|
|
325
360
|
[0.1.10]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.10
|
|
326
361
|
[0.1.9]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.9
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: extract-cli
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.13
|
|
4
4
|
Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON.
|
|
5
5
|
Project-URL: Homepage, https://cli.drbaher.com/
|
|
6
6
|
Project-URL: Repository, https://github.com/DrBaher/extract-cli
|
|
@@ -256,13 +256,33 @@ paths. Configure it once and every suite tool that adopts the same lookup gets
|
|
|
256
256
|
LLM features for free. Without it, `--llm` just warns and returns the
|
|
257
257
|
deterministic output.
|
|
258
258
|
|
|
259
|
+
## Accuracy
|
|
260
|
+
|
|
261
|
+
Line coverage tells you the code runs; it doesn't tell you the extraction is
|
|
262
|
+
*correct*. `make eval` scores the deterministic tier against a small corpus of
|
|
263
|
+
**real, executed contracts** (SEC EDGAR filings) with hand-verified ground truth
|
|
264
|
+
([`tests/eval/`](tests/eval/)), reporting precision/recall per field:
|
|
265
|
+
|
|
266
|
+
| Field | Score |
|
|
267
|
+
|---|---|
|
|
268
|
+
| parties | P 1.00 · R 0.92 · F1 0.96 |
|
|
269
|
+
| effective date | accuracy 1.00 |
|
|
270
|
+
| governing law | accuracy 1.00 |
|
|
271
|
+
| jurisdiction (normalized) | accuracy 1.00 |
|
|
272
|
+
| clauses (recall on verified sections) | 0.45 |
|
|
273
|
+
|
|
274
|
+
Clause recall is the honest weak spot — heading detection on dense HTML
|
|
275
|
+
exhibits still misses sections. A test (`tests/test_eval.py`) gates these so
|
|
276
|
+
accuracy can't silently regress.
|
|
277
|
+
|
|
259
278
|
## Development
|
|
260
279
|
|
|
261
280
|
```bash
|
|
262
281
|
make install # editable install with the [dev] extra
|
|
263
282
|
make test # full suite
|
|
264
|
-
make coverage # suite + coverage report
|
|
283
|
+
make coverage # suite + coverage report (installs extras; fails under 100%)
|
|
265
284
|
make typecheck # mypy --strict
|
|
285
|
+
make eval # accuracy benchmark vs the labeled corpus
|
|
266
286
|
make build # wheel + sdist
|
|
267
287
|
make smoke # build, install the wheel in a clean venv, run it
|
|
268
288
|
make spec-check # assert docs/spec schema == `extract schema`
|
|
@@ -218,13 +218,33 @@ paths. Configure it once and every suite tool that adopts the same lookup gets
|
|
|
218
218
|
LLM features for free. Without it, `--llm` just warns and returns the
|
|
219
219
|
deterministic output.
|
|
220
220
|
|
|
221
|
+
## Accuracy
|
|
222
|
+
|
|
223
|
+
Line coverage tells you the code runs; it doesn't tell you the extraction is
|
|
224
|
+
*correct*. `make eval` scores the deterministic tier against a small corpus of
|
|
225
|
+
**real, executed contracts** (SEC EDGAR filings) with hand-verified ground truth
|
|
226
|
+
([`tests/eval/`](tests/eval/)), reporting precision/recall per field:
|
|
227
|
+
|
|
228
|
+
| Field | Score |
|
|
229
|
+
|---|---|
|
|
230
|
+
| parties | P 1.00 · R 0.92 · F1 0.96 |
|
|
231
|
+
| effective date | accuracy 1.00 |
|
|
232
|
+
| governing law | accuracy 1.00 |
|
|
233
|
+
| jurisdiction (normalized) | accuracy 1.00 |
|
|
234
|
+
| clauses (recall on verified sections) | 0.45 |
|
|
235
|
+
|
|
236
|
+
Clause recall is the honest weak spot — heading detection on dense HTML
|
|
237
|
+
exhibits still misses sections. A test (`tests/test_eval.py`) gates these so
|
|
238
|
+
accuracy can't silently regress.
|
|
239
|
+
|
|
221
240
|
## Development
|
|
222
241
|
|
|
223
242
|
```bash
|
|
224
243
|
make install # editable install with the [dev] extra
|
|
225
244
|
make test # full suite
|
|
226
|
-
make coverage # suite + coverage report
|
|
245
|
+
make coverage # suite + coverage report (installs extras; fails under 100%)
|
|
227
246
|
make typecheck # mypy --strict
|
|
247
|
+
make eval # accuracy benchmark vs the labeled corpus
|
|
228
248
|
make build # wheel + sdist
|
|
229
249
|
make smoke # build, install the wheel in a clean venv, run it
|
|
230
250
|
make spec-check # assert docs/spec schema == `extract schema`
|
|
@@ -43,11 +43,11 @@ import urllib.request
|
|
|
43
43
|
from pathlib import Path
|
|
44
44
|
from typing import Any, Dict, List, Optional, Tuple
|
|
45
45
|
|
|
46
|
-
__version__ = "0.1.
|
|
46
|
+
__version__ = "0.1.13"
|
|
47
47
|
|
|
48
48
|
# Bumped independently of the package version when the *extraction logic*
|
|
49
49
|
# changes in a way downstream consumers should notice. Embedded in `_meta`.
|
|
50
|
-
EXTRACTOR_VERSION = "0.1.
|
|
50
|
+
EXTRACTOR_VERSION = "0.1.13"
|
|
51
51
|
|
|
52
52
|
# JSON Schema version of the output contract (docs/spec/extract-output.schema.json).
|
|
53
53
|
SCHEMA_VERSION = 1
|
|
@@ -616,8 +616,11 @@ _ROLE_PAREN_RE = re.compile(
|
|
|
616
616
|
# enforces a capitalized proper noun (a global re.IGNORECASE would defeat that
|
|
617
617
|
# and over-capture trailing lowercase clauses like ", without regard to ...").
|
|
618
618
|
_GOV_LAW_RE = re.compile(
|
|
619
|
-
|
|
620
|
-
|
|
619
|
+
# Allow a short same-sentence gap between "governed by" and "laws of" so the
|
|
620
|
+
# many real connector phrasings are covered: "...and construed in accordance
|
|
621
|
+
# with...", "...and enforced in accordance with...", "the internal laws of",
|
|
622
|
+
# etc. (bounded + lazy so it stays within the clause).
|
|
623
|
+
r"(?i:(?:governed|construed|interpreted|enforced)\b[^.\n]{0,60}?\blaws?\s+of\s+(?:the\s+)?)"
|
|
621
624
|
r"([A-Z][A-Za-z\.\- ]+?(?:,\s*[A-Z][A-Za-z\.\- ]+?)?)"
|
|
622
625
|
r"(?=[\.,;\n)]|\s+and\b|\s+without\b|$)",
|
|
623
626
|
)
|
|
@@ -889,16 +892,31 @@ def extract_signatories(text: str) -> List[JSON]:
|
|
|
889
892
|
return out
|
|
890
893
|
|
|
891
894
|
|
|
892
|
-
# Free-text jurisdiction -> a normalized ISO-
|
|
895
|
+
# Free-text jurisdiction -> a normalized ISO 3166-2 / ISO 3166-1 code. All 50 US
|
|
896
|
+
# states + DC, common Canadian provinces, UK nations, and frequent countries.
|
|
897
|
+
_US_STATES: Dict[str, str] = {
|
|
898
|
+
"alabama": "AL", "alaska": "AK", "arizona": "AZ", "arkansas": "AR",
|
|
899
|
+
"california": "CA", "colorado": "CO", "connecticut": "CT", "delaware": "DE",
|
|
900
|
+
"florida": "FL", "georgia": "GA", "hawaii": "HI", "idaho": "ID",
|
|
901
|
+
"illinois": "IL", "indiana": "IN", "iowa": "IA", "kansas": "KS",
|
|
902
|
+
"kentucky": "KY", "louisiana": "LA", "maine": "ME", "maryland": "MD",
|
|
903
|
+
"massachusetts": "MA", "michigan": "MI", "minnesota": "MN", "mississippi": "MS",
|
|
904
|
+
"missouri": "MO", "montana": "MT", "nebraska": "NE", "nevada": "NV",
|
|
905
|
+
"new hampshire": "NH", "new jersey": "NJ", "new mexico": "NM", "new york": "NY",
|
|
906
|
+
"north carolina": "NC", "north dakota": "ND", "ohio": "OH", "oklahoma": "OK",
|
|
907
|
+
"oregon": "OR", "pennsylvania": "PA", "rhode island": "RI", "south carolina": "SC",
|
|
908
|
+
"south dakota": "SD", "tennessee": "TN", "texas": "TX", "utah": "UT",
|
|
909
|
+
"vermont": "VT", "virginia": "VA", "washington": "WA", "west virginia": "WV",
|
|
910
|
+
"wisconsin": "WI", "wyoming": "WY", "district of columbia": "DC",
|
|
911
|
+
}
|
|
893
912
|
_JURISDICTION_CODES: Dict[str, str] = {
|
|
894
|
-
|
|
895
|
-
"texas": "US-TX", "illinois": "US-IL", "massachusetts": "US-MA",
|
|
896
|
-
"washington": "US-WA", "florida": "US-FL", "nevada": "US-NV",
|
|
897
|
-
"new jersey": "US-NJ", "pennsylvania": "US-PA", "michigan": "US-MI",
|
|
913
|
+
**{name: f"US-{code}" for name, code in _US_STATES.items()},
|
|
898
914
|
"ontario": "CA-ON", "quebec": "CA-QC", "british columbia": "CA-BC",
|
|
899
|
-
"
|
|
915
|
+
"alberta": "CA-AB", "england and wales": "GB-EAW", "england": "GB-ENG",
|
|
916
|
+
"scotland": "GB-SCT", "wales": "GB-WLS", "northern ireland": "GB-NIR",
|
|
900
917
|
"united kingdom": "GB", "france": "FR", "germany": "DE", "ireland": "IE",
|
|
901
918
|
"singapore": "SG", "australia": "AU", "india": "IN", "netherlands": "NL",
|
|
919
|
+
"switzerland": "CH", "japan": "JP",
|
|
902
920
|
}
|
|
903
921
|
|
|
904
922
|
|
|
@@ -1110,6 +1128,32 @@ def _read_html(raw_text: str) -> str:
|
|
|
1110
1128
|
return parser.get_text()
|
|
1111
1129
|
|
|
1112
1130
|
|
|
1131
|
+
def _docx_xml_guard(raw: bytes) -> Optional[str]:
|
|
1132
|
+
"""Run before EITHER docx reader on untrusted input. Returns a reason string
|
|
1133
|
+
if word/document.xml is unsafe to parse, else None:
|
|
1134
|
+
* decompresses past MAX_DECOMPRESSED_BYTES (zip bomb), or
|
|
1135
|
+
* declares a DTD/entities -- a tiny 'billion laughs' part that passes the
|
|
1136
|
+
size check but expands exponentially in the XML parser (ElementTree
|
|
1137
|
+
*and* lxml/python-docx resolve internal entities). A legitimate OOXML
|
|
1138
|
+
document.xml never declares one, so refusing is safe.
|
|
1139
|
+
"""
|
|
1140
|
+
import io
|
|
1141
|
+
import zipfile
|
|
1142
|
+
try:
|
|
1143
|
+
with zipfile.ZipFile(io.BytesIO(raw)) as z:
|
|
1144
|
+
info = z.getinfo("word/document.xml")
|
|
1145
|
+
if info.file_size > MAX_DECOMPRESSED_BYTES:
|
|
1146
|
+
return (f"word/document.xml decompresses to {info.file_size} bytes "
|
|
1147
|
+
f"(> {MAX_DECOMPRESSED_BYTES} cap)")
|
|
1148
|
+
with z.open("word/document.xml") as f:
|
|
1149
|
+
head = f.read(65536)
|
|
1150
|
+
except Exception:
|
|
1151
|
+
return None # not a valid zip / no document.xml -> let the readers report it
|
|
1152
|
+
if re.search(rb"<!DOCTYPE|<!ENTITY", head, re.IGNORECASE):
|
|
1153
|
+
return "document.xml declares a DTD/entities (XML-bomb guard)"
|
|
1154
|
+
return None
|
|
1155
|
+
|
|
1156
|
+
|
|
1113
1157
|
def _read_docx(path: Path, raw: bytes, prefer_optional: bool = True) -> Tuple[str, List[str]]:
|
|
1114
1158
|
"""Extract text from a .docx. Uses python-docx for higher fidelity when the
|
|
1115
1159
|
optional [docx] extra is installed; otherwise a stdlib zipfile/XML reader
|
|
@@ -1118,6 +1162,10 @@ def _read_docx(path: Path, raw: bytes, prefer_optional: bool = True) -> Tuple[st
|
|
|
1118
1162
|
`prefer_optional=False` forces the stdlib reader regardless of what's
|
|
1119
1163
|
installed -- used to pin reproducible golden fixtures."""
|
|
1120
1164
|
warnings: List[str] = []
|
|
1165
|
+
unsafe = _docx_xml_guard(raw)
|
|
1166
|
+
if unsafe is not None:
|
|
1167
|
+
warnings.append(f"could not parse .docx ({unsafe}); treating as empty")
|
|
1168
|
+
return "", warnings
|
|
1121
1169
|
if prefer_optional and importlib.util.find_spec("docx") is not None:
|
|
1122
1170
|
try:
|
|
1123
1171
|
mod = importlib.import_module("docx")
|
|
@@ -1216,14 +1264,7 @@ def _read_docx_stdlib(raw: bytes) -> str:
|
|
|
1216
1264
|
|
|
1217
1265
|
w = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
|
|
1218
1266
|
with zipfile.ZipFile(io.BytesIO(raw)) as z:
|
|
1219
|
-
#
|
|
1220
|
-
# before reading (don't decompress GBs into memory).
|
|
1221
|
-
info = z.getinfo("word/document.xml")
|
|
1222
|
-
if info.file_size > MAX_DECOMPRESSED_BYTES:
|
|
1223
|
-
raise ValueError(
|
|
1224
|
-
f"word/document.xml decompresses to {info.file_size} bytes "
|
|
1225
|
-
f"(> {MAX_DECOMPRESSED_BYTES} cap)")
|
|
1226
|
-
xml = z.read("word/document.xml")
|
|
1267
|
+
xml = z.read("word/document.xml") # size/XML-bomb already vetted by _docx_xml_guard
|
|
1227
1268
|
root = ET.fromstring(xml)
|
|
1228
1269
|
paras: List[str] = []
|
|
1229
1270
|
# iter over w:p in document order (includes paragraphs inside table cells).
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "extract-cli"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.13"
|
|
8
8
|
description = "Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9"
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# Benchmark corpus — sources & licensing
|
|
2
|
+
|
|
3
|
+
The accuracy benchmark (`tests/eval/`) scores extract-cli against a small set of
|
|
4
|
+
**real, executed contracts** filed publicly with the U.S. Securities and
|
|
5
|
+
Exchange Commission (SEC EDGAR). SEC filings are public records; these exhibits
|
|
6
|
+
are reproduced here, unmodified, solely as a regression/accuracy test fixture.
|
|
7
|
+
|
|
8
|
+
| File | Source (SEC EDGAR) |
|
|
9
|
+
|---|---|
|
|
10
|
+
| `emp_celsci.txt` | CEL-SCI Corporation — Exhibit 10(ooo), employment agreement |
|
|
11
|
+
| `msa_kpmg.txt` | Blade Internet Ventures / KPMG Consulting — master services agreement |
|
|
12
|
+
| `services_visteon.txt` | Visteon Corporation — salaried employee lease agreement |
|
|
13
|
+
| `consulting_mtm.htm` | MTM Technologies — consulting agreement |
|
|
14
|
+
| `emp_arcp.htm` | American Realty Capital Properties — employment agreement |
|
|
15
|
+
| `emp_quadgraphics.htm` | Quad/Graphics, Inc. — employment agreement |
|
|
16
|
+
|
|
17
|
+
Ground truth (`gold.json`) was hand-verified against each document's text — the
|
|
18
|
+
parties, effective date, governing law, normalized jurisdiction, and a
|
|
19
|
+
verified subset of section headings. It is intentionally independent of what the
|
|
20
|
+
extractor currently produces.
|