extract-cli 0.1.12__tar.gz → 0.1.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {extract_cli-0.1.12 → extract_cli-0.1.14}/CHANGELOG.md +36 -0
- {extract_cli-0.1.12 → extract_cli-0.1.14}/Makefile +3 -0
- {extract_cli-0.1.12 → extract_cli-0.1.14}/PKG-INFO +23 -2
- {extract_cli-0.1.12 → extract_cli-0.1.14}/README.md +22 -1
- {extract_cli-0.1.12 → extract_cli-0.1.14}/extract_cli.py +115 -22
- {extract_cli-0.1.12 → extract_cli-0.1.14}/pyproject.toml +1 -1
- extract_cli-0.1.14/tests/eval/ATTRIBUTION.md +20 -0
- extract_cli-0.1.14/tests/eval/corpus/consulting_mtm.htm +980 -0
- extract_cli-0.1.14/tests/eval/corpus/emp_arcp.htm +18 -0
- extract_cli-0.1.14/tests/eval/corpus/emp_celsci.txt +494 -0
- extract_cli-0.1.14/tests/eval/corpus/emp_quadgraphics.htm +1318 -0
- extract_cli-0.1.14/tests/eval/corpus/msa_kpmg.txt +754 -0
- extract_cli-0.1.14/tests/eval/corpus/services_visteon.txt +1054 -0
- extract_cli-0.1.14/tests/eval/evaluate.py +123 -0
- extract_cli-0.1.14/tests/eval/gold.json +51 -0
- {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/fixtures/employment_docx.docx.expected.json +1 -1
- {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/fixtures/heading_docx.docx.expected.json +1 -1
- {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/fixtures/lease_allcaps.txt.expected.json +1 -1
- {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/fixtures/license_pdf.pdf.expected.json +1 -1
- {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/fixtures/nda_h2.md.expected.json +1 -1
- {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/fixtures/numbered_docx.docx.expected.json +1 -1
- {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/fixtures/scanned.pdf.expected.json +1 -1
- {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/fixtures/services_bold.txt.expected.json +1 -1
- {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/fixtures/services_html.html.expected.json +1 -1
- {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/test_deterministic.py +17 -0
- extract_cli-0.1.14/tests/test_eval.py +26 -0
- {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/test_misc.py +19 -0
- {extract_cli-0.1.12 → extract_cli-0.1.14}/.gitignore +0 -0
- {extract_cli-0.1.12 → extract_cli-0.1.14}/AGENTS.md +0 -0
- {extract_cli-0.1.12 → extract_cli-0.1.14}/ARCHITECTURE.md +0 -0
- {extract_cli-0.1.12 → extract_cli-0.1.14}/CONTRIBUTING.md +0 -0
- {extract_cli-0.1.12 → extract_cli-0.1.14}/LICENSE +0 -0
- {extract_cli-0.1.12 → extract_cli-0.1.14}/config/llm.json.example +0 -0
- {extract_cli-0.1.12 → extract_cli-0.1.14}/docs/INTEROP.md +0 -0
- {extract_cli-0.1.12 → extract_cli-0.1.14}/docs/spec/extract-output.schema.json +0 -0
- {extract_cli-0.1.12 → extract_cli-0.1.14}/llms.txt +0 -0
- {extract_cli-0.1.12 → extract_cli-0.1.14}/scripts/release.py +0 -0
- {extract_cli-0.1.12 → extract_cli-0.1.14}/scripts/validate_against_spec.py +0 -0
- {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/_fixtures_build.py +0 -0
- {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/_make_goldens.py +0 -0
- {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/_schema_validator.py +0 -0
- {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/conftest.py +0 -0
- {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/fixtures/employment_docx.docx +0 -0
- {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/fixtures/heading_docx.docx +0 -0
- {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/fixtures/lease_allcaps.txt +0 -0
- {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/fixtures/license_pdf.pdf +0 -0
- {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/fixtures/nda_h2.md +0 -0
- {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/fixtures/numbered_docx.docx +0 -0
- {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/fixtures/scanned.pdf +0 -0
- {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/fixtures/services_bold.txt +0 -0
- {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/fixtures/services_html.html +0 -0
- {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/test_clause_map.py +0 -0
- {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/test_cli.py +0 -0
- {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/test_coverage.py +0 -0
- {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/test_llm.py +0 -0
- {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/test_property.py +0 -0
- {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/test_schema_conformance.py +0 -0
|
@@ -6,6 +6,40 @@ to [Semantic Versioning](https://semver.org/). Per the suite convention
|
|
|
6
6
|
(see [`docs/INTEROP.md`](docs/INTEROP.md)), **backward-incompatible changes to
|
|
7
7
|
the output schema require a major version bump**; new optional fields are minor.
|
|
8
8
|
|
|
9
|
+
## [0.1.14] - 2026-05-22
|
|
10
|
+
|
|
11
|
+
### Improved
|
|
12
|
+
- **HTML clause detection now recognizes emphasis-marked headings.** Real HTML
|
|
13
|
+
contracts (e.g. SEC EDGAR exhibits) mark section headings with emphasis, not
|
|
14
|
+
`##`/numbers — a heading tag, `<b>`/`<strong>`/`<u>`, or **CSS**
|
|
15
|
+
(`font-weight:bold` / `text-decoration:underline`), often with a leading
|
|
16
|
+
`(g)` / `1.` token and the body run-in. The HTML reader now emits such blocks
|
|
17
|
+
as `## ` headings (splitting run-in title from body; a lone emphasized block
|
|
18
|
+
is treated as a title and left plain so numbered/ALL-CAPS sections still win).
|
|
19
|
+
On the accuracy benchmark this lifts **clause recall 0.45 → 0.86** (F1 0.62 →
|
|
20
|
+
0.93), precision still 1.00. Residual misses are compound/combined headings.
|
|
21
|
+
|
|
22
|
+
## [0.1.13] - 2026-05-22
|
|
23
|
+
|
|
24
|
+
### Added
|
|
25
|
+
- **Accuracy benchmark** (`tests/eval/`, `make eval`). Scores the deterministic
|
|
26
|
+
tier against a small corpus of real, executed SEC-EDGAR contracts with
|
|
27
|
+
hand-verified ground truth, reporting precision/recall/F1 per field — turning
|
|
28
|
+
"best-effort" into a measured number. Current: parties F1 0.96, effective
|
|
29
|
+
date / governing law / jurisdiction 1.00, clause recall 0.45 (heading
|
|
30
|
+
detection on dense HTML is the known weak spot). `tests/test_eval.py` gates it
|
|
31
|
+
so accuracy can't silently regress.
|
|
32
|
+
|
|
33
|
+
### Fixed / improved (surfaced by the benchmark)
|
|
34
|
+
- **Governing-law detection** now covers the common connector phrasings beyond
|
|
35
|
+
"governed by the laws of X": "governed by, **and enforced in accordance
|
|
36
|
+
with,** the laws of X", "**interpreted and enforced in accordance with** the
|
|
37
|
+
laws of X", "**construed under** the laws of X". (Benchmark: governing law
|
|
38
|
+
0.67 → 1.00.)
|
|
39
|
+
- **Jurisdiction normalization** now maps **all 50 US states + DC** (plus more
|
|
40
|
+
Canadian provinces / UK nations / countries), not just a dozen. (Benchmark:
|
|
41
|
+
jurisdiction 0.67 → 1.00.)
|
|
42
|
+
|
|
9
43
|
## [0.1.12] - 2026-05-22
|
|
10
44
|
|
|
11
45
|
### Security
|
|
@@ -333,6 +367,8 @@ Initial release — the open-loop front door of the contract-ops CLI suite.
|
|
|
333
367
|
intentionally *not* governed by the output schema (the schema describes the
|
|
334
368
|
full default output).
|
|
335
369
|
|
|
370
|
+
[0.1.14]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.14
|
|
371
|
+
[0.1.13]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.13
|
|
336
372
|
[0.1.12]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.12
|
|
337
373
|
[0.1.11]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.11
|
|
338
374
|
[0.1.10]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.10
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: extract-cli
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.14
|
|
4
4
|
Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON.
|
|
5
5
|
Project-URL: Homepage, https://cli.drbaher.com/
|
|
6
6
|
Project-URL: Repository, https://github.com/DrBaher/extract-cli
|
|
@@ -256,13 +256,34 @@ paths. Configure it once and every suite tool that adopts the same lookup gets
|
|
|
256
256
|
LLM features for free. Without it, `--llm` just warns and returns the
|
|
257
257
|
deterministic output.
|
|
258
258
|
|
|
259
|
+
## Accuracy
|
|
260
|
+
|
|
261
|
+
Line coverage tells you the code runs; it doesn't tell you the extraction is
|
|
262
|
+
*correct*. `make eval` scores the deterministic tier against a small corpus of
|
|
263
|
+
**real, executed contracts** (SEC EDGAR filings) with hand-verified ground truth
|
|
264
|
+
([`tests/eval/`](tests/eval/)), reporting precision/recall per field:
|
|
265
|
+
|
|
266
|
+
| Field | Score |
|
|
267
|
+
|---|---|
|
|
268
|
+
| parties | P 1.00 · R 0.92 · F1 0.96 |
|
|
269
|
+
| effective date | accuracy 1.00 |
|
|
270
|
+
| governing law | accuracy 1.00 |
|
|
271
|
+
| jurisdiction (normalized) | accuracy 1.00 |
|
|
272
|
+
| clauses (recall on verified sections) | 0.86 |
|
|
273
|
+
|
|
274
|
+
Clause recall improved sharply once the HTML reader learned to treat
|
|
275
|
+
emphasis (heading tags, <b>/<u>, CSS font-weight/underline) as section
|
|
276
|
+
headings; the residual misses are compound/combined heading titles. A test (`tests/test_eval.py`) gates these so
|
|
277
|
+
accuracy can't silently regress.
|
|
278
|
+
|
|
259
279
|
## Development
|
|
260
280
|
|
|
261
281
|
```bash
|
|
262
282
|
make install # editable install with the [dev] extra
|
|
263
283
|
make test # full suite
|
|
264
|
-
make coverage # suite + coverage report
|
|
284
|
+
make coverage # suite + coverage report (installs extras; fails under 100%)
|
|
265
285
|
make typecheck # mypy --strict
|
|
286
|
+
make eval # accuracy benchmark vs the labeled corpus
|
|
266
287
|
make build # wheel + sdist
|
|
267
288
|
make smoke # build, install the wheel in a clean venv, run it
|
|
268
289
|
make spec-check # assert docs/spec schema == `extract schema`
|
|
@@ -218,13 +218,34 @@ paths. Configure it once and every suite tool that adopts the same lookup gets
|
|
|
218
218
|
LLM features for free. Without it, `--llm` just warns and returns the
|
|
219
219
|
deterministic output.
|
|
220
220
|
|
|
221
|
+
## Accuracy
|
|
222
|
+
|
|
223
|
+
Line coverage tells you the code runs; it doesn't tell you the extraction is
|
|
224
|
+
*correct*. `make eval` scores the deterministic tier against a small corpus of
|
|
225
|
+
**real, executed contracts** (SEC EDGAR filings) with hand-verified ground truth
|
|
226
|
+
([`tests/eval/`](tests/eval/)), reporting precision/recall per field:
|
|
227
|
+
|
|
228
|
+
| Field | Score |
|
|
229
|
+
|---|---|
|
|
230
|
+
| parties | P 1.00 · R 0.92 · F1 0.96 |
|
|
231
|
+
| effective date | accuracy 1.00 |
|
|
232
|
+
| governing law | accuracy 1.00 |
|
|
233
|
+
| jurisdiction (normalized) | accuracy 1.00 |
|
|
234
|
+
| clauses (recall on verified sections) | 0.86 |
|
|
235
|
+
|
|
236
|
+
Clause recall improved sharply once the HTML reader learned to treat
|
|
237
|
+
emphasis (heading tags, <b>/<u>, CSS font-weight/underline) as section
|
|
238
|
+
headings; the residual misses are compound/combined heading titles. A test (`tests/test_eval.py`) gates these so
|
|
239
|
+
accuracy can't silently regress.
|
|
240
|
+
|
|
221
241
|
## Development
|
|
222
242
|
|
|
223
243
|
```bash
|
|
224
244
|
make install # editable install with the [dev] extra
|
|
225
245
|
make test # full suite
|
|
226
|
-
make coverage # suite + coverage report
|
|
246
|
+
make coverage # suite + coverage report (installs extras; fails under 100%)
|
|
227
247
|
make typecheck # mypy --strict
|
|
248
|
+
make eval # accuracy benchmark vs the labeled corpus
|
|
228
249
|
make build # wheel + sdist
|
|
229
250
|
make smoke # build, install the wheel in a clean venv, run it
|
|
230
251
|
make spec-check # assert docs/spec schema == `extract schema`
|
|
@@ -43,11 +43,11 @@ import urllib.request
|
|
|
43
43
|
from pathlib import Path
|
|
44
44
|
from typing import Any, Dict, List, Optional, Tuple
|
|
45
45
|
|
|
46
|
-
__version__ = "0.1.
|
|
46
|
+
__version__ = "0.1.14"
|
|
47
47
|
|
|
48
48
|
# Bumped independently of the package version when the *extraction logic*
|
|
49
49
|
# changes in a way downstream consumers should notice. Embedded in `_meta`.
|
|
50
|
-
EXTRACTOR_VERSION = "0.1.
|
|
50
|
+
EXTRACTOR_VERSION = "0.1.14"
|
|
51
51
|
|
|
52
52
|
# JSON Schema version of the output contract (docs/spec/extract-output.schema.json).
|
|
53
53
|
SCHEMA_VERSION = 1
|
|
@@ -616,8 +616,11 @@ _ROLE_PAREN_RE = re.compile(
|
|
|
616
616
|
# enforces a capitalized proper noun (a global re.IGNORECASE would defeat that
|
|
617
617
|
# and over-capture trailing lowercase clauses like ", without regard to ...").
|
|
618
618
|
_GOV_LAW_RE = re.compile(
|
|
619
|
-
|
|
620
|
-
|
|
619
|
+
# Allow a short same-sentence gap between "governed by" and "laws of" so the
|
|
620
|
+
# many real connector phrasings are covered: "...and construed in accordance
|
|
621
|
+
# with...", "...and enforced in accordance with...", "the internal laws of",
|
|
622
|
+
# etc. (bounded + lazy so it stays within the clause).
|
|
623
|
+
r"(?i:(?:governed|construed|interpreted|enforced)\b[^.\n]{0,60}?\blaws?\s+of\s+(?:the\s+)?)"
|
|
621
624
|
r"([A-Z][A-Za-z\.\- ]+?(?:,\s*[A-Z][A-Za-z\.\- ]+?)?)"
|
|
622
625
|
r"(?=[\.,;\n)]|\s+and\b|\s+without\b|$)",
|
|
623
626
|
)
|
|
@@ -889,16 +892,31 @@ def extract_signatories(text: str) -> List[JSON]:
|
|
|
889
892
|
return out
|
|
890
893
|
|
|
891
894
|
|
|
892
|
-
# Free-text jurisdiction -> a normalized ISO-
|
|
895
|
+
# Free-text jurisdiction -> a normalized ISO 3166-2 / ISO 3166-1 code. All 50 US
|
|
896
|
+
# states + DC, common Canadian provinces, UK nations, and frequent countries.
|
|
897
|
+
_US_STATES: Dict[str, str] = {
|
|
898
|
+
"alabama": "AL", "alaska": "AK", "arizona": "AZ", "arkansas": "AR",
|
|
899
|
+
"california": "CA", "colorado": "CO", "connecticut": "CT", "delaware": "DE",
|
|
900
|
+
"florida": "FL", "georgia": "GA", "hawaii": "HI", "idaho": "ID",
|
|
901
|
+
"illinois": "IL", "indiana": "IN", "iowa": "IA", "kansas": "KS",
|
|
902
|
+
"kentucky": "KY", "louisiana": "LA", "maine": "ME", "maryland": "MD",
|
|
903
|
+
"massachusetts": "MA", "michigan": "MI", "minnesota": "MN", "mississippi": "MS",
|
|
904
|
+
"missouri": "MO", "montana": "MT", "nebraska": "NE", "nevada": "NV",
|
|
905
|
+
"new hampshire": "NH", "new jersey": "NJ", "new mexico": "NM", "new york": "NY",
|
|
906
|
+
"north carolina": "NC", "north dakota": "ND", "ohio": "OH", "oklahoma": "OK",
|
|
907
|
+
"oregon": "OR", "pennsylvania": "PA", "rhode island": "RI", "south carolina": "SC",
|
|
908
|
+
"south dakota": "SD", "tennessee": "TN", "texas": "TX", "utah": "UT",
|
|
909
|
+
"vermont": "VT", "virginia": "VA", "washington": "WA", "west virginia": "WV",
|
|
910
|
+
"wisconsin": "WI", "wyoming": "WY", "district of columbia": "DC",
|
|
911
|
+
}
|
|
893
912
|
_JURISDICTION_CODES: Dict[str, str] = {
|
|
894
|
-
|
|
895
|
-
"texas": "US-TX", "illinois": "US-IL", "massachusetts": "US-MA",
|
|
896
|
-
"washington": "US-WA", "florida": "US-FL", "nevada": "US-NV",
|
|
897
|
-
"new jersey": "US-NJ", "pennsylvania": "US-PA", "michigan": "US-MI",
|
|
913
|
+
**{name: f"US-{code}" for name, code in _US_STATES.items()},
|
|
898
914
|
"ontario": "CA-ON", "quebec": "CA-QC", "british columbia": "CA-BC",
|
|
899
|
-
"
|
|
915
|
+
"alberta": "CA-AB", "england and wales": "GB-EAW", "england": "GB-ENG",
|
|
916
|
+
"scotland": "GB-SCT", "wales": "GB-WLS", "northern ireland": "GB-NIR",
|
|
900
917
|
"united kingdom": "GB", "france": "FR", "germany": "DE", "ireland": "IE",
|
|
901
918
|
"singapore": "SG", "australia": "AU", "india": "IN", "netherlands": "NL",
|
|
919
|
+
"switzerland": "CH", "japan": "JP",
|
|
902
920
|
}
|
|
903
921
|
|
|
904
922
|
|
|
@@ -1051,11 +1069,24 @@ def _detect_format(path: Path, raw: bytes) -> str:
|
|
|
1051
1069
|
return base
|
|
1052
1070
|
|
|
1053
1071
|
|
|
1072
|
+
def _looks_like_heading_text(s: str) -> bool:
|
|
1073
|
+
"""Lenient: short, few words, not a full sentence -- used to decide whether
|
|
1074
|
+
an *emphasized* HTML block is a clause heading."""
|
|
1075
|
+
s = s.strip().rstrip(".:;,")
|
|
1076
|
+
return 2 <= len(s) <= 90 and len(s.split()) <= 10
|
|
1077
|
+
|
|
1078
|
+
|
|
1054
1079
|
class _HTMLTextExtractor(html.parser.HTMLParser):
|
|
1055
|
-
"""Stdlib HTML -> text
|
|
1056
|
-
|
|
1080
|
+
"""Stdlib HTML -> text. Drops script/style, frames blocks with blank lines,
|
|
1081
|
+
unescapes entities, and -- crucially for clause detection -- emits blocks
|
|
1082
|
+
that are emphasized (a heading tag, or text wrapped in <b>/<strong>/<u>) as
|
|
1083
|
+
Markdown `## headings`. Real contracts (e.g. SEC HTML exhibits) mark section
|
|
1084
|
+
headings with emphasis, not `##`/numbers, so without this the cascade sees
|
|
1085
|
+
only plain lines. A run-in heading (emphasized lead + body in one block) is
|
|
1086
|
+
split into `## Title` + body."""
|
|
1057
1087
|
|
|
1058
1088
|
_SKIP = {"script", "style", "head", "title", "meta", "link", "noscript"}
|
|
1089
|
+
_EMPH = {"b", "strong", "u", "h1", "h2", "h3", "h4", "h5", "h6"}
|
|
1059
1090
|
_BLOCK = {
|
|
1060
1091
|
"p", "div", "br", "tr", "li", "h1", "h2", "h3", "h4", "h5", "h6",
|
|
1061
1092
|
"section", "article", "table", "ul", "ol", "blockquote", "pre", "hr",
|
|
@@ -1064,32 +1095,94 @@ class _HTMLTextExtractor(html.parser.HTMLParser):
|
|
|
1064
1095
|
|
|
1065
1096
|
def __init__(self) -> None:
|
|
1066
1097
|
super().__init__(convert_charrefs=True)
|
|
1067
|
-
self.
|
|
1098
|
+
self._lines: List[str] = []
|
|
1099
|
+
self._runs: List[Tuple[bool, str]] = [] # (emphasized, text) for current block
|
|
1068
1100
|
self._skip = 0
|
|
1101
|
+
self._emph = 0
|
|
1102
|
+
# Per-tag-name LIFO stack of "did this open tag add emphasis?", so an
|
|
1103
|
+
# emphasis opened by a CSS style (not just a <b>/<u> tag) is closed by
|
|
1104
|
+
# the right end tag even when many <font>/<span> nest.
|
|
1105
|
+
self._emph_stack: Dict[str, List[bool]] = {}
|
|
1106
|
+
|
|
1107
|
+
@staticmethod
|
|
1108
|
+
def _style_is_emph(attrs: Any) -> bool:
|
|
1109
|
+
for name, value in attrs:
|
|
1110
|
+
if name == "style" and value:
|
|
1111
|
+
v = value.lower()
|
|
1112
|
+
if ("font-weight:bold" in v.replace(" ", "") or "font-weight:700" in v.replace(" ", "")
|
|
1113
|
+
or "text-decoration:underline" in v.replace(" ", "")):
|
|
1114
|
+
return True
|
|
1115
|
+
return False
|
|
1116
|
+
|
|
1117
|
+
def _flush_block(self) -> None:
|
|
1118
|
+
runs, self._runs = self._runs, []
|
|
1119
|
+
full = re.sub(r"\s+", " ", "".join(t for _e, t in runs)).strip()
|
|
1120
|
+
if not full:
|
|
1121
|
+
self._lines.append("")
|
|
1122
|
+
return
|
|
1123
|
+
# Standalone emphasized block (a heading tag or fully <b>/<u>/styled text).
|
|
1124
|
+
if all(e for e, t in runs if t.strip()) and _looks_like_heading_text(_strip_clause_number(full)):
|
|
1125
|
+
self._lines.append("## " + _strip_clause_number(full))
|
|
1126
|
+
return
|
|
1127
|
+
# Run-in heading: an optional leading numbering token ("(g)", "1.") then
|
|
1128
|
+
# an emphasized title, then the body in the same block.
|
|
1129
|
+
i, saw_emph = 0, False
|
|
1130
|
+
while i < len(runs):
|
|
1131
|
+
emph, txt = runs[i]
|
|
1132
|
+
if not txt.strip():
|
|
1133
|
+
i += 1
|
|
1134
|
+
elif emph:
|
|
1135
|
+
saw_emph = True
|
|
1136
|
+
i += 1
|
|
1137
|
+
elif not saw_emph and re.fullmatch(r"\(?[0-9A-Za-z]{1,4}\)?[.)]?", txt.strip()):
|
|
1138
|
+
i += 1 # skip a clause-number/letter prefix
|
|
1139
|
+
else:
|
|
1140
|
+
break
|
|
1141
|
+
lead = _strip_clause_number(re.sub(r"\s+", " ", "".join(t for _e, t in runs[:i])).strip())
|
|
1142
|
+
rest = re.sub(r"\s+", " ", "".join(t for _e, t in runs[i:])).strip()
|
|
1143
|
+
if saw_emph and lead and rest and _looks_like_heading_text(lead):
|
|
1144
|
+
self._lines.append("## " + lead)
|
|
1145
|
+
self._lines.append(rest)
|
|
1146
|
+
else:
|
|
1147
|
+
self._lines.append(full)
|
|
1069
1148
|
|
|
1070
1149
|
def handle_starttag(self, tag: str, attrs: Any) -> None:
|
|
1071
1150
|
if tag in self._SKIP:
|
|
1072
1151
|
self._skip += 1
|
|
1073
|
-
|
|
1074
|
-
|
|
1152
|
+
return
|
|
1153
|
+
if tag in self._BLOCK:
|
|
1154
|
+
self._flush_block()
|
|
1155
|
+
added = tag in self._EMPH or self._style_is_emph(attrs)
|
|
1156
|
+
self._emph_stack.setdefault(tag, []).append(added)
|
|
1157
|
+
if added:
|
|
1158
|
+
self._emph += 1
|
|
1075
1159
|
|
|
1076
1160
|
def handle_endtag(self, tag: str) -> None:
|
|
1077
1161
|
if tag in self._SKIP and self._skip > 0:
|
|
1078
1162
|
self._skip -= 1
|
|
1079
|
-
|
|
1080
|
-
|
|
1163
|
+
return
|
|
1164
|
+
stack = self._emph_stack.get(tag)
|
|
1165
|
+
if stack:
|
|
1166
|
+
if stack.pop() and self._emph > 0:
|
|
1167
|
+
self._emph -= 1
|
|
1168
|
+
if tag in self._BLOCK:
|
|
1169
|
+
self._flush_block()
|
|
1081
1170
|
|
|
1082
1171
|
def handle_data(self, data: str) -> None:
|
|
1083
1172
|
if self._skip == 0:
|
|
1084
|
-
self.
|
|
1173
|
+
self._runs.append((self._emph > 0, data))
|
|
1085
1174
|
|
|
1086
1175
|
def get_text(self) -> str:
|
|
1087
|
-
|
|
1088
|
-
#
|
|
1089
|
-
|
|
1176
|
+
self._flush_block()
|
|
1177
|
+
# A lone emphasized heading is almost always the document title, not a
|
|
1178
|
+
# section scheme -- downgrade it to plain text so the numbered/ALL-CAPS
|
|
1179
|
+
# tiers can still detect the real sections (matches the >=2 threshold the
|
|
1180
|
+
# other fallback tiers use).
|
|
1181
|
+
if sum(1 for ln in self._lines if ln.startswith("## ")) < 2:
|
|
1182
|
+
self._lines = [ln[3:] if ln.startswith("## ") else ln for ln in self._lines]
|
|
1090
1183
|
out: List[str] = []
|
|
1091
1184
|
blank = False
|
|
1092
|
-
for ln in
|
|
1185
|
+
for ln in self._lines:
|
|
1093
1186
|
if ln:
|
|
1094
1187
|
out.append(ln)
|
|
1095
1188
|
blank = False
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "extract-cli"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.14"
|
|
8
8
|
description = "Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9"
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# Benchmark corpus — sources & licensing
|
|
2
|
+
|
|
3
|
+
The accuracy benchmark (`tests/eval/`) scores extract-cli against a small set of
|
|
4
|
+
**real, executed contracts** filed publicly with the U.S. Securities and
|
|
5
|
+
Exchange Commission (SEC EDGAR). SEC filings are public records; these exhibits
|
|
6
|
+
are reproduced here, unmodified, solely as a regression/accuracy test fixture.
|
|
7
|
+
|
|
8
|
+
| File | Source (SEC EDGAR) |
|
|
9
|
+
|---|---|
|
|
10
|
+
| `emp_celsci.txt` | CEL-SCI Corporation — Exhibit 10(ooo), employment agreement |
|
|
11
|
+
| `msa_kpmg.txt` | Blade Internet Ventures / KPMG Consulting — master services agreement |
|
|
12
|
+
| `services_visteon.txt` | Visteon Corporation — salaried employee lease agreement |
|
|
13
|
+
| `consulting_mtm.htm` | MTM Technologies — consulting agreement |
|
|
14
|
+
| `emp_arcp.htm` | American Realty Capital Properties — employment agreement |
|
|
15
|
+
| `emp_quadgraphics.htm` | Quad/Graphics, Inc. — employment agreement |
|
|
16
|
+
|
|
17
|
+
Ground truth (`gold.json`) was hand-verified against each document's text — the
|
|
18
|
+
parties, effective date, governing law, normalized jurisdiction, and a
|
|
19
|
+
verified subset of section headings. It is intentionally independent of what the
|
|
20
|
+
extractor currently produces.
|