extract-cli 0.1.13__tar.gz → 0.1.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {extract_cli-0.1.13 → extract_cli-0.1.14}/CHANGELOG.md +14 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/PKG-INFO +5 -4
- {extract_cli-0.1.13 → extract_cli-0.1.14}/README.md +4 -3
- {extract_cli-0.1.13 → extract_cli-0.1.14}/extract_cli.py +89 -14
- {extract_cli-0.1.13 → extract_cli-0.1.14}/pyproject.toml +1 -1
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/fixtures/employment_docx.docx.expected.json +1 -1
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/fixtures/heading_docx.docx.expected.json +1 -1
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/fixtures/lease_allcaps.txt.expected.json +1 -1
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/fixtures/license_pdf.pdf.expected.json +1 -1
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/fixtures/nda_h2.md.expected.json +1 -1
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/fixtures/numbered_docx.docx.expected.json +1 -1
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/fixtures/scanned.pdf.expected.json +1 -1
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/fixtures/services_bold.txt.expected.json +1 -1
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/fixtures/services_html.html.expected.json +1 -1
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/test_misc.py +19 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/.gitignore +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/AGENTS.md +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/ARCHITECTURE.md +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/CONTRIBUTING.md +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/LICENSE +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/Makefile +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/config/llm.json.example +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/docs/INTEROP.md +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/docs/spec/extract-output.schema.json +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/llms.txt +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/scripts/release.py +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/scripts/validate_against_spec.py +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/_fixtures_build.py +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/_make_goldens.py +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/_schema_validator.py +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/conftest.py +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/eval/ATTRIBUTION.md +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/eval/corpus/consulting_mtm.htm +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/eval/corpus/emp_arcp.htm +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/eval/corpus/emp_celsci.txt +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/eval/corpus/emp_quadgraphics.htm +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/eval/corpus/msa_kpmg.txt +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/eval/corpus/services_visteon.txt +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/eval/evaluate.py +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/eval/gold.json +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/fixtures/employment_docx.docx +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/fixtures/heading_docx.docx +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/fixtures/lease_allcaps.txt +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/fixtures/license_pdf.pdf +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/fixtures/nda_h2.md +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/fixtures/numbered_docx.docx +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/fixtures/scanned.pdf +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/fixtures/services_bold.txt +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/fixtures/services_html.html +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/test_clause_map.py +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/test_cli.py +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/test_coverage.py +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/test_deterministic.py +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/test_eval.py +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/test_llm.py +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/test_property.py +0 -0
- {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/test_schema_conformance.py +0 -0
|
@@ -6,6 +6,19 @@ to [Semantic Versioning](https://semver.org/). Per the suite convention
|
|
|
6
6
|
(see [`docs/INTEROP.md`](docs/INTEROP.md)), **backward-incompatible changes to
|
|
7
7
|
the output schema require a major version bump**; new optional fields are minor.
|
|
8
8
|
|
|
9
|
+
## [0.1.14] - 2026-05-22
|
|
10
|
+
|
|
11
|
+
### Improved
|
|
12
|
+
- **HTML clause detection now recognizes emphasis-marked headings.** Real HTML
|
|
13
|
+
contracts (e.g. SEC EDGAR exhibits) mark section headings with emphasis, not
|
|
14
|
+
`##`/numbers — a heading tag, `<b>`/`<strong>`/`<u>`, or **CSS**
|
|
15
|
+
(`font-weight:bold` / `text-decoration:underline`), often with a leading
|
|
16
|
+
`(g)` / `1.` token and the body run-in. The HTML reader now emits such blocks
|
|
17
|
+
as `## ` headings (splitting run-in title from body; a lone emphasized block
|
|
18
|
+
is treated as a title and left plain so numbered/ALL-CAPS sections still win).
|
|
19
|
+
On the accuracy benchmark this lifts **clause recall 0.45 → 0.86** (F1 0.62 →
|
|
20
|
+
0.93), precision still 1.00. Residual misses are compound/combined headings.
|
|
21
|
+
|
|
9
22
|
## [0.1.13] - 2026-05-22
|
|
10
23
|
|
|
11
24
|
### Added
|
|
@@ -354,6 +367,7 @@ Initial release — the open-loop front door of the contract-ops CLI suite.
|
|
|
354
367
|
intentionally *not* governed by the output schema (the schema describes the
|
|
355
368
|
full default output).
|
|
356
369
|
|
|
370
|
+
[0.1.14]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.14
|
|
357
371
|
[0.1.13]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.13
|
|
358
372
|
[0.1.12]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.12
|
|
359
373
|
[0.1.11]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.11
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: extract-cli
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.14
|
|
4
4
|
Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON.
|
|
5
5
|
Project-URL: Homepage, https://cli.drbaher.com/
|
|
6
6
|
Project-URL: Repository, https://github.com/DrBaher/extract-cli
|
|
@@ -269,10 +269,11 @@ Line coverage tells you the code runs; it doesn't tell you the extraction is
|
|
|
269
269
|
| effective date | accuracy 1.00 |
|
|
270
270
|
| governing law | accuracy 1.00 |
|
|
271
271
|
| jurisdiction (normalized) | accuracy 1.00 |
|
|
272
|
-
| clauses (recall on verified sections) | 0.
|
|
272
|
+
| clauses (recall on verified sections) | 0.86 |
|
|
273
273
|
|
|
274
|
-
Clause recall
|
|
275
|
-
|
|
274
|
+
Clause recall improved sharply once the HTML reader learned to treat
|
|
275
|
+
emphasis (heading tags, <b>/<u>, CSS font-weight/underline) as section
|
|
276
|
+
headings; the residual misses are compound/combined heading titles. A test (`tests/test_eval.py`) gates these so
|
|
276
277
|
accuracy can't silently regress.
|
|
277
278
|
|
|
278
279
|
## Development
|
|
@@ -231,10 +231,11 @@ Line coverage tells you the code runs; it doesn't tell you the extraction is
|
|
|
231
231
|
| effective date | accuracy 1.00 |
|
|
232
232
|
| governing law | accuracy 1.00 |
|
|
233
233
|
| jurisdiction (normalized) | accuracy 1.00 |
|
|
234
|
-
| clauses (recall on verified sections) | 0.
|
|
234
|
+
| clauses (recall on verified sections) | 0.86 |
|
|
235
235
|
|
|
236
|
-
Clause recall
|
|
237
|
-
|
|
236
|
+
Clause recall improved sharply once the HTML reader learned to treat
|
|
237
|
+
emphasis (heading tags, <b>/<u>, CSS font-weight/underline) as section
|
|
238
|
+
headings; the residual misses are compound/combined heading titles. A test (`tests/test_eval.py`) gates these so
|
|
238
239
|
accuracy can't silently regress.
|
|
239
240
|
|
|
240
241
|
## Development
|
|
@@ -43,11 +43,11 @@ import urllib.request
|
|
|
43
43
|
from pathlib import Path
|
|
44
44
|
from typing import Any, Dict, List, Optional, Tuple
|
|
45
45
|
|
|
46
|
-
__version__ = "0.1.
|
|
46
|
+
__version__ = "0.1.14"
|
|
47
47
|
|
|
48
48
|
# Bumped independently of the package version when the *extraction logic*
|
|
49
49
|
# changes in a way downstream consumers should notice. Embedded in `_meta`.
|
|
50
|
-
EXTRACTOR_VERSION = "0.1.
|
|
50
|
+
EXTRACTOR_VERSION = "0.1.14"
|
|
51
51
|
|
|
52
52
|
# JSON Schema version of the output contract (docs/spec/extract-output.schema.json).
|
|
53
53
|
SCHEMA_VERSION = 1
|
|
@@ -1069,11 +1069,24 @@ def _detect_format(path: Path, raw: bytes) -> str:
|
|
|
1069
1069
|
return base
|
|
1070
1070
|
|
|
1071
1071
|
|
|
1072
|
+
def _looks_like_heading_text(s: str) -> bool:
|
|
1073
|
+
"""Lenient: short, few words, not a full sentence -- used to decide whether
|
|
1074
|
+
an *emphasized* HTML block is a clause heading."""
|
|
1075
|
+
s = s.strip().rstrip(".:;,")
|
|
1076
|
+
return 2 <= len(s) <= 90 and len(s.split()) <= 10
|
|
1077
|
+
|
|
1078
|
+
|
|
1072
1079
|
class _HTMLTextExtractor(html.parser.HTMLParser):
|
|
1073
|
-
"""Stdlib HTML -> text
|
|
1074
|
-
|
|
1080
|
+
"""Stdlib HTML -> text. Drops script/style, frames blocks with blank lines,
|
|
1081
|
+
unescapes entities, and -- crucially for clause detection -- emits blocks
|
|
1082
|
+
that are emphasized (a heading tag, or text wrapped in <b>/<strong>/<u>) as
|
|
1083
|
+
Markdown `## headings`. Real contracts (e.g. SEC HTML exhibits) mark section
|
|
1084
|
+
headings with emphasis, not `##`/numbers, so without this the cascade sees
|
|
1085
|
+
only plain lines. A run-in heading (emphasized lead + body in one block) is
|
|
1086
|
+
split into `## Title` + body."""
|
|
1075
1087
|
|
|
1076
1088
|
_SKIP = {"script", "style", "head", "title", "meta", "link", "noscript"}
|
|
1089
|
+
_EMPH = {"b", "strong", "u", "h1", "h2", "h3", "h4", "h5", "h6"}
|
|
1077
1090
|
_BLOCK = {
|
|
1078
1091
|
"p", "div", "br", "tr", "li", "h1", "h2", "h3", "h4", "h5", "h6",
|
|
1079
1092
|
"section", "article", "table", "ul", "ol", "blockquote", "pre", "hr",
|
|
@@ -1082,32 +1095,94 @@ class _HTMLTextExtractor(html.parser.HTMLParser):
|
|
|
1082
1095
|
|
|
1083
1096
|
def __init__(self) -> None:
|
|
1084
1097
|
super().__init__(convert_charrefs=True)
|
|
1085
|
-
self.
|
|
1098
|
+
self._lines: List[str] = []
|
|
1099
|
+
self._runs: List[Tuple[bool, str]] = [] # (emphasized, text) for current block
|
|
1086
1100
|
self._skip = 0
|
|
1101
|
+
self._emph = 0
|
|
1102
|
+
# Per-tag-name LIFO stack of "did this open tag add emphasis?", so an
|
|
1103
|
+
# emphasis opened by a CSS style (not just a <b>/<u> tag) is closed by
|
|
1104
|
+
# the right end tag even when many <font>/<span> nest.
|
|
1105
|
+
self._emph_stack: Dict[str, List[bool]] = {}
|
|
1106
|
+
|
|
1107
|
+
@staticmethod
|
|
1108
|
+
def _style_is_emph(attrs: Any) -> bool:
|
|
1109
|
+
for name, value in attrs:
|
|
1110
|
+
if name == "style" and value:
|
|
1111
|
+
v = value.lower()
|
|
1112
|
+
if ("font-weight:bold" in v.replace(" ", "") or "font-weight:700" in v.replace(" ", "")
|
|
1113
|
+
or "text-decoration:underline" in v.replace(" ", "")):
|
|
1114
|
+
return True
|
|
1115
|
+
return False
|
|
1116
|
+
|
|
1117
|
+
def _flush_block(self) -> None:
|
|
1118
|
+
runs, self._runs = self._runs, []
|
|
1119
|
+
full = re.sub(r"\s+", " ", "".join(t for _e, t in runs)).strip()
|
|
1120
|
+
if not full:
|
|
1121
|
+
self._lines.append("")
|
|
1122
|
+
return
|
|
1123
|
+
# Standalone emphasized block (a heading tag or fully <b>/<u>/styled text).
|
|
1124
|
+
if all(e for e, t in runs if t.strip()) and _looks_like_heading_text(_strip_clause_number(full)):
|
|
1125
|
+
self._lines.append("## " + _strip_clause_number(full))
|
|
1126
|
+
return
|
|
1127
|
+
# Run-in heading: an optional leading numbering token ("(g)", "1.") then
|
|
1128
|
+
# an emphasized title, then the body in the same block.
|
|
1129
|
+
i, saw_emph = 0, False
|
|
1130
|
+
while i < len(runs):
|
|
1131
|
+
emph, txt = runs[i]
|
|
1132
|
+
if not txt.strip():
|
|
1133
|
+
i += 1
|
|
1134
|
+
elif emph:
|
|
1135
|
+
saw_emph = True
|
|
1136
|
+
i += 1
|
|
1137
|
+
elif not saw_emph and re.fullmatch(r"\(?[0-9A-Za-z]{1,4}\)?[.)]?", txt.strip()):
|
|
1138
|
+
i += 1 # skip a clause-number/letter prefix
|
|
1139
|
+
else:
|
|
1140
|
+
break
|
|
1141
|
+
lead = _strip_clause_number(re.sub(r"\s+", " ", "".join(t for _e, t in runs[:i])).strip())
|
|
1142
|
+
rest = re.sub(r"\s+", " ", "".join(t for _e, t in runs[i:])).strip()
|
|
1143
|
+
if saw_emph and lead and rest and _looks_like_heading_text(lead):
|
|
1144
|
+
self._lines.append("## " + lead)
|
|
1145
|
+
self._lines.append(rest)
|
|
1146
|
+
else:
|
|
1147
|
+
self._lines.append(full)
|
|
1087
1148
|
|
|
1088
1149
|
def handle_starttag(self, tag: str, attrs: Any) -> None:
|
|
1089
1150
|
if tag in self._SKIP:
|
|
1090
1151
|
self._skip += 1
|
|
1091
|
-
|
|
1092
|
-
|
|
1152
|
+
return
|
|
1153
|
+
if tag in self._BLOCK:
|
|
1154
|
+
self._flush_block()
|
|
1155
|
+
added = tag in self._EMPH or self._style_is_emph(attrs)
|
|
1156
|
+
self._emph_stack.setdefault(tag, []).append(added)
|
|
1157
|
+
if added:
|
|
1158
|
+
self._emph += 1
|
|
1093
1159
|
|
|
1094
1160
|
def handle_endtag(self, tag: str) -> None:
|
|
1095
1161
|
if tag in self._SKIP and self._skip > 0:
|
|
1096
1162
|
self._skip -= 1
|
|
1097
|
-
|
|
1098
|
-
|
|
1163
|
+
return
|
|
1164
|
+
stack = self._emph_stack.get(tag)
|
|
1165
|
+
if stack:
|
|
1166
|
+
if stack.pop() and self._emph > 0:
|
|
1167
|
+
self._emph -= 1
|
|
1168
|
+
if tag in self._BLOCK:
|
|
1169
|
+
self._flush_block()
|
|
1099
1170
|
|
|
1100
1171
|
def handle_data(self, data: str) -> None:
|
|
1101
1172
|
if self._skip == 0:
|
|
1102
|
-
self.
|
|
1173
|
+
self._runs.append((self._emph > 0, data))
|
|
1103
1174
|
|
|
1104
1175
|
def get_text(self) -> str:
|
|
1105
|
-
|
|
1106
|
-
#
|
|
1107
|
-
|
|
1176
|
+
self._flush_block()
|
|
1177
|
+
# A lone emphasized heading is almost always the document title, not a
|
|
1178
|
+
# section scheme -- downgrade it to plain text so the numbered/ALL-CAPS
|
|
1179
|
+
# tiers can still detect the real sections (matches the >=2 threshold the
|
|
1180
|
+
# other fallback tiers use).
|
|
1181
|
+
if sum(1 for ln in self._lines if ln.startswith("## ")) < 2:
|
|
1182
|
+
self._lines = [ln[3:] if ln.startswith("## ") else ln for ln in self._lines]
|
|
1108
1183
|
out: List[str] = []
|
|
1109
1184
|
blank = False
|
|
1110
|
-
for ln in
|
|
1185
|
+
for ln in self._lines:
|
|
1111
1186
|
if ln:
|
|
1112
1187
|
out.append(ln)
|
|
1113
1188
|
blank = False
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "extract-cli"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.14"
|
|
8
8
|
description = "Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9"
|
|
@@ -327,6 +327,25 @@ def test_html_extraction() -> None:
|
|
|
327
327
|
assert {"Payment", "Termination", "Confidentiality", "Governing Law"} <= canon
|
|
328
328
|
|
|
329
329
|
|
|
330
|
+
def test_html_emphasis_headings_become_clauses() -> None:
|
|
331
|
+
"""Section headings marked by emphasis (heading tag, <b>/<u>, or CSS
|
|
332
|
+
font-weight/underline) -- with or without a leading '(a)'/'1.' token -- are
|
|
333
|
+
emitted as `## ` headings and detected as clauses."""
|
|
334
|
+
html = (
|
|
335
|
+
"<html><body>"
|
|
336
|
+
"<p><b>MASTER AGREEMENT</b></p>"
|
|
337
|
+
"<p>(a) <u>Confidentiality</u>. The parties keep information secret.</p>"
|
|
338
|
+
"<p><font style=\"font-weight:bold\">Payment</font>. Fees are due monthly.</p>"
|
|
339
|
+
"<p>(c) <span style=\"text-decoration:underline\">Governing Law</span>. "
|
|
340
|
+
"Governed by the laws of the State of Delaware.</p>"
|
|
341
|
+
"</body></html>"
|
|
342
|
+
)
|
|
343
|
+
text = ex._read_html(html)
|
|
344
|
+
result = ex.build_extraction(text, html.encode(), "html", "x.html")
|
|
345
|
+
canon = {c["canonical_title"] for c in result["clauses"] if c["mapped"]}
|
|
346
|
+
assert {"Confidentiality", "Payment", "Governing Law"} <= canon
|
|
347
|
+
|
|
348
|
+
|
|
330
349
|
def test_html_detected_by_content_sniff(tmp_path: Any) -> None:
|
|
331
350
|
# HTML masquerading as .txt (e.g. a SEC EDGAR full submission) is sniffed.
|
|
332
351
|
p = tmp_path / "exhibit.txt"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|