extract-cli 0.1.13__tar.gz → 0.1.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. {extract_cli-0.1.13 → extract_cli-0.1.14}/CHANGELOG.md +14 -0
  2. {extract_cli-0.1.13 → extract_cli-0.1.14}/PKG-INFO +5 -4
  3. {extract_cli-0.1.13 → extract_cli-0.1.14}/README.md +4 -3
  4. {extract_cli-0.1.13 → extract_cli-0.1.14}/extract_cli.py +89 -14
  5. {extract_cli-0.1.13 → extract_cli-0.1.14}/pyproject.toml +1 -1
  6. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/fixtures/employment_docx.docx.expected.json +1 -1
  7. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/fixtures/heading_docx.docx.expected.json +1 -1
  8. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/fixtures/lease_allcaps.txt.expected.json +1 -1
  9. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/fixtures/license_pdf.pdf.expected.json +1 -1
  10. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/fixtures/nda_h2.md.expected.json +1 -1
  11. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/fixtures/numbered_docx.docx.expected.json +1 -1
  12. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/fixtures/scanned.pdf.expected.json +1 -1
  13. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/fixtures/services_bold.txt.expected.json +1 -1
  14. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/fixtures/services_html.html.expected.json +1 -1
  15. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/test_misc.py +19 -0
  16. {extract_cli-0.1.13 → extract_cli-0.1.14}/.gitignore +0 -0
  17. {extract_cli-0.1.13 → extract_cli-0.1.14}/AGENTS.md +0 -0
  18. {extract_cli-0.1.13 → extract_cli-0.1.14}/ARCHITECTURE.md +0 -0
  19. {extract_cli-0.1.13 → extract_cli-0.1.14}/CONTRIBUTING.md +0 -0
  20. {extract_cli-0.1.13 → extract_cli-0.1.14}/LICENSE +0 -0
  21. {extract_cli-0.1.13 → extract_cli-0.1.14}/Makefile +0 -0
  22. {extract_cli-0.1.13 → extract_cli-0.1.14}/config/llm.json.example +0 -0
  23. {extract_cli-0.1.13 → extract_cli-0.1.14}/docs/INTEROP.md +0 -0
  24. {extract_cli-0.1.13 → extract_cli-0.1.14}/docs/spec/extract-output.schema.json +0 -0
  25. {extract_cli-0.1.13 → extract_cli-0.1.14}/llms.txt +0 -0
  26. {extract_cli-0.1.13 → extract_cli-0.1.14}/scripts/release.py +0 -0
  27. {extract_cli-0.1.13 → extract_cli-0.1.14}/scripts/validate_against_spec.py +0 -0
  28. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/_fixtures_build.py +0 -0
  29. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/_make_goldens.py +0 -0
  30. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/_schema_validator.py +0 -0
  31. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/conftest.py +0 -0
  32. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/eval/ATTRIBUTION.md +0 -0
  33. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/eval/corpus/consulting_mtm.htm +0 -0
  34. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/eval/corpus/emp_arcp.htm +0 -0
  35. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/eval/corpus/emp_celsci.txt +0 -0
  36. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/eval/corpus/emp_quadgraphics.htm +0 -0
  37. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/eval/corpus/msa_kpmg.txt +0 -0
  38. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/eval/corpus/services_visteon.txt +0 -0
  39. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/eval/evaluate.py +0 -0
  40. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/eval/gold.json +0 -0
  41. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/fixtures/employment_docx.docx +0 -0
  42. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/fixtures/heading_docx.docx +0 -0
  43. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/fixtures/lease_allcaps.txt +0 -0
  44. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/fixtures/license_pdf.pdf +0 -0
  45. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/fixtures/nda_h2.md +0 -0
  46. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/fixtures/numbered_docx.docx +0 -0
  47. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/fixtures/scanned.pdf +0 -0
  48. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/fixtures/services_bold.txt +0 -0
  49. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/fixtures/services_html.html +0 -0
  50. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/test_clause_map.py +0 -0
  51. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/test_cli.py +0 -0
  52. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/test_coverage.py +0 -0
  53. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/test_deterministic.py +0 -0
  54. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/test_eval.py +0 -0
  55. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/test_llm.py +0 -0
  56. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/test_property.py +0 -0
  57. {extract_cli-0.1.13 → extract_cli-0.1.14}/tests/test_schema_conformance.py +0 -0
@@ -6,6 +6,19 @@ to [Semantic Versioning](https://semver.org/). Per the suite convention
6
6
  (see [`docs/INTEROP.md`](docs/INTEROP.md)), **backward-incompatible changes to
7
7
  the output schema require a major version bump**; new optional fields are minor.
8
8
 
9
+ ## [0.1.14] - 2026-05-22
10
+
11
+ ### Improved
12
+ - **HTML clause detection now recognizes emphasis-marked headings.** Real HTML
13
+ contracts (e.g. SEC EDGAR exhibits) mark section headings with emphasis, not
14
+ `##`/numbers — a heading tag, `<b>`/`<strong>`/`<u>`, or **CSS**
15
+ (`font-weight:bold` / `text-decoration:underline`), often with a leading
16
+ `(g)` / `1.` token and the body run-in. The HTML reader now emits such blocks
17
+ as `## ` headings (splitting run-in title from body; a lone emphasized block
18
+ is treated as a title and left plain so numbered/ALL-CAPS sections still win).
19
+ On the accuracy benchmark this lifts **clause recall 0.45 → 0.86** (F1 0.62 →
20
+ 0.93), precision still 1.00. Residual misses are compound/combined headings.
21
+
9
22
  ## [0.1.13] - 2026-05-22
10
23
 
11
24
  ### Added
@@ -354,6 +367,7 @@ Initial release — the open-loop front door of the contract-ops CLI suite.
354
367
  intentionally *not* governed by the output schema (the schema describes the
355
368
  full default output).
356
369
 
370
+ [0.1.14]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.14
357
371
  [0.1.13]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.13
358
372
  [0.1.12]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.12
359
373
  [0.1.11]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.11
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-cli
3
- Version: 0.1.13
3
+ Version: 0.1.14
4
4
  Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON.
5
5
  Project-URL: Homepage, https://cli.drbaher.com/
6
6
  Project-URL: Repository, https://github.com/DrBaher/extract-cli
@@ -269,10 +269,11 @@ Line coverage tells you the code runs; it doesn't tell you the extraction is
269
269
  | effective date | accuracy 1.00 |
270
270
  | governing law | accuracy 1.00 |
271
271
  | jurisdiction (normalized) | accuracy 1.00 |
272
- | clauses (recall on verified sections) | 0.45 |
272
+ | clauses (recall on verified sections) | 0.86 |
273
273
 
274
- Clause recall is the honest weak spot heading detection on dense HTML
275
- exhibits still misses sections. A test (`tests/test_eval.py`) gates these so
274
+ Clause recall improved sharply once the HTML reader learned to treat
275
+ emphasis (heading tags, <b>/<u>, CSS font-weight/underline) as section
276
+ headings; the residual misses are compound/combined heading titles. A test (`tests/test_eval.py`) gates these so
276
277
  accuracy can't silently regress.
277
278
 
278
279
  ## Development
@@ -231,10 +231,11 @@ Line coverage tells you the code runs; it doesn't tell you the extraction is
231
231
  | effective date | accuracy 1.00 |
232
232
  | governing law | accuracy 1.00 |
233
233
  | jurisdiction (normalized) | accuracy 1.00 |
234
- | clauses (recall on verified sections) | 0.45 |
234
+ | clauses (recall on verified sections) | 0.86 |
235
235
 
236
- Clause recall is the honest weak spot heading detection on dense HTML
237
- exhibits still misses sections. A test (`tests/test_eval.py`) gates these so
236
+ Clause recall improved sharply once the HTML reader learned to treat
237
+ emphasis (heading tags, <b>/<u>, CSS font-weight/underline) as section
238
+ headings; the residual misses are compound/combined heading titles. A test (`tests/test_eval.py`) gates these so
238
239
  accuracy can't silently regress.
239
240
 
240
241
  ## Development
@@ -43,11 +43,11 @@ import urllib.request
43
43
  from pathlib import Path
44
44
  from typing import Any, Dict, List, Optional, Tuple
45
45
 
46
- __version__ = "0.1.13"
46
+ __version__ = "0.1.14"
47
47
 
48
48
  # Bumped independently of the package version when the *extraction logic*
49
49
  # changes in a way downstream consumers should notice. Embedded in `_meta`.
50
- EXTRACTOR_VERSION = "0.1.13"
50
+ EXTRACTOR_VERSION = "0.1.14"
51
51
 
52
52
  # JSON Schema version of the output contract (docs/spec/extract-output.schema.json).
53
53
  SCHEMA_VERSION = 1
@@ -1069,11 +1069,24 @@ def _detect_format(path: Path, raw: bytes) -> str:
1069
1069
  return base
1070
1070
 
1071
1071
 
1072
+ def _looks_like_heading_text(s: str) -> bool:
1073
+ """Lenient: short, few words, not a full sentence -- used to decide whether
1074
+ an *emphasized* HTML block is a clause heading."""
1075
+ s = s.strip().rstrip(".:;,")
1076
+ return 2 <= len(s) <= 90 and len(s.split()) <= 10
1077
+
1078
+
1072
1079
  class _HTMLTextExtractor(html.parser.HTMLParser):
1073
- """Stdlib HTML -> text: drops script/style, frames block elements with blank
1074
- lines (so clause-heading detection still works), and unescapes entities."""
1080
+ """Stdlib HTML -> text. Drops script/style, frames blocks with blank lines,
1081
+ unescapes entities, and -- crucially for clause detection -- emits blocks
1082
+ that are emphasized (a heading tag, or text wrapped in <b>/<strong>/<u>) as
1083
+ Markdown `## headings`. Real contracts (e.g. SEC HTML exhibits) mark section
1084
+ headings with emphasis, not `##`/numbers, so without this the cascade sees
1085
+ only plain lines. A run-in heading (emphasized lead + body in one block) is
1086
+ split into `## Title` + body."""
1075
1087
 
1076
1088
  _SKIP = {"script", "style", "head", "title", "meta", "link", "noscript"}
1089
+ _EMPH = {"b", "strong", "u", "h1", "h2", "h3", "h4", "h5", "h6"}
1077
1090
  _BLOCK = {
1078
1091
  "p", "div", "br", "tr", "li", "h1", "h2", "h3", "h4", "h5", "h6",
1079
1092
  "section", "article", "table", "ul", "ol", "blockquote", "pre", "hr",
@@ -1082,32 +1095,94 @@ class _HTMLTextExtractor(html.parser.HTMLParser):
1082
1095
 
1083
1096
  def __init__(self) -> None:
1084
1097
  super().__init__(convert_charrefs=True)
1085
- self._parts: List[str] = []
1098
+ self._lines: List[str] = []
1099
+ self._runs: List[Tuple[bool, str]] = [] # (emphasized, text) for current block
1086
1100
  self._skip = 0
1101
+ self._emph = 0
1102
+ # Per-tag-name LIFO stack of "did this open tag add emphasis?", so an
1103
+ # emphasis opened by a CSS style (not just a <b>/<u> tag) is closed by
1104
+ # the right end tag even when many <font>/<span> nest.
1105
+ self._emph_stack: Dict[str, List[bool]] = {}
1106
+
1107
+ @staticmethod
1108
+ def _style_is_emph(attrs: Any) -> bool:
1109
+ for name, value in attrs:
1110
+ if name == "style" and value:
1111
+ v = value.lower()
1112
+ if ("font-weight:bold" in v.replace(" ", "") or "font-weight:700" in v.replace(" ", "")
1113
+ or "text-decoration:underline" in v.replace(" ", "")):
1114
+ return True
1115
+ return False
1116
+
1117
+ def _flush_block(self) -> None:
1118
+ runs, self._runs = self._runs, []
1119
+ full = re.sub(r"\s+", " ", "".join(t for _e, t in runs)).strip()
1120
+ if not full:
1121
+ self._lines.append("")
1122
+ return
1123
+ # Standalone emphasized block (a heading tag or fully <b>/<u>/styled text).
1124
+ if all(e for e, t in runs if t.strip()) and _looks_like_heading_text(_strip_clause_number(full)):
1125
+ self._lines.append("## " + _strip_clause_number(full))
1126
+ return
1127
+ # Run-in heading: an optional leading numbering token ("(g)", "1.") then
1128
+ # an emphasized title, then the body in the same block.
1129
+ i, saw_emph = 0, False
1130
+ while i < len(runs):
1131
+ emph, txt = runs[i]
1132
+ if not txt.strip():
1133
+ i += 1
1134
+ elif emph:
1135
+ saw_emph = True
1136
+ i += 1
1137
+ elif not saw_emph and re.fullmatch(r"\(?[0-9A-Za-z]{1,4}\)?[.)]?", txt.strip()):
1138
+ i += 1 # skip a clause-number/letter prefix
1139
+ else:
1140
+ break
1141
+ lead = _strip_clause_number(re.sub(r"\s+", " ", "".join(t for _e, t in runs[:i])).strip())
1142
+ rest = re.sub(r"\s+", " ", "".join(t for _e, t in runs[i:])).strip()
1143
+ if saw_emph and lead and rest and _looks_like_heading_text(lead):
1144
+ self._lines.append("## " + lead)
1145
+ self._lines.append(rest)
1146
+ else:
1147
+ self._lines.append(full)
1087
1148
 
1088
1149
  def handle_starttag(self, tag: str, attrs: Any) -> None:
1089
1150
  if tag in self._SKIP:
1090
1151
  self._skip += 1
1091
- elif tag in self._BLOCK:
1092
- self._parts.append("\n")
1152
+ return
1153
+ if tag in self._BLOCK:
1154
+ self._flush_block()
1155
+ added = tag in self._EMPH or self._style_is_emph(attrs)
1156
+ self._emph_stack.setdefault(tag, []).append(added)
1157
+ if added:
1158
+ self._emph += 1
1093
1159
 
1094
1160
  def handle_endtag(self, tag: str) -> None:
1095
1161
  if tag in self._SKIP and self._skip > 0:
1096
1162
  self._skip -= 1
1097
- elif tag in self._BLOCK:
1098
- self._parts.append("\n")
1163
+ return
1164
+ stack = self._emph_stack.get(tag)
1165
+ if stack:
1166
+ if stack.pop() and self._emph > 0:
1167
+ self._emph -= 1
1168
+ if tag in self._BLOCK:
1169
+ self._flush_block()
1099
1170
 
1100
1171
  def handle_data(self, data: str) -> None:
1101
1172
  if self._skip == 0:
1102
- self._parts.append(data)
1173
+ self._runs.append((self._emph > 0, data))
1103
1174
 
1104
1175
  def get_text(self) -> str:
1105
- # Strip each line; collapse runs of blank lines to a single blank line
1106
- # (gives ALL-CAPS / numbered headings their blank-line frame).
1107
- lines = [re.sub(r"[ \t]+", " ", ln).strip() for ln in "".join(self._parts).split("\n")]
1176
+ self._flush_block()
1177
+ # A lone emphasized heading is almost always the document title, not a
1178
+ # section scheme -- downgrade it to plain text so the numbered/ALL-CAPS
1179
+ # tiers can still detect the real sections (matches the >=2 threshold the
1180
+ # other fallback tiers use).
1181
+ if sum(1 for ln in self._lines if ln.startswith("## ")) < 2:
1182
+ self._lines = [ln[3:] if ln.startswith("## ") else ln for ln in self._lines]
1108
1183
  out: List[str] = []
1109
1184
  blank = False
1110
- for ln in lines:
1185
+ for ln in self._lines:
1111
1186
  if ln:
1112
1187
  out.append(ln)
1113
1188
  blank = False
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "extract-cli"
7
- version = "0.1.13"
7
+ version = "0.1.14"
8
8
  description = "Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
@@ -151,7 +151,7 @@
151
151
  ],
152
152
  "signatories": [],
153
153
  "_meta": {
154
- "extractor_version": "0.1.13",
154
+ "extractor_version": "0.1.14",
155
155
  "tiers_used": [
156
156
  "deterministic"
157
157
  ],
@@ -140,7 +140,7 @@
140
140
  "amounts": [],
141
141
  "signatories": [],
142
142
  "_meta": {
143
- "extractor_version": "0.1.13",
143
+ "extractor_version": "0.1.14",
144
144
  "tiers_used": [
145
145
  "deterministic"
146
146
  ],
@@ -146,7 +146,7 @@
146
146
  ],
147
147
  "signatories": [],
148
148
  "_meta": {
149
- "extractor_version": "0.1.13",
149
+ "extractor_version": "0.1.14",
150
150
  "tiers_used": [
151
151
  "deterministic"
152
152
  ],
@@ -146,7 +146,7 @@
146
146
  ],
147
147
  "signatories": [],
148
148
  "_meta": {
149
- "extractor_version": "0.1.13",
149
+ "extractor_version": "0.1.14",
150
150
  "tiers_used": [
151
151
  "deterministic"
152
152
  ],
@@ -150,7 +150,7 @@
150
150
  "amounts": [],
151
151
  "signatories": [],
152
152
  "_meta": {
153
- "extractor_version": "0.1.13",
153
+ "extractor_version": "0.1.14",
154
154
  "tiers_used": [
155
155
  "deterministic"
156
156
  ],
@@ -140,7 +140,7 @@
140
140
  "amounts": [],
141
141
  "signatories": [],
142
142
  "_meta": {
143
- "extractor_version": "0.1.13",
143
+ "extractor_version": "0.1.14",
144
144
  "tiers_used": [
145
145
  "deterministic"
146
146
  ],
@@ -55,7 +55,7 @@
55
55
  "amounts": [],
56
56
  "signatories": [],
57
57
  "_meta": {
58
- "extractor_version": "0.1.13",
58
+ "extractor_version": "0.1.14",
59
59
  "tiers_used": [
60
60
  "deterministic"
61
61
  ],
@@ -146,7 +146,7 @@
146
146
  ],
147
147
  "signatories": [],
148
148
  "_meta": {
149
- "extractor_version": "0.1.13",
149
+ "extractor_version": "0.1.14",
150
150
  "tiers_used": [
151
151
  "deterministic"
152
152
  ],
@@ -161,7 +161,7 @@
161
161
  ],
162
162
  "signatories": [],
163
163
  "_meta": {
164
- "extractor_version": "0.1.13",
164
+ "extractor_version": "0.1.14",
165
165
  "tiers_used": [
166
166
  "deterministic"
167
167
  ],
@@ -327,6 +327,25 @@ def test_html_extraction() -> None:
327
327
  assert {"Payment", "Termination", "Confidentiality", "Governing Law"} <= canon
328
328
 
329
329
 
330
+ def test_html_emphasis_headings_become_clauses() -> None:
331
+ """Section headings marked by emphasis (heading tag, <b>/<u>, or CSS
332
+ font-weight/underline) -- with or without a leading '(a)'/'1.' token -- are
333
+ emitted as `## ` headings and detected as clauses."""
334
+ html = (
335
+ "<html><body>"
336
+ "<p><b>MASTER AGREEMENT</b></p>"
337
+ "<p>(a) <u>Confidentiality</u>. The parties keep information secret.</p>"
338
+ "<p><font style=\"font-weight:bold\">Payment</font>. Fees are due monthly.</p>"
339
+ "<p>(c) <span style=\"text-decoration:underline\">Governing Law</span>. "
340
+ "Governed by the laws of the State of Delaware.</p>"
341
+ "</body></html>"
342
+ )
343
+ text = ex._read_html(html)
344
+ result = ex.build_extraction(text, html.encode(), "html", "x.html")
345
+ canon = {c["canonical_title"] for c in result["clauses"] if c["mapped"]}
346
+ assert {"Confidentiality", "Payment", "Governing Law"} <= canon
347
+
348
+
330
349
  def test_html_detected_by_content_sniff(tmp_path: Any) -> None:
331
350
  # HTML masquerading as .txt (e.g. a SEC EDGAR full submission) is sniffed.
332
351
  p = tmp_path / "exhibit.txt"
File without changes
File without changes
File without changes
File without changes
File without changes