extract-cli 0.1.12__tar.gz → 0.1.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. {extract_cli-0.1.12 → extract_cli-0.1.14}/CHANGELOG.md +36 -0
  2. {extract_cli-0.1.12 → extract_cli-0.1.14}/Makefile +3 -0
  3. {extract_cli-0.1.12 → extract_cli-0.1.14}/PKG-INFO +23 -2
  4. {extract_cli-0.1.12 → extract_cli-0.1.14}/README.md +22 -1
  5. {extract_cli-0.1.12 → extract_cli-0.1.14}/extract_cli.py +115 -22
  6. {extract_cli-0.1.12 → extract_cli-0.1.14}/pyproject.toml +1 -1
  7. extract_cli-0.1.14/tests/eval/ATTRIBUTION.md +20 -0
  8. extract_cli-0.1.14/tests/eval/corpus/consulting_mtm.htm +980 -0
  9. extract_cli-0.1.14/tests/eval/corpus/emp_arcp.htm +18 -0
  10. extract_cli-0.1.14/tests/eval/corpus/emp_celsci.txt +494 -0
  11. extract_cli-0.1.14/tests/eval/corpus/emp_quadgraphics.htm +1318 -0
  12. extract_cli-0.1.14/tests/eval/corpus/msa_kpmg.txt +754 -0
  13. extract_cli-0.1.14/tests/eval/corpus/services_visteon.txt +1054 -0
  14. extract_cli-0.1.14/tests/eval/evaluate.py +123 -0
  15. extract_cli-0.1.14/tests/eval/gold.json +51 -0
  16. {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/fixtures/employment_docx.docx.expected.json +1 -1
  17. {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/fixtures/heading_docx.docx.expected.json +1 -1
  18. {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/fixtures/lease_allcaps.txt.expected.json +1 -1
  19. {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/fixtures/license_pdf.pdf.expected.json +1 -1
  20. {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/fixtures/nda_h2.md.expected.json +1 -1
  21. {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/fixtures/numbered_docx.docx.expected.json +1 -1
  22. {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/fixtures/scanned.pdf.expected.json +1 -1
  23. {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/fixtures/services_bold.txt.expected.json +1 -1
  24. {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/fixtures/services_html.html.expected.json +1 -1
  25. {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/test_deterministic.py +17 -0
  26. extract_cli-0.1.14/tests/test_eval.py +26 -0
  27. {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/test_misc.py +19 -0
  28. {extract_cli-0.1.12 → extract_cli-0.1.14}/.gitignore +0 -0
  29. {extract_cli-0.1.12 → extract_cli-0.1.14}/AGENTS.md +0 -0
  30. {extract_cli-0.1.12 → extract_cli-0.1.14}/ARCHITECTURE.md +0 -0
  31. {extract_cli-0.1.12 → extract_cli-0.1.14}/CONTRIBUTING.md +0 -0
  32. {extract_cli-0.1.12 → extract_cli-0.1.14}/LICENSE +0 -0
  33. {extract_cli-0.1.12 → extract_cli-0.1.14}/config/llm.json.example +0 -0
  34. {extract_cli-0.1.12 → extract_cli-0.1.14}/docs/INTEROP.md +0 -0
  35. {extract_cli-0.1.12 → extract_cli-0.1.14}/docs/spec/extract-output.schema.json +0 -0
  36. {extract_cli-0.1.12 → extract_cli-0.1.14}/llms.txt +0 -0
  37. {extract_cli-0.1.12 → extract_cli-0.1.14}/scripts/release.py +0 -0
  38. {extract_cli-0.1.12 → extract_cli-0.1.14}/scripts/validate_against_spec.py +0 -0
  39. {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/_fixtures_build.py +0 -0
  40. {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/_make_goldens.py +0 -0
  41. {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/_schema_validator.py +0 -0
  42. {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/conftest.py +0 -0
  43. {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/fixtures/employment_docx.docx +0 -0
  44. {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/fixtures/heading_docx.docx +0 -0
  45. {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/fixtures/lease_allcaps.txt +0 -0
  46. {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/fixtures/license_pdf.pdf +0 -0
  47. {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/fixtures/nda_h2.md +0 -0
  48. {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/fixtures/numbered_docx.docx +0 -0
  49. {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/fixtures/scanned.pdf +0 -0
  50. {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/fixtures/services_bold.txt +0 -0
  51. {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/fixtures/services_html.html +0 -0
  52. {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/test_clause_map.py +0 -0
  53. {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/test_cli.py +0 -0
  54. {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/test_coverage.py +0 -0
  55. {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/test_llm.py +0 -0
  56. {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/test_property.py +0 -0
  57. {extract_cli-0.1.12 → extract_cli-0.1.14}/tests/test_schema_conformance.py +0 -0
@@ -6,6 +6,40 @@ to [Semantic Versioning](https://semver.org/). Per the suite convention
6
6
  (see [`docs/INTEROP.md`](docs/INTEROP.md)), **backward-incompatible changes to
7
7
  the output schema require a major version bump**; new optional fields are minor.
8
8
 
9
+ ## [0.1.14] - 2026-05-22
10
+
11
+ ### Improved
12
+ - **HTML clause detection now recognizes emphasis-marked headings.** Real HTML
13
+ contracts (e.g. SEC EDGAR exhibits) mark section headings with emphasis, not
14
+ `##`/numbers — a heading tag, `<b>`/`<strong>`/`<u>`, or **CSS**
15
+ (`font-weight:bold` / `text-decoration:underline`), often with a leading
16
+ `(g)` / `1.` token and the body run-in. The HTML reader now emits such blocks
17
+ as `## ` headings (splitting run-in title from body; a lone emphasized block
18
+ is treated as a title and left plain so numbered/ALL-CAPS sections still win).
19
+ On the accuracy benchmark this lifts **clause recall 0.45 → 0.86** (F1 0.62 →
20
+ 0.93), precision still 1.00. Residual misses are compound/combined headings.
21
+
22
+ ## [0.1.13] - 2026-05-22
23
+
24
+ ### Added
25
+ - **Accuracy benchmark** (`tests/eval/`, `make eval`). Scores the deterministic
26
+ tier against a small corpus of real, executed SEC-EDGAR contracts with
27
+ hand-verified ground truth, reporting precision/recall/F1 per field — turning
28
+ "best-effort" into a measured number. Current: parties F1 0.96, effective
29
+ date / governing law / jurisdiction 1.00, clause recall 0.45 (heading
30
+ detection on dense HTML is the known weak spot). `tests/test_eval.py` gates it
31
+ so accuracy can't silently regress.
32
+
33
+ ### Fixed / improved (surfaced by the benchmark)
34
+ - **Governing-law detection** now covers the common connector phrasings beyond
35
+ "governed by the laws of X": "governed by, **and enforced in accordance
36
+ with,** the laws of X", "**interpreted and enforced in accordance with** the
37
+ laws of X", "**construed under** the laws of X". (Benchmark: governing law
38
+ 0.67 → 1.00.)
39
+ - **Jurisdiction normalization** now maps **all 50 US states + DC** (plus more
40
+ Canadian provinces / UK nations / countries), not just a dozen. (Benchmark:
41
+ jurisdiction 0.67 → 1.00.)
42
+
9
43
  ## [0.1.12] - 2026-05-22
10
44
 
11
45
  ### Security
@@ -333,6 +367,8 @@ Initial release — the open-loop front door of the contract-ops CLI suite.
333
367
  intentionally *not* governed by the output schema (the schema describes the
334
368
  full default output).
335
369
 
370
+ [0.1.14]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.14
371
+ [0.1.13]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.13
336
372
  [0.1.12]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.12
337
373
  [0.1.11]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.11
338
374
  [0.1.10]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.10
@@ -40,6 +40,9 @@ coverage:
40
40
  typecheck:
41
41
  $(PYTHON) -m mypy --strict extract_cli.py
42
42
 
43
+ eval:
44
+ $(PYTHON) tests/eval/evaluate.py
45
+
43
46
  build: clean
44
47
  $(PYTHON) -m build
45
48
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-cli
3
- Version: 0.1.12
3
+ Version: 0.1.14
4
4
  Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON.
5
5
  Project-URL: Homepage, https://cli.drbaher.com/
6
6
  Project-URL: Repository, https://github.com/DrBaher/extract-cli
@@ -256,13 +256,34 @@ paths. Configure it once and every suite tool that adopts the same lookup gets
256
256
  LLM features for free. Without it, `--llm` just warns and returns the
257
257
  deterministic output.
258
258
 
259
+ ## Accuracy
260
+
261
+ Line coverage tells you the code runs; it doesn't tell you the extraction is
262
+ *correct*. `make eval` scores the deterministic tier against a small corpus of
263
+ **real, executed contracts** (SEC EDGAR filings) with hand-verified ground truth
264
+ ([`tests/eval/`](tests/eval/)), reporting precision/recall per field:
265
+
266
+ | Field | Score |
267
+ |---|---|
268
+ | parties | P 1.00 · R 0.92 · F1 0.96 |
269
+ | effective date | accuracy 1.00 |
270
+ | governing law | accuracy 1.00 |
271
+ | jurisdiction (normalized) | accuracy 1.00 |
272
+ | clauses (recall on verified sections) | 0.86 |
273
+
274
+ Clause recall improved sharply once the HTML reader learned to treat
275
+ emphasis (heading tags, <b>/<u>, CSS font-weight/underline) as section
276
+ headings; the residual misses are compound/combined heading titles. A test (`tests/test_eval.py`) gates these so
277
+ accuracy can't silently regress.
278
+
259
279
  ## Development
260
280
 
261
281
  ```bash
262
282
  make install # editable install with the [dev] extra
263
283
  make test # full suite
264
- make coverage # suite + coverage report
284
+ make coverage # suite + coverage report (installs extras; fails under 100%)
265
285
  make typecheck # mypy --strict
286
+ make eval # accuracy benchmark vs the labeled corpus
266
287
  make build # wheel + sdist
267
288
  make smoke # build, install the wheel in a clean venv, run it
268
289
  make spec-check # assert docs/spec schema == `extract schema`
@@ -218,13 +218,34 @@ paths. Configure it once and every suite tool that adopts the same lookup gets
218
218
  LLM features for free. Without it, `--llm` just warns and returns the
219
219
  deterministic output.
220
220
 
221
+ ## Accuracy
222
+
223
+ Line coverage tells you the code runs; it doesn't tell you the extraction is
224
+ *correct*. `make eval` scores the deterministic tier against a small corpus of
225
+ **real, executed contracts** (SEC EDGAR filings) with hand-verified ground truth
226
+ ([`tests/eval/`](tests/eval/)), reporting precision/recall per field:
227
+
228
+ | Field | Score |
229
+ |---|---|
230
+ | parties | P 1.00 · R 0.92 · F1 0.96 |
231
+ | effective date | accuracy 1.00 |
232
+ | governing law | accuracy 1.00 |
233
+ | jurisdiction (normalized) | accuracy 1.00 |
234
+ | clauses (recall on verified sections) | 0.86 |
235
+
236
+ Clause recall improved sharply once the HTML reader learned to treat
237
+ emphasis (heading tags, <b>/<u>, CSS font-weight/underline) as section
238
+ headings; the residual misses are compound/combined heading titles. A test (`tests/test_eval.py`) gates these so
239
+ accuracy can't silently regress.
240
+
221
241
  ## Development
222
242
 
223
243
  ```bash
224
244
  make install # editable install with the [dev] extra
225
245
  make test # full suite
226
- make coverage # suite + coverage report
246
+ make coverage # suite + coverage report (installs extras; fails under 100%)
227
247
  make typecheck # mypy --strict
248
+ make eval # accuracy benchmark vs the labeled corpus
228
249
  make build # wheel + sdist
229
250
  make smoke # build, install the wheel in a clean venv, run it
230
251
  make spec-check # assert docs/spec schema == `extract schema`
@@ -43,11 +43,11 @@ import urllib.request
43
43
  from pathlib import Path
44
44
  from typing import Any, Dict, List, Optional, Tuple
45
45
 
46
- __version__ = "0.1.12"
46
+ __version__ = "0.1.14"
47
47
 
48
48
  # Bumped independently of the package version when the *extraction logic*
49
49
  # changes in a way downstream consumers should notice. Embedded in `_meta`.
50
- EXTRACTOR_VERSION = "0.1.12"
50
+ EXTRACTOR_VERSION = "0.1.14"
51
51
 
52
52
  # JSON Schema version of the output contract (docs/spec/extract-output.schema.json).
53
53
  SCHEMA_VERSION = 1
@@ -616,8 +616,11 @@ _ROLE_PAREN_RE = re.compile(
616
616
  # enforces a capitalized proper noun (a global re.IGNORECASE would defeat that
617
617
  # and over-capture trailing lowercase clauses like ", without regard to ...").
618
618
  _GOV_LAW_RE = re.compile(
619
- r"(?i:governed\s+by(?:\s+and\s+construed\s+in\s+accordance\s+with)?\s+"
620
- r"(?:the\s+)?laws?\s+of\s+(?:the\s+)?)"
619
+ # Allow a short same-sentence gap between "governed by" and "laws of" so the
620
+ # many real connector phrasings are covered: "...and construed in accordance
621
+ # with...", "...and enforced in accordance with...", "the internal laws of",
622
+ # etc. (bounded + lazy so it stays within the clause).
623
+ r"(?i:(?:governed|construed|interpreted|enforced)\b[^.\n]{0,60}?\blaws?\s+of\s+(?:the\s+)?)"
621
624
  r"([A-Z][A-Za-z\.\- ]+?(?:,\s*[A-Z][A-Za-z\.\- ]+?)?)"
622
625
  r"(?=[\.,;\n)]|\s+and\b|\s+without\b|$)",
623
626
  )
@@ -889,16 +892,31 @@ def extract_signatories(text: str) -> List[JSON]:
889
892
  return out
890
893
 
891
894
 
892
- # Free-text jurisdiction -> a normalized ISO-ish code (best-effort, common only).
895
+ # Free-text jurisdiction -> a normalized ISO 3166-2 / ISO 3166-1 code. All 50 US
896
+ # states + DC, common Canadian provinces, UK nations, and frequent countries.
897
+ _US_STATES: Dict[str, str] = {
898
+ "alabama": "AL", "alaska": "AK", "arizona": "AZ", "arkansas": "AR",
899
+ "california": "CA", "colorado": "CO", "connecticut": "CT", "delaware": "DE",
900
+ "florida": "FL", "georgia": "GA", "hawaii": "HI", "idaho": "ID",
901
+ "illinois": "IL", "indiana": "IN", "iowa": "IA", "kansas": "KS",
902
+ "kentucky": "KY", "louisiana": "LA", "maine": "ME", "maryland": "MD",
903
+ "massachusetts": "MA", "michigan": "MI", "minnesota": "MN", "mississippi": "MS",
904
+ "missouri": "MO", "montana": "MT", "nebraska": "NE", "nevada": "NV",
905
+ "new hampshire": "NH", "new jersey": "NJ", "new mexico": "NM", "new york": "NY",
906
+ "north carolina": "NC", "north dakota": "ND", "ohio": "OH", "oklahoma": "OK",
907
+ "oregon": "OR", "pennsylvania": "PA", "rhode island": "RI", "south carolina": "SC",
908
+ "south dakota": "SD", "tennessee": "TN", "texas": "TX", "utah": "UT",
909
+ "vermont": "VT", "virginia": "VA", "washington": "WA", "west virginia": "WV",
910
+ "wisconsin": "WI", "wyoming": "WY", "district of columbia": "DC",
911
+ }
893
912
  _JURISDICTION_CODES: Dict[str, str] = {
894
- "delaware": "US-DE", "new york": "US-NY", "california": "US-CA",
895
- "texas": "US-TX", "illinois": "US-IL", "massachusetts": "US-MA",
896
- "washington": "US-WA", "florida": "US-FL", "nevada": "US-NV",
897
- "new jersey": "US-NJ", "pennsylvania": "US-PA", "michigan": "US-MI",
913
+ **{name: f"US-{code}" for name, code in _US_STATES.items()},
898
914
  "ontario": "CA-ON", "quebec": "CA-QC", "british columbia": "CA-BC",
899
- "england and wales": "GB-EAW", "england": "GB-ENG", "scotland": "GB-SCT",
915
+ "alberta": "CA-AB", "england and wales": "GB-EAW", "england": "GB-ENG",
916
+ "scotland": "GB-SCT", "wales": "GB-WLS", "northern ireland": "GB-NIR",
900
917
  "united kingdom": "GB", "france": "FR", "germany": "DE", "ireland": "IE",
901
918
  "singapore": "SG", "australia": "AU", "india": "IN", "netherlands": "NL",
919
+ "switzerland": "CH", "japan": "JP",
902
920
  }
903
921
 
904
922
 
@@ -1051,11 +1069,24 @@ def _detect_format(path: Path, raw: bytes) -> str:
1051
1069
  return base
1052
1070
 
1053
1071
 
1072
+ def _looks_like_heading_text(s: str) -> bool:
1073
+ """Lenient: short, few words, not a full sentence -- used to decide whether
1074
+ an *emphasized* HTML block is a clause heading."""
1075
+ s = s.strip().rstrip(".:;,")
1076
+ return 2 <= len(s) <= 90 and len(s.split()) <= 10
1077
+
1078
+
1054
1079
  class _HTMLTextExtractor(html.parser.HTMLParser):
1055
- """Stdlib HTML -> text: drops script/style, frames block elements with blank
1056
- lines (so clause-heading detection still works), and unescapes entities."""
1080
+ """Stdlib HTML -> text. Drops script/style, frames blocks with blank lines,
1081
+ unescapes entities, and -- crucially for clause detection -- emits blocks
1082
+ that are emphasized (a heading tag, or text wrapped in <b>/<strong>/<u>) as
1083
+ Markdown `## headings`. Real contracts (e.g. SEC HTML exhibits) mark section
1084
+ headings with emphasis, not `##`/numbers, so without this the cascade sees
1085
+ only plain lines. A run-in heading (emphasized lead + body in one block) is
1086
+ split into `## Title` + body."""
1057
1087
 
1058
1088
  _SKIP = {"script", "style", "head", "title", "meta", "link", "noscript"}
1089
+ _EMPH = {"b", "strong", "u", "h1", "h2", "h3", "h4", "h5", "h6"}
1059
1090
  _BLOCK = {
1060
1091
  "p", "div", "br", "tr", "li", "h1", "h2", "h3", "h4", "h5", "h6",
1061
1092
  "section", "article", "table", "ul", "ol", "blockquote", "pre", "hr",
@@ -1064,32 +1095,94 @@ class _HTMLTextExtractor(html.parser.HTMLParser):
1064
1095
 
1065
1096
  def __init__(self) -> None:
1066
1097
  super().__init__(convert_charrefs=True)
1067
- self._parts: List[str] = []
1098
+ self._lines: List[str] = []
1099
+ self._runs: List[Tuple[bool, str]] = [] # (emphasized, text) for current block
1068
1100
  self._skip = 0
1101
+ self._emph = 0
1102
+ # Per-tag-name LIFO stack of "did this open tag add emphasis?", so an
1103
+ # emphasis opened by a CSS style (not just a <b>/<u> tag) is closed by
1104
+ # the right end tag even when many <font>/<span> nest.
1105
+ self._emph_stack: Dict[str, List[bool]] = {}
1106
+
1107
+ @staticmethod
1108
+ def _style_is_emph(attrs: Any) -> bool:
1109
+ for name, value in attrs:
1110
+ if name == "style" and value:
1111
+ v = value.lower()
1112
+ if ("font-weight:bold" in v.replace(" ", "") or "font-weight:700" in v.replace(" ", "")
1113
+ or "text-decoration:underline" in v.replace(" ", "")):
1114
+ return True
1115
+ return False
1116
+
1117
+ def _flush_block(self) -> None:
1118
+ runs, self._runs = self._runs, []
1119
+ full = re.sub(r"\s+", " ", "".join(t for _e, t in runs)).strip()
1120
+ if not full:
1121
+ self._lines.append("")
1122
+ return
1123
+ # Standalone emphasized block (a heading tag or fully <b>/<u>/styled text).
1124
+ if all(e for e, t in runs if t.strip()) and _looks_like_heading_text(_strip_clause_number(full)):
1125
+ self._lines.append("## " + _strip_clause_number(full))
1126
+ return
1127
+ # Run-in heading: an optional leading numbering token ("(g)", "1.") then
1128
+ # an emphasized title, then the body in the same block.
1129
+ i, saw_emph = 0, False
1130
+ while i < len(runs):
1131
+ emph, txt = runs[i]
1132
+ if not txt.strip():
1133
+ i += 1
1134
+ elif emph:
1135
+ saw_emph = True
1136
+ i += 1
1137
+ elif not saw_emph and re.fullmatch(r"\(?[0-9A-Za-z]{1,4}\)?[.)]?", txt.strip()):
1138
+ i += 1 # skip a clause-number/letter prefix
1139
+ else:
1140
+ break
1141
+ lead = _strip_clause_number(re.sub(r"\s+", " ", "".join(t for _e, t in runs[:i])).strip())
1142
+ rest = re.sub(r"\s+", " ", "".join(t for _e, t in runs[i:])).strip()
1143
+ if saw_emph and lead and rest and _looks_like_heading_text(lead):
1144
+ self._lines.append("## " + lead)
1145
+ self._lines.append(rest)
1146
+ else:
1147
+ self._lines.append(full)
1069
1148
 
1070
1149
  def handle_starttag(self, tag: str, attrs: Any) -> None:
1071
1150
  if tag in self._SKIP:
1072
1151
  self._skip += 1
1073
- elif tag in self._BLOCK:
1074
- self._parts.append("\n")
1152
+ return
1153
+ if tag in self._BLOCK:
1154
+ self._flush_block()
1155
+ added = tag in self._EMPH or self._style_is_emph(attrs)
1156
+ self._emph_stack.setdefault(tag, []).append(added)
1157
+ if added:
1158
+ self._emph += 1
1075
1159
 
1076
1160
  def handle_endtag(self, tag: str) -> None:
1077
1161
  if tag in self._SKIP and self._skip > 0:
1078
1162
  self._skip -= 1
1079
- elif tag in self._BLOCK:
1080
- self._parts.append("\n")
1163
+ return
1164
+ stack = self._emph_stack.get(tag)
1165
+ if stack:
1166
+ if stack.pop() and self._emph > 0:
1167
+ self._emph -= 1
1168
+ if tag in self._BLOCK:
1169
+ self._flush_block()
1081
1170
 
1082
1171
  def handle_data(self, data: str) -> None:
1083
1172
  if self._skip == 0:
1084
- self._parts.append(data)
1173
+ self._runs.append((self._emph > 0, data))
1085
1174
 
1086
1175
  def get_text(self) -> str:
1087
- # Strip each line; collapse runs of blank lines to a single blank line
1088
- # (gives ALL-CAPS / numbered headings their blank-line frame).
1089
- lines = [re.sub(r"[ \t]+", " ", ln).strip() for ln in "".join(self._parts).split("\n")]
1176
+ self._flush_block()
1177
+ # A lone emphasized heading is almost always the document title, not a
1178
+ # section scheme -- downgrade it to plain text so the numbered/ALL-CAPS
1179
+ # tiers can still detect the real sections (matches the >=2 threshold the
1180
+ # other fallback tiers use).
1181
+ if sum(1 for ln in self._lines if ln.startswith("## ")) < 2:
1182
+ self._lines = [ln[3:] if ln.startswith("## ") else ln for ln in self._lines]
1090
1183
  out: List[str] = []
1091
1184
  blank = False
1092
- for ln in lines:
1185
+ for ln in self._lines:
1093
1186
  if ln:
1094
1187
  out.append(ln)
1095
1188
  blank = False
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "extract-cli"
7
- version = "0.1.12"
7
+ version = "0.1.14"
8
8
  description = "Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
@@ -0,0 +1,20 @@
1
+ # Benchmark corpus — sources & licensing
2
+
3
+ The accuracy benchmark (`tests/eval/`) scores extract-cli against a small set of
4
+ **real, executed contracts** filed publicly with the U.S. Securities and
5
+ Exchange Commission (SEC EDGAR). SEC filings are public records; these exhibits
6
+ are reproduced here, unmodified, solely as a regression/accuracy test fixture.
7
+
8
+ | File | Source (SEC EDGAR) |
9
+ |---|---|
10
+ | `emp_celsci.txt` | CEL-SCI Corporation — Exhibit 10(ooo), employment agreement |
11
+ | `msa_kpmg.txt` | Blade Internet Ventures / KPMG Consulting — master services agreement |
12
+ | `services_visteon.txt` | Visteon Corporation — salaried employee lease agreement |
13
+ | `consulting_mtm.htm` | MTM Technologies — consulting agreement |
14
+ | `emp_arcp.htm` | American Realty Capital Properties — employment agreement |
15
+ | `emp_quadgraphics.htm` | Quad/Graphics, Inc. — employment agreement |
16
+
17
+ Ground truth (`gold.json`) was hand-verified against each document's text — the
18
+ parties, effective date, governing law, normalized jurisdiction, and a
19
+ verified subset of section headings. It is intentionally independent of what the
20
+ extractor currently produces.