extract-cli 0.1.0__tar.gz → 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. {extract_cli-0.1.0 → extract_cli-0.1.1}/CHANGELOG.md +26 -0
  2. {extract_cli-0.1.0 → extract_cli-0.1.1}/PKG-INFO +1 -1
  3. {extract_cli-0.1.0 → extract_cli-0.1.1}/docs/spec/extract-output.schema.json +1 -0
  4. {extract_cli-0.1.0 → extract_cli-0.1.1}/extract_cli.py +119 -27
  5. {extract_cli-0.1.0 → extract_cli-0.1.1}/pyproject.toml +1 -1
  6. {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/fixtures/employment_docx.docx.expected.json +1 -1
  7. {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/fixtures/lease_allcaps.txt.expected.json +1 -1
  8. {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/fixtures/license_pdf.pdf.expected.json +1 -1
  9. {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/fixtures/nda_h2.md.expected.json +1 -1
  10. {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/fixtures/scanned.pdf.expected.json +1 -1
  11. {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/fixtures/services_bold.txt.expected.json +1 -1
  12. {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/test_clause_map.py +44 -0
  13. {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/test_cli.py +1 -1
  14. {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/test_deterministic.py +19 -0
  15. {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/test_misc.py +13 -0
  16. {extract_cli-0.1.0 → extract_cli-0.1.1}/.gitignore +0 -0
  17. {extract_cli-0.1.0 → extract_cli-0.1.1}/ARCHITECTURE.md +0 -0
  18. {extract_cli-0.1.0 → extract_cli-0.1.1}/CONTRIBUTING.md +0 -0
  19. {extract_cli-0.1.0 → extract_cli-0.1.1}/LICENSE +0 -0
  20. {extract_cli-0.1.0 → extract_cli-0.1.1}/Makefile +0 -0
  21. {extract_cli-0.1.0 → extract_cli-0.1.1}/README.md +0 -0
  22. {extract_cli-0.1.0 → extract_cli-0.1.1}/config/llm.json.example +0 -0
  23. {extract_cli-0.1.0 → extract_cli-0.1.1}/docs/INTEROP.md +0 -0
  24. {extract_cli-0.1.0 → extract_cli-0.1.1}/scripts/release.py +0 -0
  25. {extract_cli-0.1.0 → extract_cli-0.1.1}/scripts/validate_against_spec.py +0 -0
  26. {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/_fixtures_build.py +0 -0
  27. {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/_make_goldens.py +0 -0
  28. {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/_schema_validator.py +0 -0
  29. {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/conftest.py +0 -0
  30. {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/fixtures/employment_docx.docx +0 -0
  31. {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/fixtures/lease_allcaps.txt +0 -0
  32. {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/fixtures/license_pdf.pdf +0 -0
  33. {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/fixtures/nda_h2.md +0 -0
  34. {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/fixtures/scanned.pdf +0 -0
  35. {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/fixtures/services_bold.txt +0 -0
  36. {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/test_llm.py +0 -0
  37. {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/test_property.py +0 -0
  38. {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/test_schema_conformance.py +0 -0
@@ -6,6 +6,31 @@ to [Semantic Versioning](https://semver.org/). Per the suite convention
6
6
  (see [`docs/INTEROP.md`](docs/INTEROP.md)), **backward-incompatible changes to
7
7
  the output schema require a major version bump**; new optional fields are minor.
8
8
 
9
+ ## [0.1.1] - 2026-05-21
10
+
11
+ Real-world hardening, driven by testing against a SEC EDGAR employment
12
+ agreement and the Common Paper Mutual NDA (PDF/DOCX).
13
+
14
+ ### Added
15
+ - **`numbered` clause-detection tier** for plain numbered headings
16
+ (`1. Termination`, `Section 3. Payment`, `Article IV. …`) — the dominant
17
+ format in foreign paper, missed by the H2/bold/ALL-CAPS tiers. A title-case
18
+ heuristic rejects numbered sentences and list items. The output schema's
19
+ clause `tier` enum gains `numbered` (a backward-compatible widening).
20
+
21
+ ### Fixed
22
+ - **PDF reader** now extracts text only from inside `BT … ET` text objects, so
23
+ embedded fonts, digital-signature blobs, and metadata streams no longer leak
24
+ binary noise (a real signed PDF dropped from ~188 KB of garbage to ~8.7 KB of
25
+ clean text). Added a printable-ratio backstop.
26
+ - **Effective date**: anchor on `(the "Effective Date")` and a bare
27
+ `as of <date>` cue; handle dates that wrap across a line break.
28
+ - **Term length**: require a real number, dropping false positives such as
29
+ `…consecutive days`.
30
+ - **Title**: skip SGML/XML wrapper lines (e.g. SEC EDGAR `<DOCUMENT>` headers).
31
+ - Strip trailing punctuation from clause titles (`Other Benefits.` →
32
+ `Other Benefits`).
33
+
9
34
  ## [0.1.0] - 2026-05-21
10
35
 
11
36
  Initial release — the open-loop front door of the contract-ops CLI suite.
@@ -57,4 +82,5 @@ Initial release — the open-loop front door of the contract-ops CLI suite.
57
82
  intentionally *not* governed by the output schema (the schema describes the
58
83
  full default output).
59
84
 
85
+ [0.1.1]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.1
60
86
  [0.1.0]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-cli
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.docx/.pdf) and emit structured JSON.
5
5
  Project-URL: Homepage, https://cli.drbaher.com/
6
6
  Project-URL: Repository, https://github.com/DrBaher/extract-cli
@@ -183,6 +183,7 @@
183
183
  "enum": [
184
184
  "h2",
185
185
  "bold-numbered",
186
+ "numbered",
186
187
  "all-caps",
187
188
  "explicit",
188
189
  "llm"
@@ -42,11 +42,11 @@ import urllib.request
42
42
  from pathlib import Path
43
43
  from typing import Any, Dict, List, Optional, Tuple
44
44
 
45
- __version__ = "0.1.0"
45
+ __version__ = "0.1.1"
46
46
 
47
47
  # Bumped independently of the package version when the *extraction logic*
48
48
  # changes in a way downstream consumers should notice. Embedded in `_meta`.
49
- EXTRACTOR_VERSION = "0.1.0"
49
+ EXTRACTOR_VERSION = "0.1.1"
50
50
 
51
51
  # JSON Schema version of the output contract (docs/spec/extract-output.schema.json).
52
52
  SCHEMA_VERSION = 1
@@ -214,6 +214,49 @@ def _qualifies_as_all_caps_heading(title: str) -> bool:
214
214
  return sum(1 for ch in title if "A" <= ch <= "Z") >= 4
215
215
 
216
216
 
217
+ # Tier between bold-numbered and ALL-CAPS: plain numbered headings on their own
218
+ # line -- "1. Termination", "5. Wage Compensation", "Section 3. Payment",
219
+ # "Article IV. Confidentiality". These are the dominant real-world format in
220
+ # foreign paper (and aren't caught by H2, **bold**, or ALL-CAPS). A title-case
221
+ # heuristic distinguishes a heading from a numbered *sentence* or list item.
222
+ _NUMBERED_HEADING_RE = re.compile(
223
+ r"^[ \t]*"
224
+ r"(?:(?:Article|Section|ARTICLE|SECTION)[ \t]+)?"
225
+ r"(?:" + _ROMAN_RE + r"|\d{1,2})\.?"
226
+ r"[ \t]+"
227
+ r"([A-Z][A-Za-z][^\n]{0,58})"
228
+ r"[ \t]*$",
229
+ re.MULTILINE,
230
+ )
231
+
232
+ # Lowercase words allowed inside an otherwise Title-Cased heading.
233
+ _HEADING_STOPWORDS = {
234
+ "a", "an", "the", "and", "or", "of", "to", "for", "in", "on", "with",
235
+ "by", "at", "as", "per", "from", "into", "nor", "but",
236
+ }
237
+
238
+
239
+ def _qualifies_as_numbered_heading(title: str) -> bool:
240
+ """A numbered line qualifies as a heading only if its title looks like a
241
+ heading: 1-9 words, Title-Cased (every word starts uppercase or is a short
242
+ lowercase connector), no sentence-y lowercase verbs. A single word must be
243
+ >= 4 letters. Rejects 'The parties agree as follows' but accepts 'Wage
244
+ Compensation' and 'Term And Nature Of Employment'."""
245
+ t = title.strip().rstrip(".").strip()
246
+ words = t.split()
247
+ if not (1 <= len(words) <= 9):
248
+ return False
249
+ if len(words) == 1:
250
+ return sum(1 for ch in words[0] if ch.isalpha()) >= 4 and words[0][:1].isupper()
251
+ for w in words:
252
+ if w[:1].isupper() or not w[:1].isalpha():
253
+ continue # capitalized word, or punctuation/number token
254
+ if w.lower() in _HEADING_STOPWORDS:
255
+ continue # allowed connector
256
+ return False # a lowercase content word => this is a sentence, not a heading
257
+ return True
258
+
259
+
217
260
  def detect_clauses(text: str) -> List[JSON]:
218
261
  """Run the three-tier cascade and return clauses with their detection tier.
219
262
 
@@ -227,6 +270,12 @@ def detect_clauses(text: str) -> List[JSON]:
227
270
  bold = list(_BOLD_HEADING_RE.finditer(text))
228
271
  if len(bold) >= 2:
229
272
  return _matches_to_clauses(text, bold, group=1, tier="bold-numbered")
273
+ numbered = [
274
+ m for m in _NUMBERED_HEADING_RE.finditer(text)
275
+ if _qualifies_as_numbered_heading(m.group(1))
276
+ ]
277
+ if len(numbered) >= 2:
278
+ return _matches_to_clauses(text, numbered, group=1, tier="numbered")
230
279
  caps = [
231
280
  m for m in _ALL_CAPS_HEADING_RE.finditer(text)
232
281
  if _qualifies_as_all_caps_heading(m.group(1))
@@ -266,8 +315,9 @@ def _matches_to_clauses(text: str, matches: List["re.Match[str]"], group: int,
266
315
 
267
316
 
268
317
  def _norm_clause_key(s: str) -> str:
269
- """Normalize a clause title/alias for matching (number-stripped, lowercased)."""
270
- return _strip_clause_number(s).strip().lower()
318
+ """Normalize a clause title/alias for matching (number-stripped, trailing
319
+ punctuation removed, lowercased)."""
320
+ return _strip_clause_number(s).strip().lower().rstrip(" .:;,")
271
321
 
272
322
 
273
323
  # ---------------------------------------------------------------------------
@@ -366,7 +416,7 @@ def _canonicalize_clause(detected_title: str) -> Tuple[Optional[str], bool]:
366
416
  best, best_len = canonical, len(alias_key)
367
417
  if best is not None:
368
418
  return best, True
369
- return _titlecase(detected_title), False
419
+ return _titlecase(detected_title.strip().rstrip(" .:;,")), False
370
420
 
371
421
 
372
422
  # ---------------------------------------------------------------------------
@@ -421,11 +471,17 @@ _DATE_PAT = (
421
471
  )
422
472
  _DATE_RE = re.compile(_DATE_PAT, re.IGNORECASE)
423
473
 
474
+ # Highest-confidence: a date explicitly labeled "(the "Effective Date")".
475
+ _EFFDATE_LABEL_RE = re.compile(
476
+ r"(" + _DATE_PAT + r")\s*\(\s*(?:the\s+)?[\"“]?\s*Effective\s+Date",
477
+ re.IGNORECASE,
478
+ )
424
479
  _EFFECTIVE_RE = re.compile(
425
480
  r"(?:effective(?:\s+date)?(?:\s+(?:as\s+of|date|on))?|"
426
481
  r"dated(?:\s+as\s+of)?|"
427
482
  r"made(?:\s+and\s+entered\s+into)?(?:\s+as\s+of|\s+on)?|"
428
- r"entered\s+into(?:\s+as\s+of|\s+on)?)"
483
+ r"entered\s+into(?:\s+as\s+of|\s+on)?|"
484
+ r"as\s+of)"
429
485
  r"[\s:,]+(?:the\s+)?(" + _DATE_PAT + r")",
430
486
  re.IGNORECASE,
431
487
  )
@@ -534,14 +590,18 @@ def _parse_date_to_iso(s: str) -> Optional[str]:
534
590
  return None
535
591
 
536
592
 
593
+ def _date_field_from_str(raw: str, base_conf: float) -> JSON:
594
+ raw = re.sub(r"\s+", " ", raw.strip())
595
+ iso = _parse_date_to_iso(raw)
596
+ if iso is not None:
597
+ return _field(iso, base_conf)
598
+ return _field(raw, max(0.0, base_conf - 0.3))
599
+
600
+
537
601
  def _date_field(match: Optional["re.Match[str]"]) -> JSON:
538
602
  if match is None:
539
603
  return _none_field()
540
- raw = match.group(1).strip()
541
- iso = _parse_date_to_iso(raw)
542
- if iso is not None:
543
- return _field(iso, 0.85)
544
- return _field(raw, 0.55)
604
+ return _date_field_from_str(match.group(1), 0.85)
545
605
 
546
606
 
547
607
  def _split_name_role(s: str) -> Tuple[str, Optional[str]]:
@@ -578,10 +638,12 @@ def extract_parties(text: str) -> List[JSON]:
578
638
 
579
639
 
580
640
  def extract_dates(text: str) -> JSON:
581
- return {
582
- "effective": _date_field(_EFFECTIVE_RE.search(text)),
583
- "expiration": _date_field(_EXPIRE_RE.search(text)),
584
- }
641
+ label = _EFFDATE_LABEL_RE.search(text)
642
+ if label is not None:
643
+ effective = _date_field_from_str(label.group(1), 0.9)
644
+ else:
645
+ effective = _date_field(_EFFECTIVE_RE.search(text))
646
+ return {"effective": effective, "expiration": _date_field(_EXPIRE_RE.search(text))}
585
647
 
586
648
 
587
649
  def extract_governing_law(text: str) -> JSON:
@@ -600,10 +662,10 @@ def extract_term(text: str) -> JSON:
600
662
  if m:
601
663
  num = _word_to_int(m.group(1))
602
664
  unit = m.group(2).lower().rstrip("s")
665
+ # Only emit when the captured token is a real number; otherwise the
666
+ # match was a coincidence ("...consecutive days") -> leave as not-found.
603
667
  if num is not None:
604
668
  length = _field(f"{num} {unit}{'s' if num != 1 else ''}", 0.7)
605
- else:
606
- length = _field(f"{m.group(1)} {m.group(2)}".strip(), 0.5)
607
669
 
608
670
  notice = _none_field()
609
671
  nm = _NOTICE_RE.search(text)
@@ -649,7 +711,8 @@ def extract_clauses(text: str) -> List[JSON]:
649
711
  for c in detect_clauses(text):
650
712
  canonical, mapped = _canonicalize_clause(c["title"])
651
713
  tier = c["tier"]
652
- base = {"h2": 0.95, "bold-numbered": 0.85, "all-caps": 0.75, "explicit": 0.95}.get(tier, 0.7)
714
+ base = {"h2": 0.95, "bold-numbered": 0.85, "numbered": 0.8,
715
+ "all-caps": 0.75, "explicit": 0.95}.get(tier, 0.7)
653
716
  conf = round(base * (1.0 if mapped else 0.75), 2)
654
717
  out.append({
655
718
  "canonical_title": canonical,
@@ -669,10 +732,14 @@ def extract_title(text: str, path: Optional[Path], fmt: str) -> Optional[str]:
669
732
  return m.group(1).strip()
670
733
  for line in text.splitlines():
671
734
  ls = line.strip().lstrip("#").strip()
672
- if ls:
673
- if len(ls) <= 90:
674
- return ls
675
- break
735
+ if not ls:
736
+ continue
737
+ # Skip SGML/XML wrapper lines (e.g. SEC EDGAR "<DOCUMENT>", "<TYPE>...").
738
+ if ls.startswith("<"):
739
+ continue
740
+ if len(ls) <= 90:
741
+ return ls
742
+ break
676
743
  if path is not None:
677
744
  return _titlecase(path.stem.replace("_", " ").replace("-", " "))
678
745
  return None
@@ -834,9 +901,15 @@ def _pdf_unescape(s: str) -> str:
834
901
 
835
902
 
836
903
  def _pdf_text_from_content(content: bytes) -> str:
904
+ """Pull text strings from a PDF content stream, but ONLY from inside text
905
+ objects (`BT` ... `ET`). Real text lives there; embedded fonts, images,
906
+ digital-signature blobs and metadata streams have no BT/ET, so gating on it
907
+ keeps their binary bytes (which often contain stray `(...)` sequences) out
908
+ of the output -- essential for real signed/font-embedded PDFs."""
837
909
  s = content.decode("latin-1", "replace")
838
910
  lines: List[str] = []
839
911
  cur: List[str] = []
912
+ in_text = False
840
913
 
841
914
  def flush() -> None:
842
915
  if cur:
@@ -845,17 +918,34 @@ def _pdf_text_from_content(content: bytes) -> str:
845
918
 
846
919
  for m in _PDF_TOKEN_RE.finditer(s):
847
920
  tok = m.group(0)
848
- if tok.startswith("("):
921
+ if tok == "BT":
922
+ flush()
923
+ in_text = True
924
+ elif tok == "ET":
925
+ flush()
926
+ in_text = False
927
+ elif not in_text:
928
+ continue
929
+ elif tok.startswith("("):
849
930
  cur.append(_pdf_unescape(tok[1:-1]))
850
931
  elif tok.startswith("["):
851
932
  for sm in re.finditer(r"\((?:\\.|[^\\()])*\)", tok):
852
933
  cur.append(_pdf_unescape(sm.group(0)[1:-1]))
853
- elif tok in ("Td", "TD", "T*", "'", '"', "BT", "ET"):
934
+ elif tok in ("Td", "TD", "T*", "'", '"'):
854
935
  flush()
855
936
  flush()
856
937
  return "\n".join(lines)
857
938
 
858
939
 
940
+ def _mostly_printable(s: str) -> bool:
941
+ """True if `s` is overwhelmingly printable text (backstop against a
942
+ malformed stream slipping binary through the BT/ET gate)."""
943
+ if not s:
944
+ return False
945
+ printable = sum(1 for ch in s if ch in "\n\t" or 32 <= ord(ch) < 127 or ord(ch) > 160)
946
+ return printable / len(s) >= 0.85
947
+
948
+
859
949
  def _read_pdf_stdlib(raw: bytes) -> str:
860
950
  import zlib
861
951
 
@@ -873,9 +963,11 @@ def _read_pdf_stdlib(raw: bytes) -> str:
873
963
  content = zlib.decompress(body)
874
964
  except Exception:
875
965
  content = body
876
- chunks.append(_pdf_text_from_content(content))
966
+ piece = _pdf_text_from_content(content)
967
+ if piece.strip() and _mostly_printable(piece):
968
+ chunks.append(piece)
877
969
  idx = e + len(b"endstream")
878
- return "\n".join(c for c in chunks if c.strip())
970
+ return "\n".join(chunks)
879
971
 
880
972
 
881
973
  def load_source(path: Path, prefer_optional: bool = True) -> Tuple[bytes, str, str, List[str]]:
@@ -1293,7 +1385,7 @@ def output_schema() -> JSON:
1293
1385
  "properties": {
1294
1386
  "canonical_title": {"type": ["string", "null"]},
1295
1387
  "detected_title": {"type": "string"},
1296
- "tier": {"enum": ["h2", "bold-numbered", "all-caps", "explicit", "llm"]},
1388
+ "tier": {"enum": ["h2", "bold-numbered", "numbered", "all-caps", "explicit", "llm"]},
1297
1389
  "span": {
1298
1390
  "type": "object",
1299
1391
  "required": ["start", "end"],
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "extract-cli"
7
- version = "0.1.0"
7
+ version = "0.1.1"
8
8
  description = "Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.docx/.pdf) and emit structured JSON."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
@@ -138,7 +138,7 @@
138
138
  "source": "deterministic"
139
139
  },
140
140
  "_meta": {
141
- "extractor_version": "0.1.0",
141
+ "extractor_version": "0.1.1",
142
142
  "tiers_used": [
143
143
  "deterministic"
144
144
  ],
@@ -133,7 +133,7 @@
133
133
  "source": "deterministic"
134
134
  },
135
135
  "_meta": {
136
- "extractor_version": "0.1.0",
136
+ "extractor_version": "0.1.1",
137
137
  "tiers_used": [
138
138
  "deterministic"
139
139
  ],
@@ -133,7 +133,7 @@
133
133
  "source": "deterministic"
134
134
  },
135
135
  "_meta": {
136
- "extractor_version": "0.1.0",
136
+ "extractor_version": "0.1.1",
137
137
  "tiers_used": [
138
138
  "deterministic"
139
139
  ],
@@ -138,7 +138,7 @@
138
138
  "source": "none"
139
139
  },
140
140
  "_meta": {
141
- "extractor_version": "0.1.0",
141
+ "extractor_version": "0.1.1",
142
142
  "tiers_used": [
143
143
  "deterministic"
144
144
  ],
@@ -48,7 +48,7 @@
48
48
  "source": "none"
49
49
  },
50
50
  "_meta": {
51
- "extractor_version": "0.1.0",
51
+ "extractor_version": "0.1.1",
52
52
  "tiers_used": [
53
53
  "deterministic"
54
54
  ],
@@ -133,7 +133,7 @@
133
133
  "source": "deterministic"
134
134
  },
135
135
  "_meta": {
136
- "extractor_version": "0.1.0",
136
+ "extractor_version": "0.1.1",
137
137
  "tiers_used": [
138
138
  "deterministic"
139
139
  ],
@@ -25,6 +25,50 @@ def test_tier3_all_caps() -> None:
25
25
  assert [c["tier"] for c in clauses] == ["all-caps", "all-caps"]
26
26
 
27
27
 
28
+ def test_tier_numbered_plain_headings() -> None:
29
+ # Real-world dominant format: plain numbered, mixed-case, unbolded headings.
30
+ text = ("1. Term And Nature Of Employment\n\nbody about term\n\n"
31
+ "2. Wage Compensation\n\nbody about wages\n\n"
32
+ "5. Termination\n\nbody about termination")
33
+ clauses = ex.detect_clauses(text)
34
+ assert [c["tier"] for c in clauses] == ["numbered", "numbered", "numbered"]
35
+ assert clauses[0]["title"] == "Term And Nature Of Employment"
36
+ assert clauses[2]["title"] == "Termination"
37
+
38
+
39
+ def test_numbered_heading_rejects_sentences() -> None:
40
+ # "1. The Company shall pay..." is a numbered sentence, not a heading.
41
+ assert ex._qualifies_as_numbered_heading("Wage Compensation")
42
+ assert ex._qualifies_as_numbered_heading("Term And Nature Of Employment")
43
+ assert ex._qualifies_as_numbered_heading("Termination")
44
+ assert not ex._qualifies_as_numbered_heading("The Company shall pay the Employee monthly")
45
+ assert not ex._qualifies_as_numbered_heading("Fee") # single word < 4 letters
46
+ assert not ex._qualifies_as_numbered_heading(
47
+ "EMPLOYEE shall be compensated on the basis of an annual salary")
48
+
49
+
50
+ def test_numbered_section_article_prefixes() -> None:
51
+ text = ("Section 1. Definitions\n\nx\n\nSection 2. Confidentiality\n\ny\n\n"
52
+ "Article IV. Governing Law\n\nz")
53
+ clauses = ex.detect_clauses(text)
54
+ assert all(c["tier"] == "numbered" for c in clauses)
55
+ assert clauses[0]["title"] == "Definitions"
56
+ assert clauses[2]["title"] == "Governing Law"
57
+
58
+
59
+ def test_numbered_does_not_shadow_bold() -> None:
60
+ # Bold-numbered must win over plain-numbered when both could match.
61
+ text = "**1. Purpose**\n\nx\n\n**2. Scope**\n\ny"
62
+ assert all(c["tier"] == "bold-numbered" for c in ex.detect_clauses(text))
63
+
64
+
65
+ def test_trailing_period_stripped_from_titles() -> None:
66
+ canon, mapped = ex._canonicalize_clause("Other Benefits.")
67
+ assert canon == "Other Benefits"
68
+ # And a mapped clause with a trailing period still maps.
69
+ assert ex._canonicalize_clause("Survival.") == ("Survival", True)
70
+
71
+
28
72
  def test_cascade_priority_h2_wins() -> None:
29
73
  # An H2 present means the bold/all-caps fallbacks must not fire.
30
74
  text = "## Real Heading\n\n**1. Not A Heading**\n\nALSO NOT A HEADING\n\nbody"
@@ -22,7 +22,7 @@ def test_version(capsys: pytest.CaptureFixture[str]) -> None:
22
22
  with pytest.raises(SystemExit) as exc:
23
23
  ex.main(["--version"])
24
24
  assert exc.value.code == 0
25
- assert "extract-cli 0.1.0" in capsys.readouterr().out
25
+ assert f"extract-cli {ex.__version__}" in capsys.readouterr().out
26
26
 
27
27
 
28
28
  def test_demo_runs(capsys: pytest.CaptureFixture[str]) -> None:
@@ -39,6 +39,25 @@ def test_dates_iso_normalization() -> None:
39
39
  assert out["source"] == "deterministic"
40
40
 
41
41
 
42
+ def test_dates_effective_date_label_and_as_of() -> None:
43
+ # The "(the "Effective Date")" anchor, with the date wrapping a newline.
44
+ text = 'between A and B as of August\n31, 2016 (the "Effective Date").'
45
+ assert ex.extract_dates(text)["effective"]["value"] == "2016-08-31"
46
+ # Bare "as of <date>" cue.
47
+ assert ex.extract_dates("dated as of June 1, 2023")["effective"]["value"] == "2023-06-01"
48
+
49
+
50
+ def test_term_length_rejects_non_number() -> None:
51
+ # "...for consecutive days" must NOT be reported as a term length.
52
+ text = "the Employment Period shall run for consecutive days as scheduled"
53
+ assert ex.extract_term(text)["length"]["source"] == "none"
54
+
55
+
56
+ def test_title_skips_sgml_wrapper() -> None:
57
+ text = "<DOCUMENT>\n<TYPE>EX-10\n<TEXT>\n\nEMPLOYMENT AGREEMENT\n\nbody"
58
+ assert ex.extract_title(text, None, "text") == "EMPLOYMENT AGREEMENT"
59
+
60
+
42
61
  def test_dates_missing() -> None:
43
62
  out = ex.extract_dates("no dates in here")
44
63
  assert out["effective"] == ex._none_field()
@@ -142,6 +142,19 @@ def test_pdf_unescape() -> None:
142
142
  assert ex._pdf_unescape(r"\101\102") == "AB" # octal escapes
143
143
 
144
144
 
145
+ def test_pdf_text_only_inside_bt_et() -> None:
146
+ # Strings outside BT/ET (font/signature/metadata stream bytes that happen to
147
+ # contain parentheses) must be ignored; only text objects yield text.
148
+ content = b"(garbage outside) /Font << >> BT (real text) Tj ET (more garbage)"
149
+ assert ex._pdf_text_from_content(content) == "real text"
150
+
151
+
152
+ def test_pdf_mostly_printable_backstop() -> None:
153
+ assert ex._mostly_printable("Hello, world")
154
+ assert not ex._mostly_printable("\x00\x01\x02\x03\x04\x05\x06\x07")
155
+ assert not ex._mostly_printable("")
156
+
157
+
145
158
  def test_extract_json_object_from_noise() -> None:
146
159
  assert ex._extract_json_object('prefix {"a": 1} suffix') == {"a": 1}
147
160
  assert ex._extract_json_object("no json here") is None
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes