extract-cli 0.1.0__tar.gz → 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {extract_cli-0.1.0 → extract_cli-0.1.1}/CHANGELOG.md +26 -0
- {extract_cli-0.1.0 → extract_cli-0.1.1}/PKG-INFO +1 -1
- {extract_cli-0.1.0 → extract_cli-0.1.1}/docs/spec/extract-output.schema.json +1 -0
- {extract_cli-0.1.0 → extract_cli-0.1.1}/extract_cli.py +119 -27
- {extract_cli-0.1.0 → extract_cli-0.1.1}/pyproject.toml +1 -1
- {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/fixtures/employment_docx.docx.expected.json +1 -1
- {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/fixtures/lease_allcaps.txt.expected.json +1 -1
- {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/fixtures/license_pdf.pdf.expected.json +1 -1
- {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/fixtures/nda_h2.md.expected.json +1 -1
- {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/fixtures/scanned.pdf.expected.json +1 -1
- {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/fixtures/services_bold.txt.expected.json +1 -1
- {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/test_clause_map.py +44 -0
- {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/test_cli.py +1 -1
- {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/test_deterministic.py +19 -0
- {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/test_misc.py +13 -0
- {extract_cli-0.1.0 → extract_cli-0.1.1}/.gitignore +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.1}/ARCHITECTURE.md +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.1}/CONTRIBUTING.md +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.1}/LICENSE +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.1}/Makefile +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.1}/README.md +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.1}/config/llm.json.example +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.1}/docs/INTEROP.md +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.1}/scripts/release.py +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.1}/scripts/validate_against_spec.py +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/_fixtures_build.py +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/_make_goldens.py +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/_schema_validator.py +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/conftest.py +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/fixtures/employment_docx.docx +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/fixtures/lease_allcaps.txt +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/fixtures/license_pdf.pdf +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/fixtures/nda_h2.md +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/fixtures/scanned.pdf +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/fixtures/services_bold.txt +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/test_llm.py +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/test_property.py +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.1}/tests/test_schema_conformance.py +0 -0
|
@@ -6,6 +6,31 @@ to [Semantic Versioning](https://semver.org/). Per the suite convention
|
|
|
6
6
|
(see [`docs/INTEROP.md`](docs/INTEROP.md)), **backward-incompatible changes to
|
|
7
7
|
the output schema require a major version bump**; new optional fields are minor.
|
|
8
8
|
|
|
9
|
+
## [0.1.1] - 2026-05-21
|
|
10
|
+
|
|
11
|
+
Real-world hardening, driven by testing against a SEC EDGAR employment
|
|
12
|
+
agreement and the Common Paper Mutual NDA (PDF/DOCX).
|
|
13
|
+
|
|
14
|
+
### Added
|
|
15
|
+
- **`numbered` clause-detection tier** for plain numbered headings
|
|
16
|
+
(`1. Termination`, `Section 3. Payment`, `Article IV. …`) — the dominant
|
|
17
|
+
format in foreign paper, missed by the H2/bold/ALL-CAPS tiers. A title-case
|
|
18
|
+
heuristic rejects numbered sentences and list items. The output schema's
|
|
19
|
+
clause `tier` enum gains `numbered` (a backward-compatible widening).
|
|
20
|
+
|
|
21
|
+
### Fixed
|
|
22
|
+
- **PDF reader** now extracts text only from inside `BT … ET` text objects, so
|
|
23
|
+
embedded fonts, digital-signature blobs, and metadata streams no longer leak
|
|
24
|
+
binary noise (a real signed PDF dropped from ~188 KB of garbage to ~8.7 KB of
|
|
25
|
+
clean text). Added a printable-ratio backstop.
|
|
26
|
+
- **Effective date**: anchor on `(the "Effective Date")` and a bare
|
|
27
|
+
`as of <date>` cue; handle dates that wrap across a line break.
|
|
28
|
+
- **Term length**: require a real number, dropping false positives such as
|
|
29
|
+
`…consecutive days`.
|
|
30
|
+
- **Title**: skip SGML/XML wrapper lines (e.g. SEC EDGAR `<DOCUMENT>` headers).
|
|
31
|
+
- Strip trailing punctuation from clause titles (`Other Benefits.` →
|
|
32
|
+
`Other Benefits`).
|
|
33
|
+
|
|
9
34
|
## [0.1.0] - 2026-05-21
|
|
10
35
|
|
|
11
36
|
Initial release — the open-loop front door of the contract-ops CLI suite.
|
|
@@ -57,4 +82,5 @@ Initial release — the open-loop front door of the contract-ops CLI suite.
|
|
|
57
82
|
intentionally *not* governed by the output schema (the schema describes the
|
|
58
83
|
full default output).
|
|
59
84
|
|
|
85
|
+
[0.1.1]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.1
|
|
60
86
|
[0.1.0]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.0
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: extract-cli
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.1
|
|
4
4
|
Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.docx/.pdf) and emit structured JSON.
|
|
5
5
|
Project-URL: Homepage, https://cli.drbaher.com/
|
|
6
6
|
Project-URL: Repository, https://github.com/DrBaher/extract-cli
|
|
@@ -42,11 +42,11 @@ import urllib.request
|
|
|
42
42
|
from pathlib import Path
|
|
43
43
|
from typing import Any, Dict, List, Optional, Tuple
|
|
44
44
|
|
|
45
|
-
__version__ = "0.1.
|
|
45
|
+
__version__ = "0.1.1"
|
|
46
46
|
|
|
47
47
|
# Bumped independently of the package version when the *extraction logic*
|
|
48
48
|
# changes in a way downstream consumers should notice. Embedded in `_meta`.
|
|
49
|
-
EXTRACTOR_VERSION = "0.1.
|
|
49
|
+
EXTRACTOR_VERSION = "0.1.1"
|
|
50
50
|
|
|
51
51
|
# JSON Schema version of the output contract (docs/spec/extract-output.schema.json).
|
|
52
52
|
SCHEMA_VERSION = 1
|
|
@@ -214,6 +214,49 @@ def _qualifies_as_all_caps_heading(title: str) -> bool:
|
|
|
214
214
|
return sum(1 for ch in title if "A" <= ch <= "Z") >= 4
|
|
215
215
|
|
|
216
216
|
|
|
217
|
+
# Tier between bold-numbered and ALL-CAPS: plain numbered headings on their own
|
|
218
|
+
# line -- "1. Termination", "5. Wage Compensation", "Section 3. Payment",
|
|
219
|
+
# "Article IV. Confidentiality". These are the dominant real-world format in
|
|
220
|
+
# foreign paper (and aren't caught by H2, **bold**, or ALL-CAPS). A title-case
|
|
221
|
+
# heuristic distinguishes a heading from a numbered *sentence* or list item.
|
|
222
|
+
_NUMBERED_HEADING_RE = re.compile(
|
|
223
|
+
r"^[ \t]*"
|
|
224
|
+
r"(?:(?:Article|Section|ARTICLE|SECTION)[ \t]+)?"
|
|
225
|
+
r"(?:" + _ROMAN_RE + r"|\d{1,2})\.?"
|
|
226
|
+
r"[ \t]+"
|
|
227
|
+
r"([A-Z][A-Za-z][^\n]{0,58})"
|
|
228
|
+
r"[ \t]*$",
|
|
229
|
+
re.MULTILINE,
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
# Lowercase words allowed inside an otherwise Title-Cased heading.
|
|
233
|
+
_HEADING_STOPWORDS = {
|
|
234
|
+
"a", "an", "the", "and", "or", "of", "to", "for", "in", "on", "with",
|
|
235
|
+
"by", "at", "as", "per", "from", "into", "nor", "but",
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def _qualifies_as_numbered_heading(title: str) -> bool:
|
|
240
|
+
"""A numbered line qualifies as a heading only if its title looks like a
|
|
241
|
+
heading: 1-9 words, Title-Cased (every word starts uppercase or is a short
|
|
242
|
+
lowercase connector), no sentence-y lowercase verbs. A single word must be
|
|
243
|
+
>= 4 letters. Rejects 'The parties agree as follows' but accepts 'Wage
|
|
244
|
+
Compensation' and 'Term And Nature Of Employment'."""
|
|
245
|
+
t = title.strip().rstrip(".").strip()
|
|
246
|
+
words = t.split()
|
|
247
|
+
if not (1 <= len(words) <= 9):
|
|
248
|
+
return False
|
|
249
|
+
if len(words) == 1:
|
|
250
|
+
return sum(1 for ch in words[0] if ch.isalpha()) >= 4 and words[0][:1].isupper()
|
|
251
|
+
for w in words:
|
|
252
|
+
if w[:1].isupper() or not w[:1].isalpha():
|
|
253
|
+
continue # capitalized word, or punctuation/number token
|
|
254
|
+
if w.lower() in _HEADING_STOPWORDS:
|
|
255
|
+
continue # allowed connector
|
|
256
|
+
return False # a lowercase content word => this is a sentence, not a heading
|
|
257
|
+
return True
|
|
258
|
+
|
|
259
|
+
|
|
217
260
|
def detect_clauses(text: str) -> List[JSON]:
|
|
218
261
|
"""Run the three-tier cascade and return clauses with their detection tier.
|
|
219
262
|
|
|
@@ -227,6 +270,12 @@ def detect_clauses(text: str) -> List[JSON]:
|
|
|
227
270
|
bold = list(_BOLD_HEADING_RE.finditer(text))
|
|
228
271
|
if len(bold) >= 2:
|
|
229
272
|
return _matches_to_clauses(text, bold, group=1, tier="bold-numbered")
|
|
273
|
+
numbered = [
|
|
274
|
+
m for m in _NUMBERED_HEADING_RE.finditer(text)
|
|
275
|
+
if _qualifies_as_numbered_heading(m.group(1))
|
|
276
|
+
]
|
|
277
|
+
if len(numbered) >= 2:
|
|
278
|
+
return _matches_to_clauses(text, numbered, group=1, tier="numbered")
|
|
230
279
|
caps = [
|
|
231
280
|
m for m in _ALL_CAPS_HEADING_RE.finditer(text)
|
|
232
281
|
if _qualifies_as_all_caps_heading(m.group(1))
|
|
@@ -266,8 +315,9 @@ def _matches_to_clauses(text: str, matches: List["re.Match[str]"], group: int,
|
|
|
266
315
|
|
|
267
316
|
|
|
268
317
|
def _norm_clause_key(s: str) -> str:
|
|
269
|
-
"""Normalize a clause title/alias for matching (number-stripped,
|
|
270
|
-
|
|
318
|
+
"""Normalize a clause title/alias for matching (number-stripped, trailing
|
|
319
|
+
punctuation removed, lowercased)."""
|
|
320
|
+
return _strip_clause_number(s).strip().lower().rstrip(" .:;,")
|
|
271
321
|
|
|
272
322
|
|
|
273
323
|
# ---------------------------------------------------------------------------
|
|
@@ -366,7 +416,7 @@ def _canonicalize_clause(detected_title: str) -> Tuple[Optional[str], bool]:
|
|
|
366
416
|
best, best_len = canonical, len(alias_key)
|
|
367
417
|
if best is not None:
|
|
368
418
|
return best, True
|
|
369
|
-
return _titlecase(detected_title), False
|
|
419
|
+
return _titlecase(detected_title.strip().rstrip(" .:;,")), False
|
|
370
420
|
|
|
371
421
|
|
|
372
422
|
# ---------------------------------------------------------------------------
|
|
@@ -421,11 +471,17 @@ _DATE_PAT = (
|
|
|
421
471
|
)
|
|
422
472
|
_DATE_RE = re.compile(_DATE_PAT, re.IGNORECASE)
|
|
423
473
|
|
|
474
|
+
# Highest-confidence: a date explicitly labeled "(the "Effective Date")".
|
|
475
|
+
_EFFDATE_LABEL_RE = re.compile(
|
|
476
|
+
r"(" + _DATE_PAT + r")\s*\(\s*(?:the\s+)?[\"“]?\s*Effective\s+Date",
|
|
477
|
+
re.IGNORECASE,
|
|
478
|
+
)
|
|
424
479
|
_EFFECTIVE_RE = re.compile(
|
|
425
480
|
r"(?:effective(?:\s+date)?(?:\s+(?:as\s+of|date|on))?|"
|
|
426
481
|
r"dated(?:\s+as\s+of)?|"
|
|
427
482
|
r"made(?:\s+and\s+entered\s+into)?(?:\s+as\s+of|\s+on)?|"
|
|
428
|
-
r"entered\s+into(?:\s+as\s+of|\s+on)
|
|
483
|
+
r"entered\s+into(?:\s+as\s+of|\s+on)?|"
|
|
484
|
+
r"as\s+of)"
|
|
429
485
|
r"[\s:,]+(?:the\s+)?(" + _DATE_PAT + r")",
|
|
430
486
|
re.IGNORECASE,
|
|
431
487
|
)
|
|
@@ -534,14 +590,18 @@ def _parse_date_to_iso(s: str) -> Optional[str]:
|
|
|
534
590
|
return None
|
|
535
591
|
|
|
536
592
|
|
|
593
|
+
def _date_field_from_str(raw: str, base_conf: float) -> JSON:
|
|
594
|
+
raw = re.sub(r"\s+", " ", raw.strip())
|
|
595
|
+
iso = _parse_date_to_iso(raw)
|
|
596
|
+
if iso is not None:
|
|
597
|
+
return _field(iso, base_conf)
|
|
598
|
+
return _field(raw, max(0.0, base_conf - 0.3))
|
|
599
|
+
|
|
600
|
+
|
|
537
601
|
def _date_field(match: Optional["re.Match[str]"]) -> JSON:
|
|
538
602
|
if match is None:
|
|
539
603
|
return _none_field()
|
|
540
|
-
|
|
541
|
-
iso = _parse_date_to_iso(raw)
|
|
542
|
-
if iso is not None:
|
|
543
|
-
return _field(iso, 0.85)
|
|
544
|
-
return _field(raw, 0.55)
|
|
604
|
+
return _date_field_from_str(match.group(1), 0.85)
|
|
545
605
|
|
|
546
606
|
|
|
547
607
|
def _split_name_role(s: str) -> Tuple[str, Optional[str]]:
|
|
@@ -578,10 +638,12 @@ def extract_parties(text: str) -> List[JSON]:
|
|
|
578
638
|
|
|
579
639
|
|
|
580
640
|
def extract_dates(text: str) -> JSON:
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
641
|
+
label = _EFFDATE_LABEL_RE.search(text)
|
|
642
|
+
if label is not None:
|
|
643
|
+
effective = _date_field_from_str(label.group(1), 0.9)
|
|
644
|
+
else:
|
|
645
|
+
effective = _date_field(_EFFECTIVE_RE.search(text))
|
|
646
|
+
return {"effective": effective, "expiration": _date_field(_EXPIRE_RE.search(text))}
|
|
585
647
|
|
|
586
648
|
|
|
587
649
|
def extract_governing_law(text: str) -> JSON:
|
|
@@ -600,10 +662,10 @@ def extract_term(text: str) -> JSON:
|
|
|
600
662
|
if m:
|
|
601
663
|
num = _word_to_int(m.group(1))
|
|
602
664
|
unit = m.group(2).lower().rstrip("s")
|
|
665
|
+
# Only emit when the captured token is a real number; otherwise the
|
|
666
|
+
# match was a coincidence ("...consecutive days") -> leave as not-found.
|
|
603
667
|
if num is not None:
|
|
604
668
|
length = _field(f"{num} {unit}{'s' if num != 1 else ''}", 0.7)
|
|
605
|
-
else:
|
|
606
|
-
length = _field(f"{m.group(1)} {m.group(2)}".strip(), 0.5)
|
|
607
669
|
|
|
608
670
|
notice = _none_field()
|
|
609
671
|
nm = _NOTICE_RE.search(text)
|
|
@@ -649,7 +711,8 @@ def extract_clauses(text: str) -> List[JSON]:
|
|
|
649
711
|
for c in detect_clauses(text):
|
|
650
712
|
canonical, mapped = _canonicalize_clause(c["title"])
|
|
651
713
|
tier = c["tier"]
|
|
652
|
-
base = {"h2": 0.95, "bold-numbered": 0.85, "
|
|
714
|
+
base = {"h2": 0.95, "bold-numbered": 0.85, "numbered": 0.8,
|
|
715
|
+
"all-caps": 0.75, "explicit": 0.95}.get(tier, 0.7)
|
|
653
716
|
conf = round(base * (1.0 if mapped else 0.75), 2)
|
|
654
717
|
out.append({
|
|
655
718
|
"canonical_title": canonical,
|
|
@@ -669,10 +732,14 @@ def extract_title(text: str, path: Optional[Path], fmt: str) -> Optional[str]:
|
|
|
669
732
|
return m.group(1).strip()
|
|
670
733
|
for line in text.splitlines():
|
|
671
734
|
ls = line.strip().lstrip("#").strip()
|
|
672
|
-
if ls:
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
735
|
+
if not ls:
|
|
736
|
+
continue
|
|
737
|
+
# Skip SGML/XML wrapper lines (e.g. SEC EDGAR "<DOCUMENT>", "<TYPE>...").
|
|
738
|
+
if ls.startswith("<"):
|
|
739
|
+
continue
|
|
740
|
+
if len(ls) <= 90:
|
|
741
|
+
return ls
|
|
742
|
+
break
|
|
676
743
|
if path is not None:
|
|
677
744
|
return _titlecase(path.stem.replace("_", " ").replace("-", " "))
|
|
678
745
|
return None
|
|
@@ -834,9 +901,15 @@ def _pdf_unescape(s: str) -> str:
|
|
|
834
901
|
|
|
835
902
|
|
|
836
903
|
def _pdf_text_from_content(content: bytes) -> str:
|
|
904
|
+
"""Pull text strings from a PDF content stream, but ONLY from inside text
|
|
905
|
+
objects (`BT` ... `ET`). Real text lives there; embedded fonts, images,
|
|
906
|
+
digital-signature blobs and metadata streams have no BT/ET, so gating on it
|
|
907
|
+
keeps their binary bytes (which often contain stray `(...)` sequences) out
|
|
908
|
+
of the output -- essential for real signed/font-embedded PDFs."""
|
|
837
909
|
s = content.decode("latin-1", "replace")
|
|
838
910
|
lines: List[str] = []
|
|
839
911
|
cur: List[str] = []
|
|
912
|
+
in_text = False
|
|
840
913
|
|
|
841
914
|
def flush() -> None:
|
|
842
915
|
if cur:
|
|
@@ -845,17 +918,34 @@ def _pdf_text_from_content(content: bytes) -> str:
|
|
|
845
918
|
|
|
846
919
|
for m in _PDF_TOKEN_RE.finditer(s):
|
|
847
920
|
tok = m.group(0)
|
|
848
|
-
if tok
|
|
921
|
+
if tok == "BT":
|
|
922
|
+
flush()
|
|
923
|
+
in_text = True
|
|
924
|
+
elif tok == "ET":
|
|
925
|
+
flush()
|
|
926
|
+
in_text = False
|
|
927
|
+
elif not in_text:
|
|
928
|
+
continue
|
|
929
|
+
elif tok.startswith("("):
|
|
849
930
|
cur.append(_pdf_unescape(tok[1:-1]))
|
|
850
931
|
elif tok.startswith("["):
|
|
851
932
|
for sm in re.finditer(r"\((?:\\.|[^\\()])*\)", tok):
|
|
852
933
|
cur.append(_pdf_unescape(sm.group(0)[1:-1]))
|
|
853
|
-
elif tok in ("Td", "TD", "T*", "'", '"'
|
|
934
|
+
elif tok in ("Td", "TD", "T*", "'", '"'):
|
|
854
935
|
flush()
|
|
855
936
|
flush()
|
|
856
937
|
return "\n".join(lines)
|
|
857
938
|
|
|
858
939
|
|
|
940
|
+
def _mostly_printable(s: str) -> bool:
|
|
941
|
+
"""True if `s` is overwhelmingly printable text (backstop against a
|
|
942
|
+
malformed stream slipping binary through the BT/ET gate)."""
|
|
943
|
+
if not s:
|
|
944
|
+
return False
|
|
945
|
+
printable = sum(1 for ch in s if ch in "\n\t" or 32 <= ord(ch) < 127 or ord(ch) > 160)
|
|
946
|
+
return printable / len(s) >= 0.85
|
|
947
|
+
|
|
948
|
+
|
|
859
949
|
def _read_pdf_stdlib(raw: bytes) -> str:
|
|
860
950
|
import zlib
|
|
861
951
|
|
|
@@ -873,9 +963,11 @@ def _read_pdf_stdlib(raw: bytes) -> str:
|
|
|
873
963
|
content = zlib.decompress(body)
|
|
874
964
|
except Exception:
|
|
875
965
|
content = body
|
|
876
|
-
|
|
966
|
+
piece = _pdf_text_from_content(content)
|
|
967
|
+
if piece.strip() and _mostly_printable(piece):
|
|
968
|
+
chunks.append(piece)
|
|
877
969
|
idx = e + len(b"endstream")
|
|
878
|
-
return "\n".join(
|
|
970
|
+
return "\n".join(chunks)
|
|
879
971
|
|
|
880
972
|
|
|
881
973
|
def load_source(path: Path, prefer_optional: bool = True) -> Tuple[bytes, str, str, List[str]]:
|
|
@@ -1293,7 +1385,7 @@ def output_schema() -> JSON:
|
|
|
1293
1385
|
"properties": {
|
|
1294
1386
|
"canonical_title": {"type": ["string", "null"]},
|
|
1295
1387
|
"detected_title": {"type": "string"},
|
|
1296
|
-
"tier": {"enum": ["h2", "bold-numbered", "all-caps", "explicit", "llm"]},
|
|
1388
|
+
"tier": {"enum": ["h2", "bold-numbered", "numbered", "all-caps", "explicit", "llm"]},
|
|
1297
1389
|
"span": {
|
|
1298
1390
|
"type": "object",
|
|
1299
1391
|
"required": ["start", "end"],
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "extract-cli"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.1"
|
|
8
8
|
description = "Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.docx/.pdf) and emit structured JSON."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9"
|
|
@@ -25,6 +25,50 @@ def test_tier3_all_caps() -> None:
|
|
|
25
25
|
assert [c["tier"] for c in clauses] == ["all-caps", "all-caps"]
|
|
26
26
|
|
|
27
27
|
|
|
28
|
+
def test_tier_numbered_plain_headings() -> None:
|
|
29
|
+
# Real-world dominant format: plain numbered, mixed-case, unbolded headings.
|
|
30
|
+
text = ("1. Term And Nature Of Employment\n\nbody about term\n\n"
|
|
31
|
+
"2. Wage Compensation\n\nbody about wages\n\n"
|
|
32
|
+
"5. Termination\n\nbody about termination")
|
|
33
|
+
clauses = ex.detect_clauses(text)
|
|
34
|
+
assert [c["tier"] for c in clauses] == ["numbered", "numbered", "numbered"]
|
|
35
|
+
assert clauses[0]["title"] == "Term And Nature Of Employment"
|
|
36
|
+
assert clauses[2]["title"] == "Termination"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def test_numbered_heading_rejects_sentences() -> None:
|
|
40
|
+
# "1. The Company shall pay..." is a numbered sentence, not a heading.
|
|
41
|
+
assert ex._qualifies_as_numbered_heading("Wage Compensation")
|
|
42
|
+
assert ex._qualifies_as_numbered_heading("Term And Nature Of Employment")
|
|
43
|
+
assert ex._qualifies_as_numbered_heading("Termination")
|
|
44
|
+
assert not ex._qualifies_as_numbered_heading("The Company shall pay the Employee monthly")
|
|
45
|
+
assert not ex._qualifies_as_numbered_heading("Fee") # single word < 4 letters
|
|
46
|
+
assert not ex._qualifies_as_numbered_heading(
|
|
47
|
+
"EMPLOYEE shall be compensated on the basis of an annual salary")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def test_numbered_section_article_prefixes() -> None:
|
|
51
|
+
text = ("Section 1. Definitions\n\nx\n\nSection 2. Confidentiality\n\ny\n\n"
|
|
52
|
+
"Article IV. Governing Law\n\nz")
|
|
53
|
+
clauses = ex.detect_clauses(text)
|
|
54
|
+
assert all(c["tier"] == "numbered" for c in clauses)
|
|
55
|
+
assert clauses[0]["title"] == "Definitions"
|
|
56
|
+
assert clauses[2]["title"] == "Governing Law"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def test_numbered_does_not_shadow_bold() -> None:
|
|
60
|
+
# Bold-numbered must win over plain-numbered when both could match.
|
|
61
|
+
text = "**1. Purpose**\n\nx\n\n**2. Scope**\n\ny"
|
|
62
|
+
assert all(c["tier"] == "bold-numbered" for c in ex.detect_clauses(text))
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def test_trailing_period_stripped_from_titles() -> None:
|
|
66
|
+
canon, mapped = ex._canonicalize_clause("Other Benefits.")
|
|
67
|
+
assert canon == "Other Benefits"
|
|
68
|
+
# And a mapped clause with a trailing period still maps.
|
|
69
|
+
assert ex._canonicalize_clause("Survival.") == ("Survival", True)
|
|
70
|
+
|
|
71
|
+
|
|
28
72
|
def test_cascade_priority_h2_wins() -> None:
|
|
29
73
|
# An H2 present means the bold/all-caps fallbacks must not fire.
|
|
30
74
|
text = "## Real Heading\n\n**1. Not A Heading**\n\nALSO NOT A HEADING\n\nbody"
|
|
@@ -22,7 +22,7 @@ def test_version(capsys: pytest.CaptureFixture[str]) -> None:
|
|
|
22
22
|
with pytest.raises(SystemExit) as exc:
|
|
23
23
|
ex.main(["--version"])
|
|
24
24
|
assert exc.value.code == 0
|
|
25
|
-
assert "extract-cli
|
|
25
|
+
assert f"extract-cli {ex.__version__}" in capsys.readouterr().out
|
|
26
26
|
|
|
27
27
|
|
|
28
28
|
def test_demo_runs(capsys: pytest.CaptureFixture[str]) -> None:
|
|
@@ -39,6 +39,25 @@ def test_dates_iso_normalization() -> None:
|
|
|
39
39
|
assert out["source"] == "deterministic"
|
|
40
40
|
|
|
41
41
|
|
|
42
|
+
def test_dates_effective_date_label_and_as_of() -> None:
|
|
43
|
+
# The "(the "Effective Date")" anchor, with the date wrapping a newline.
|
|
44
|
+
text = 'between A and B as of August\n31, 2016 (the "Effective Date").'
|
|
45
|
+
assert ex.extract_dates(text)["effective"]["value"] == "2016-08-31"
|
|
46
|
+
# Bare "as of <date>" cue.
|
|
47
|
+
assert ex.extract_dates("dated as of June 1, 2023")["effective"]["value"] == "2023-06-01"
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def test_term_length_rejects_non_number() -> None:
|
|
51
|
+
# "...for consecutive days" must NOT be reported as a term length.
|
|
52
|
+
text = "the Employment Period shall run for consecutive days as scheduled"
|
|
53
|
+
assert ex.extract_term(text)["length"]["source"] == "none"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def test_title_skips_sgml_wrapper() -> None:
|
|
57
|
+
text = "<DOCUMENT>\n<TYPE>EX-10\n<TEXT>\n\nEMPLOYMENT AGREEMENT\n\nbody"
|
|
58
|
+
assert ex.extract_title(text, None, "text") == "EMPLOYMENT AGREEMENT"
|
|
59
|
+
|
|
60
|
+
|
|
42
61
|
def test_dates_missing() -> None:
|
|
43
62
|
out = ex.extract_dates("no dates in here")
|
|
44
63
|
assert out["effective"] == ex._none_field()
|
|
@@ -142,6 +142,19 @@ def test_pdf_unescape() -> None:
|
|
|
142
142
|
assert ex._pdf_unescape(r"\101\102") == "AB" # octal escapes
|
|
143
143
|
|
|
144
144
|
|
|
145
|
+
def test_pdf_text_only_inside_bt_et() -> None:
|
|
146
|
+
# Strings outside BT/ET (font/signature/metadata stream bytes that happen to
|
|
147
|
+
# contain parentheses) must be ignored; only text objects yield text.
|
|
148
|
+
content = b"(garbage outside) /Font << >> BT (real text) Tj ET (more garbage)"
|
|
149
|
+
assert ex._pdf_text_from_content(content) == "real text"
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def test_pdf_mostly_printable_backstop() -> None:
|
|
153
|
+
assert ex._mostly_printable("Hello, world")
|
|
154
|
+
assert not ex._mostly_printable("\x00\x01\x02\x03\x04\x05\x06\x07")
|
|
155
|
+
assert not ex._mostly_printable("")
|
|
156
|
+
|
|
157
|
+
|
|
145
158
|
def test_extract_json_object_from_noise() -> None:
|
|
146
159
|
assert ex._extract_json_object('prefix {"a": 1} suffix') == {"a": 1}
|
|
147
160
|
assert ex._extract_json_object("no json here") is None
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|