extract-cli 0.1.2__tar.gz → 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. {extract_cli-0.1.2 → extract_cli-0.1.3}/CHANGELOG.md +26 -0
  2. {extract_cli-0.1.2 → extract_cli-0.1.3}/PKG-INFO +1 -1
  3. {extract_cli-0.1.2 → extract_cli-0.1.3}/extract_cli.py +47 -4
  4. {extract_cli-0.1.2 → extract_cli-0.1.3}/pyproject.toml +1 -1
  5. {extract_cli-0.1.2 → extract_cli-0.1.3}/tests/fixtures/employment_docx.docx.expected.json +1 -1
  6. {extract_cli-0.1.2 → extract_cli-0.1.3}/tests/fixtures/lease_allcaps.txt.expected.json +1 -1
  7. {extract_cli-0.1.2 → extract_cli-0.1.3}/tests/fixtures/license_pdf.pdf.expected.json +1 -1
  8. {extract_cli-0.1.2 → extract_cli-0.1.3}/tests/fixtures/nda_h2.md.expected.json +1 -1
  9. {extract_cli-0.1.2 → extract_cli-0.1.3}/tests/fixtures/scanned.pdf.expected.json +1 -1
  10. {extract_cli-0.1.2 → extract_cli-0.1.3}/tests/fixtures/services_bold.txt.expected.json +1 -1
  11. {extract_cli-0.1.2 → extract_cli-0.1.3}/tests/fixtures/services_html.html.expected.json +1 -1
  12. {extract_cli-0.1.2 → extract_cli-0.1.3}/tests/test_clause_map.py +25 -0
  13. {extract_cli-0.1.2 → extract_cli-0.1.3}/.gitignore +0 -0
  14. {extract_cli-0.1.2 → extract_cli-0.1.3}/ARCHITECTURE.md +0 -0
  15. {extract_cli-0.1.2 → extract_cli-0.1.3}/CONTRIBUTING.md +0 -0
  16. {extract_cli-0.1.2 → extract_cli-0.1.3}/LICENSE +0 -0
  17. {extract_cli-0.1.2 → extract_cli-0.1.3}/Makefile +0 -0
  18. {extract_cli-0.1.2 → extract_cli-0.1.3}/README.md +0 -0
  19. {extract_cli-0.1.2 → extract_cli-0.1.3}/config/llm.json.example +0 -0
  20. {extract_cli-0.1.2 → extract_cli-0.1.3}/docs/INTEROP.md +0 -0
  21. {extract_cli-0.1.2 → extract_cli-0.1.3}/docs/spec/extract-output.schema.json +0 -0
  22. {extract_cli-0.1.2 → extract_cli-0.1.3}/scripts/release.py +0 -0
  23. {extract_cli-0.1.2 → extract_cli-0.1.3}/scripts/validate_against_spec.py +0 -0
  24. {extract_cli-0.1.2 → extract_cli-0.1.3}/tests/_fixtures_build.py +0 -0
  25. {extract_cli-0.1.2 → extract_cli-0.1.3}/tests/_make_goldens.py +0 -0
  26. {extract_cli-0.1.2 → extract_cli-0.1.3}/tests/_schema_validator.py +0 -0
  27. {extract_cli-0.1.2 → extract_cli-0.1.3}/tests/conftest.py +0 -0
  28. {extract_cli-0.1.2 → extract_cli-0.1.3}/tests/fixtures/employment_docx.docx +0 -0
  29. {extract_cli-0.1.2 → extract_cli-0.1.3}/tests/fixtures/lease_allcaps.txt +0 -0
  30. {extract_cli-0.1.2 → extract_cli-0.1.3}/tests/fixtures/license_pdf.pdf +0 -0
  31. {extract_cli-0.1.2 → extract_cli-0.1.3}/tests/fixtures/nda_h2.md +0 -0
  32. {extract_cli-0.1.2 → extract_cli-0.1.3}/tests/fixtures/scanned.pdf +0 -0
  33. {extract_cli-0.1.2 → extract_cli-0.1.3}/tests/fixtures/services_bold.txt +0 -0
  34. {extract_cli-0.1.2 → extract_cli-0.1.3}/tests/fixtures/services_html.html +0 -0
  35. {extract_cli-0.1.2 → extract_cli-0.1.3}/tests/test_cli.py +0 -0
  36. {extract_cli-0.1.2 → extract_cli-0.1.3}/tests/test_deterministic.py +0 -0
  37. {extract_cli-0.1.2 → extract_cli-0.1.3}/tests/test_llm.py +0 -0
  38. {extract_cli-0.1.2 → extract_cli-0.1.3}/tests/test_misc.py +0 -0
  39. {extract_cli-0.1.2 → extract_cli-0.1.3}/tests/test_property.py +0 -0
  40. {extract_cli-0.1.2 → extract_cli-0.1.3}/tests/test_schema_conformance.py +0 -0
@@ -6,6 +6,31 @@ to [Semantic Versioning](https://semver.org/). Per the suite convention
6
6
  (see [`docs/INTEROP.md`](docs/INTEROP.md)), **backward-incompatible changes to
7
7
  the output schema require a major version bump**; new optional fields are minor.
8
8
 
9
+ ## [0.1.3] - 2026-05-21
10
+
11
+ Clause-map de-noising and party cleanup, driven by testing against 10 more
12
+ contracts (SEC EDGAR credit, loan, employment, lease, asset-purchase, and
13
+ consulting HTML exhibits; Apache PDFs).
14
+
15
+ ### Fixed
16
+ - **Clause map drops structural noise** common in dense real documents:
17
+ a heading whose title repeats 3+ times is treated as a running header/footer
18
+ (one lease's `Ks 112708-2` page code went from 44 "clauses" to 0), and
19
+ front/back-matter (`Table of Contents`, `Exhibit B`, `Schedule 2.1`) and
20
+ document codes/page numbers (4+ consecutive digits) are filtered out.
21
+ - **Party-name cleanup** extended: trailing `together with …`, `, as
22
+ administrative agent`, and a dangling unclosed parenthetical
23
+ (`(each of them being`) are trimmed.
24
+
25
+ ### Notes
26
+ - On dense documents the deterministic clause map can still surface a few
27
+ non-clause headings (e.g. address lines in a notices block); consumers
28
+ wanting only suite-vocabulary clauses should filter on `mapped == true`,
29
+ which isolates the real clauses (the noise is always `mapped == false`).
30
+ - Known best-effort edge cases on varied real paper: a bare role word as a
31
+ party name ("Landlord"), and a middle-initial period truncating a personal
32
+ name ("John C." → "John C"). Best-effort fields carry confidence/source.
33
+
9
34
  ## [0.1.2] - 2026-05-21
10
35
 
11
36
  More real-world hardening, driven by testing against five additional contracts
@@ -115,6 +140,7 @@ Initial release — the open-loop front door of the contract-ops CLI suite.
115
140
  intentionally *not* governed by the output schema (the schema describes the
116
141
  full default output).
117
142
 
143
+ [0.1.3]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.3
118
144
  [0.1.2]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.2
119
145
  [0.1.1]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.1
120
146
  [0.1.0]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-cli
3
- Version: 0.1.2
3
+ Version: 0.1.3
4
4
  Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON.
5
5
  Project-URL: Homepage, https://cli.drbaher.com/
6
6
  Project-URL: Repository, https://github.com/DrBaher/extract-cli
@@ -43,11 +43,11 @@ import urllib.request
43
43
  from pathlib import Path
44
44
  from typing import Any, Dict, List, Optional, Tuple
45
45
 
46
- __version__ = "0.1.2"
46
+ __version__ = "0.1.3"
47
47
 
48
48
  # Bumped independently of the package version when the *extraction logic*
49
49
  # changes in a way downstream consumers should notice. Embedded in `_meta`.
50
- EXTRACTOR_VERSION = "0.1.2"
50
+ EXTRACTOR_VERSION = "0.1.3"
51
51
 
52
52
  # JSON Schema version of the output contract (docs/spec/extract-output.schema.json).
53
53
  SCHEMA_VERSION = 1
@@ -629,18 +629,25 @@ _PARTY_CUT_MARKERS: Tuple[str, ...] = (
629
629
  r"\s+organized\b",
630
630
  r"\s+incorporated\b",
631
631
  r"\s+whose\b",
632
+ r"\s+together\b",
633
+ r",\s+as\s+\w", # ", as administrative agent"
632
634
  r"\s+(?:as\s+of|dated|effective)\b",
633
635
  )
634
636
 
635
637
 
636
638
  def _clean_party_name(s: str) -> str:
637
639
  """Trim a captured party name down to the entity name, dropping trailing
638
- descriptors ('a Delaware corporation', 'd/b/a ...', 'as of ...')."""
640
+ descriptors ('a Delaware corporation', 'd/b/a ...', 'together with ...',
641
+ 'as of ...') and any dangling unclosed parenthetical ('(each of them ...')."""
639
642
  s = re.sub(r"\s+", " ", s).strip().strip(",").strip()
640
643
  for pat in _PARTY_CUT_MARKERS:
641
644
  m = re.search(pat, s, re.IGNORECASE)
642
645
  if m:
643
646
  s = s[: m.start()].strip().strip(",").strip()
647
+ # Drop a trailing parenthetical that was opened but never closed (the close
648
+ # fell outside the captured span), e.g. "Glenn Rufrano (each of them being".
649
+ if "(" in s and ")" not in s:
650
+ s = s[: s.index("(")].strip().strip(",").strip()
644
651
  return s.strip("\"“”").strip()
645
652
 
646
653
 
@@ -741,9 +748,45 @@ def extract_defined_terms(text: str) -> List[JSON]:
741
748
  return [{"term": t, "confidence": 0.6, "source": "deterministic"} for t in seen]
742
749
 
743
750
 
751
+ # Detected-heading titles that are almost never real clauses: front/back-matter,
752
+ # page/document codes, exhibit & schedule references.
753
+ _NOISE_TITLE_PREFIX_RE = re.compile(
754
+ r"^(?:table\s+of\s+contents|exhibit|schedule|annex|appendix|attachment|"
755
+ r"signature\s+page|page)\b",
756
+ re.IGNORECASE,
757
+ )
758
+
759
+
760
+ def _is_noise_clause_title(title: str) -> bool:
761
+ """True for detected 'headings' that are structural noise rather than
762
+ clauses -- document codes/page numbers (4+ consecutive digits, e.g.
763
+ 'Ks 112708-2'), and front/back-matter like 'Table of Contents' or
764
+ 'Exhibit B'. Safe filters only; kept conservative to avoid dropping real
765
+ clauses."""
766
+ t = title.strip()
767
+ if re.search(r"\d{4,}", t):
768
+ return True
769
+ if _NOISE_TITLE_PREFIX_RE.match(t):
770
+ return True
771
+ return False
772
+
773
+
744
774
  def extract_clauses(text: str) -> List[JSON]:
775
+ detected = detect_clauses(text)
776
+ # A heading whose title repeats 3+ times across the document is almost
777
+ # always a running header/footer (e.g. a page code), not that many distinct
778
+ # clauses -- drop every occurrence. (Counted on the normalized title.)
779
+ counts: Dict[str, int] = {}
780
+ for c in detected:
781
+ k = _norm_clause_key(c["title"])
782
+ counts[k] = counts.get(k, 0) + 1
783
+
745
784
  out: List[JSON] = []
746
- for c in detect_clauses(text):
785
+ for c in detected:
786
+ if counts[_norm_clause_key(c["title"])] >= 3:
787
+ continue
788
+ if _is_noise_clause_title(c["title"]):
789
+ continue
747
790
  canonical, mapped = _canonicalize_clause(c["title"])
748
791
  tier = c["tier"]
749
792
  base = {"h2": 0.95, "bold-numbered": 0.85, "numbered": 0.8,
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "extract-cli"
7
- version = "0.1.2"
7
+ version = "0.1.3"
8
8
  description = "Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
@@ -138,7 +138,7 @@
138
138
  "source": "deterministic"
139
139
  },
140
140
  "_meta": {
141
- "extractor_version": "0.1.2",
141
+ "extractor_version": "0.1.3",
142
142
  "tiers_used": [
143
143
  "deterministic"
144
144
  ],
@@ -133,7 +133,7 @@
133
133
  "source": "deterministic"
134
134
  },
135
135
  "_meta": {
136
- "extractor_version": "0.1.2",
136
+ "extractor_version": "0.1.3",
137
137
  "tiers_used": [
138
138
  "deterministic"
139
139
  ],
@@ -133,7 +133,7 @@
133
133
  "source": "deterministic"
134
134
  },
135
135
  "_meta": {
136
- "extractor_version": "0.1.2",
136
+ "extractor_version": "0.1.3",
137
137
  "tiers_used": [
138
138
  "deterministic"
139
139
  ],
@@ -143,7 +143,7 @@
143
143
  "source": "none"
144
144
  },
145
145
  "_meta": {
146
- "extractor_version": "0.1.2",
146
+ "extractor_version": "0.1.3",
147
147
  "tiers_used": [
148
148
  "deterministic"
149
149
  ],
@@ -48,7 +48,7 @@
48
48
  "source": "none"
49
49
  },
50
50
  "_meta": {
51
- "extractor_version": "0.1.2",
51
+ "extractor_version": "0.1.3",
52
52
  "tiers_used": [
53
53
  "deterministic"
54
54
  ],
@@ -133,7 +133,7 @@
133
133
  "source": "deterministic"
134
134
  },
135
135
  "_meta": {
136
- "extractor_version": "0.1.2",
136
+ "extractor_version": "0.1.3",
137
137
  "tiers_used": [
138
138
  "deterministic"
139
139
  ],
@@ -148,7 +148,7 @@
148
148
  "source": "deterministic"
149
149
  },
150
150
  "_meta": {
151
- "extractor_version": "0.1.2",
151
+ "extractor_version": "0.1.3",
152
152
  "tiers_used": [
153
153
  "deterministic"
154
154
  ],
@@ -69,6 +69,31 @@ def test_trailing_period_stripped_from_titles() -> None:
69
69
  assert ex._canonicalize_clause("Survival.") == ("Survival", True)
70
70
 
71
71
 
72
+ def test_repeated_heading_treated_as_boilerplate() -> None:
73
+ # A "heading" that repeats 3+ times is a running header/footer, not clauses.
74
+ body = "\n\n".join("## Ks 99-2\n\nfoo" for _ in range(4))
75
+ text = "## Confidentiality\n\nreal body\n\n" + body
76
+ clauses = ex.extract_clauses(text)
77
+ titles = [c["canonical_title"] for c in clauses]
78
+ assert "Confidentiality" in titles
79
+ assert not any("Ks" in (t or "") for t in titles)
80
+
81
+
82
+ def test_noise_clause_titles_filtered() -> None:
83
+ assert ex._is_noise_clause_title("Ks 112708-2") # 4+ digit code
84
+ assert ex._is_noise_clause_title("Table of Contents")
85
+ assert ex._is_noise_clause_title("Exhibit B")
86
+ assert ex._is_noise_clause_title("Schedule 2.1")
87
+ assert not ex._is_noise_clause_title("Confidentiality")
88
+ assert not ex._is_noise_clause_title("Term and Termination")
89
+
90
+
91
+ def test_party_cuts_together_as_agent_and_unclosed_paren() -> None:
92
+ assert ex._clean_party_name("Foo LLC, together with its affiliates") == "Foo LLC"
93
+ assert ex._clean_party_name("GE Capital Corporation, as administrative agent") == "GE Capital Corporation"
94
+ assert ex._clean_party_name("Glenn Rufrano (each of them being") == "Glenn Rufrano"
95
+
96
+
72
97
  def test_cascade_priority_h2_wins() -> None:
73
98
  # An H2 present means the bold/all-caps fallbacks must not fire.
74
99
  text = "## Real Heading\n\n**1. Not A Heading**\n\nALSO NOT A HEADING\n\nbody"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes