extract-cli 0.1.3__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {extract_cli-0.1.3 → extract_cli-0.1.4}/CHANGELOG.md +27 -0
- {extract_cli-0.1.3 → extract_cli-0.1.4}/PKG-INFO +1 -1
- {extract_cli-0.1.3 → extract_cli-0.1.4}/extract_cli.py +47 -2
- {extract_cli-0.1.3 → extract_cli-0.1.4}/pyproject.toml +1 -1
- {extract_cli-0.1.3 → extract_cli-0.1.4}/tests/_fixtures_build.py +45 -7
- {extract_cli-0.1.3 → extract_cli-0.1.4}/tests/_make_goldens.py +2 -2
- {extract_cli-0.1.3 → extract_cli-0.1.4}/tests/conftest.py +1 -0
- {extract_cli-0.1.3 → extract_cli-0.1.4}/tests/fixtures/employment_docx.docx +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.4}/tests/fixtures/employment_docx.docx.expected.json +2 -2
- extract_cli-0.1.4/tests/fixtures/heading_docx.docx +0 -0
- extract_cli-0.1.4/tests/fixtures/heading_docx.docx.expected.json +142 -0
- {extract_cli-0.1.3 → extract_cli-0.1.4}/tests/fixtures/lease_allcaps.txt.expected.json +1 -1
- {extract_cli-0.1.3 → extract_cli-0.1.4}/tests/fixtures/license_pdf.pdf.expected.json +1 -1
- {extract_cli-0.1.3 → extract_cli-0.1.4}/tests/fixtures/nda_h2.md.expected.json +1 -1
- {extract_cli-0.1.3 → extract_cli-0.1.4}/tests/fixtures/scanned.pdf.expected.json +1 -1
- {extract_cli-0.1.3 → extract_cli-0.1.4}/tests/fixtures/services_bold.txt.expected.json +1 -1
- {extract_cli-0.1.3 → extract_cli-0.1.4}/tests/fixtures/services_html.html.expected.json +1 -1
- {extract_cli-0.1.3 → extract_cli-0.1.4}/tests/test_misc.py +31 -0
- {extract_cli-0.1.3 → extract_cli-0.1.4}/.gitignore +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.4}/ARCHITECTURE.md +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.4}/CONTRIBUTING.md +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.4}/LICENSE +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.4}/Makefile +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.4}/README.md +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.4}/config/llm.json.example +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.4}/docs/INTEROP.md +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.4}/docs/spec/extract-output.schema.json +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.4}/scripts/release.py +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.4}/scripts/validate_against_spec.py +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.4}/tests/_schema_validator.py +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.4}/tests/fixtures/lease_allcaps.txt +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.4}/tests/fixtures/license_pdf.pdf +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.4}/tests/fixtures/nda_h2.md +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.4}/tests/fixtures/scanned.pdf +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.4}/tests/fixtures/services_bold.txt +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.4}/tests/fixtures/services_html.html +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.4}/tests/test_clause_map.py +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.4}/tests/test_cli.py +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.4}/tests/test_deterministic.py +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.4}/tests/test_llm.py +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.4}/tests/test_property.py +0 -0
- {extract_cli-0.1.3 → extract_cli-0.1.4}/tests/test_schema_conformance.py +0 -0
|
@@ -6,6 +6,32 @@ to [Semantic Versioning](https://semver.org/). Per the suite convention
|
|
|
6
6
|
(see [`docs/INTEROP.md`](docs/INTEROP.md)), **backward-incompatible changes to
|
|
7
7
|
the output schema require a major version bump**; new optional fields are minor.
|
|
8
8
|
|
|
9
|
+
## [0.1.4] - 2026-05-21
|
|
10
|
+
|
|
11
|
+
DOCX clause detection, driven by testing against 20 real `.docx` contracts
|
|
12
|
+
(Common Paper / Bonterms / YC templates via open-agreements, plus government
|
|
13
|
+
samples) — the format we expect most.
|
|
14
|
+
|
|
15
|
+
### Fixed
|
|
16
|
+
- **The DOCX reader now honors Word heading styles.** Real Word contracts carry
|
|
17
|
+
their clause structure in `Heading1`–`Heading9`/`Title` paragraph styles with
|
|
18
|
+
*auto-generated* numbers (absent from the raw text), so the prior cascade
|
|
19
|
+
found almost no clauses. Heading-styled paragraphs are now emitted as `##`
|
|
20
|
+
headings (detected by the strongest tier); run-in headings
|
|
21
|
+
(`Payment. Customer will pay …`) are split into title + body, and a full
|
|
22
|
+
sentence that merely carries a heading style is rejected (not a clause).
|
|
23
|
+
Across the 20-doc sample this took heading-styled agreements from ~0 clauses
|
|
24
|
+
to a clean 14–21 distinct suite-vocabulary clauses each.
|
|
25
|
+
- Binary DOCX test fixtures are now generated deterministically (fixed zip
|
|
26
|
+
timestamp) so their sha256 — and the goldens — are stable across regenerations.
|
|
27
|
+
|
|
28
|
+
### Known limitations (documented)
|
|
29
|
+
- DOCX that auto-number clauses via `numbering.xml` with **no heading style and
|
|
30
|
+
no bold lead** (some Bonterms/older templates use a flat `Plain`/`ListParagraph`
|
|
31
|
+
style) still yield no clause map: the heading text carries no detectable
|
|
32
|
+
signal without reconstructing Word's numbering counters. Parties/dates/
|
|
33
|
+
governing-law still extract.
|
|
34
|
+
|
|
9
35
|
## [0.1.3] - 2026-05-21
|
|
10
36
|
|
|
11
37
|
Clause-map de-noising and party cleanup, driven by testing against 10 more
|
|
@@ -140,6 +166,7 @@ Initial release — the open-loop front door of the contract-ops CLI suite.
|
|
|
140
166
|
intentionally *not* governed by the output schema (the schema describes the
|
|
141
167
|
full default output).
|
|
142
168
|
|
|
169
|
+
[0.1.4]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.4
|
|
143
170
|
[0.1.3]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.3
|
|
144
171
|
[0.1.2]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.2
|
|
145
172
|
[0.1.1]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.1
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: extract-cli
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON.
|
|
5
5
|
Project-URL: Homepage, https://cli.drbaher.com/
|
|
6
6
|
Project-URL: Repository, https://github.com/DrBaher/extract-cli
|
|
@@ -43,11 +43,11 @@ import urllib.request
|
|
|
43
43
|
from pathlib import Path
|
|
44
44
|
from typing import Any, Dict, List, Optional, Tuple
|
|
45
45
|
|
|
46
|
-
__version__ = "0.1.
|
|
46
|
+
__version__ = "0.1.4"
|
|
47
47
|
|
|
48
48
|
# Bumped independently of the package version when the *extraction logic*
|
|
49
49
|
# changes in a way downstream consumers should notice. Embedded in `_meta`.
|
|
50
|
-
EXTRACTOR_VERSION = "0.1.
|
|
50
|
+
EXTRACTOR_VERSION = "0.1.4"
|
|
51
51
|
|
|
52
52
|
# JSON Schema version of the output contract (docs/spec/extract-output.schema.json).
|
|
53
53
|
SCHEMA_VERSION = 1
|
|
@@ -950,6 +950,39 @@ def _read_docx(path: Path, raw: bytes, prefer_optional: bool = True) -> Tuple[st
|
|
|
950
950
|
return "", warnings
|
|
951
951
|
|
|
952
952
|
|
|
953
|
+
def _docx_paragraph_style(ppr: Any, w: str) -> Optional[str]:
|
|
954
|
+
if ppr is None:
|
|
955
|
+
return None
|
|
956
|
+
st = ppr.find(w + "pStyle")
|
|
957
|
+
return st.get(w + "val") if st is not None else None
|
|
958
|
+
|
|
959
|
+
|
|
960
|
+
def _is_heading_style(style: Optional[str]) -> bool:
|
|
961
|
+
"""True for Word built-in heading/title styles (Heading1-9, Title, and the
|
|
962
|
+
'H1'/'H2' shorthands). These mark clause headings whose visible numbers are
|
|
963
|
+
auto-generated and absent from the raw text."""
|
|
964
|
+
if not style:
|
|
965
|
+
return False
|
|
966
|
+
s = style.lower()
|
|
967
|
+
return "heading" in s or s == "title" or bool(re.fullmatch(r"h[1-9]", s))
|
|
968
|
+
|
|
969
|
+
|
|
970
|
+
def _docx_heading_title(text: str) -> Optional[str]:
|
|
971
|
+
"""Pull the clause title out of a heading paragraph. Many contracts use a
|
|
972
|
+
run-in heading -- 'Performing Services. Contractor will ...' -- where the
|
|
973
|
+
title is the lead before the first sentence break; a standalone header
|
|
974
|
+
('Services & Restrictions') has no such break and is used whole.
|
|
975
|
+
|
|
976
|
+
Returns None when the paragraph is really a full sentence that merely
|
|
977
|
+
carries a heading style (no run-in title) -- those would otherwise become
|
|
978
|
+
garbage clause titles and mis-map under substring matching."""
|
|
979
|
+
m = re.match(r"\s*(.{2,80}?)[.:]\s+[A-Z(\"“]", text)
|
|
980
|
+
title = m.group(1).strip() if m else text.strip()
|
|
981
|
+
if len(title) > 70 or len(title.split()) > 9:
|
|
982
|
+
return None
|
|
983
|
+
return title
|
|
984
|
+
|
|
985
|
+
|
|
953
986
|
def _read_docx_stdlib(raw: bytes) -> str:
|
|
954
987
|
import io
|
|
955
988
|
import zipfile
|
|
@@ -962,6 +995,7 @@ def _read_docx_stdlib(raw: bytes) -> str:
|
|
|
962
995
|
paras: List[str] = []
|
|
963
996
|
# iter over w:p in document order (includes paragraphs inside table cells).
|
|
964
997
|
for p in root.iter(w + "p"):
|
|
998
|
+
style = _docx_paragraph_style(p.find(w + "pPr"), w)
|
|
965
999
|
run_texts: List[str] = []
|
|
966
1000
|
any_text = False
|
|
967
1001
|
all_bold = True
|
|
@@ -978,6 +1012,17 @@ def _read_docx_stdlib(raw: bytes) -> str:
|
|
|
978
1012
|
if not line:
|
|
979
1013
|
paras.append("")
|
|
980
1014
|
continue
|
|
1015
|
+
# Word heading styles carry the clause structure (their numbers are
|
|
1016
|
+
# auto-generated, so absent from text). Emit them as H2 so the clause
|
|
1017
|
+
# cascade's strongest tier detects them; keep any run-in body too.
|
|
1018
|
+
if _is_heading_style(style):
|
|
1019
|
+
title = _docx_heading_title(line)
|
|
1020
|
+
if title is not None:
|
|
1021
|
+
paras.append(f"## {title}")
|
|
1022
|
+
if len(title) < len(line):
|
|
1023
|
+
paras.append(line[len(title):].lstrip(" .:\t"))
|
|
1024
|
+
continue
|
|
1025
|
+
# Sentence carrying a heading style -> treat as ordinary body text.
|
|
981
1026
|
if any_text and all_bold:
|
|
982
1027
|
line = f"**{line}**"
|
|
983
1028
|
paras.append(line)
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "extract-cli"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.4"
|
|
8
8
|
description = "Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9"
|
|
@@ -42,14 +42,36 @@ _DOCX_PARAS = [
|
|
|
42
42
|
_W = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
|
43
43
|
|
|
44
44
|
|
|
45
|
-
def _docx_paragraph(text: str, bold: bool) -> str:
|
|
45
|
+
def _docx_paragraph(text: str, bold: bool = False, style: str = "") -> str:
|
|
46
|
+
ppr = f'<w:pPr><w:pStyle w:val="{style}"/></w:pPr>' if style else ""
|
|
46
47
|
rpr = "<w:rPr><w:b/></w:rPr>" if bold else ""
|
|
47
|
-
return (f"<w:p
|
|
48
|
+
return (f"<w:p>{ppr}<w:r>{rpr}"
|
|
48
49
|
f'<w:t xml:space="preserve">{escape(text)}</w:t></w:r></w:p>')
|
|
49
50
|
|
|
50
51
|
|
|
51
|
-
|
|
52
|
-
|
|
52
|
+
# A Word-styled agreement: clause structure carried by Heading1 styles (their
|
|
53
|
+
# numbers are auto-generated, absent from text), including a run-in heading and
|
|
54
|
+
# a full sentence that merely carries the heading style (must be rejected).
|
|
55
|
+
_HEADING_DOCX_PARAS = [
|
|
56
|
+
('Cloud Service Agreement', False, "Title"),
|
|
57
|
+
('This Cloud Service Agreement is entered into as of April 4, 2024, by and '
|
|
58
|
+
'between Initech Software, Inc. (the "Provider") and Globex Corporation '
|
|
59
|
+
'(the "Customer").', False, ""),
|
|
60
|
+
('Confidentiality', False, "Heading1"),
|
|
61
|
+
('Each party will protect the other party’s Confidential Information.', False, ""),
|
|
62
|
+
('Payment. Customer will pay the fees set out in the Order Form within '
|
|
63
|
+
'thirty (30) days.', False, "Heading1"),
|
|
64
|
+
('Term & Termination', False, "Heading1"),
|
|
65
|
+
('The term of this Agreement is two (2) years and will automatically renew '
|
|
66
|
+
'for successive one-year terms.', False, ""),
|
|
67
|
+
('Either party may terminate this Agreement upon material breach that '
|
|
68
|
+
'remains uncured for thirty days after written notice.', False, "Heading1"),
|
|
69
|
+
('Governing Law', False, "Heading1"),
|
|
70
|
+
('This Agreement is governed by the laws of the State of New York.', False, ""),
|
|
71
|
+
]
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _docx_package(body: str) -> bytes:
|
|
53
75
|
document = (
|
|
54
76
|
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
|
|
55
77
|
f'<w:document xmlns:w="{_W}"><w:body>{body}<w:sectPr/></w:body></w:document>'
|
|
@@ -70,14 +92,29 @@ def build_docx() -> bytes:
|
|
|
70
92
|
'Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" '
|
|
71
93
|
'Target="word/document.xml"/></Relationships>'
|
|
72
94
|
)
|
|
95
|
+
# Deterministic: a fixed timestamp on every entry so regenerating the
|
|
96
|
+
# fixture produces byte-identical output (stable sha256 -> stable goldens).
|
|
73
97
|
buf = io.BytesIO()
|
|
74
98
|
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
99
|
+
for name, data in (("[Content_Types].xml", content_types),
|
|
100
|
+
("_rels/.rels", rels),
|
|
101
|
+
("word/document.xml", document)):
|
|
102
|
+
info = zipfile.ZipInfo(name, date_time=(1980, 1, 1, 0, 0, 0))
|
|
103
|
+
info.compress_type = zipfile.ZIP_DEFLATED
|
|
104
|
+
z.writestr(info, data)
|
|
78
105
|
return buf.getvalue()
|
|
79
106
|
|
|
80
107
|
|
|
108
|
+
def build_docx() -> bytes:
|
|
109
|
+
return _docx_package("".join(_docx_paragraph(t, b) for t, b in _DOCX_PARAS))
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def build_heading_docx() -> bytes:
|
|
113
|
+
return _docx_package(
|
|
114
|
+
"".join(_docx_paragraph(t, b, style=s) for t, b, s in _HEADING_DOCX_PARAS)
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
|
|
81
118
|
# --- PDF: a software license with ALL-CAPS headings (Tier 3) ----------------
|
|
82
119
|
|
|
83
120
|
_PDF_TEXT = """SOFTWARE LICENSE AGREEMENT
|
|
@@ -156,6 +193,7 @@ def build_scanned_pdf() -> bytes:
|
|
|
156
193
|
|
|
157
194
|
_BINARY_FIXTURES = {
|
|
158
195
|
"employment_docx.docx": build_docx,
|
|
196
|
+
"heading_docx.docx": build_heading_docx,
|
|
159
197
|
"license_pdf.pdf": build_pdf,
|
|
160
198
|
"scanned.pdf": build_scanned_pdf,
|
|
161
199
|
}
|
|
@@ -20,8 +20,8 @@ from tests._fixtures_build import ensure_binary_fixtures # noqa: E402
|
|
|
20
20
|
FIXTURES = Path(__file__).resolve().parent / "fixtures"
|
|
21
21
|
|
|
22
22
|
DOCS = ["nda_h2.md", "services_bold.txt", "lease_allcaps.txt",
|
|
23
|
-
"employment_docx.docx", "
|
|
24
|
-
"scanned.pdf"]
|
|
23
|
+
"employment_docx.docx", "heading_docx.docx", "license_pdf.pdf",
|
|
24
|
+
"services_html.html", "scanned.pdf"]
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
def golden_for(name: str) -> dict:
|
|
@@ -25,6 +25,7 @@ CORPUS: Tuple[Tuple[str, str, str], ...] = (
|
|
|
25
25
|
("services_bold.txt", "bold-numbered", "text"),
|
|
26
26
|
("lease_allcaps.txt", "all-caps", "text"),
|
|
27
27
|
("employment_docx.docx", "bold-numbered", "docx"),
|
|
28
|
+
("heading_docx.docx", "h2", "docx"),
|
|
28
29
|
("license_pdf.pdf", "all-caps", "pdf"),
|
|
29
30
|
("services_html.html", "numbered", "html"),
|
|
30
31
|
)
|
|
Binary file
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"document": {
|
|
3
3
|
"title": "EMPLOYMENT AGREEMENT",
|
|
4
4
|
"format": "docx",
|
|
5
|
-
"sha256": "
|
|
5
|
+
"sha256": "f50e4b9b0cb77250280eb4c26225009de063b5f4a2318e9e53784d3730d20bd1",
|
|
6
6
|
"source_path": "employment_docx.docx"
|
|
7
7
|
},
|
|
8
8
|
"parties": [
|
|
@@ -138,7 +138,7 @@
|
|
|
138
138
|
"source": "deterministic"
|
|
139
139
|
},
|
|
140
140
|
"_meta": {
|
|
141
|
-
"extractor_version": "0.1.
|
|
141
|
+
"extractor_version": "0.1.4",
|
|
142
142
|
"tiers_used": [
|
|
143
143
|
"deterministic"
|
|
144
144
|
],
|
|
Binary file
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
{
|
|
2
|
+
"document": {
|
|
3
|
+
"title": "Cloud Service Agreement",
|
|
4
|
+
"format": "docx",
|
|
5
|
+
"sha256": "23a3b14196cdca6b58d14c7a6836fe28ff6d2be6c2fd852badb03ab6b6e84056",
|
|
6
|
+
"source_path": "heading_docx.docx"
|
|
7
|
+
},
|
|
8
|
+
"parties": [
|
|
9
|
+
{
|
|
10
|
+
"name": "Initech Software, Inc.",
|
|
11
|
+
"confidence": 0.9,
|
|
12
|
+
"source": "deterministic",
|
|
13
|
+
"role": "Provider"
|
|
14
|
+
},
|
|
15
|
+
{
|
|
16
|
+
"name": "Globex Corporation",
|
|
17
|
+
"confidence": 0.9,
|
|
18
|
+
"source": "deterministic",
|
|
19
|
+
"role": "Customer"
|
|
20
|
+
}
|
|
21
|
+
],
|
|
22
|
+
"dates": {
|
|
23
|
+
"effective": {
|
|
24
|
+
"value": "2024-04-04",
|
|
25
|
+
"confidence": 0.85,
|
|
26
|
+
"source": "deterministic"
|
|
27
|
+
},
|
|
28
|
+
"expiration": {
|
|
29
|
+
"value": null,
|
|
30
|
+
"confidence": 0.0,
|
|
31
|
+
"source": "none"
|
|
32
|
+
}
|
|
33
|
+
},
|
|
34
|
+
"term": {
|
|
35
|
+
"length": {
|
|
36
|
+
"value": "2 years",
|
|
37
|
+
"confidence": 0.7,
|
|
38
|
+
"source": "deterministic"
|
|
39
|
+
},
|
|
40
|
+
"auto_renew": {
|
|
41
|
+
"value": true,
|
|
42
|
+
"confidence": 0.65,
|
|
43
|
+
"source": "deterministic"
|
|
44
|
+
},
|
|
45
|
+
"notice_period_days": {
|
|
46
|
+
"value": null,
|
|
47
|
+
"confidence": 0.0,
|
|
48
|
+
"source": "none"
|
|
49
|
+
}
|
|
50
|
+
},
|
|
51
|
+
"governing_law": {
|
|
52
|
+
"value": "State of New York",
|
|
53
|
+
"confidence": 0.85,
|
|
54
|
+
"source": "deterministic"
|
|
55
|
+
},
|
|
56
|
+
"clauses": [
|
|
57
|
+
{
|
|
58
|
+
"canonical_title": "Cloud Service Agreement",
|
|
59
|
+
"detected_title": "## Cloud Service Agreement",
|
|
60
|
+
"tier": "h2",
|
|
61
|
+
"span": {
|
|
62
|
+
"start": 0,
|
|
63
|
+
"end": 191
|
|
64
|
+
},
|
|
65
|
+
"confidence": 0.71,
|
|
66
|
+
"source": "deterministic",
|
|
67
|
+
"mapped": false
|
|
68
|
+
},
|
|
69
|
+
{
|
|
70
|
+
"canonical_title": "Confidentiality",
|
|
71
|
+
"detected_title": "## Confidentiality",
|
|
72
|
+
"tier": "h2",
|
|
73
|
+
"span": {
|
|
74
|
+
"start": 191,
|
|
75
|
+
"end": 280
|
|
76
|
+
},
|
|
77
|
+
"confidence": 0.95,
|
|
78
|
+
"source": "deterministic",
|
|
79
|
+
"mapped": true
|
|
80
|
+
},
|
|
81
|
+
{
|
|
82
|
+
"canonical_title": "Payment",
|
|
83
|
+
"detected_title": "## Payment",
|
|
84
|
+
"tier": "h2",
|
|
85
|
+
"span": {
|
|
86
|
+
"start": 280,
|
|
87
|
+
"end": 371
|
|
88
|
+
},
|
|
89
|
+
"confidence": 0.95,
|
|
90
|
+
"source": "deterministic",
|
|
91
|
+
"mapped": true
|
|
92
|
+
},
|
|
93
|
+
{
|
|
94
|
+
"canonical_title": "Termination",
|
|
95
|
+
"detected_title": "## Term & Termination",
|
|
96
|
+
"tier": "h2",
|
|
97
|
+
"span": {
|
|
98
|
+
"start": 371,
|
|
99
|
+
"end": 622
|
|
100
|
+
},
|
|
101
|
+
"confidence": 0.95,
|
|
102
|
+
"source": "deterministic",
|
|
103
|
+
"mapped": true
|
|
104
|
+
},
|
|
105
|
+
{
|
|
106
|
+
"canonical_title": "Governing Law",
|
|
107
|
+
"detected_title": "## Governing Law",
|
|
108
|
+
"tier": "h2",
|
|
109
|
+
"span": {
|
|
110
|
+
"start": 622,
|
|
111
|
+
"end": 704
|
|
112
|
+
},
|
|
113
|
+
"confidence": 0.95,
|
|
114
|
+
"source": "deterministic",
|
|
115
|
+
"mapped": true
|
|
116
|
+
}
|
|
117
|
+
],
|
|
118
|
+
"defined_terms": [
|
|
119
|
+
{
|
|
120
|
+
"term": "Provider",
|
|
121
|
+
"confidence": 0.6,
|
|
122
|
+
"source": "deterministic"
|
|
123
|
+
},
|
|
124
|
+
{
|
|
125
|
+
"term": "Customer",
|
|
126
|
+
"confidence": 0.6,
|
|
127
|
+
"source": "deterministic"
|
|
128
|
+
}
|
|
129
|
+
],
|
|
130
|
+
"value": {
|
|
131
|
+
"value": null,
|
|
132
|
+
"confidence": 0.0,
|
|
133
|
+
"source": "none"
|
|
134
|
+
},
|
|
135
|
+
"_meta": {
|
|
136
|
+
"extractor_version": "0.1.4",
|
|
137
|
+
"tiers_used": [
|
|
138
|
+
"deterministic"
|
|
139
|
+
],
|
|
140
|
+
"llm_used": false
|
|
141
|
+
}
|
|
142
|
+
}
|
|
@@ -142,6 +142,37 @@ def test_pdf_unescape() -> None:
|
|
|
142
142
|
assert ex._pdf_unescape(r"\101\102") == "AB" # octal escapes
|
|
143
143
|
|
|
144
144
|
|
|
145
|
+
def test_docx_heading_style_helpers() -> None:
|
|
146
|
+
assert ex._is_heading_style("Heading1")
|
|
147
|
+
assert ex._is_heading_style("Heading 2".replace(" ", ""))
|
|
148
|
+
assert ex._is_heading_style("Title")
|
|
149
|
+
assert ex._is_heading_style("h3")
|
|
150
|
+
assert not ex._is_heading_style("Plain")
|
|
151
|
+
assert not ex._is_heading_style(None)
|
|
152
|
+
# Run-in heading: title is the lead before the sentence body.
|
|
153
|
+
assert ex._docx_heading_title("Payment. Customer will pay the fees.") == "Payment"
|
|
154
|
+
assert ex._docx_heading_title("Governing Law") == "Governing Law"
|
|
155
|
+
# A full sentence carrying a heading style is rejected (not a clause title).
|
|
156
|
+
assert ex._docx_heading_title(
|
|
157
|
+
"Either party may terminate this Agreement upon material breach that "
|
|
158
|
+
"remains uncured for thirty days.") is None
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def test_docx_heading_styles_drive_clause_map() -> None:
|
|
162
|
+
"""The Word-styled fixture's clauses come from Heading1 styles (their
|
|
163
|
+
numbers are auto-generated), detected via the H2 tier; the sentence that
|
|
164
|
+
merely carries a heading style is not a clause."""
|
|
165
|
+
raw, text, fmt, _w = ex.load_source(FIXTURES / "heading_docx.docx", prefer_optional=False)
|
|
166
|
+
result = ex.build_extraction(text, raw, fmt, "heading_docx.docx")
|
|
167
|
+
assert result["clauses"], "heading-styled docx should yield clauses"
|
|
168
|
+
canon = {c["canonical_title"] for c in result["clauses"]}
|
|
169
|
+
assert {"Confidentiality", "Payment", "Governing Law"} <= canon
|
|
170
|
+
assert all(c["tier"] == "h2" for c in result["clauses"])
|
|
171
|
+
# The full-sentence "Either party may terminate ..." must not appear.
|
|
172
|
+
assert not any("terminate this Agreement" in c["detected_title"] for c in result["clauses"])
|
|
173
|
+
assert [p["name"] for p in result["parties"]] == ["Initech Software, Inc.", "Globex Corporation"]
|
|
174
|
+
|
|
175
|
+
|
|
145
176
|
def test_html_extraction() -> None:
|
|
146
177
|
raw, text, fmt, _w = ex.load_source(FIXTURES / "services_html.html")
|
|
147
178
|
assert fmt == "html"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|