extract-cli 0.1.0__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {extract_cli-0.1.0 → extract_cli-0.1.2}/ARCHITECTURE.md +3 -0
- {extract_cli-0.1.0 → extract_cli-0.1.2}/CHANGELOG.md +60 -0
- {extract_cli-0.1.0 → extract_cli-0.1.2}/PKG-INFO +8 -7
- {extract_cli-0.1.0 → extract_cli-0.1.2}/README.md +6 -5
- {extract_cli-0.1.0 → extract_cli-0.1.2}/docs/spec/extract-output.schema.json +3 -1
- {extract_cli-0.1.0 → extract_cli-0.1.2}/extract_cli.py +260 -54
- {extract_cli-0.1.0 → extract_cli-0.1.2}/pyproject.toml +2 -2
- {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/_make_goldens.py +2 -1
- {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/conftest.py +1 -0
- {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/fixtures/employment_docx.docx.expected.json +1 -1
- {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/fixtures/lease_allcaps.txt.expected.json +1 -1
- {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/fixtures/license_pdf.pdf.expected.json +1 -1
- {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/fixtures/nda_h2.md.expected.json +6 -1
- {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/fixtures/scanned.pdf.expected.json +1 -1
- {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/fixtures/services_bold.txt.expected.json +1 -1
- extract_cli-0.1.2/tests/fixtures/services_html.html +35 -0
- extract_cli-0.1.2/tests/fixtures/services_html.html.expected.json +157 -0
- {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/test_clause_map.py +44 -0
- {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/test_cli.py +1 -1
- {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/test_deterministic.py +54 -2
- {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/test_misc.py +39 -0
- {extract_cli-0.1.0 → extract_cli-0.1.2}/.gitignore +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.2}/CONTRIBUTING.md +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.2}/LICENSE +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.2}/Makefile +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.2}/config/llm.json.example +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.2}/docs/INTEROP.md +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.2}/scripts/release.py +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.2}/scripts/validate_against_spec.py +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/_fixtures_build.py +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/_schema_validator.py +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/fixtures/employment_docx.docx +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/fixtures/lease_allcaps.txt +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/fixtures/license_pdf.pdf +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/fixtures/nda_h2.md +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/fixtures/scanned.pdf +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/fixtures/services_bold.txt +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/test_llm.py +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/test_property.py +0 -0
- {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/test_schema_conformance.py +0 -0
|
@@ -8,11 +8,14 @@ map.
|
|
|
8
8
|
```
|
|
9
9
|
load_source(path) extension/content sniff → reader
|
|
10
10
|
├─ .md/.txt → utf-8 decode
|
|
11
|
+
├─ .html → stdlib html.parser reader (also auto-detected inside .txt)
|
|
11
12
|
├─ .docx → python-docx (if [docx]) else stdlib zipfile/XML reader
|
|
12
13
|
└─ .pdf → pypdf (if [pdf]) else stdlib zlib + text-operator reader
|
|
13
14
|
│
|
|
14
15
|
▼ (raw_bytes, text, format, warnings)
|
|
15
16
|
build_extraction(text, raw, fmt, src) the DETERMINISTIC tier (always on)
|
|
17
|
+
│ field extractors run on a whitespace-FLATTENED copy (so values that wrap
|
|
18
|
+
│ across a line are matched whole); clause detection keeps the original text
|
|
16
19
|
├─ extract_parties "between X and Y", with role parentheticals
|
|
17
20
|
├─ extract_dates effective / expiration, ISO-normalized
|
|
18
21
|
├─ extract_term length / auto_renew / notice_period_days
|
|
@@ -6,6 +6,64 @@ to [Semantic Versioning](https://semver.org/). Per the suite convention
|
|
|
6
6
|
(see [`docs/INTEROP.md`](docs/INTEROP.md)), **backward-incompatible changes to
|
|
7
7
|
the output schema require a major version bump**; new optional fields are minor.
|
|
8
8
|
|
|
9
|
+
## [0.1.2] - 2026-05-21
|
|
10
|
+
|
|
11
|
+
More real-world hardening, driven by testing against five additional contracts
|
|
12
|
+
(SEC EDGAR consulting/MSA, lease, and Visteon services agreements; Common Paper
|
|
13
|
+
and Perigon Cloud Service Agreements).
|
|
14
|
+
|
|
15
|
+
### Added
|
|
16
|
+
- **HTML input** (`.html`/`.htm`, and HTML auto-detected inside `.txt` such as
|
|
17
|
+
SEC EDGAR full submissions). Stdlib `html.parser`-based reader strips
|
|
18
|
+
script/style, frames block elements so heading detection still works, and
|
|
19
|
+
unescapes entities. `document.format` enum gains `html` (backward-compatible
|
|
20
|
+
widening). This turns the large class of HTML contracts (SEC exhibits, web
|
|
21
|
+
ToS) from garbage into structured output.
|
|
22
|
+
|
|
23
|
+
### Fixed
|
|
24
|
+
- **Field extraction now runs on whitespace-flattened text**, so values that
|
|
25
|
+
wrap across a line break are matched whole — e.g. governing law
|
|
26
|
+
`the laws of the Province\nof Ontario` now yields `Province of Ontario`, and
|
|
27
|
+
line-wrapped party names/defined terms are captured.
|
|
28
|
+
- **Party extraction** (continues issue #2): names are trimmed of trailing
|
|
29
|
+
descriptors (`, a Delaware corporation`, `doing business as …`,
|
|
30
|
+
`having its offices at …`, `as of …`), and each party must begin with a
|
|
31
|
+
capital so an `and` *inside* a party's own description no longer splits the
|
|
32
|
+
parties (`…V6E 3S7 and doing business as …` → real parties recovered).
|
|
33
|
+
|
|
34
|
+
### Known limitations (documented, not bugs)
|
|
35
|
+
- The stdlib PDF reader cannot decode PDFs that use embedded subset fonts with
|
|
36
|
+
hex-encoded glyph strings (common in professionally-typeset PDFs); these
|
|
37
|
+
degrade gracefully to a low-signal warning. Install the `[pdf]` extra (pypdf)
|
|
38
|
+
for them — verified to recover full text and clause structure.
|
|
39
|
+
- Two-line `ARTICLE N` / title headings (number on one line, title on the next)
|
|
40
|
+
are not yet detected.
|
|
41
|
+
|
|
42
|
+
## [0.1.1] - 2026-05-21
|
|
43
|
+
|
|
44
|
+
Real-world hardening, driven by testing against a SEC EDGAR employment
|
|
45
|
+
agreement and the Common Paper Mutual NDA (PDF/DOCX).
|
|
46
|
+
|
|
47
|
+
### Added
|
|
48
|
+
- **`numbered` clause-detection tier** for plain numbered headings
|
|
49
|
+
(`1. Termination`, `Section 3. Payment`, `Article IV. …`) — the dominant
|
|
50
|
+
format in foreign paper, missed by the H2/bold/ALL-CAPS tiers. A title-case
|
|
51
|
+
heuristic rejects numbered sentences and list items. The output schema's
|
|
52
|
+
clause `tier` enum gains `numbered` (a backward-compatible widening).
|
|
53
|
+
|
|
54
|
+
### Fixed
|
|
55
|
+
- **PDF reader** now extracts text only from inside `BT … ET` text objects, so
|
|
56
|
+
embedded fonts, digital-signature blobs, and metadata streams no longer leak
|
|
57
|
+
binary noise (a real signed PDF dropped from ~188 KB of garbage to ~8.7 KB of
|
|
58
|
+
clean text). Added a printable-ratio backstop.
|
|
59
|
+
- **Effective date**: anchor on `(the "Effective Date")` and a bare
|
|
60
|
+
`as of <date>` cue; handle dates that wrap across a line break.
|
|
61
|
+
- **Term length**: require a real number, dropping false positives such as
|
|
62
|
+
`…consecutive days`.
|
|
63
|
+
- **Title**: skip SGML/XML wrapper lines (e.g. SEC EDGAR `<DOCUMENT>` headers).
|
|
64
|
+
- Strip trailing punctuation from clause titles (`Other Benefits.` →
|
|
65
|
+
`Other Benefits`).
|
|
66
|
+
|
|
9
67
|
## [0.1.0] - 2026-05-21
|
|
10
68
|
|
|
11
69
|
Initial release — the open-loop front door of the contract-ops CLI suite.
|
|
@@ -57,4 +115,6 @@ Initial release — the open-loop front door of the contract-ops CLI suite.
|
|
|
57
115
|
intentionally *not* governed by the output schema (the schema describes the
|
|
58
116
|
full default output).
|
|
59
117
|
|
|
118
|
+
[0.1.2]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.2
|
|
119
|
+
[0.1.1]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.1
|
|
60
120
|
[0.1.0]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.0
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: extract-cli
|
|
3
|
-
Version: 0.1.
|
|
4
|
-
Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.docx/.pdf) and emit structured JSON.
|
|
3
|
+
Version: 0.1.2
|
|
4
|
+
Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON.
|
|
5
5
|
Project-URL: Homepage, https://cli.drbaher.com/
|
|
6
6
|
Project-URL: Repository, https://github.com/DrBaher/extract-cli
|
|
7
7
|
Project-URL: Suite interop, https://github.com/DrBaher/extract-cli/blob/main/docs/INTEROP.md
|
|
@@ -63,8 +63,8 @@ ingest (extract) → review → diff → convert → sign
|
|
|
63
63
|
|
|
64
64
|
## What it does
|
|
65
65
|
|
|
66
|
-
Give it a contract in **`.md` / `.txt`** (native), **`.docx`**, or
|
|
67
|
-
and it returns structured JSON: the parties, dates, term, governing law, a
|
|
66
|
+
Give it a contract in **`.md` / `.txt` / `.html`** (native), **`.docx`**, or
|
|
67
|
+
**`.pdf`**, and it returns structured JSON: the parties, dates, term, governing law, a
|
|
68
68
|
**clause map** normalized onto the suite's canonical clause vocabulary, a
|
|
69
69
|
defined-term inventory, and a headline value. Every field carries a
|
|
70
70
|
`confidence` and a `source` so downstream tools **verify, don't trust**.
|
|
@@ -75,14 +75,15 @@ daemon, no network in the default path.
|
|
|
75
75
|
## Install
|
|
76
76
|
|
|
77
77
|
```bash
|
|
78
|
-
pip install extract-cli # core: .md/.txt + best-effort .docx/.pdf
|
|
78
|
+
pip install extract-cli # core: .md/.txt/.html + best-effort .docx/.pdf
|
|
79
79
|
pip install "extract-cli[docx]" # higher-fidelity .docx (python-docx)
|
|
80
80
|
pip install "extract-cli[pdf]" # higher-fidelity .pdf (pypdf)
|
|
81
81
|
pip install "extract-cli[docx,pdf]" # both
|
|
82
82
|
```
|
|
83
83
|
|
|
84
|
-
The core has **zero runtime dependencies** and is fully functional on
|
|
85
|
-
with no extras
|
|
84
|
+
The core has **zero runtime dependencies** and is fully functional on
|
|
85
|
+
`.md`/`.txt`/`.html` with no extras (HTML is also auto-detected when it hides
|
|
86
|
+
inside a `.txt`, e.g. SEC EDGAR filings). `.docx` and `.pdf` work out of the box via stdlib readers; the
|
|
86
87
|
`[docx]`/`[pdf]` extras improve fidelity on complex documents (see
|
|
87
88
|
[ARCHITECTURE.md](ARCHITECTURE.md)).
|
|
88
89
|
|
|
@@ -25,8 +25,8 @@ ingest (extract) → review → diff → convert → sign
|
|
|
25
25
|
|
|
26
26
|
## What it does
|
|
27
27
|
|
|
28
|
-
Give it a contract in **`.md` / `.txt`** (native), **`.docx`**, or
|
|
29
|
-
and it returns structured JSON: the parties, dates, term, governing law, a
|
|
28
|
+
Give it a contract in **`.md` / `.txt` / `.html`** (native), **`.docx`**, or
|
|
29
|
+
**`.pdf`**, and it returns structured JSON: the parties, dates, term, governing law, a
|
|
30
30
|
**clause map** normalized onto the suite's canonical clause vocabulary, a
|
|
31
31
|
defined-term inventory, and a headline value. Every field carries a
|
|
32
32
|
`confidence` and a `source` so downstream tools **verify, don't trust**.
|
|
@@ -37,14 +37,15 @@ daemon, no network in the default path.
|
|
|
37
37
|
## Install
|
|
38
38
|
|
|
39
39
|
```bash
|
|
40
|
-
pip install extract-cli # core: .md/.txt + best-effort .docx/.pdf
|
|
40
|
+
pip install extract-cli # core: .md/.txt/.html + best-effort .docx/.pdf
|
|
41
41
|
pip install "extract-cli[docx]" # higher-fidelity .docx (python-docx)
|
|
42
42
|
pip install "extract-cli[pdf]" # higher-fidelity .pdf (pypdf)
|
|
43
43
|
pip install "extract-cli[docx,pdf]" # both
|
|
44
44
|
```
|
|
45
45
|
|
|
46
|
-
The core has **zero runtime dependencies** and is fully functional on
|
|
47
|
-
with no extras
|
|
46
|
+
The core has **zero runtime dependencies** and is fully functional on
|
|
47
|
+
`.md`/`.txt`/`.html` with no extras (HTML is also auto-detected when it hides
|
|
48
|
+
inside a `.txt`, e.g. SEC EDGAR filings). `.docx` and `.pdf` work out of the box via stdlib readers; the
|
|
48
49
|
`[docx]`/`[pdf]` extras improve fidelity on complex documents (see
|
|
49
50
|
[ARCHITECTURE.md](ARCHITECTURE.md)).
|
|
50
51
|
|
|
@@ -4,8 +4,8 @@
|
|
|
4
4
|
The suite is a contract lifecycle (store -> draft -> review -> diff -> convert
|
|
5
5
|
-> sign) that, until now, only handled documents it authored from its own
|
|
6
6
|
templates. `extract-cli` is "passport control": it ingests ANY document --
|
|
7
|
-
yours or a counterparty's foreign paper -- in .md/.txt (natively), .docx,
|
|
8
|
-
.pdf, and emits a structured JSON representation that the rest of the suite
|
|
7
|
+
yours or a counterparty's foreign paper -- in .md/.txt/.html (natively), .docx,
|
|
8
|
+
or .pdf, and emits a structured JSON representation that the rest of the suite
|
|
9
9
|
(nda-review-cli, compare-cli, contract-vault) consumes.
|
|
10
10
|
|
|
11
11
|
Two extraction tiers:
|
|
@@ -32,6 +32,7 @@ from __future__ import annotations
|
|
|
32
32
|
import argparse
|
|
33
33
|
import datetime as _dt
|
|
34
34
|
import hashlib
|
|
35
|
+
import html.parser
|
|
35
36
|
import importlib.util
|
|
36
37
|
import json
|
|
37
38
|
import os
|
|
@@ -42,11 +43,11 @@ import urllib.request
|
|
|
42
43
|
from pathlib import Path
|
|
43
44
|
from typing import Any, Dict, List, Optional, Tuple
|
|
44
45
|
|
|
45
|
-
__version__ = "0.1.
|
|
46
|
+
__version__ = "0.1.2"
|
|
46
47
|
|
|
47
48
|
# Bumped independently of the package version when the *extraction logic*
|
|
48
49
|
# changes in a way downstream consumers should notice. Embedded in `_meta`.
|
|
49
|
-
EXTRACTOR_VERSION = "0.1.
|
|
50
|
+
EXTRACTOR_VERSION = "0.1.2"
|
|
50
51
|
|
|
51
52
|
# JSON Schema version of the output contract (docs/spec/extract-output.schema.json).
|
|
52
53
|
SCHEMA_VERSION = 1
|
|
@@ -214,6 +215,49 @@ def _qualifies_as_all_caps_heading(title: str) -> bool:
|
|
|
214
215
|
return sum(1 for ch in title if "A" <= ch <= "Z") >= 4
|
|
215
216
|
|
|
216
217
|
|
|
218
|
+
# Tier between bold-numbered and ALL-CAPS: plain numbered headings on their own
|
|
219
|
+
# line -- "1. Termination", "5. Wage Compensation", "Section 3. Payment",
|
|
220
|
+
# "Article IV. Confidentiality". These are the dominant real-world format in
|
|
221
|
+
# foreign paper (and aren't caught by H2, **bold**, or ALL-CAPS). A title-case
|
|
222
|
+
# heuristic distinguishes a heading from a numbered *sentence* or list item.
|
|
223
|
+
_NUMBERED_HEADING_RE = re.compile(
|
|
224
|
+
r"^[ \t]*"
|
|
225
|
+
r"(?:(?:Article|Section|ARTICLE|SECTION)[ \t]+)?"
|
|
226
|
+
r"(?:" + _ROMAN_RE + r"|\d{1,2})\.?"
|
|
227
|
+
r"[ \t]+"
|
|
228
|
+
r"([A-Z][A-Za-z][^\n]{0,58})"
|
|
229
|
+
r"[ \t]*$",
|
|
230
|
+
re.MULTILINE,
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
# Lowercase words allowed inside an otherwise Title-Cased heading.
|
|
234
|
+
_HEADING_STOPWORDS = {
|
|
235
|
+
"a", "an", "the", "and", "or", "of", "to", "for", "in", "on", "with",
|
|
236
|
+
"by", "at", "as", "per", "from", "into", "nor", "but",
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def _qualifies_as_numbered_heading(title: str) -> bool:
|
|
241
|
+
"""A numbered line qualifies as a heading only if its title looks like a
|
|
242
|
+
heading: 1-9 words, Title-Cased (every word starts uppercase or is a short
|
|
243
|
+
lowercase connector), no sentence-y lowercase verbs. A single word must be
|
|
244
|
+
>= 4 letters. Rejects 'The parties agree as follows' but accepts 'Wage
|
|
245
|
+
Compensation' and 'Term And Nature Of Employment'."""
|
|
246
|
+
t = title.strip().rstrip(".").strip()
|
|
247
|
+
words = t.split()
|
|
248
|
+
if not (1 <= len(words) <= 9):
|
|
249
|
+
return False
|
|
250
|
+
if len(words) == 1:
|
|
251
|
+
return sum(1 for ch in words[0] if ch.isalpha()) >= 4 and words[0][:1].isupper()
|
|
252
|
+
for w in words:
|
|
253
|
+
if w[:1].isupper() or not w[:1].isalpha():
|
|
254
|
+
continue # capitalized word, or punctuation/number token
|
|
255
|
+
if w.lower() in _HEADING_STOPWORDS:
|
|
256
|
+
continue # allowed connector
|
|
257
|
+
return False # a lowercase content word => this is a sentence, not a heading
|
|
258
|
+
return True
|
|
259
|
+
|
|
260
|
+
|
|
217
261
|
def detect_clauses(text: str) -> List[JSON]:
|
|
218
262
|
"""Run the three-tier cascade and return clauses with their detection tier.
|
|
219
263
|
|
|
@@ -227,6 +271,12 @@ def detect_clauses(text: str) -> List[JSON]:
|
|
|
227
271
|
bold = list(_BOLD_HEADING_RE.finditer(text))
|
|
228
272
|
if len(bold) >= 2:
|
|
229
273
|
return _matches_to_clauses(text, bold, group=1, tier="bold-numbered")
|
|
274
|
+
numbered = [
|
|
275
|
+
m for m in _NUMBERED_HEADING_RE.finditer(text)
|
|
276
|
+
if _qualifies_as_numbered_heading(m.group(1))
|
|
277
|
+
]
|
|
278
|
+
if len(numbered) >= 2:
|
|
279
|
+
return _matches_to_clauses(text, numbered, group=1, tier="numbered")
|
|
230
280
|
caps = [
|
|
231
281
|
m for m in _ALL_CAPS_HEADING_RE.finditer(text)
|
|
232
282
|
if _qualifies_as_all_caps_heading(m.group(1))
|
|
@@ -266,8 +316,9 @@ def _matches_to_clauses(text: str, matches: List["re.Match[str]"], group: int,
|
|
|
266
316
|
|
|
267
317
|
|
|
268
318
|
def _norm_clause_key(s: str) -> str:
|
|
269
|
-
"""Normalize a clause title/alias for matching (number-stripped,
|
|
270
|
-
|
|
319
|
+
"""Normalize a clause title/alias for matching (number-stripped, trailing
|
|
320
|
+
punctuation removed, lowercased)."""
|
|
321
|
+
return _strip_clause_number(s).strip().lower().rstrip(" .:;,")
|
|
271
322
|
|
|
272
323
|
|
|
273
324
|
# ---------------------------------------------------------------------------
|
|
@@ -366,7 +417,7 @@ def _canonicalize_clause(detected_title: str) -> Tuple[Optional[str], bool]:
|
|
|
366
417
|
best, best_len = canonical, len(alias_key)
|
|
367
418
|
if best is not None:
|
|
368
419
|
return best, True
|
|
369
|
-
return _titlecase(detected_title), False
|
|
420
|
+
return _titlecase(detected_title.strip().rstrip(" .:;,")), False
|
|
370
421
|
|
|
371
422
|
|
|
372
423
|
# ---------------------------------------------------------------------------
|
|
@@ -421,11 +472,17 @@ _DATE_PAT = (
|
|
|
421
472
|
)
|
|
422
473
|
_DATE_RE = re.compile(_DATE_PAT, re.IGNORECASE)
|
|
423
474
|
|
|
475
|
+
# Highest-confidence: a date explicitly labeled "(the "Effective Date")".
|
|
476
|
+
_EFFDATE_LABEL_RE = re.compile(
|
|
477
|
+
r"(" + _DATE_PAT + r")\s*\(\s*(?:the\s+)?[\"“]?\s*Effective\s+Date",
|
|
478
|
+
re.IGNORECASE,
|
|
479
|
+
)
|
|
424
480
|
_EFFECTIVE_RE = re.compile(
|
|
425
481
|
r"(?:effective(?:\s+date)?(?:\s+(?:as\s+of|date|on))?|"
|
|
426
482
|
r"dated(?:\s+as\s+of)?|"
|
|
427
483
|
r"made(?:\s+and\s+entered\s+into)?(?:\s+as\s+of|\s+on)?|"
|
|
428
|
-
r"entered\s+into(?:\s+as\s+of|\s+on)
|
|
484
|
+
r"entered\s+into(?:\s+as\s+of|\s+on)?|"
|
|
485
|
+
r"as\s+of)"
|
|
429
486
|
r"[\s:,]+(?:the\s+)?(" + _DATE_PAT + r")",
|
|
430
487
|
re.IGNORECASE,
|
|
431
488
|
)
|
|
@@ -436,10 +493,17 @@ _EXPIRE_RE = re.compile(
|
|
|
436
493
|
re.IGNORECASE,
|
|
437
494
|
)
|
|
438
495
|
|
|
496
|
+
# Each party must start with a capital letter (optionally "the X"), a quote, or
|
|
497
|
+
# a paren. This is case-sensitive on purpose (no global IGNORECASE -- only the
|
|
498
|
+
# keywords are): it lets the engine skip an "and" that sits INSIDE a party's own
|
|
499
|
+
# description ("...V6E 3S7 and doing business as ...", where the right side
|
|
500
|
+
# starts lowercase) and find the real "and" before the second named entity.
|
|
501
|
+
_PARTY_START = r"(?:(?:[Tt]he|its)\s+)?[A-Z\"“(]"
|
|
439
502
|
_PARTY_BLOCK_RE = re.compile(
|
|
440
|
-
r"
|
|
441
|
-
r"(
|
|
442
|
-
|
|
503
|
+
r"(?i:\b(?:by\s+and\s+between|between)\s+)"
|
|
504
|
+
r"(" + _PARTY_START + r"[^\n]{1,200}?)\s+and\s+"
|
|
505
|
+
r"(" + _PARTY_START + r"[^\n]{1,200}?)"
|
|
506
|
+
r"(?=[\.;\n]|(?i:\bwhereas\b|\beffective\b|\bdated\b|\bas\s+of\b|\bwitnesseth\b)|$)",
|
|
443
507
|
)
|
|
444
508
|
_ROLE_PAREN_RE = re.compile(
|
|
445
509
|
r"\(\s*(?:the\s+)?[\"“]?([^\"”()]+?)[\"”]?\s*\)"
|
|
@@ -534,18 +598,54 @@ def _parse_date_to_iso(s: str) -> Optional[str]:
|
|
|
534
598
|
return None
|
|
535
599
|
|
|
536
600
|
|
|
601
|
+
def _date_field_from_str(raw: str, base_conf: float) -> JSON:
|
|
602
|
+
raw = re.sub(r"\s+", " ", raw.strip())
|
|
603
|
+
iso = _parse_date_to_iso(raw)
|
|
604
|
+
if iso is not None:
|
|
605
|
+
return _field(iso, base_conf)
|
|
606
|
+
return _field(raw, max(0.0, base_conf - 0.3))
|
|
607
|
+
|
|
608
|
+
|
|
537
609
|
def _date_field(match: Optional["re.Match[str]"]) -> JSON:
|
|
538
610
|
if match is None:
|
|
539
611
|
return _none_field()
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
612
|
+
return _date_field_from_str(match.group(1), 0.85)
|
|
613
|
+
|
|
614
|
+
|
|
615
|
+
# Trailing descriptors that follow a party's actual name and should be dropped
|
|
616
|
+
# ("Acme Corp., a Delaware corporation", "... doing business as Foo", "... as of
|
|
617
|
+
# March 1", "... having its offices at ..."). Each is matched and everything from
|
|
618
|
+
# it onward is cut.
|
|
619
|
+
_PARTY_CUT_MARKERS: Tuple[str, ...] = (
|
|
620
|
+
r",\s+an?\s+\w", # ", a Delaware ..." / ", an Ohio ..."
|
|
621
|
+
r"\s+doing\s+business\s+as\b",
|
|
622
|
+
r"\s+d/?b/?a\b",
|
|
623
|
+
r"\s+f/?k/?a\b",
|
|
624
|
+
r"\s+a[n]?\s+\w+\s+(?:corporation|company|partnership|limited)\b",
|
|
625
|
+
r"\s+having\b",
|
|
626
|
+
r"\s+with\s+(?:its\s+)?(?:offices|principal|a\s)\b",
|
|
627
|
+
r"\s+with\s+offices\b",
|
|
628
|
+
r"\s+located\b",
|
|
629
|
+
r"\s+organized\b",
|
|
630
|
+
r"\s+incorporated\b",
|
|
631
|
+
r"\s+whose\b",
|
|
632
|
+
r"\s+(?:as\s+of|dated|effective)\b",
|
|
633
|
+
)
|
|
634
|
+
|
|
635
|
+
|
|
636
|
+
def _clean_party_name(s: str) -> str:
|
|
637
|
+
"""Trim a captured party name down to the entity name, dropping trailing
|
|
638
|
+
descriptors ('a Delaware corporation', 'd/b/a ...', 'as of ...')."""
|
|
639
|
+
s = re.sub(r"\s+", " ", s).strip().strip(",").strip()
|
|
640
|
+
for pat in _PARTY_CUT_MARKERS:
|
|
641
|
+
m = re.search(pat, s, re.IGNORECASE)
|
|
642
|
+
if m:
|
|
643
|
+
s = s[: m.start()].strip().strip(",").strip()
|
|
644
|
+
return s.strip("\"“”").strip()
|
|
545
645
|
|
|
546
646
|
|
|
547
647
|
def _split_name_role(s: str) -> Tuple[str, Optional[str]]:
|
|
548
|
-
s = s.strip().strip(",").strip()
|
|
648
|
+
s = re.sub(r"\s+", " ", s).strip().strip(",").strip()
|
|
549
649
|
role: Optional[str] = None
|
|
550
650
|
m = _ROLE_PAREN_RE.search(s)
|
|
551
651
|
if m:
|
|
@@ -554,9 +654,7 @@ def _split_name_role(s: str) -> Tuple[str, Optional[str]]:
|
|
|
554
654
|
if len(candidate) <= 40 and candidate.lower() not in ("a", "an", "the"):
|
|
555
655
|
role = candidate
|
|
556
656
|
s = (s[: m.start()] + s[m.end():]).strip().rstrip(",").strip()
|
|
557
|
-
|
|
558
|
-
s = re.sub(r"\s+", " ", s)
|
|
559
|
-
return s, role
|
|
657
|
+
return _clean_party_name(s), role
|
|
560
658
|
|
|
561
659
|
|
|
562
660
|
def extract_parties(text: str) -> List[JSON]:
|
|
@@ -565,9 +663,6 @@ def extract_parties(text: str) -> List[JSON]:
|
|
|
565
663
|
return []
|
|
566
664
|
out: List[JSON] = []
|
|
567
665
|
for raw in (m.group(1), m.group(2)):
|
|
568
|
-
# Party names can wrap across lines ("...(the \"Disclosing\nParty\")");
|
|
569
|
-
# collapse whitespace rather than truncating at the first newline.
|
|
570
|
-
raw = re.sub(r"\s+", " ", raw).strip()
|
|
571
666
|
name, role = _split_name_role(raw)
|
|
572
667
|
if not name or len(name) < 2 or len(name) > 120:
|
|
573
668
|
continue
|
|
@@ -578,10 +673,12 @@ def extract_parties(text: str) -> List[JSON]:
|
|
|
578
673
|
|
|
579
674
|
|
|
580
675
|
def extract_dates(text: str) -> JSON:
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
676
|
+
label = _EFFDATE_LABEL_RE.search(text)
|
|
677
|
+
if label is not None:
|
|
678
|
+
effective = _date_field_from_str(label.group(1), 0.9)
|
|
679
|
+
else:
|
|
680
|
+
effective = _date_field(_EFFECTIVE_RE.search(text))
|
|
681
|
+
return {"effective": effective, "expiration": _date_field(_EXPIRE_RE.search(text))}
|
|
585
682
|
|
|
586
683
|
|
|
587
684
|
def extract_governing_law(text: str) -> JSON:
|
|
@@ -600,10 +697,10 @@ def extract_term(text: str) -> JSON:
|
|
|
600
697
|
if m:
|
|
601
698
|
num = _word_to_int(m.group(1))
|
|
602
699
|
unit = m.group(2).lower().rstrip("s")
|
|
700
|
+
# Only emit when the captured token is a real number; otherwise the
|
|
701
|
+
# match was a coincidence ("...consecutive days") -> leave as not-found.
|
|
603
702
|
if num is not None:
|
|
604
703
|
length = _field(f"{num} {unit}{'s' if num != 1 else ''}", 0.7)
|
|
605
|
-
else:
|
|
606
|
-
length = _field(f"{m.group(1)} {m.group(2)}".strip(), 0.5)
|
|
607
704
|
|
|
608
705
|
notice = _none_field()
|
|
609
706
|
nm = _NOTICE_RE.search(text)
|
|
@@ -649,7 +746,8 @@ def extract_clauses(text: str) -> List[JSON]:
|
|
|
649
746
|
for c in detect_clauses(text):
|
|
650
747
|
canonical, mapped = _canonicalize_clause(c["title"])
|
|
651
748
|
tier = c["tier"]
|
|
652
|
-
base = {"h2": 0.95, "bold-numbered": 0.85, "
|
|
749
|
+
base = {"h2": 0.95, "bold-numbered": 0.85, "numbered": 0.8,
|
|
750
|
+
"all-caps": 0.75, "explicit": 0.95}.get(tier, 0.7)
|
|
653
751
|
conf = round(base * (1.0 if mapped else 0.75), 2)
|
|
654
752
|
out.append({
|
|
655
753
|
"canonical_title": canonical,
|
|
@@ -669,10 +767,14 @@ def extract_title(text: str, path: Optional[Path], fmt: str) -> Optional[str]:
|
|
|
669
767
|
return m.group(1).strip()
|
|
670
768
|
for line in text.splitlines():
|
|
671
769
|
ls = line.strip().lstrip("#").strip()
|
|
672
|
-
if ls:
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
770
|
+
if not ls:
|
|
771
|
+
continue
|
|
772
|
+
# Skip SGML/XML wrapper lines (e.g. SEC EDGAR "<DOCUMENT>", "<TYPE>...").
|
|
773
|
+
if ls.startswith("<"):
|
|
774
|
+
continue
|
|
775
|
+
if len(ls) <= 90:
|
|
776
|
+
return ls
|
|
777
|
+
break
|
|
676
778
|
if path is not None:
|
|
677
779
|
return _titlecase(path.stem.replace("_", " ").replace("-", " "))
|
|
678
780
|
return None
|
|
@@ -683,21 +785,91 @@ def extract_title(text: str, path: Optional[Path], fmt: str) -> Optional[str]:
|
|
|
683
785
|
# ---------------------------------------------------------------------------
|
|
684
786
|
|
|
685
787
|
|
|
788
|
+
def _looks_like_html(head: str) -> bool:
|
|
789
|
+
"""Heuristic: does this text look like HTML? Catches HTML masquerading as
|
|
790
|
+
.txt (e.g. SEC EDGAR full submissions wrap HTML exhibits in a .txt)."""
|
|
791
|
+
low = head.lower()
|
|
792
|
+
if "<!doctype html" in low or "<html" in low or "<body" in low:
|
|
793
|
+
return True
|
|
794
|
+
return len(re.findall(r"</?(?:p|div|table|tr|td|span|br|h[1-6]|font|b|i)\b", low)) >= 6
|
|
795
|
+
|
|
796
|
+
|
|
686
797
|
def _detect_format(path: Path, raw: bytes) -> str:
|
|
687
798
|
ext = path.suffix.lower()
|
|
688
|
-
if ext in (".
|
|
689
|
-
return "
|
|
690
|
-
if ext == ".txt":
|
|
691
|
-
return "text"
|
|
799
|
+
if ext in (".htm", ".html", ".xhtml"):
|
|
800
|
+
return "html"
|
|
692
801
|
if ext == ".docx":
|
|
693
802
|
return "docx"
|
|
694
803
|
if ext == ".pdf":
|
|
695
804
|
return "pdf"
|
|
696
805
|
if raw[:4] == b"%PDF":
|
|
697
806
|
return "pdf"
|
|
698
|
-
if raw[:2] == b"PK":
|
|
807
|
+
if raw[:2] == b"PK" and ext not in (".md", ".markdown", ".txt"):
|
|
699
808
|
return "docx"
|
|
700
|
-
|
|
809
|
+
base = "markdown" if ext in (".md", ".markdown") else "text"
|
|
810
|
+
# Content sniff: HTML hiding inside a .txt/.md (or extensionless) file.
|
|
811
|
+
if _looks_like_html(raw[:4096].decode("utf-8", "replace")):
|
|
812
|
+
return "html"
|
|
813
|
+
return base
|
|
814
|
+
|
|
815
|
+
|
|
816
|
+
class _HTMLTextExtractor(html.parser.HTMLParser):
|
|
817
|
+
"""Stdlib HTML -> text: drops script/style, frames block elements with blank
|
|
818
|
+
lines (so clause-heading detection still works), and unescapes entities."""
|
|
819
|
+
|
|
820
|
+
_SKIP = {"script", "style", "head", "title", "meta", "link", "noscript"}
|
|
821
|
+
_BLOCK = {
|
|
822
|
+
"p", "div", "br", "tr", "li", "h1", "h2", "h3", "h4", "h5", "h6",
|
|
823
|
+
"section", "article", "table", "ul", "ol", "blockquote", "pre", "hr",
|
|
824
|
+
"thead", "tbody", "header", "footer", "main",
|
|
825
|
+
}
|
|
826
|
+
|
|
827
|
+
def __init__(self) -> None:
|
|
828
|
+
super().__init__(convert_charrefs=True)
|
|
829
|
+
self._parts: List[str] = []
|
|
830
|
+
self._skip = 0
|
|
831
|
+
|
|
832
|
+
def handle_starttag(self, tag: str, attrs: Any) -> None:
|
|
833
|
+
if tag in self._SKIP:
|
|
834
|
+
self._skip += 1
|
|
835
|
+
elif tag in self._BLOCK:
|
|
836
|
+
self._parts.append("\n")
|
|
837
|
+
|
|
838
|
+
def handle_endtag(self, tag: str) -> None:
|
|
839
|
+
if tag in self._SKIP and self._skip > 0:
|
|
840
|
+
self._skip -= 1
|
|
841
|
+
elif tag in self._BLOCK:
|
|
842
|
+
self._parts.append("\n")
|
|
843
|
+
|
|
844
|
+
def handle_data(self, data: str) -> None:
|
|
845
|
+
if self._skip == 0:
|
|
846
|
+
self._parts.append(data)
|
|
847
|
+
|
|
848
|
+
def get_text(self) -> str:
|
|
849
|
+
# Strip each line; collapse runs of blank lines to a single blank line
|
|
850
|
+
# (gives ALL-CAPS / numbered headings their blank-line frame).
|
|
851
|
+
lines = [re.sub(r"[ \t]+", " ", ln).strip() for ln in "".join(self._parts).split("\n")]
|
|
852
|
+
out: List[str] = []
|
|
853
|
+
blank = False
|
|
854
|
+
for ln in lines:
|
|
855
|
+
if ln:
|
|
856
|
+
out.append(ln)
|
|
857
|
+
blank = False
|
|
858
|
+
elif not blank:
|
|
859
|
+
out.append("")
|
|
860
|
+
blank = True
|
|
861
|
+
return "\n".join(out).strip()
|
|
862
|
+
|
|
863
|
+
|
|
864
|
+
def _read_html(raw_text: str) -> str:
|
|
865
|
+
parser = _HTMLTextExtractor()
|
|
866
|
+
try:
|
|
867
|
+
parser.feed(raw_text)
|
|
868
|
+
parser.close()
|
|
869
|
+
except Exception:
|
|
870
|
+
# Never crash on malformed markup; fall back to a crude tag strip.
|
|
871
|
+
return re.sub(r"<[^>]+>", " ", raw_text)
|
|
872
|
+
return parser.get_text()
|
|
701
873
|
|
|
702
874
|
|
|
703
875
|
def _read_docx(path: Path, raw: bytes, prefer_optional: bool = True) -> Tuple[str, List[str]]:
|
|
@@ -834,9 +1006,15 @@ def _pdf_unescape(s: str) -> str:
|
|
|
834
1006
|
|
|
835
1007
|
|
|
836
1008
|
def _pdf_text_from_content(content: bytes) -> str:
|
|
1009
|
+
"""Pull text strings from a PDF content stream, but ONLY from inside text
|
|
1010
|
+
objects (`BT` ... `ET`). Real text lives there; embedded fonts, images,
|
|
1011
|
+
digital-signature blobs and metadata streams have no BT/ET, so gating on it
|
|
1012
|
+
keeps their binary bytes (which often contain stray `(...)` sequences) out
|
|
1013
|
+
of the output -- essential for real signed/font-embedded PDFs."""
|
|
837
1014
|
s = content.decode("latin-1", "replace")
|
|
838
1015
|
lines: List[str] = []
|
|
839
1016
|
cur: List[str] = []
|
|
1017
|
+
in_text = False
|
|
840
1018
|
|
|
841
1019
|
def flush() -> None:
|
|
842
1020
|
if cur:
|
|
@@ -845,17 +1023,34 @@ def _pdf_text_from_content(content: bytes) -> str:
|
|
|
845
1023
|
|
|
846
1024
|
for m in _PDF_TOKEN_RE.finditer(s):
|
|
847
1025
|
tok = m.group(0)
|
|
848
|
-
if tok
|
|
1026
|
+
if tok == "BT":
|
|
1027
|
+
flush()
|
|
1028
|
+
in_text = True
|
|
1029
|
+
elif tok == "ET":
|
|
1030
|
+
flush()
|
|
1031
|
+
in_text = False
|
|
1032
|
+
elif not in_text:
|
|
1033
|
+
continue
|
|
1034
|
+
elif tok.startswith("("):
|
|
849
1035
|
cur.append(_pdf_unescape(tok[1:-1]))
|
|
850
1036
|
elif tok.startswith("["):
|
|
851
1037
|
for sm in re.finditer(r"\((?:\\.|[^\\()])*\)", tok):
|
|
852
1038
|
cur.append(_pdf_unescape(sm.group(0)[1:-1]))
|
|
853
|
-
elif tok in ("Td", "TD", "T*", "'", '"'
|
|
1039
|
+
elif tok in ("Td", "TD", "T*", "'", '"'):
|
|
854
1040
|
flush()
|
|
855
1041
|
flush()
|
|
856
1042
|
return "\n".join(lines)
|
|
857
1043
|
|
|
858
1044
|
|
|
1045
|
+
def _mostly_printable(s: str) -> bool:
|
|
1046
|
+
"""True if `s` is overwhelmingly printable text (backstop against a
|
|
1047
|
+
malformed stream slipping binary through the BT/ET gate)."""
|
|
1048
|
+
if not s:
|
|
1049
|
+
return False
|
|
1050
|
+
printable = sum(1 for ch in s if ch in "\n\t" or 32 <= ord(ch) < 127 or ord(ch) > 160)
|
|
1051
|
+
return printable / len(s) >= 0.85
|
|
1052
|
+
|
|
1053
|
+
|
|
859
1054
|
def _read_pdf_stdlib(raw: bytes) -> str:
|
|
860
1055
|
import zlib
|
|
861
1056
|
|
|
@@ -873,9 +1068,11 @@ def _read_pdf_stdlib(raw: bytes) -> str:
|
|
|
873
1068
|
content = zlib.decompress(body)
|
|
874
1069
|
except Exception:
|
|
875
1070
|
content = body
|
|
876
|
-
|
|
1071
|
+
piece = _pdf_text_from_content(content)
|
|
1072
|
+
if piece.strip() and _mostly_printable(piece):
|
|
1073
|
+
chunks.append(piece)
|
|
877
1074
|
idx = e + len(b"endstream")
|
|
878
|
-
return "\n".join(
|
|
1075
|
+
return "\n".join(chunks)
|
|
879
1076
|
|
|
880
1077
|
|
|
881
1078
|
def load_source(path: Path, prefer_optional: bool = True) -> Tuple[bytes, str, str, List[str]]:
|
|
@@ -894,6 +1091,8 @@ def load_source(path: Path, prefer_optional: bool = True) -> Tuple[bytes, str, s
|
|
|
894
1091
|
warnings: List[str] = []
|
|
895
1092
|
if fmt in ("markdown", "text"):
|
|
896
1093
|
text = raw.decode("utf-8", "replace")
|
|
1094
|
+
elif fmt == "html":
|
|
1095
|
+
text = _read_html(raw.decode("utf-8", "replace"))
|
|
897
1096
|
elif fmt == "docx":
|
|
898
1097
|
text, w = _read_docx(path, raw, prefer_optional)
|
|
899
1098
|
warnings += w
|
|
@@ -919,6 +1118,13 @@ def build_extraction(text: str, raw: bytes, fmt: str,
|
|
|
919
1118
|
source_path: Optional[str]) -> JSON:
|
|
920
1119
|
"""Run the deterministic tier and assemble the output contract object."""
|
|
921
1120
|
sha = hashlib.sha256(raw).hexdigest()
|
|
1121
|
+
# Field extractors (parties, dates, governing law, term, value, defined
|
|
1122
|
+
# terms) run on a whitespace-flattened copy so values that wrap across a
|
|
1123
|
+
# line break in the source -- "...laws of the Province\nof Ontario", a party
|
|
1124
|
+
# name split mid-line -- are matched whole. Clause detection and the title
|
|
1125
|
+
# keep the original text, which depends on line structure.
|
|
1126
|
+
flat = re.sub(r"[ \t\r\f\v]*\n[ \t\r\f\v]*", " ", text)
|
|
1127
|
+
flat = re.sub(r"[ \t]+", " ", flat)
|
|
922
1128
|
return {
|
|
923
1129
|
"document": {
|
|
924
1130
|
"title": extract_title(text, Path(source_path) if source_path else None, fmt),
|
|
@@ -926,13 +1132,13 @@ def build_extraction(text: str, raw: bytes, fmt: str,
|
|
|
926
1132
|
"sha256": sha,
|
|
927
1133
|
"source_path": source_path,
|
|
928
1134
|
},
|
|
929
|
-
"parties": extract_parties(
|
|
930
|
-
"dates": extract_dates(
|
|
931
|
-
"term": extract_term(
|
|
932
|
-
"governing_law": extract_governing_law(
|
|
1135
|
+
"parties": extract_parties(flat),
|
|
1136
|
+
"dates": extract_dates(flat),
|
|
1137
|
+
"term": extract_term(flat),
|
|
1138
|
+
"governing_law": extract_governing_law(flat),
|
|
933
1139
|
"clauses": extract_clauses(text),
|
|
934
|
-
"defined_terms": extract_defined_terms(
|
|
935
|
-
"value": extract_value(
|
|
1140
|
+
"defined_terms": extract_defined_terms(flat),
|
|
1141
|
+
"value": extract_value(flat),
|
|
936
1142
|
"_meta": {
|
|
937
1143
|
"extractor_version": EXTRACTOR_VERSION,
|
|
938
1144
|
"tiers_used": ["deterministic"],
|
|
@@ -1244,7 +1450,7 @@ def output_schema() -> JSON:
|
|
|
1244
1450
|
"required": ["title", "format", "sha256", "source_path"],
|
|
1245
1451
|
"properties": {
|
|
1246
1452
|
"title": {"type": ["string", "null"]},
|
|
1247
|
-
"format": {"enum": ["markdown", "text", "docx", "pdf"]},
|
|
1453
|
+
"format": {"enum": ["markdown", "text", "docx", "pdf", "html"]},
|
|
1248
1454
|
"sha256": {"type": "string", "pattern": "^[0-9a-f]{64}$"},
|
|
1249
1455
|
"source_path": {"type": ["string", "null"]},
|
|
1250
1456
|
},
|
|
@@ -1293,7 +1499,7 @@ def output_schema() -> JSON:
|
|
|
1293
1499
|
"properties": {
|
|
1294
1500
|
"canonical_title": {"type": ["string", "null"]},
|
|
1295
1501
|
"detected_title": {"type": "string"},
|
|
1296
|
-
"tier": {"enum": ["h2", "bold-numbered", "all-caps", "explicit", "llm"]},
|
|
1502
|
+
"tier": {"enum": ["h2", "bold-numbered", "numbered", "all-caps", "explicit", "llm"]},
|
|
1297
1503
|
"span": {
|
|
1298
1504
|
"type": "object",
|
|
1299
1505
|
"required": ["start", "end"],
|
|
@@ -1595,7 +1801,7 @@ def _add_common_output_flags(p: argparse.ArgumentParser) -> None:
|
|
|
1595
1801
|
def build_parser() -> argparse.ArgumentParser:
|
|
1596
1802
|
parser = argparse.ArgumentParser(
|
|
1597
1803
|
prog="extract",
|
|
1598
|
-
description="Ingest any contract (.md/.txt/.docx/.pdf) and emit structured "
|
|
1804
|
+
description="Ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured "
|
|
1599
1805
|
"JSON for the contract-ops CLI suite. See docs/INTEROP.md.",
|
|
1600
1806
|
)
|
|
1601
1807
|
parser.add_argument("-V", "--version", action="version",
|
|
@@ -1629,7 +1835,7 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
1629
1835
|
|
|
1630
1836
|
|
|
1631
1837
|
def _build_extract_args(p: argparse.ArgumentParser) -> None:
|
|
1632
|
-
p.add_argument("path", help="Path to the document (.md/.txt/.docx/.pdf).")
|
|
1838
|
+
p.add_argument("path", help="Path to the document (.md/.txt/.html/.docx/.pdf).")
|
|
1633
1839
|
p.add_argument("--llm", action="store_true",
|
|
1634
1840
|
help="Opt-in LLM enrichment of fuzzy fields (renewal, obligations). "
|
|
1635
1841
|
"Off by default; the deterministic core is fully useful without it.")
|
|
@@ -4,8 +4,8 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "extract-cli"
|
|
7
|
-
version = "0.1.
|
|
8
|
-
description = "Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.docx/.pdf) and emit structured JSON."
|
|
7
|
+
version = "0.1.2"
|
|
8
|
+
description = "Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9"
|
|
11
11
|
license = { text = "MIT" }
|
|
@@ -20,7 +20,8 @@ from tests._fixtures_build import ensure_binary_fixtures # noqa: E402
|
|
|
20
20
|
FIXTURES = Path(__file__).resolve().parent / "fixtures"
|
|
21
21
|
|
|
22
22
|
DOCS = ["nda_h2.md", "services_bold.txt", "lease_allcaps.txt",
|
|
23
|
-
"employment_docx.docx", "license_pdf.pdf", "
|
|
23
|
+
"employment_docx.docx", "license_pdf.pdf", "services_html.html",
|
|
24
|
+
"scanned.pdf"]
|
|
24
25
|
|
|
25
26
|
|
|
26
27
|
def golden_for(name: str) -> dict:
|
|
@@ -121,6 +121,11 @@
|
|
|
121
121
|
"confidence": 0.6,
|
|
122
122
|
"source": "deterministic"
|
|
123
123
|
},
|
|
124
|
+
{
|
|
125
|
+
"term": "Disclosing Party",
|
|
126
|
+
"confidence": 0.6,
|
|
127
|
+
"source": "deterministic"
|
|
128
|
+
},
|
|
124
129
|
{
|
|
125
130
|
"term": "Receiving Party",
|
|
126
131
|
"confidence": 0.6,
|
|
@@ -138,7 +143,7 @@
|
|
|
138
143
|
"source": "none"
|
|
139
144
|
},
|
|
140
145
|
"_meta": {
|
|
141
|
-
"extractor_version": "0.1.
|
|
146
|
+
"extractor_version": "0.1.2",
|
|
142
147
|
"tiers_used": [
|
|
143
148
|
"deterministic"
|
|
144
149
|
],
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
<!DOCTYPE html>
|
|
2
|
+
<html>
|
|
3
|
+
<head>
|
|
4
|
+
<title>Exhibit 10.1</title>
|
|
5
|
+
<style>body { font-family: serif; } .hidden { display:none; }</style>
|
|
6
|
+
<script>var x = "(this should never appear in output)";</script>
|
|
7
|
+
</head>
|
|
8
|
+
<body>
|
|
9
|
+
<p align="center"><b>MASTER SERVICES AGREEMENT</b></p>
|
|
10
|
+
|
|
11
|
+
<p>This Master Services Agreement (the “Agreement”) is entered
|
|
12
|
+
into as of March 15, 2023 (the "Effective Date"), by and between
|
|
13
|
+
Initrode Systems, Inc., a Delaware corporation (“Provider”),
|
|
14
|
+
and Hooli LLC (“Customer”).</p>
|
|
15
|
+
|
|
16
|
+
<p>1. Services</p>
|
|
17
|
+
<p>Provider shall perform the services described in each Statement of Work.</p>
|
|
18
|
+
|
|
19
|
+
<p>2. Fees and Payment</p>
|
|
20
|
+
<p>Customer shall pay Provider the fees set forth in the applicable Statement
|
|
21
|
+
of Work, not to exceed $500,000 in the aggregate.</p>
|
|
22
|
+
|
|
23
|
+
<p>3. Term and Termination</p>
|
|
24
|
+
<p>The initial term of this Agreement is two (2) years. Either party may
|
|
25
|
+
terminate upon sixty (60) days’ written notice. This Agreement shall
|
|
26
|
+
automatically renew for successive one-year terms.</p>
|
|
27
|
+
|
|
28
|
+
<p>4. Confidentiality</p>
|
|
29
|
+
<p>Each party shall protect the other’s “Confidential
|
|
30
|
+
Information” using reasonable care.</p>
|
|
31
|
+
|
|
32
|
+
<p>5. Governing Law</p>
|
|
33
|
+
<p>This Agreement shall be governed by the laws of the State of California.</p>
|
|
34
|
+
</body>
|
|
35
|
+
</html>
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
{
|
|
2
|
+
"document": {
|
|
3
|
+
"title": "MASTER SERVICES AGREEMENT",
|
|
4
|
+
"format": "html",
|
|
5
|
+
"sha256": "088b40f13135e6b5d8f8548b162d657f10725d348388c7c3a416d11d7fc65300",
|
|
6
|
+
"source_path": "services_html.html"
|
|
7
|
+
},
|
|
8
|
+
"parties": [
|
|
9
|
+
{
|
|
10
|
+
"name": "Initrode Systems, Inc.",
|
|
11
|
+
"confidence": 0.9,
|
|
12
|
+
"source": "deterministic",
|
|
13
|
+
"role": "Provider"
|
|
14
|
+
},
|
|
15
|
+
{
|
|
16
|
+
"name": "Hooli LLC",
|
|
17
|
+
"confidence": 0.9,
|
|
18
|
+
"source": "deterministic",
|
|
19
|
+
"role": "Customer"
|
|
20
|
+
}
|
|
21
|
+
],
|
|
22
|
+
"dates": {
|
|
23
|
+
"effective": {
|
|
24
|
+
"value": "2023-03-15",
|
|
25
|
+
"confidence": 0.9,
|
|
26
|
+
"source": "deterministic"
|
|
27
|
+
},
|
|
28
|
+
"expiration": {
|
|
29
|
+
"value": null,
|
|
30
|
+
"confidence": 0.0,
|
|
31
|
+
"source": "none"
|
|
32
|
+
}
|
|
33
|
+
},
|
|
34
|
+
"term": {
|
|
35
|
+
"length": {
|
|
36
|
+
"value": "2 years",
|
|
37
|
+
"confidence": 0.7,
|
|
38
|
+
"source": "deterministic"
|
|
39
|
+
},
|
|
40
|
+
"auto_renew": {
|
|
41
|
+
"value": true,
|
|
42
|
+
"confidence": 0.65,
|
|
43
|
+
"source": "deterministic"
|
|
44
|
+
},
|
|
45
|
+
"notice_period_days": {
|
|
46
|
+
"value": 60,
|
|
47
|
+
"confidence": 0.7,
|
|
48
|
+
"source": "deterministic"
|
|
49
|
+
}
|
|
50
|
+
},
|
|
51
|
+
"governing_law": {
|
|
52
|
+
"value": "State of California",
|
|
53
|
+
"confidence": 0.85,
|
|
54
|
+
"source": "deterministic"
|
|
55
|
+
},
|
|
56
|
+
"clauses": [
|
|
57
|
+
{
|
|
58
|
+
"canonical_title": "Services",
|
|
59
|
+
"detected_title": "1. Services",
|
|
60
|
+
"tier": "numbered",
|
|
61
|
+
"span": {
|
|
62
|
+
"start": 242,
|
|
63
|
+
"end": 329
|
|
64
|
+
},
|
|
65
|
+
"confidence": 0.6,
|
|
66
|
+
"source": "deterministic",
|
|
67
|
+
"mapped": false
|
|
68
|
+
},
|
|
69
|
+
{
|
|
70
|
+
"canonical_title": "Payment",
|
|
71
|
+
"detected_title": "2. Fees and Payment",
|
|
72
|
+
"tier": "numbered",
|
|
73
|
+
"span": {
|
|
74
|
+
"start": 329,
|
|
75
|
+
"end": 476
|
|
76
|
+
},
|
|
77
|
+
"confidence": 0.8,
|
|
78
|
+
"source": "deterministic",
|
|
79
|
+
"mapped": true
|
|
80
|
+
},
|
|
81
|
+
{
|
|
82
|
+
"canonical_title": "Termination",
|
|
83
|
+
"detected_title": "3. Term and Termination",
|
|
84
|
+
"tier": "numbered",
|
|
85
|
+
"span": {
|
|
86
|
+
"start": 476,
|
|
87
|
+
"end": 692
|
|
88
|
+
},
|
|
89
|
+
"confidence": 0.8,
|
|
90
|
+
"source": "deterministic",
|
|
91
|
+
"mapped": true
|
|
92
|
+
},
|
|
93
|
+
{
|
|
94
|
+
"canonical_title": "Confidentiality",
|
|
95
|
+
"detected_title": "4. Confidentiality",
|
|
96
|
+
"tier": "numbered",
|
|
97
|
+
"span": {
|
|
98
|
+
"start": 692,
|
|
99
|
+
"end": 800
|
|
100
|
+
},
|
|
101
|
+
"confidence": 0.8,
|
|
102
|
+
"source": "deterministic",
|
|
103
|
+
"mapped": true
|
|
104
|
+
},
|
|
105
|
+
{
|
|
106
|
+
"canonical_title": "Governing Law",
|
|
107
|
+
"detected_title": "5. Governing Law",
|
|
108
|
+
"tier": "numbered",
|
|
109
|
+
"span": {
|
|
110
|
+
"start": 800,
|
|
111
|
+
"end": 890
|
|
112
|
+
},
|
|
113
|
+
"confidence": 0.8,
|
|
114
|
+
"source": "deterministic",
|
|
115
|
+
"mapped": true
|
|
116
|
+
}
|
|
117
|
+
],
|
|
118
|
+
"defined_terms": [
|
|
119
|
+
{
|
|
120
|
+
"term": "Agreement",
|
|
121
|
+
"confidence": 0.6,
|
|
122
|
+
"source": "deterministic"
|
|
123
|
+
},
|
|
124
|
+
{
|
|
125
|
+
"term": "Effective Date",
|
|
126
|
+
"confidence": 0.6,
|
|
127
|
+
"source": "deterministic"
|
|
128
|
+
},
|
|
129
|
+
{
|
|
130
|
+
"term": "Provider",
|
|
131
|
+
"confidence": 0.6,
|
|
132
|
+
"source": "deterministic"
|
|
133
|
+
},
|
|
134
|
+
{
|
|
135
|
+
"term": "Customer",
|
|
136
|
+
"confidence": 0.6,
|
|
137
|
+
"source": "deterministic"
|
|
138
|
+
},
|
|
139
|
+
{
|
|
140
|
+
"term": "Confidential Information",
|
|
141
|
+
"confidence": 0.6,
|
|
142
|
+
"source": "deterministic"
|
|
143
|
+
}
|
|
144
|
+
],
|
|
145
|
+
"value": {
|
|
146
|
+
"value": "$500,000",
|
|
147
|
+
"confidence": 0.6,
|
|
148
|
+
"source": "deterministic"
|
|
149
|
+
},
|
|
150
|
+
"_meta": {
|
|
151
|
+
"extractor_version": "0.1.2",
|
|
152
|
+
"tiers_used": [
|
|
153
|
+
"deterministic"
|
|
154
|
+
],
|
|
155
|
+
"llm_used": false
|
|
156
|
+
}
|
|
157
|
+
}
|
|
@@ -25,6 +25,50 @@ def test_tier3_all_caps() -> None:
|
|
|
25
25
|
assert [c["tier"] for c in clauses] == ["all-caps", "all-caps"]
|
|
26
26
|
|
|
27
27
|
|
|
28
|
+
def test_tier_numbered_plain_headings() -> None:
|
|
29
|
+
# Real-world dominant format: plain numbered, mixed-case, unbolded headings.
|
|
30
|
+
text = ("1. Term And Nature Of Employment\n\nbody about term\n\n"
|
|
31
|
+
"2. Wage Compensation\n\nbody about wages\n\n"
|
|
32
|
+
"5. Termination\n\nbody about termination")
|
|
33
|
+
clauses = ex.detect_clauses(text)
|
|
34
|
+
assert [c["tier"] for c in clauses] == ["numbered", "numbered", "numbered"]
|
|
35
|
+
assert clauses[0]["title"] == "Term And Nature Of Employment"
|
|
36
|
+
assert clauses[2]["title"] == "Termination"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def test_numbered_heading_rejects_sentences() -> None:
|
|
40
|
+
# "1. The Company shall pay..." is a numbered sentence, not a heading.
|
|
41
|
+
assert ex._qualifies_as_numbered_heading("Wage Compensation")
|
|
42
|
+
assert ex._qualifies_as_numbered_heading("Term And Nature Of Employment")
|
|
43
|
+
assert ex._qualifies_as_numbered_heading("Termination")
|
|
44
|
+
assert not ex._qualifies_as_numbered_heading("The Company shall pay the Employee monthly")
|
|
45
|
+
assert not ex._qualifies_as_numbered_heading("Fee") # single word < 4 letters
|
|
46
|
+
assert not ex._qualifies_as_numbered_heading(
|
|
47
|
+
"EMPLOYEE shall be compensated on the basis of an annual salary")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def test_numbered_section_article_prefixes() -> None:
|
|
51
|
+
text = ("Section 1. Definitions\n\nx\n\nSection 2. Confidentiality\n\ny\n\n"
|
|
52
|
+
"Article IV. Governing Law\n\nz")
|
|
53
|
+
clauses = ex.detect_clauses(text)
|
|
54
|
+
assert all(c["tier"] == "numbered" for c in clauses)
|
|
55
|
+
assert clauses[0]["title"] == "Definitions"
|
|
56
|
+
assert clauses[2]["title"] == "Governing Law"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def test_numbered_does_not_shadow_bold() -> None:
|
|
60
|
+
# Bold-numbered must win over plain-numbered when both could match.
|
|
61
|
+
text = "**1. Purpose**\n\nx\n\n**2. Scope**\n\ny"
|
|
62
|
+
assert all(c["tier"] == "bold-numbered" for c in ex.detect_clauses(text))
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def test_trailing_period_stripped_from_titles() -> None:
|
|
66
|
+
canon, mapped = ex._canonicalize_clause("Other Benefits.")
|
|
67
|
+
assert canon == "Other Benefits"
|
|
68
|
+
# And a mapped clause with a trailing period still maps.
|
|
69
|
+
assert ex._canonicalize_clause("Survival.") == ("Survival", True)
|
|
70
|
+
|
|
71
|
+
|
|
28
72
|
def test_cascade_priority_h2_wins() -> None:
|
|
29
73
|
# An H2 present means the bold/all-caps fallbacks must not fire.
|
|
30
74
|
text = "## Real Heading\n\n**1. Not A Heading**\n\nALSO NOT A HEADING\n\nbody"
|
|
@@ -22,7 +22,7 @@ def test_version(capsys: pytest.CaptureFixture[str]) -> None:
|
|
|
22
22
|
with pytest.raises(SystemExit) as exc:
|
|
23
23
|
ex.main(["--version"])
|
|
24
24
|
assert exc.value.code == 0
|
|
25
|
-
assert "extract-cli
|
|
25
|
+
assert f"extract-cli {ex.__version__}" in capsys.readouterr().out
|
|
26
26
|
|
|
27
27
|
|
|
28
28
|
def test_demo_runs(capsys: pytest.CaptureFixture[str]) -> None:
|
|
@@ -12,8 +12,8 @@ def test_parties_between_simple() -> None:
|
|
|
12
12
|
assert all(0.0 <= p["confidence"] <= 1.0 for p in parties)
|
|
13
13
|
|
|
14
14
|
|
|
15
|
-
def
|
|
16
|
-
text = ('by and between Acme Corp. (the "Disclosing
|
|
15
|
+
def test_parties_with_roles() -> None:
|
|
16
|
+
text = ('by and between Acme Corp. (the "Disclosing Party") and '
|
|
17
17
|
'Beta LLC (the "Receiving Party"), dated March 1, 2024.')
|
|
18
18
|
parties = ex.extract_parties(text)
|
|
19
19
|
assert parties[0]["name"] == "Acme Corp."
|
|
@@ -22,6 +22,30 @@ def test_parties_with_roles_and_linebreak() -> None:
|
|
|
22
22
|
assert parties[1]["role"] == "Receiving Party"
|
|
23
23
|
|
|
24
24
|
|
|
25
|
+
def test_parties_linebreak_handled_by_build() -> None:
|
|
26
|
+
# build_extraction flattens whitespace, so a party/role that wraps across a
|
|
27
|
+
# line is matched whole.
|
|
28
|
+
text = ('This Agreement is made by and between Acme Corp. (the "Disclosing\n'
|
|
29
|
+
'Party") and Beta LLC (the "Receiving Party").')
|
|
30
|
+
r = ex.build_extraction(text, text.encode("utf-8"), "text", "x.txt")
|
|
31
|
+
assert [p["name"] for p in r["parties"]] == ["Acme Corp.", "Beta LLC"]
|
|
32
|
+
assert r["parties"][0]["role"] == "Disclosing Party"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def test_parties_skip_and_inside_description() -> None:
|
|
36
|
+
# An "and" inside a party's own description must not split the parties.
|
|
37
|
+
text = ("between Blade Ventures Inc., a Nevada corporation having offices at "
|
|
38
|
+
"1 Main St and doing business as Foo (\"Client\"), and KPMG LP")
|
|
39
|
+
parties = ex.extract_parties(text)
|
|
40
|
+
assert [p["name"] for p in parties] == ["Blade Ventures Inc.", "KPMG LP"]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def test_party_name_descriptors_trimmed() -> None:
|
|
44
|
+
assert ex._clean_party_name("Visteon Corporation, a Delaware corporation") == "Visteon Corporation"
|
|
45
|
+
assert ex._clean_party_name("Foo Inc. doing business as Bar") == "Foo Inc."
|
|
46
|
+
assert ex._clean_party_name("Baz LLC having its principal office at X") == "Baz LLC"
|
|
47
|
+
|
|
48
|
+
|
|
25
49
|
def test_parties_none() -> None:
|
|
26
50
|
assert ex.extract_parties("There are no parties named here.") == []
|
|
27
51
|
|
|
@@ -39,6 +63,25 @@ def test_dates_iso_normalization() -> None:
|
|
|
39
63
|
assert out["source"] == "deterministic"
|
|
40
64
|
|
|
41
65
|
|
|
66
|
+
def test_dates_effective_date_label_and_as_of() -> None:
|
|
67
|
+
# The "(the "Effective Date")" anchor, with the date wrapping a newline.
|
|
68
|
+
text = 'between A and B as of August\n31, 2016 (the "Effective Date").'
|
|
69
|
+
assert ex.extract_dates(text)["effective"]["value"] == "2016-08-31"
|
|
70
|
+
# Bare "as of <date>" cue.
|
|
71
|
+
assert ex.extract_dates("dated as of June 1, 2023")["effective"]["value"] == "2023-06-01"
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def test_term_length_rejects_non_number() -> None:
|
|
75
|
+
# "...for consecutive days" must NOT be reported as a term length.
|
|
76
|
+
text = "the Employment Period shall run for consecutive days as scheduled"
|
|
77
|
+
assert ex.extract_term(text)["length"]["source"] == "none"
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def test_title_skips_sgml_wrapper() -> None:
|
|
81
|
+
text = "<DOCUMENT>\n<TYPE>EX-10\n<TEXT>\n\nEMPLOYMENT AGREEMENT\n\nbody"
|
|
82
|
+
assert ex.extract_title(text, None, "text") == "EMPLOYMENT AGREEMENT"
|
|
83
|
+
|
|
84
|
+
|
|
42
85
|
def test_dates_missing() -> None:
|
|
43
86
|
out = ex.extract_dates("no dates in here")
|
|
44
87
|
assert out["effective"] == ex._none_field()
|
|
@@ -61,6 +104,15 @@ def test_governing_law_stops_before_trailing_clause() -> None:
|
|
|
61
104
|
assert out["value"] == "State of Delaware"
|
|
62
105
|
|
|
63
106
|
|
|
107
|
+
def test_governing_law_linebreak_handled_by_build() -> None:
|
|
108
|
+
# A jurisdiction that wraps a line ("...the Province\nof Ontario") is
|
|
109
|
+
# matched whole because build_extraction flattens whitespace first.
|
|
110
|
+
text = ("This Agreement shall be governed by the laws of the Province\n"
|
|
111
|
+
"of Ontario and the federal laws of Canada.")
|
|
112
|
+
r = ex.build_extraction(text, text.encode("utf-8"), "text", "x.txt")
|
|
113
|
+
assert r["governing_law"]["value"] == "Province of Ontario"
|
|
114
|
+
|
|
115
|
+
|
|
64
116
|
def test_governing_law_missing() -> None:
|
|
65
117
|
assert ex.extract_governing_law("nothing about law")["source"] == "none"
|
|
66
118
|
|
|
@@ -142,6 +142,45 @@ def test_pdf_unescape() -> None:
|
|
|
142
142
|
assert ex._pdf_unescape(r"\101\102") == "AB" # octal escapes
|
|
143
143
|
|
|
144
144
|
|
|
145
|
+
def test_html_extraction() -> None:
|
|
146
|
+
raw, text, fmt, _w = ex.load_source(FIXTURES / "services_html.html")
|
|
147
|
+
assert fmt == "html"
|
|
148
|
+
# script/style content is dropped; entities are unescaped.
|
|
149
|
+
assert "this should never appear" not in text
|
|
150
|
+
result = ex.build_extraction(text, raw, fmt, "services_html.html")
|
|
151
|
+
assert result["document"]["format"] == "html"
|
|
152
|
+
assert [p["name"] for p in result["parties"]] == ["Initrode Systems, Inc.", "Hooli LLC"]
|
|
153
|
+
assert result["governing_law"]["value"] == "State of California"
|
|
154
|
+
assert result["dates"]["effective"]["value"] == "2023-03-15"
|
|
155
|
+
canon = {c["canonical_title"] for c in result["clauses"]}
|
|
156
|
+
assert {"Payment", "Termination", "Confidentiality", "Governing Law"} <= canon
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def test_html_detected_by_content_sniff(tmp_path: Any) -> None:
|
|
160
|
+
# HTML masquerading as .txt (e.g. a SEC EDGAR full submission) is sniffed.
|
|
161
|
+
p = tmp_path / "exhibit.txt"
|
|
162
|
+
p.write_text("<html><body><p>between A Co and B Co</p></body></html>")
|
|
163
|
+
_raw, _text, fmt, _w = ex.load_source(p)
|
|
164
|
+
assert fmt == "html"
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def test_html_malformed_does_not_crash() -> None:
|
|
168
|
+
assert ex._read_html("<p>unclosed <b>bold <div>text") is not None
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def test_pdf_text_only_inside_bt_et() -> None:
|
|
172
|
+
# Strings outside BT/ET (font/signature/metadata stream bytes that happen to
|
|
173
|
+
# contain parentheses) must be ignored; only text objects yield text.
|
|
174
|
+
content = b"(garbage outside) /Font << >> BT (real text) Tj ET (more garbage)"
|
|
175
|
+
assert ex._pdf_text_from_content(content) == "real text"
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def test_pdf_mostly_printable_backstop() -> None:
|
|
179
|
+
assert ex._mostly_printable("Hello, world")
|
|
180
|
+
assert not ex._mostly_printable("\x00\x01\x02\x03\x04\x05\x06\x07")
|
|
181
|
+
assert not ex._mostly_printable("")
|
|
182
|
+
|
|
183
|
+
|
|
145
184
|
def test_extract_json_object_from_noise() -> None:
|
|
146
185
|
assert ex._extract_json_object('prefix {"a": 1} suffix') == {"a": 1}
|
|
147
186
|
assert ex._extract_json_object("no json here") is None
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|