lede 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lede/__init__.py +28 -0
- lede/_headings.py +111 -0
- lede/_parity.py +73 -0
- lede/_types.py +47 -0
- lede/brief.py +157 -0
- lede/clean.py +100 -0
- lede/cli.py +84 -0
- lede/coverage.py +108 -0
- lede/extract/__init__.py +37 -0
- lede/extract/_backends.py +69 -0
- lede/extract/_common.py +20 -0
- lede/extract/_yake.py +78 -0
- lede/extract/correlate.py +117 -0
- lede/extract/key_facts.py +135 -0
- lede/extract/metadata.py +78 -0
- lede/extract/outline.py +97 -0
- lede/extract/phrases.py +121 -0
- lede/extract/stats.py +412 -0
- lede/keyword.py +79 -0
- lede/sentences.py +78 -0
- lede/textrank.py +75 -0
- lede/tfidf.py +406 -0
- lede-0.3.0.dist-info/METADATA +505 -0
- lede-0.3.0.dist-info/RECORD +27 -0
- lede-0.3.0.dist-info/WHEEL +4 -0
- lede-0.3.0.dist-info/entry_points.txt +2 -0
- lede-0.3.0.dist-info/licenses/LICENSE +202 -0
lede/__init__.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""lede — deterministic extractive summarization.
|
|
2
|
+
|
|
3
|
+
Public API:
|
|
4
|
+
summarize(text, max_length, mode='default') -> SummaryResult
|
|
5
|
+
brief(text, *, overview_max, max_facts, include_phrases, format) -> str | dict
|
|
6
|
+
clean_text(text) -> str
|
|
7
|
+
strip_think(text) -> str
|
|
8
|
+
extract_keyword(text, keywords, num_sentences=10) -> str
|
|
9
|
+
set_default_backend(name) -> None # enrichment backend selector
|
|
10
|
+
"""
|
|
11
|
+
from lede.clean import clean_text, strip_think
|
|
12
|
+
from lede.tfidf import summarize
|
|
13
|
+
from lede.brief import brief
|
|
14
|
+
from lede.keyword import extract_keyword
|
|
15
|
+
from lede._types import SummaryResult
|
|
16
|
+
from lede.extract._backends import set_default_backend
|
|
17
|
+
|
|
18
|
+
__version__ = "0.3.0"
|
|
19
|
+
__all__ = [
|
|
20
|
+
"summarize",
|
|
21
|
+
"brief",
|
|
22
|
+
"clean_text",
|
|
23
|
+
"strip_think",
|
|
24
|
+
"extract_keyword",
|
|
25
|
+
"SummaryResult",
|
|
26
|
+
"set_default_backend",
|
|
27
|
+
"__version__",
|
|
28
|
+
]
|
lede/_headings.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""Shared heading detection.
|
|
2
|
+
|
|
3
|
+
Any sentence matching any of these patterns is considered a heading and is
|
|
4
|
+
dropped from candidate selection in mode='default'. Also used by
|
|
5
|
+
extract.outline to discover section names.
|
|
6
|
+
"""
|
|
7
|
+
import re
|
|
8
|
+
|
|
9
|
+
# Markdown ATX-style heading: one or more # followed by space and text
|
|
10
|
+
_MD_HEADING_RE = re.compile(r"^\s*#+\s+.+$")
|
|
11
|
+
|
|
12
|
+
# ALL-CAPS short line: 4-80 chars of A-Z/space/colon, no lowercase.
|
|
13
|
+
# Upper bound extended from 28 -> 80 in T13b so long document-title
|
|
14
|
+
# conventions like "SUPREME COURT OF THE UNITED STATES" (32 chars) match.
|
|
15
|
+
_ALLCAPS_RE = re.compile(r"^\s*[A-Z][A-Z\s]{3,80}:?\s*$")
|
|
16
|
+
|
|
17
|
+
# Short label ending in colon (<=30 chars including the colon)
|
|
18
|
+
_SHORT_LABEL_RE = re.compile(r"^\s*.{1,30}:\s*$")
|
|
19
|
+
|
|
20
|
+
# Bare title-case single-line heading (T13b Class A): starts uppercase,
|
|
21
|
+
# second char alphanumeric (not space/punct), up to ~60 chars total, no
|
|
22
|
+
# internal special chars, no terminal punctuation. Matches bare lines like
|
|
23
|
+
# "Abstract", "Introduction", "Modern methods", "Privacy Policy".
|
|
24
|
+
#
|
|
25
|
+
# The "no terminal punctuation" constraint is load-bearing: it excludes short
|
|
26
|
+
# body sentences like "Costs declined." that ended in a period, which was the
|
|
27
|
+
# T6 false-positive class we dropped the < 4 content-tokens fallback for.
|
|
28
|
+
_BARE_TITLE_RE = re.compile(r"^\s*[A-Z][A-Za-z0-9][A-Za-z0-9 ]{0,58}$")
|
|
29
|
+
|
|
30
|
+
# Numbered-section-prefix heading (T13b Class B): "\d+. Section Name" on its
|
|
31
|
+
# own line. Matches "1. Information We Collect" etc. The numeric prefix is
|
|
32
|
+
# stripped by `heading_name()` so the emitted name matches gold.
|
|
33
|
+
_NUMBERED_SECTION_RE = re.compile(r"^\s*\d+\.\s+[A-Z][A-Za-z0-9 ]{0,58}$")
|
|
34
|
+
|
|
35
|
+
# Title-with-dash heading (T13d): document title line of the form
|
|
36
|
+
# "Title — Metadata" (em-dash U+2014, en-dash U+2013, or hyphen-minus), e.g.
|
|
37
|
+
# "Privacy Policy — Effective Date: 2026-01-01". Authorized by the labeling
|
|
38
|
+
# protocol's Edge-case conventions (docs/extraction-gold-labeling.md).
|
|
39
|
+
# Requires a space on both sides of the dash so hyphenated single words
|
|
40
|
+
# ("state-of-the-art") don't match.
|
|
41
|
+
#
|
|
42
|
+
# The trailing character class `[^.!?]*$` (no terminal `.`/`!`/`?`) mirrors
|
|
43
|
+
# the `_BARE_TITLE_RE` safety constraint — excludes body sentences like
|
|
44
|
+
# "Main concern is pricing — our $50K quote is 40% above their Q2 budget."
|
|
45
|
+
# that would otherwise match. The dash-metadata suffix is stripped by
|
|
46
|
+
# `heading_name()`.
|
|
47
|
+
_TITLE_WITH_DASH_RE = re.compile(r"^\s*[A-Z][A-Za-z0-9 ]{0,58}\s+[—–\-]\s+\S[^.!?]*$")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# Markdown heading depth helper — stripped from `## Section` etc.
|
|
51
|
+
_MD_DEPTH_RE = re.compile(r"^\s*(#+)\s+")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def is_structural_heading(sentence: str) -> bool:
|
|
55
|
+
"""True when `sentence` matches a structural heading pattern.
|
|
56
|
+
|
|
57
|
+
Narrower than [`is_heading`] — drops the "< 4 content tokens"
|
|
58
|
+
fallback so short body sentences don't masquerade as headings.
|
|
59
|
+
Use this when you only want explicit document structure (markdown,
|
|
60
|
+
title-case, all-caps, etc.) and not the heuristic short-token rule.
|
|
61
|
+
"""
|
|
62
|
+
if not sentence.strip():
|
|
63
|
+
return False
|
|
64
|
+
if _MD_HEADING_RE.match(sentence):
|
|
65
|
+
return True
|
|
66
|
+
if _ALLCAPS_RE.match(sentence):
|
|
67
|
+
return True
|
|
68
|
+
if _SHORT_LABEL_RE.match(sentence):
|
|
69
|
+
return True
|
|
70
|
+
if _BARE_TITLE_RE.match(sentence):
|
|
71
|
+
return True
|
|
72
|
+
if _NUMBERED_SECTION_RE.match(sentence):
|
|
73
|
+
return True
|
|
74
|
+
if _TITLE_WITH_DASH_RE.match(sentence):
|
|
75
|
+
return True
|
|
76
|
+
return False
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def md_depth(heading_line: str) -> int:
|
|
80
|
+
"""Markdown heading depth (# = 1, ## = 2, ...). 1 for non-markdown headings."""
|
|
81
|
+
m = _MD_DEPTH_RE.match(heading_line)
|
|
82
|
+
if m:
|
|
83
|
+
return len(m.group(1))
|
|
84
|
+
return 1
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def is_heading(sentence: str) -> bool:
|
|
88
|
+
"""True when `sentence` matches any heading pattern (incl. heuristic)."""
|
|
89
|
+
if is_structural_heading(sentence):
|
|
90
|
+
return True
|
|
91
|
+
if not sentence.strip():
|
|
92
|
+
return False
|
|
93
|
+
# Fewer than 4 content-word tokens (rough "title-like" filter).
|
|
94
|
+
toks = re.findall(r"[A-Za-z]{3,}", sentence)
|
|
95
|
+
return len(toks) < 4
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def heading_name(sentence: str) -> str | None:
|
|
99
|
+
"""Extract the name portion of a heading, or None if not a heading."""
|
|
100
|
+
s = sentence.strip()
|
|
101
|
+
if not s:
|
|
102
|
+
return None
|
|
103
|
+
# Strip markdown #'s, numeric-section prefix, em-dash metadata suffix,
|
|
104
|
+
# and trailing colon.
|
|
105
|
+
s = re.sub(r"^#+\s+", "", s)
|
|
106
|
+
s = re.sub(r"^\d+\.\s+", "", s)
|
|
107
|
+
s = re.sub(r"\s+[—–\-]\s+.*$", "", s) # T13d: strip em-dash metadata
|
|
108
|
+
s = s.rstrip(":").strip()
|
|
109
|
+
if not s:
|
|
110
|
+
return None
|
|
111
|
+
return s
|
lede/_parity.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""Canonical text formatters for v0.2 extract primitives — used by the
|
|
2
|
+
cross-runtime parity walker (`rust/tests/fixtures.rs`) and the Python
|
|
3
|
+
generator (`benchmarks/gen_parity_fixtures.py`).
|
|
4
|
+
|
|
5
|
+
Each formatter takes the primitive's structured output and emits a
|
|
6
|
+
deterministic, line-oriented text representation. Python and Rust
|
|
7
|
+
have byte-identical implementations so a fixture generated from
|
|
8
|
+
Python output can be byte-compared against the Rust output for the
|
|
9
|
+
same input.
|
|
10
|
+
|
|
11
|
+
Format choices:
|
|
12
|
+
- One record per line.
|
|
13
|
+
- Fields within a record separated by ` | ` (space-pipe-space).
|
|
14
|
+
- Trailing newline at end of file (mirror of how Python's `\\n`.join
|
|
15
|
+
+ final write_text behaves).
|
|
16
|
+
- Empty-output → empty file (no trailing newline either).
|
|
17
|
+
|
|
18
|
+
Mirrors: rust/src/_parity.rs.
|
|
19
|
+
"""
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
from ._types import Metadata, PhraseFact, Section, Stat
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _join_lines(lines: list[str]) -> str:
|
|
26
|
+
if not lines:
|
|
27
|
+
return ""
|
|
28
|
+
return "\n".join(lines) + "\n"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def format_stats(stats: tuple[Stat, ...]) -> str:
|
|
32
|
+
lines = [
|
|
33
|
+
f"{s.stat_type} | {s.value} | {s.unit} | {s.phrase} | {s.context_sentence}"
|
|
34
|
+
for s in stats
|
|
35
|
+
]
|
|
36
|
+
return _join_lines(lines)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def format_outline(outline: tuple[Section, ...]) -> str:
|
|
40
|
+
lines = [
|
|
41
|
+
f"{sec.depth} | {sec.name} | {sec.representative_sentence}"
|
|
42
|
+
for sec in outline
|
|
43
|
+
]
|
|
44
|
+
return _join_lines(lines)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def format_toc(toc: tuple[str, ...]) -> str:
|
|
48
|
+
return _join_lines(list(toc))
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def format_metadata(md: Metadata) -> str:
|
|
52
|
+
return _join_lines([
|
|
53
|
+
"DATES: " + ", ".join(md.dates),
|
|
54
|
+
"AMOUNTS: " + ", ".join(md.amounts),
|
|
55
|
+
"URLS: " + ", ".join(md.urls),
|
|
56
|
+
"ENTITIES: " + ", ".join(md.entities),
|
|
57
|
+
])
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def format_phrases(phrases: tuple[str, ...]) -> str:
|
|
61
|
+
return _join_lines(list(phrases))
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def format_key_facts(facts: tuple[str, ...]) -> str:
|
|
65
|
+
return _join_lines(list(facts))
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def format_correlate_facts(facts: tuple[PhraseFact, ...]) -> str:
|
|
69
|
+
lines = [
|
|
70
|
+
f"{f.entity} | {f.number} | {f.polarity} | {f.sentence}"
|
|
71
|
+
for f in facts
|
|
72
|
+
]
|
|
73
|
+
return _join_lines(lines)
|
lede/_types.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""v0.2.0 return types (frozen dataclasses)."""
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass(frozen=True)
|
|
6
|
+
class Stat:
|
|
7
|
+
value: str
|
|
8
|
+
unit: str
|
|
9
|
+
phrase: str
|
|
10
|
+
context_sentence: str
|
|
11
|
+
stat_type: str # "money" | "percent" | "count" | "date" | "duration"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass(frozen=True)
|
|
15
|
+
class Section:
|
|
16
|
+
depth: int
|
|
17
|
+
name: str
|
|
18
|
+
representative_sentence: str
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass(frozen=True)
|
|
22
|
+
class Metadata:
|
|
23
|
+
dates: tuple[str, ...] = ()
|
|
24
|
+
amounts: tuple[str, ...] = ()
|
|
25
|
+
urls: tuple[str, ...] = ()
|
|
26
|
+
entities: tuple[str, ...] = () # populated only when lede[ner] installed (Task 9)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass(frozen=True)
|
|
30
|
+
class PhraseFact:
|
|
31
|
+
entity: str
|
|
32
|
+
number: str
|
|
33
|
+
polarity: str # "absolute" | "growth" | "decline" | "unknown"
|
|
34
|
+
sentence: str
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass(frozen=True)
|
|
38
|
+
class SummaryResult:
|
|
39
|
+
summary: str
|
|
40
|
+
stats: tuple[Stat, ...] | None = None
|
|
41
|
+
outline: tuple[Section, ...] | None = None
|
|
42
|
+
metadata: Metadata | None = None
|
|
43
|
+
phrases: tuple[str, ...] | None = None
|
|
44
|
+
correlated_facts: tuple[PhraseFact, ...] | None = None
|
|
45
|
+
|
|
46
|
+
def __str__(self) -> str:
|
|
47
|
+
return self.summary
|
lede/brief.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
"""At-a-glance document brief — composes summarize + key_facts + toc.
|
|
2
|
+
|
|
3
|
+
Top-level convenience primitive for "reader brief" use cases: email
|
|
4
|
+
digests, file-browser previews, ingest-pipeline pre-summarization.
|
|
5
|
+
Agnostic of document type — no per-doc heuristics. Callers who want
|
|
6
|
+
different composition should call the underlying primitives directly.
|
|
7
|
+
|
|
8
|
+
Mirrors rust/src/brief.rs. Keep the three output formats (string,
|
|
9
|
+
markdown, dict) byte-identical across Python and Rust for the regex
|
|
10
|
+
backend.
|
|
11
|
+
"""
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from .tfidf import summarize
|
|
15
|
+
from .extract.key_facts import key_facts
|
|
16
|
+
from .extract.outline import toc
|
|
17
|
+
from .extract.phrases import phrases
|
|
18
|
+
|
|
19
|
+
# Auto-detect the wordforms extra at import time. When available,
|
|
20
|
+
# brief() forwards convert_word_names=True to its internal key_facts()
|
|
21
|
+
# call so spelled-out numbers ("five thousand documents", "twelve lines")
|
|
22
|
+
# surface in the key facts section. Mirrors the same pattern used in
|
|
23
|
+
# benchmarks/extraction_eval.py::_STATS_WORDFORMS.
|
|
24
|
+
try:
|
|
25
|
+
import text_to_num # noqa: F401
|
|
26
|
+
|
|
27
|
+
_HAS_WORDFORMS = True
|
|
28
|
+
except ImportError:
|
|
29
|
+
_HAS_WORDFORMS = False
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# Overview budget clamps. Floor keeps very short inputs readable; ceiling
|
|
33
|
+
# keeps very long inputs actually brief (a 30 KB doc at 0.35 would produce
|
|
34
|
+
# a 10 KB "overview" which defeats the purpose).
|
|
35
|
+
_OVERVIEW_MIN_FRAC = 0.05
|
|
36
|
+
_OVERVIEW_MAX_FRAC = 0.50
|
|
37
|
+
_OVERVIEW_MIN_CHARS = 250
|
|
38
|
+
_OVERVIEW_MAX_CHARS = 1500
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def brief(
|
|
42
|
+
text: str,
|
|
43
|
+
*,
|
|
44
|
+
overview_max: float = 0.35,
|
|
45
|
+
max_facts: int = 10,
|
|
46
|
+
include_phrases: bool = False,
|
|
47
|
+
convert_word_names: bool | None = None,
|
|
48
|
+
format: str = "string",
|
|
49
|
+
) -> str | dict:
|
|
50
|
+
"""Produce an at-a-glance brief of a document.
|
|
51
|
+
|
|
52
|
+
Composes ``summarize()`` (overview) + ``extract.key_facts()`` +
|
|
53
|
+
``extract.toc()`` into a single caller-friendly artifact. Optional
|
|
54
|
+
``extract.phrases()`` via ``include_phrases=True``.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
text: input document text.
|
|
58
|
+
overview_max: fraction of source length to budget for the overview.
|
|
59
|
+
Clamped to ``[0.05, 0.50]``. Default 0.35. The resolved
|
|
60
|
+
char budget is further clamped to ``[250, 1500]`` so short
|
|
61
|
+
docs still get a readable overview and long docs stay brief.
|
|
62
|
+
max_facts: cap on key-facts sentences. Default 10.
|
|
63
|
+
include_phrases: when True, append a key-phrases section (regex
|
|
64
|
+
backend). Default False.
|
|
65
|
+
convert_word_names: forward to ``key_facts()`` so spelled-out
|
|
66
|
+
numbers ("five thousand documents") surface in the
|
|
67
|
+
key-facts section. ``None`` (default) auto-detects whether
|
|
68
|
+
``text2num`` is importable. Pass ``True`` / ``False``
|
|
69
|
+
explicitly to lock the behavior — useful when you need
|
|
70
|
+
output to match a Rust binary built with or without the
|
|
71
|
+
``wordforms`` cargo feature regardless of which Python
|
|
72
|
+
extras happen to be installed.
|
|
73
|
+
format: output shape. One of:
|
|
74
|
+
- ``"string"`` (default) — plain text with section labels.
|
|
75
|
+
- ``"markdown"`` — ``##`` headers + bullet lists.
|
|
76
|
+
- ``"dict"`` — structured dict with overview/key_facts/toc/phrases.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
``str`` for ``"string"`` / ``"markdown"`` formats, ``dict`` for
|
|
80
|
+
``"dict"`` format.
|
|
81
|
+
|
|
82
|
+
Raises:
|
|
83
|
+
ValueError: when ``format`` is not one of the three supported values.
|
|
84
|
+
|
|
85
|
+
Notes on cross-runtime parity:
|
|
86
|
+
Python's auto-detect (``convert_word_names=None``) is at import
|
|
87
|
+
time; Rust's equivalent is the compile-time ``wordforms`` cargo
|
|
88
|
+
feature. The two can disagree silently when (a) the Python
|
|
89
|
+
process has ``text2num`` installed for an unrelated tool and
|
|
90
|
+
(b) the Rust binary was built without ``--features wordforms``
|
|
91
|
+
— same input, different brief bytes. Pass an explicit
|
|
92
|
+
``convert_word_names=True`` (or ``False``) on both sides to
|
|
93
|
+
lock parity.
|
|
94
|
+
"""
|
|
95
|
+
# Clamp overview_max to sane fraction bounds.
|
|
96
|
+
overview_max = max(_OVERVIEW_MIN_FRAC, min(_OVERVIEW_MAX_FRAC, overview_max))
|
|
97
|
+
budget = int(len(text) * overview_max)
|
|
98
|
+
budget = max(_OVERVIEW_MIN_CHARS, min(_OVERVIEW_MAX_CHARS, budget))
|
|
99
|
+
|
|
100
|
+
overview_result = summarize(text, max_length=budget)
|
|
101
|
+
overview_text = overview_result.summary.rstrip()
|
|
102
|
+
|
|
103
|
+
use_wordforms = _HAS_WORDFORMS if convert_word_names is None else convert_word_names
|
|
104
|
+
facts = key_facts(
|
|
105
|
+
text,
|
|
106
|
+
max_facts=max_facts,
|
|
107
|
+
convert_word_names=use_wordforms,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
sections = toc(text)
|
|
111
|
+
|
|
112
|
+
phrases_list: tuple[str, ...] = phrases(text) if include_phrases else ()
|
|
113
|
+
|
|
114
|
+
if format == "dict":
|
|
115
|
+
return {
|
|
116
|
+
"overview": overview_text,
|
|
117
|
+
"key_facts": list(facts),
|
|
118
|
+
"toc": list(sections),
|
|
119
|
+
"phrases": list(phrases_list) if include_phrases else None,
|
|
120
|
+
}
|
|
121
|
+
if format == "markdown":
|
|
122
|
+
parts: list[str] = ["## Overview\n", overview_text, ""]
|
|
123
|
+
if facts:
|
|
124
|
+
parts.append("\n## Key facts\n")
|
|
125
|
+
for f in facts:
|
|
126
|
+
parts.append(f"- {f}")
|
|
127
|
+
parts.append("")
|
|
128
|
+
if sections:
|
|
129
|
+
parts.append("\n## Also in this doc\n")
|
|
130
|
+
for s in sections:
|
|
131
|
+
parts.append(f"- {s}")
|
|
132
|
+
parts.append("")
|
|
133
|
+
if include_phrases and phrases_list:
|
|
134
|
+
parts.append("\n## Key phrases\n")
|
|
135
|
+
parts.append(" · ".join(phrases_list))
|
|
136
|
+
parts.append("")
|
|
137
|
+
return "\n".join(parts)
|
|
138
|
+
if format == "string":
|
|
139
|
+
parts = ["Overview:", overview_text, ""]
|
|
140
|
+
if facts:
|
|
141
|
+
parts.append("Key facts:")
|
|
142
|
+
for f in facts:
|
|
143
|
+
parts.append(f" - {f}")
|
|
144
|
+
parts.append("")
|
|
145
|
+
if sections:
|
|
146
|
+
parts.append("Also in this doc:")
|
|
147
|
+
for s in sections:
|
|
148
|
+
parts.append(f" - {s}")
|
|
149
|
+
parts.append("")
|
|
150
|
+
if include_phrases and phrases_list:
|
|
151
|
+
parts.append("Key phrases:")
|
|
152
|
+
parts.append(" " + ", ".join(phrases_list))
|
|
153
|
+
parts.append("")
|
|
154
|
+
return "\n".join(parts).rstrip() + "\n"
|
|
155
|
+
raise ValueError(
|
|
156
|
+
f"format must be 'string', 'markdown', or 'dict'; got {format!r}"
|
|
157
|
+
)
|
lede/clean.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""Text cleaners: clean_text (markdown + filler + boilerplate) and strip_think
|
|
2
|
+
(reasoning-model <think>...</think> blocks).
|
|
3
|
+
|
|
4
|
+
Both functions are deterministic and stdlib-only.
|
|
5
|
+
"""
|
|
6
|
+
import re
|
|
7
|
+
|
|
8
|
+
_THINK_RE = re.compile(r"<think>.*?</think>\s*", re.DOTALL)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def strip_think(text: str) -> str:
|
|
12
|
+
"""Remove <think>...</think> blocks and trim surrounding whitespace."""
|
|
13
|
+
if text is None:
|
|
14
|
+
return ""
|
|
15
|
+
return _THINK_RE.sub("", text).strip()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# --- clean_text — pipeline ---
|
|
19
|
+
#
|
|
20
|
+
# Steps in order:
|
|
21
|
+
# 1. Strip markdown (*, _, #, ---, bullets, numbered list prefixes)
|
|
22
|
+
# 2. Remove filler phrases
|
|
23
|
+
# 3. Remove filler words
|
|
24
|
+
# 4. Remove CRM boilerplate
|
|
25
|
+
# 5. Lowercase
|
|
26
|
+
# 6. Collapse whitespace and blank lines
|
|
27
|
+
# 7. Trim
|
|
28
|
+
|
|
29
|
+
_FILLER_PHRASES = re.compile(
|
|
30
|
+
r"\b(just wanted to|i just wanted to|wanted to follow up|as discussed"
|
|
31
|
+
r"|per our conversation|as mentioned|going forward|at the end of the day"
|
|
32
|
+
r"|in terms of|with respect to|in regards to|please find attached"
|
|
33
|
+
r"|hope this helps|let me know if you have any questions"
|
|
34
|
+
r"|looking forward to hearing from you)\b",
|
|
35
|
+
re.IGNORECASE,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
_FILLER_WORDS = re.compile(
|
|
39
|
+
r"\b(basically|essentially|actually|literally|honestly|frankly"
|
|
40
|
+
r"|obviously|clearly|simply|really|very|quite|rather|pretty much"
|
|
41
|
+
r"|kind of|sort of|in order to|due to the fact that"
|
|
42
|
+
r"|at this point in time|for all intents and purposes)\b",
|
|
43
|
+
re.IGNORECASE,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
_CRM_PATTERNS = [
|
|
47
|
+
re.compile(r"\bNo update[s]?\b\.?", re.IGNORECASE),
|
|
48
|
+
re.compile(r"\bCalendar invite sent[.]?\b", re.IGNORECASE),
|
|
49
|
+
re.compile(
|
|
50
|
+
r"\bSent (proposal|case study|documentation|overview|pricing)"
|
|
51
|
+
r" (documentation |via email|as requested)?\.?\b",
|
|
52
|
+
re.IGNORECASE,
|
|
53
|
+
),
|
|
54
|
+
re.compile(r"\bWaiting (on|for) callback\.?\b", re.IGNORECASE),
|
|
55
|
+
re.compile(r"\bUpdated CRM with latest info\.?\b", re.IGNORECASE),
|
|
56
|
+
re.compile(r"\bMeeting confirmed for next week\.?\b", re.IGNORECASE),
|
|
57
|
+
re.compile(r"\bFollowing standard sales process\.?\b", re.IGNORECASE),
|
|
58
|
+
re.compile(r"\bMeeting went as expected\.?\b", re.IGNORECASE),
|
|
59
|
+
]
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def clean_text(text: str | None) -> str:
|
|
63
|
+
"""Strip markdown, filler phrases, filler words, and CRM boilerplate;
|
|
64
|
+
lowercase and normalize whitespace.
|
|
65
|
+
|
|
66
|
+
Returns the empty string for None or empty input.
|
|
67
|
+
"""
|
|
68
|
+
if not text:
|
|
69
|
+
return ""
|
|
70
|
+
|
|
71
|
+
result = text
|
|
72
|
+
|
|
73
|
+
# 1. Markdown formatting
|
|
74
|
+
result = re.sub(r"\*{1,3}", "", result)
|
|
75
|
+
result = re.sub(r"_{1,3}", "", result)
|
|
76
|
+
result = re.sub(r"^#{1,6}\s*", "", result, flags=re.MULTILINE)
|
|
77
|
+
result = re.sub(r"^-{3,}$", "", result, flags=re.MULTILINE)
|
|
78
|
+
result = re.sub(r"^\s*[-*+]\s+", "", result, flags=re.MULTILINE)
|
|
79
|
+
result = re.sub(r"^\s*\d+\.\s+", "", result, flags=re.MULTILINE)
|
|
80
|
+
|
|
81
|
+
# 2-3. Filler
|
|
82
|
+
result = _FILLER_PHRASES.sub("", result)
|
|
83
|
+
result = _FILLER_WORDS.sub("", result)
|
|
84
|
+
|
|
85
|
+
# 4. CRM boilerplate
|
|
86
|
+
for pattern in _CRM_PATTERNS:
|
|
87
|
+
result = pattern.sub("", result)
|
|
88
|
+
|
|
89
|
+
# 5. Lowercase
|
|
90
|
+
result = result.lower()
|
|
91
|
+
|
|
92
|
+
# 6. Whitespace normalization
|
|
93
|
+
result = re.sub(r"[ \t]+", " ", result)
|
|
94
|
+
result = re.sub(r"\n\s*\n+", "\n", result)
|
|
95
|
+
result = re.sub(r"^\s+", "", result, flags=re.MULTILINE)
|
|
96
|
+
result = re.sub(r"\s+$", "", result, flags=re.MULTILINE)
|
|
97
|
+
result = re.sub(r"^\s*$\n?", "", result, flags=re.MULTILINE)
|
|
98
|
+
|
|
99
|
+
# 7. Trim
|
|
100
|
+
return result.strip()
|
lede/cli.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""lede CLI.
|
|
2
|
+
|
|
3
|
+
Usage:
|
|
4
|
+
lede [FILE] --mode {tfidf,keyword,clean_text,strip_think} [OPTIONS]
|
|
5
|
+
|
|
6
|
+
Reads FILE or stdin, writes summary to stdout.
|
|
7
|
+
"""
|
|
8
|
+
import argparse
|
|
9
|
+
import sys
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from lede import summarize, clean_text, strip_think, extract_keyword
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _read_input(path: str | None) -> str:
|
|
16
|
+
# Force UTF-8 — relying on locale gives Windows users mojibake on
|
|
17
|
+
# non-ASCII content and breaks the byte-identical-runtime claim
|
|
18
|
+
# (the Rust CLI reads files as bytes and decodes UTF-8 explicitly).
|
|
19
|
+
if path and path != "-":
|
|
20
|
+
return Path(path).read_text(encoding="utf-8")
|
|
21
|
+
if hasattr(sys.stdin, "buffer"):
|
|
22
|
+
return sys.stdin.buffer.read().decode("utf-8")
|
|
23
|
+
return sys.stdin.read()
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def main(argv: list[str] | None = None) -> int:
|
|
27
|
+
parser = argparse.ArgumentParser(
|
|
28
|
+
prog="lede",
|
|
29
|
+
description="Deterministic extractive summarization.",
|
|
30
|
+
)
|
|
31
|
+
parser.add_argument(
|
|
32
|
+
"path",
|
|
33
|
+
nargs="?",
|
|
34
|
+
default=None,
|
|
35
|
+
help="Input file path. Reads stdin if omitted.",
|
|
36
|
+
)
|
|
37
|
+
parser.add_argument(
|
|
38
|
+
"--mode",
|
|
39
|
+
choices=["tfidf", "keyword", "clean_text", "strip_think"],
|
|
40
|
+
default="tfidf",
|
|
41
|
+
help="Summarization mode (default: tfidf).",
|
|
42
|
+
)
|
|
43
|
+
parser.add_argument(
|
|
44
|
+
"--max-chars",
|
|
45
|
+
type=int,
|
|
46
|
+
default=500,
|
|
47
|
+
help="Character budget for tfidf mode (default: 500).",
|
|
48
|
+
)
|
|
49
|
+
parser.add_argument(
|
|
50
|
+
"--keywords",
|
|
51
|
+
default="",
|
|
52
|
+
help="Space-separated keywords for keyword mode.",
|
|
53
|
+
)
|
|
54
|
+
parser.add_argument(
|
|
55
|
+
"--top",
|
|
56
|
+
type=int,
|
|
57
|
+
default=10,
|
|
58
|
+
help="Number of sentences to return in keyword mode (default: 10).",
|
|
59
|
+
)
|
|
60
|
+
args = parser.parse_args(argv)
|
|
61
|
+
|
|
62
|
+
text = _read_input(args.path)
|
|
63
|
+
|
|
64
|
+
if args.mode == "tfidf":
|
|
65
|
+
output = summarize(text, max_length=args.max_chars).summary
|
|
66
|
+
elif args.mode == "keyword":
|
|
67
|
+
if not args.keywords:
|
|
68
|
+
parser.error("--mode keyword requires --keywords")
|
|
69
|
+
output = extract_keyword(text, args.keywords, num_sentences=args.top)
|
|
70
|
+
elif args.mode == "clean_text":
|
|
71
|
+
output = clean_text(text)
|
|
72
|
+
elif args.mode == "strip_think":
|
|
73
|
+
output = strip_think(text)
|
|
74
|
+
else: # pragma: no cover
|
|
75
|
+
parser.error(f"unknown mode: {args.mode}")
|
|
76
|
+
|
|
77
|
+
sys.stdout.write(output)
|
|
78
|
+
if not output.endswith("\n"):
|
|
79
|
+
sys.stdout.write("\n")
|
|
80
|
+
return 0
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
if __name__ == "__main__":
|
|
84
|
+
sys.exit(main())
|