lede 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lede/__init__.py ADDED
@@ -0,0 +1,28 @@
1
+ """lede — deterministic extractive summarization.
2
+
3
+ Public API:
4
+ summarize(text, max_length, mode='default') -> SummaryResult
5
+ brief(text, *, overview_max, max_facts, include_phrases, format) -> str | dict
6
+ clean_text(text) -> str
7
+ strip_think(text) -> str
8
+ extract_keyword(text, keywords, num_sentences=10) -> str
9
+ set_default_backend(name) -> None # enrichment backend selector
10
+ """
11
+ from lede.clean import clean_text, strip_think
12
+ from lede.tfidf import summarize
13
+ from lede.brief import brief
14
+ from lede.keyword import extract_keyword
15
+ from lede._types import SummaryResult
16
+ from lede.extract._backends import set_default_backend
17
+
18
+ __version__ = "0.3.0"
19
+ __all__ = [
20
+ "summarize",
21
+ "brief",
22
+ "clean_text",
23
+ "strip_think",
24
+ "extract_keyword",
25
+ "SummaryResult",
26
+ "set_default_backend",
27
+ "__version__",
28
+ ]
lede/_headings.py ADDED
@@ -0,0 +1,111 @@
1
+ """Shared heading detection.
2
+
3
+ Any sentence matching any of these patterns is considered a heading and is
4
+ dropped from candidate selection in mode='default'. Also used by
5
+ extract.outline to discover section names.
6
+ """
7
+ import re
8
+
9
+ # Markdown ATX-style heading: one or more # followed by space and text
10
+ _MD_HEADING_RE = re.compile(r"^\s*#+\s+.+$")
11
+
12
+ # ALL-CAPS short line: 4-80 chars of A-Z/space/colon, no lowercase.
13
+ # Upper bound extended from 28 -> 80 in T13b so long document-title
14
+ # conventions like "SUPREME COURT OF THE UNITED STATES" (32 chars) match.
15
+ _ALLCAPS_RE = re.compile(r"^\s*[A-Z][A-Z\s]{3,80}:?\s*$")
16
+
17
+ # Short label ending in colon (<=30 chars including the colon)
18
+ _SHORT_LABEL_RE = re.compile(r"^\s*.{1,30}:\s*$")
19
+
20
+ # Bare title-case single-line heading (T13b Class A): starts uppercase,
21
+ # second char alphanumeric (not space/punct), up to ~60 chars total, no
22
+ # internal special chars, no terminal punctuation. Matches bare lines like
23
+ # "Abstract", "Introduction", "Modern methods", "Privacy Policy".
24
+ #
25
+ # The "no terminal punctuation" constraint is load-bearing: it excludes short
26
+ # body sentences like "Costs declined." that ended in a period, which was the
27
+ # T6 false-positive class we dropped the < 4 content-tokens fallback for.
28
+ _BARE_TITLE_RE = re.compile(r"^\s*[A-Z][A-Za-z0-9][A-Za-z0-9 ]{0,58}$")
29
+
30
+ # Numbered-section-prefix heading (T13b Class B): "\d+. Section Name" on its
31
+ # own line. Matches "1. Information We Collect" etc. The numeric prefix is
32
+ # stripped by `heading_name()` so the emitted name matches gold.
33
+ _NUMBERED_SECTION_RE = re.compile(r"^\s*\d+\.\s+[A-Z][A-Za-z0-9 ]{0,58}$")
34
+
35
+ # Title-with-dash heading (T13d): document title line of the form
36
+ # "Title — Metadata" (em-dash U+2014, en-dash U+2013, or hyphen-minus), e.g.
37
+ # "Privacy Policy — Effective Date: 2026-01-01". Authorized by the labeling
38
+ # protocol's Edge-case conventions (docs/extraction-gold-labeling.md).
39
+ # Requires a space on both sides of the dash so hyphenated single words
40
+ # ("state-of-the-art") don't match.
41
+ #
42
+ # The trailing character class `[^.!?]*$` (no terminal `.`/`!`/`?`) mirrors
43
+ # the `_BARE_TITLE_RE` safety constraint — excludes body sentences like
44
+ # "Main concern is pricing — our $50K quote is 40% above their Q2 budget."
45
+ # that would otherwise match. The dash-metadata suffix is stripped by
46
+ # `heading_name()`.
47
+ _TITLE_WITH_DASH_RE = re.compile(r"^\s*[A-Z][A-Za-z0-9 ]{0,58}\s+[—–\-]\s+\S[^.!?]*$")
48
+
49
+
50
+ # Markdown heading depth helper — stripped from `## Section` etc.
51
+ _MD_DEPTH_RE = re.compile(r"^\s*(#+)\s+")
52
+
53
+
54
+ def is_structural_heading(sentence: str) -> bool:
55
+ """True when `sentence` matches a structural heading pattern.
56
+
57
+ Narrower than [`is_heading`] — drops the "< 4 content tokens"
58
+ fallback so short body sentences don't masquerade as headings.
59
+ Use this when you only want explicit document structure (markdown,
60
+ title-case, all-caps, etc.) and not the heuristic short-token rule.
61
+ """
62
+ if not sentence.strip():
63
+ return False
64
+ if _MD_HEADING_RE.match(sentence):
65
+ return True
66
+ if _ALLCAPS_RE.match(sentence):
67
+ return True
68
+ if _SHORT_LABEL_RE.match(sentence):
69
+ return True
70
+ if _BARE_TITLE_RE.match(sentence):
71
+ return True
72
+ if _NUMBERED_SECTION_RE.match(sentence):
73
+ return True
74
+ if _TITLE_WITH_DASH_RE.match(sentence):
75
+ return True
76
+ return False
77
+
78
+
79
+ def md_depth(heading_line: str) -> int:
80
+ """Markdown heading depth (# = 1, ## = 2, ...). 1 for non-markdown headings."""
81
+ m = _MD_DEPTH_RE.match(heading_line)
82
+ if m:
83
+ return len(m.group(1))
84
+ return 1
85
+
86
+
87
+ def is_heading(sentence: str) -> bool:
88
+ """True when `sentence` matches any heading pattern (incl. heuristic)."""
89
+ if is_structural_heading(sentence):
90
+ return True
91
+ if not sentence.strip():
92
+ return False
93
+ # Fewer than 4 content-word tokens (rough "title-like" filter).
94
+ toks = re.findall(r"[A-Za-z]{3,}", sentence)
95
+ return len(toks) < 4
96
+
97
+
98
+ def heading_name(sentence: str) -> str | None:
99
+ """Extract the name portion of a heading, or None if not a heading."""
100
+ s = sentence.strip()
101
+ if not s:
102
+ return None
103
+ # Strip markdown #'s, numeric-section prefix, em-dash metadata suffix,
104
+ # and trailing colon.
105
+ s = re.sub(r"^#+\s+", "", s)
106
+ s = re.sub(r"^\d+\.\s+", "", s)
107
+ s = re.sub(r"\s+[—–\-]\s+.*$", "", s) # T13d: strip em-dash metadata
108
+ s = s.rstrip(":").strip()
109
+ if not s:
110
+ return None
111
+ return s
lede/_parity.py ADDED
@@ -0,0 +1,73 @@
1
+ """Canonical text formatters for v0.2 extract primitives — used by the
2
+ cross-runtime parity walker (`rust/tests/fixtures.rs`) and the Python
3
+ generator (`benchmarks/gen_parity_fixtures.py`).
4
+
5
+ Each formatter takes the primitive's structured output and emits a
6
+ deterministic, line-oriented text representation. Python and Rust
7
+ have byte-identical implementations so a fixture generated from
8
+ Python output can be byte-compared against the Rust output for the
9
+ same input.
10
+
11
+ Format choices:
12
+ - One record per line.
13
+ - Fields within a record separated by ` | ` (space-pipe-space).
14
+ - Trailing newline at end of file (mirror of how Python's `\\n`.join
15
+ + final write_text behaves).
16
+ - Empty-output → empty file (no trailing newline either).
17
+
18
+ Mirrors: rust/src/_parity.rs.
19
+ """
20
+ from __future__ import annotations
21
+
22
+ from ._types import Metadata, PhraseFact, Section, Stat
23
+
24
+
25
+ def _join_lines(lines: list[str]) -> str:
26
+ if not lines:
27
+ return ""
28
+ return "\n".join(lines) + "\n"
29
+
30
+
31
+ def format_stats(stats: tuple[Stat, ...]) -> str:
32
+ lines = [
33
+ f"{s.stat_type} | {s.value} | {s.unit} | {s.phrase} | {s.context_sentence}"
34
+ for s in stats
35
+ ]
36
+ return _join_lines(lines)
37
+
38
+
39
+ def format_outline(outline: tuple[Section, ...]) -> str:
40
+ lines = [
41
+ f"{sec.depth} | {sec.name} | {sec.representative_sentence}"
42
+ for sec in outline
43
+ ]
44
+ return _join_lines(lines)
45
+
46
+
47
+ def format_toc(toc: tuple[str, ...]) -> str:
48
+ return _join_lines(list(toc))
49
+
50
+
51
+ def format_metadata(md: Metadata) -> str:
52
+ return _join_lines([
53
+ "DATES: " + ", ".join(md.dates),
54
+ "AMOUNTS: " + ", ".join(md.amounts),
55
+ "URLS: " + ", ".join(md.urls),
56
+ "ENTITIES: " + ", ".join(md.entities),
57
+ ])
58
+
59
+
60
+ def format_phrases(phrases: tuple[str, ...]) -> str:
61
+ return _join_lines(list(phrases))
62
+
63
+
64
+ def format_key_facts(facts: tuple[str, ...]) -> str:
65
+ return _join_lines(list(facts))
66
+
67
+
68
+ def format_correlate_facts(facts: tuple[PhraseFact, ...]) -> str:
69
+ lines = [
70
+ f"{f.entity} | {f.number} | {f.polarity} | {f.sentence}"
71
+ for f in facts
72
+ ]
73
+ return _join_lines(lines)
lede/_types.py ADDED
@@ -0,0 +1,47 @@
1
+ """v0.2.0 return types (frozen dataclasses)."""
2
+ from dataclasses import dataclass
3
+
4
+
5
+ @dataclass(frozen=True)
6
+ class Stat:
7
+ value: str
8
+ unit: str
9
+ phrase: str
10
+ context_sentence: str
11
+ stat_type: str # "money" | "percent" | "count" | "date" | "duration"
12
+
13
+
14
+ @dataclass(frozen=True)
15
+ class Section:
16
+ depth: int
17
+ name: str
18
+ representative_sentence: str
19
+
20
+
21
+ @dataclass(frozen=True)
22
+ class Metadata:
23
+ dates: tuple[str, ...] = ()
24
+ amounts: tuple[str, ...] = ()
25
+ urls: tuple[str, ...] = ()
26
+ entities: tuple[str, ...] = () # populated only when lede[ner] installed (Task 9)
27
+
28
+
29
+ @dataclass(frozen=True)
30
+ class PhraseFact:
31
+ entity: str
32
+ number: str
33
+ polarity: str # "absolute" | "growth" | "decline" | "unknown"
34
+ sentence: str
35
+
36
+
37
+ @dataclass(frozen=True)
38
+ class SummaryResult:
39
+ summary: str
40
+ stats: tuple[Stat, ...] | None = None
41
+ outline: tuple[Section, ...] | None = None
42
+ metadata: Metadata | None = None
43
+ phrases: tuple[str, ...] | None = None
44
+ correlated_facts: tuple[PhraseFact, ...] | None = None
45
+
46
+ def __str__(self) -> str:
47
+ return self.summary
lede/brief.py ADDED
@@ -0,0 +1,157 @@
1
+ """At-a-glance document brief — composes summarize + key_facts + toc.
2
+
3
+ Top-level convenience primitive for "reader brief" use cases: email
4
+ digests, file-browser previews, ingest-pipeline pre-summarization.
5
+ Agnostic of document type — no per-doc heuristics. Callers who want
6
+ different composition should call the underlying primitives directly.
7
+
8
+ Mirrors rust/src/brief.rs. Keep the three output formats (string,
9
+ markdown, dict) byte-identical across Python and Rust for the regex
10
+ backend.
11
+ """
12
+ from __future__ import annotations
13
+
14
+ from .tfidf import summarize
15
+ from .extract.key_facts import key_facts
16
+ from .extract.outline import toc
17
+ from .extract.phrases import phrases
18
+
19
+ # Auto-detect the wordforms extra at import time. When available,
20
+ # brief() forwards convert_word_names=True to its internal key_facts()
21
+ # call so spelled-out numbers ("five thousand documents", "twelve lines")
22
+ # surface in the key facts section. Mirrors the same pattern used in
23
+ # benchmarks/extraction_eval.py::_STATS_WORDFORMS.
24
+ try:
25
+ import text_to_num # noqa: F401
26
+
27
+ _HAS_WORDFORMS = True
28
+ except ImportError:
29
+ _HAS_WORDFORMS = False
30
+
31
+
32
+ # Overview budget clamps. Floor keeps very short inputs readable; ceiling
33
+ # keeps very long inputs actually brief (a 30 KB doc at 0.35 would produce
34
+ # a 10 KB "overview" which defeats the purpose).
35
+ _OVERVIEW_MIN_FRAC = 0.05
36
+ _OVERVIEW_MAX_FRAC = 0.50
37
+ _OVERVIEW_MIN_CHARS = 250
38
+ _OVERVIEW_MAX_CHARS = 1500
39
+
40
+
41
+ def brief(
42
+ text: str,
43
+ *,
44
+ overview_max: float = 0.35,
45
+ max_facts: int = 10,
46
+ include_phrases: bool = False,
47
+ convert_word_names: bool | None = None,
48
+ format: str = "string",
49
+ ) -> str | dict:
50
+ """Produce an at-a-glance brief of a document.
51
+
52
+ Composes ``summarize()`` (overview) + ``extract.key_facts()`` +
53
+ ``extract.toc()`` into a single caller-friendly artifact. Optional
54
+ ``extract.phrases()`` via ``include_phrases=True``.
55
+
56
+ Args:
57
+ text: input document text.
58
+ overview_max: fraction of source length to budget for the overview.
59
+ Clamped to ``[0.05, 0.50]``. Default 0.35. The resolved
60
+ char budget is further clamped to ``[250, 1500]`` so short
61
+ docs still get a readable overview and long docs stay brief.
62
+ max_facts: cap on key-facts sentences. Default 10.
63
+ include_phrases: when True, append a key-phrases section (regex
64
+ backend). Default False.
65
+ convert_word_names: forward to ``key_facts()`` so spelled-out
66
+ numbers ("five thousand documents") surface in the
67
+ key-facts section. ``None`` (default) auto-detects whether
68
+ ``text2num`` is importable. Pass ``True`` / ``False``
69
+ explicitly to lock the behavior — useful when you need
70
+ output to match a Rust binary built with or without the
71
+ ``wordforms`` cargo feature regardless of which Python
72
+ extras happen to be installed.
73
+ format: output shape. One of:
74
+ - ``"string"`` (default) — plain text with section labels.
75
+ - ``"markdown"`` — ``##`` headers + bullet lists.
76
+ - ``"dict"`` — structured dict with overview/key_facts/toc/phrases.
77
+
78
+ Returns:
79
+ ``str`` for ``"string"`` / ``"markdown"`` formats, ``dict`` for
80
+ ``"dict"`` format.
81
+
82
+ Raises:
83
+ ValueError: when ``format`` is not one of the three supported values.
84
+
85
+ Notes on cross-runtime parity:
86
+ Python's auto-detect (``convert_word_names=None``) is at import
87
+ time; Rust's equivalent is the compile-time ``wordforms`` cargo
88
+ feature. The two can disagree silently when (a) the Python
89
+ process has ``text2num`` installed for an unrelated tool and
90
+ (b) the Rust binary was built without ``--features wordforms``
91
+ — same input, different brief bytes. Pass an explicit
92
+ ``convert_word_names=True`` (or ``False``) on both sides to
93
+ lock parity.
94
+ """
95
+ # Clamp overview_max to sane fraction bounds.
96
+ overview_max = max(_OVERVIEW_MIN_FRAC, min(_OVERVIEW_MAX_FRAC, overview_max))
97
+ budget = int(len(text) * overview_max)
98
+ budget = max(_OVERVIEW_MIN_CHARS, min(_OVERVIEW_MAX_CHARS, budget))
99
+
100
+ overview_result = summarize(text, max_length=budget)
101
+ overview_text = overview_result.summary.rstrip()
102
+
103
+ use_wordforms = _HAS_WORDFORMS if convert_word_names is None else convert_word_names
104
+ facts = key_facts(
105
+ text,
106
+ max_facts=max_facts,
107
+ convert_word_names=use_wordforms,
108
+ )
109
+
110
+ sections = toc(text)
111
+
112
+ phrases_list: tuple[str, ...] = phrases(text) if include_phrases else ()
113
+
114
+ if format == "dict":
115
+ return {
116
+ "overview": overview_text,
117
+ "key_facts": list(facts),
118
+ "toc": list(sections),
119
+ "phrases": list(phrases_list) if include_phrases else None,
120
+ }
121
+ if format == "markdown":
122
+ parts: list[str] = ["## Overview\n", overview_text, ""]
123
+ if facts:
124
+ parts.append("\n## Key facts\n")
125
+ for f in facts:
126
+ parts.append(f"- {f}")
127
+ parts.append("")
128
+ if sections:
129
+ parts.append("\n## Also in this doc\n")
130
+ for s in sections:
131
+ parts.append(f"- {s}")
132
+ parts.append("")
133
+ if include_phrases and phrases_list:
134
+ parts.append("\n## Key phrases\n")
135
+ parts.append(" · ".join(phrases_list))
136
+ parts.append("")
137
+ return "\n".join(parts)
138
+ if format == "string":
139
+ parts = ["Overview:", overview_text, ""]
140
+ if facts:
141
+ parts.append("Key facts:")
142
+ for f in facts:
143
+ parts.append(f" - {f}")
144
+ parts.append("")
145
+ if sections:
146
+ parts.append("Also in this doc:")
147
+ for s in sections:
148
+ parts.append(f" - {s}")
149
+ parts.append("")
150
+ if include_phrases and phrases_list:
151
+ parts.append("Key phrases:")
152
+ parts.append(" " + ", ".join(phrases_list))
153
+ parts.append("")
154
+ return "\n".join(parts).rstrip() + "\n"
155
+ raise ValueError(
156
+ f"format must be 'string', 'markdown', or 'dict'; got {format!r}"
157
+ )
lede/clean.py ADDED
@@ -0,0 +1,100 @@
1
+ """Text cleaners: clean_text (markdown + filler + boilerplate) and strip_think
2
+ (reasoning-model <think>...</think> blocks).
3
+
4
+ Both functions are deterministic and stdlib-only.
5
+ """
6
+ import re
7
+
8
+ _THINK_RE = re.compile(r"<think>.*?</think>\s*", re.DOTALL)
9
+
10
+
11
+ def strip_think(text: str) -> str:
12
+ """Remove <think>...</think> blocks and trim surrounding whitespace."""
13
+ if text is None:
14
+ return ""
15
+ return _THINK_RE.sub("", text).strip()
16
+
17
+
18
+ # --- clean_text — pipeline ---
19
+ #
20
+ # Steps in order:
21
+ # 1. Strip markdown (*, _, #, ---, bullets, numbered list prefixes)
22
+ # 2. Remove filler phrases
23
+ # 3. Remove filler words
24
+ # 4. Remove CRM boilerplate
25
+ # 5. Lowercase
26
+ # 6. Collapse whitespace and blank lines
27
+ # 7. Trim
28
+
29
+ _FILLER_PHRASES = re.compile(
30
+ r"\b(just wanted to|i just wanted to|wanted to follow up|as discussed"
31
+ r"|per our conversation|as mentioned|going forward|at the end of the day"
32
+ r"|in terms of|with respect to|in regards to|please find attached"
33
+ r"|hope this helps|let me know if you have any questions"
34
+ r"|looking forward to hearing from you)\b",
35
+ re.IGNORECASE,
36
+ )
37
+
38
+ _FILLER_WORDS = re.compile(
39
+ r"\b(basically|essentially|actually|literally|honestly|frankly"
40
+ r"|obviously|clearly|simply|really|very|quite|rather|pretty much"
41
+ r"|kind of|sort of|in order to|due to the fact that"
42
+ r"|at this point in time|for all intents and purposes)\b",
43
+ re.IGNORECASE,
44
+ )
45
+
46
+ _CRM_PATTERNS = [
47
+ re.compile(r"\bNo update[s]?\b\.?", re.IGNORECASE),
48
+ re.compile(r"\bCalendar invite sent[.]?\b", re.IGNORECASE),
49
+ re.compile(
50
+ r"\bSent (proposal|case study|documentation|overview|pricing)"
51
+ r" (documentation |via email|as requested)?\.?\b",
52
+ re.IGNORECASE,
53
+ ),
54
+ re.compile(r"\bWaiting (on|for) callback\.?\b", re.IGNORECASE),
55
+ re.compile(r"\bUpdated CRM with latest info\.?\b", re.IGNORECASE),
56
+ re.compile(r"\bMeeting confirmed for next week\.?\b", re.IGNORECASE),
57
+ re.compile(r"\bFollowing standard sales process\.?\b", re.IGNORECASE),
58
+ re.compile(r"\bMeeting went as expected\.?\b", re.IGNORECASE),
59
+ ]
60
+
61
+
62
+ def clean_text(text: str | None) -> str:
63
+ """Strip markdown, filler phrases, filler words, and CRM boilerplate;
64
+ lowercase and normalize whitespace.
65
+
66
+ Returns the empty string for None or empty input.
67
+ """
68
+ if not text:
69
+ return ""
70
+
71
+ result = text
72
+
73
+ # 1. Markdown formatting
74
+ result = re.sub(r"\*{1,3}", "", result)
75
+ result = re.sub(r"_{1,3}", "", result)
76
+ result = re.sub(r"^#{1,6}\s*", "", result, flags=re.MULTILINE)
77
+ result = re.sub(r"^-{3,}$", "", result, flags=re.MULTILINE)
78
+ result = re.sub(r"^\s*[-*+]\s+", "", result, flags=re.MULTILINE)
79
+ result = re.sub(r"^\s*\d+\.\s+", "", result, flags=re.MULTILINE)
80
+
81
+ # 2-3. Filler
82
+ result = _FILLER_PHRASES.sub("", result)
83
+ result = _FILLER_WORDS.sub("", result)
84
+
85
+ # 4. CRM boilerplate
86
+ for pattern in _CRM_PATTERNS:
87
+ result = pattern.sub("", result)
88
+
89
+ # 5. Lowercase
90
+ result = result.lower()
91
+
92
+ # 6. Whitespace normalization
93
+ result = re.sub(r"[ \t]+", " ", result)
94
+ result = re.sub(r"\n\s*\n+", "\n", result)
95
+ result = re.sub(r"^\s+", "", result, flags=re.MULTILINE)
96
+ result = re.sub(r"\s+$", "", result, flags=re.MULTILINE)
97
+ result = re.sub(r"^\s*$\n?", "", result, flags=re.MULTILINE)
98
+
99
+ # 7. Trim
100
+ return result.strip()
lede/cli.py ADDED
@@ -0,0 +1,84 @@
1
+ """lede CLI.
2
+
3
+ Usage:
4
+ lede [FILE] --mode {tfidf,keyword,clean_text,strip_think} [OPTIONS]
5
+
6
+ Reads FILE or stdin, writes summary to stdout.
7
+ """
8
+ import argparse
9
+ import sys
10
+ from pathlib import Path
11
+
12
+ from lede import summarize, clean_text, strip_think, extract_keyword
13
+
14
+
15
+ def _read_input(path: str | None) -> str:
16
+ # Force UTF-8 — relying on locale gives Windows users mojibake on
17
+ # non-ASCII content and breaks the byte-identical-runtime claim
18
+ # (the Rust CLI reads files as bytes and decodes UTF-8 explicitly).
19
+ if path and path != "-":
20
+ return Path(path).read_text(encoding="utf-8")
21
+ if hasattr(sys.stdin, "buffer"):
22
+ return sys.stdin.buffer.read().decode("utf-8")
23
+ return sys.stdin.read()
24
+
25
+
26
+ def main(argv: list[str] | None = None) -> int:
27
+ parser = argparse.ArgumentParser(
28
+ prog="lede",
29
+ description="Deterministic extractive summarization.",
30
+ )
31
+ parser.add_argument(
32
+ "path",
33
+ nargs="?",
34
+ default=None,
35
+ help="Input file path. Reads stdin if omitted.",
36
+ )
37
+ parser.add_argument(
38
+ "--mode",
39
+ choices=["tfidf", "keyword", "clean_text", "strip_think"],
40
+ default="tfidf",
41
+ help="Summarization mode (default: tfidf).",
42
+ )
43
+ parser.add_argument(
44
+ "--max-chars",
45
+ type=int,
46
+ default=500,
47
+ help="Character budget for tfidf mode (default: 500).",
48
+ )
49
+ parser.add_argument(
50
+ "--keywords",
51
+ default="",
52
+ help="Space-separated keywords for keyword mode.",
53
+ )
54
+ parser.add_argument(
55
+ "--top",
56
+ type=int,
57
+ default=10,
58
+ help="Number of sentences to return in keyword mode (default: 10).",
59
+ )
60
+ args = parser.parse_args(argv)
61
+
62
+ text = _read_input(args.path)
63
+
64
+ if args.mode == "tfidf":
65
+ output = summarize(text, max_length=args.max_chars).summary
66
+ elif args.mode == "keyword":
67
+ if not args.keywords:
68
+ parser.error("--mode keyword requires --keywords")
69
+ output = extract_keyword(text, args.keywords, num_sentences=args.top)
70
+ elif args.mode == "clean_text":
71
+ output = clean_text(text)
72
+ elif args.mode == "strip_think":
73
+ output = strip_think(text)
74
+ else: # pragma: no cover
75
+ parser.error(f"unknown mode: {args.mode}")
76
+
77
+ sys.stdout.write(output)
78
+ if not output.endswith("\n"):
79
+ sys.stdout.write("\n")
80
+ return 0
81
+
82
+
83
+ if __name__ == "__main__":
84
+ sys.exit(main())