rapid-textrank 0.0.1__cp314-cp314-macosx_10_12_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rapid_textrank/__init__.py +53 -0
- rapid_textrank/_rust.cpython-314-darwin.so +0 -0
- rapid_textrank/spacy_component.py +239 -0
- rapid_textrank-0.0.1.dist-info/METADATA +587 -0
- rapid_textrank-0.0.1.dist-info/RECORD +7 -0
- rapid_textrank-0.0.1.dist-info/WHEEL +4 -0
- rapid_textrank-0.0.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""
|
|
2
|
+
rapid_textrank - High-performance TextRank implementation
|
|
3
|
+
|
|
4
|
+
A fast TextRank implementation in Rust with Python bindings,
|
|
5
|
+
providing keyword extraction and text summarization.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from rapid_textrank._rust import (
|
|
9
|
+
__version__,
|
|
10
|
+
Phrase,
|
|
11
|
+
TextRankResult,
|
|
12
|
+
TextRankConfig,
|
|
13
|
+
BaseTextRank,
|
|
14
|
+
PositionRank,
|
|
15
|
+
BiasedTextRank,
|
|
16
|
+
extract_from_json,
|
|
17
|
+
extract_batch_from_json,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
"__version__",
|
|
22
|
+
"Phrase",
|
|
23
|
+
"TextRankResult",
|
|
24
|
+
"TextRankConfig",
|
|
25
|
+
"BaseTextRank",
|
|
26
|
+
"PositionRank",
|
|
27
|
+
"BiasedTextRank",
|
|
28
|
+
"extract_from_json",
|
|
29
|
+
"extract_batch_from_json",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def extract_keywords(text: str, top_n: int = 10, language: str = "en") -> list:
|
|
34
|
+
"""
|
|
35
|
+
Extract keywords from text using TextRank.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
text: The input text to extract keywords from
|
|
39
|
+
top_n: Number of top keywords to return
|
|
40
|
+
language: Language code for stopword filtering
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
List of Phrase objects with text, score, and rank
|
|
44
|
+
|
|
45
|
+
Example:
|
|
46
|
+
>>> from rapid_textrank import extract_keywords
|
|
47
|
+
>>> phrases = extract_keywords("Machine learning is a subset of AI.")
|
|
48
|
+
>>> for phrase in phrases:
|
|
49
|
+
... print(f"{phrase.text}: {phrase.score:.4f}")
|
|
50
|
+
"""
|
|
51
|
+
extractor = BaseTextRank(top_n=top_n, language=language)
|
|
52
|
+
result = extractor.extract_keywords(text)
|
|
53
|
+
return list(result.phrases)
|
|
Binary file
|
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
"""
|
|
2
|
+
spaCy pipeline component for rapid_textrank.
|
|
3
|
+
|
|
4
|
+
This module provides a spaCy pipeline component that uses rapid_textrank
|
|
5
|
+
for keyword extraction. It can be used as a drop-in replacement for
|
|
6
|
+
pytextrank with significantly better performance.
|
|
7
|
+
|
|
8
|
+
Example:
|
|
9
|
+
>>> import spacy
|
|
10
|
+
>>> from rapid_textrank.spacy_component import RustTextRank
|
|
11
|
+
>>>
|
|
12
|
+
>>> nlp = spacy.load("en_core_web_sm")
|
|
13
|
+
>>> nlp.add_pipe("rapid_textrank")
|
|
14
|
+
>>>
|
|
15
|
+
>>> doc = nlp("Machine learning is a subset of artificial intelligence.")
|
|
16
|
+
>>> for phrase in doc._.phrases[:5]:
|
|
17
|
+
... print(f"{phrase.text}: {phrase.score:.4f}")
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from typing import List, Optional, Dict, Any
|
|
21
|
+
import json
|
|
22
|
+
|
|
23
|
+
try:
|
|
24
|
+
from spacy.tokens import Doc
|
|
25
|
+
from spacy.language import Language
|
|
26
|
+
|
|
27
|
+
SPACY_AVAILABLE = True
|
|
28
|
+
except ImportError:
|
|
29
|
+
SPACY_AVAILABLE = False
|
|
30
|
+
Doc = None
|
|
31
|
+
Language = None
|
|
32
|
+
|
|
33
|
+
from rapid_textrank._rust import extract_from_json
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class Phrase:
|
|
37
|
+
"""
|
|
38
|
+
A keyphrase extracted by RustTextRank.
|
|
39
|
+
|
|
40
|
+
Compatible with pytextrank's Phrase interface.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
def __init__(self, text: str, lemma: str, score: float, count: int, rank: int):
|
|
44
|
+
self.text = text
|
|
45
|
+
self.lemma = lemma
|
|
46
|
+
self.score = score
|
|
47
|
+
self.count = count
|
|
48
|
+
self.rank = rank
|
|
49
|
+
# For pytextrank compatibility
|
|
50
|
+
self.chunks = []
|
|
51
|
+
|
|
52
|
+
def __repr__(self) -> str:
|
|
53
|
+
return f"Phrase(text='{self.text}', score={self.score:.4f}, rank={self.rank})"
|
|
54
|
+
|
|
55
|
+
def __str__(self) -> str:
|
|
56
|
+
return self.text
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class RustTextRankResult:
|
|
60
|
+
"""Result container for TextRank extraction."""
|
|
61
|
+
|
|
62
|
+
def __init__(self, phrases: List[Phrase], converged: bool, iterations: int):
|
|
63
|
+
self.phrases = phrases
|
|
64
|
+
self.converged = converged
|
|
65
|
+
self.iterations = iterations
|
|
66
|
+
|
|
67
|
+
def __len__(self) -> int:
|
|
68
|
+
return len(self.phrases)
|
|
69
|
+
|
|
70
|
+
def __iter__(self):
|
|
71
|
+
return iter(self.phrases)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
if SPACY_AVAILABLE:
|
|
75
|
+
|
|
76
|
+
@Language.factory(
|
|
77
|
+
"rapid_textrank",
|
|
78
|
+
default_config={
|
|
79
|
+
"damping": 0.85,
|
|
80
|
+
"max_iterations": 100,
|
|
81
|
+
"convergence_threshold": 1e-6,
|
|
82
|
+
"window_size": 4,
|
|
83
|
+
"top_n": 10,
|
|
84
|
+
"min_phrase_length": 1,
|
|
85
|
+
"max_phrase_length": 4,
|
|
86
|
+
"score_aggregation": "sum",
|
|
87
|
+
},
|
|
88
|
+
)
|
|
89
|
+
def create_rapid_textrank(
|
|
90
|
+
nlp: Language,
|
|
91
|
+
name: str,
|
|
92
|
+
damping: float,
|
|
93
|
+
max_iterations: int,
|
|
94
|
+
convergence_threshold: float,
|
|
95
|
+
window_size: int,
|
|
96
|
+
top_n: int,
|
|
97
|
+
min_phrase_length: int,
|
|
98
|
+
max_phrase_length: int,
|
|
99
|
+
score_aggregation: str,
|
|
100
|
+
):
|
|
101
|
+
"""Create a RustTextRank pipeline component."""
|
|
102
|
+
return RustTextRank(
|
|
103
|
+
nlp=nlp,
|
|
104
|
+
name=name,
|
|
105
|
+
damping=damping,
|
|
106
|
+
max_iterations=max_iterations,
|
|
107
|
+
convergence_threshold=convergence_threshold,
|
|
108
|
+
window_size=window_size,
|
|
109
|
+
top_n=top_n,
|
|
110
|
+
min_phrase_length=min_phrase_length,
|
|
111
|
+
max_phrase_length=max_phrase_length,
|
|
112
|
+
score_aggregation=score_aggregation,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
class RustTextRank:
|
|
116
|
+
"""
|
|
117
|
+
spaCy pipeline component for TextRank keyword extraction.
|
|
118
|
+
|
|
119
|
+
This component uses the Rust implementation for fast extraction
|
|
120
|
+
while integrating seamlessly with spaCy's NLP pipeline.
|
|
121
|
+
|
|
122
|
+
Example:
|
|
123
|
+
>>> import spacy
|
|
124
|
+
>>> nlp = spacy.load("en_core_web_sm")
|
|
125
|
+
>>> nlp.add_pipe("rapid_textrank")
|
|
126
|
+
>>> doc = nlp("Machine learning is transforming industries.")
|
|
127
|
+
>>> for phrase in doc._.phrases:
|
|
128
|
+
... print(phrase.text, phrase.score)
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
def __init__(
|
|
132
|
+
self,
|
|
133
|
+
nlp: Language,
|
|
134
|
+
name: str = "rapid_textrank",
|
|
135
|
+
damping: float = 0.85,
|
|
136
|
+
max_iterations: int = 100,
|
|
137
|
+
convergence_threshold: float = 1e-6,
|
|
138
|
+
window_size: int = 4,
|
|
139
|
+
top_n: int = 10,
|
|
140
|
+
min_phrase_length: int = 1,
|
|
141
|
+
max_phrase_length: int = 4,
|
|
142
|
+
score_aggregation: str = "sum",
|
|
143
|
+
):
|
|
144
|
+
self.nlp = nlp
|
|
145
|
+
self.name = name
|
|
146
|
+
self.config = {
|
|
147
|
+
"damping": damping,
|
|
148
|
+
"max_iterations": max_iterations,
|
|
149
|
+
"convergence_threshold": convergence_threshold,
|
|
150
|
+
"window_size": window_size,
|
|
151
|
+
"top_n": top_n,
|
|
152
|
+
"min_phrase_length": min_phrase_length,
|
|
153
|
+
"max_phrase_length": max_phrase_length,
|
|
154
|
+
"score_aggregation": score_aggregation,
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
# Register custom extensions
|
|
158
|
+
if not Doc.has_extension("phrases"):
|
|
159
|
+
Doc.set_extension("phrases", default=[])
|
|
160
|
+
if not Doc.has_extension("textrank_result"):
|
|
161
|
+
Doc.set_extension("textrank_result", default=None)
|
|
162
|
+
|
|
163
|
+
def __call__(self, doc: Doc) -> Doc:
|
|
164
|
+
"""Process a spaCy Doc and extract keyphrases."""
|
|
165
|
+
# Convert spaCy tokens to JSON format
|
|
166
|
+
tokens = []
|
|
167
|
+
for sent_idx, sent in enumerate(doc.sents):
|
|
168
|
+
for token in sent:
|
|
169
|
+
tokens.append(
|
|
170
|
+
{
|
|
171
|
+
"text": token.text,
|
|
172
|
+
"lemma": token.lemma_,
|
|
173
|
+
"pos": token.pos_,
|
|
174
|
+
"start": token.idx,
|
|
175
|
+
"end": token.idx + len(token.text),
|
|
176
|
+
"sentence_idx": sent_idx,
|
|
177
|
+
"token_idx": token.i,
|
|
178
|
+
"is_stopword": token.is_stop,
|
|
179
|
+
}
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
# Create JSON input
|
|
183
|
+
json_input = json.dumps({"tokens": tokens, "config": self.config})
|
|
184
|
+
|
|
185
|
+
# Extract keyphrases using Rust
|
|
186
|
+
json_output = extract_from_json(json_input)
|
|
187
|
+
result = json.loads(json_output)
|
|
188
|
+
|
|
189
|
+
# Convert to Phrase objects
|
|
190
|
+
phrases = [
|
|
191
|
+
Phrase(
|
|
192
|
+
text=p["text"],
|
|
193
|
+
lemma=p["lemma"],
|
|
194
|
+
score=p["score"],
|
|
195
|
+
count=p["count"],
|
|
196
|
+
rank=p["rank"],
|
|
197
|
+
)
|
|
198
|
+
for p in result["phrases"]
|
|
199
|
+
]
|
|
200
|
+
|
|
201
|
+
# Store results
|
|
202
|
+
doc._.phrases = phrases
|
|
203
|
+
doc._.textrank_result = RustTextRankResult(
|
|
204
|
+
phrases=phrases,
|
|
205
|
+
converged=result["converged"],
|
|
206
|
+
iterations=result["iterations"],
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
return doc
|
|
210
|
+
|
|
211
|
+
def to_disk(self, path, **kwargs):
|
|
212
|
+
"""Save component configuration to disk."""
|
|
213
|
+
import json
|
|
214
|
+
from pathlib import Path
|
|
215
|
+
|
|
216
|
+
config_path = Path(path) / "config.json"
|
|
217
|
+
with open(config_path, "w") as f:
|
|
218
|
+
json.dump(self.config, f)
|
|
219
|
+
|
|
220
|
+
def from_disk(self, path, **kwargs):
|
|
221
|
+
"""Load component configuration from disk."""
|
|
222
|
+
import json
|
|
223
|
+
from pathlib import Path
|
|
224
|
+
|
|
225
|
+
config_path = Path(path) / "config.json"
|
|
226
|
+
with open(config_path, "r") as f:
|
|
227
|
+
self.config = json.load(f)
|
|
228
|
+
return self
|
|
229
|
+
|
|
230
|
+
else:
|
|
231
|
+
# Fallback when spaCy is not available
|
|
232
|
+
class RustTextRank:
|
|
233
|
+
"""Placeholder when spaCy is not installed."""
|
|
234
|
+
|
|
235
|
+
def __init__(self, *args, **kwargs):
|
|
236
|
+
raise ImportError(
|
|
237
|
+
"spaCy is required for the RustTextRank pipeline component. "
|
|
238
|
+
"Install it with: pip install spacy"
|
|
239
|
+
)
|
|
@@ -0,0 +1,587 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: rapid_textrank
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Classifier: Development Status :: 4 - Beta
|
|
5
|
+
Classifier: Intended Audience :: Developers
|
|
6
|
+
Classifier: Intended Audience :: Science/Research
|
|
7
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Programming Language :: Rust
|
|
14
|
+
Classifier: Operating System :: MacOS :: MacOS X
|
|
15
|
+
Classifier: Operating System :: Microsoft :: Windows
|
|
16
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
19
|
+
Requires-Dist: pytest>=7.0 ; extra == 'dev'
|
|
20
|
+
Requires-Dist: pytest-benchmark ; extra == 'dev'
|
|
21
|
+
Requires-Dist: spacy>=3.0 ; extra == 'dev'
|
|
22
|
+
Requires-Dist: spacy>=3.0 ; extra == 'spacy'
|
|
23
|
+
Provides-Extra: dev
|
|
24
|
+
Provides-Extra: spacy
|
|
25
|
+
License-File: LICENSE
|
|
26
|
+
Summary: High-performance TextRank implementation with Python bindings
|
|
27
|
+
Keywords: nlp,textrank,keyword-extraction,summarization,pagerank
|
|
28
|
+
Author: TextRanker Contributors
|
|
29
|
+
License-Expression: MIT
|
|
30
|
+
Requires-Python: >=3.9
|
|
31
|
+
Description-Content-Type: text/markdown
|
|
32
|
+
Project-URL: Homepage, https://github.com/xang1234/rapid-textrank
|
|
33
|
+
Project-URL: Issues, https://github.com/xang1234/rapid-textrank/issues
|
|
34
|
+
Project-URL: Repository, https://github.com/xang1234/rapid-textrank
|
|
35
|
+
|
|
36
|
+
# rapid_textrank
|
|
37
|
+
|
|
38
|
+
[](https://opensource.org/licenses/MIT)
|
|
39
|
+
[](https://www.python.org/downloads/)
|
|
40
|
+
[](https://www.rust-lang.org/)
|
|
41
|
+
|
|
42
|
+
**High-performance TextRank implementation in Rust with Python bindings.**
|
|
43
|
+
|
|
44
|
+
Extract keywords and key phrases from text 10-100x faster than pure Python implementations, with support for multiple algorithm variants and 18 languages.
|
|
45
|
+
|
|
46
|
+
## Features
|
|
47
|
+
|
|
48
|
+
- **Fast**: 10-100x faster than pure Python implementations
|
|
49
|
+
- **Multiple algorithms**: TextRank, PositionRank, and BiasedTextRank variants
|
|
50
|
+
- **Unicode-aware**: Proper handling of CJK, emoji, and other scripts
|
|
51
|
+
- **Multi-language**: Stopword support for 18 languages
|
|
52
|
+
- **Dual API**: Native Python classes + JSON interface for batch processing
|
|
53
|
+
- **Zero Python overhead**: Computation happens entirely in Rust (no GIL)
|
|
54
|
+
|
|
55
|
+
## Quick Start
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
pip install rapid_textrank
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
from rapid_textrank import extract_keywords
|
|
63
|
+
|
|
64
|
+
text = """
|
|
65
|
+
Machine learning is a subset of artificial intelligence that enables
|
|
66
|
+
systems to learn and improve from experience. Deep learning, a type of
|
|
67
|
+
machine learning, uses neural networks with many layers.
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
keywords = extract_keywords(text, top_n=5, language="en")
|
|
71
|
+
for phrase in keywords:
|
|
72
|
+
print(f"{phrase.text}: {phrase.score:.4f}")
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Output:
|
|
76
|
+
```
|
|
77
|
+
machine learning: 0.2341
|
|
78
|
+
deep learning: 0.1872
|
|
79
|
+
artificial intelligence: 0.1654
|
|
80
|
+
neural networks: 0.1432
|
|
81
|
+
systems: 0.0891
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## How TextRank Works
|
|
85
|
+
|
|
86
|
+
TextRank is a graph-based ranking algorithm for keyword extraction, inspired by Google's PageRank.
|
|
87
|
+
|
|
88
|
+
### The Algorithm
|
|
89
|
+
|
|
90
|
+
1. **Build a co-occurrence graph**: Words become nodes. An edge connects two words if they appear within a sliding window (default: 4 words).
|
|
91
|
+
|
|
92
|
+
2. **Run PageRank**: The algorithm iteratively distributes "importance" through the graph. Words connected to many important words become important themselves.
|
|
93
|
+
|
|
94
|
+
3. **Extract phrases**: Adjacent high-scoring words are combined into key phrases. Scores are aggregated (sum, mean, or max).
|
|
95
|
+
|
|
96
|
+
```
|
|
97
|
+
Text: "Machine learning enables systems to learn from data"
|
|
98
|
+
|
|
99
|
+
Co-occurrence graph (window=2):
|
|
100
|
+
machine ←→ learning ←→ enables ←→ systems ←→ learn ←→ data
|
|
101
|
+
↓
|
|
102
|
+
PageRank
|
|
103
|
+
↓
|
|
104
|
+
Scores: machine(0.23) learning(0.31) enables(0.12) ...
|
|
105
|
+
↓
|
|
106
|
+
Phrase extraction
|
|
107
|
+
↓
|
|
108
|
+
"machine learning" (0.54), "systems" (0.18), ...
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### Further Reading
|
|
112
|
+
|
|
113
|
+
- [TextRank: Bringing Order into Texts](https://aclanthology.org/W04-3252/) (Mihalcea & Tarau, 2004)
|
|
114
|
+
- [PositionRank: An Unsupervised Approach to Keyphrase Extraction](https://aclanthology.org/P17-1102/) (Florescu & Caragea, 2017)
|
|
115
|
+
- [BiasedTextRank: Unsupervised Graph-Based Content Extraction](https://aclanthology.org/2020.coling-main.144/) (Kazemi et al., 2020)
|
|
116
|
+
|
|
117
|
+
## Algorithm Variants
|
|
118
|
+
|
|
119
|
+
| Variant | Best For | Description |
|
|
120
|
+
|---------|----------|-------------|
|
|
121
|
+
| `BaseTextRank` | General text | Standard TextRank implementation |
|
|
122
|
+
| `PositionRank` | Academic papers, news | Favors words appearing early in the document |
|
|
123
|
+
| `BiasedTextRank` | Topic-focused extraction | Biases results toward specified focus terms |
|
|
124
|
+
|
|
125
|
+
### PositionRank
|
|
126
|
+
|
|
127
|
+
Weights words by their position—earlier appearances score higher. Useful for documents where key information appears in titles, abstracts, or opening paragraphs.
|
|
128
|
+
|
|
129
|
+
```python
|
|
130
|
+
from rapid_textrank import PositionRank
|
|
131
|
+
|
|
132
|
+
extractor = PositionRank(top_n=10)
|
|
133
|
+
result = extractor.extract_keywords("""
|
|
134
|
+
Quantum Computing Advances in 2024
|
|
135
|
+
|
|
136
|
+
Researchers have made significant breakthroughs in quantum error correction.
|
|
137
|
+
The quantum computing field continues to evolve rapidly...
|
|
138
|
+
""")
|
|
139
|
+
|
|
140
|
+
# "quantum computing" and "quantum" will rank higher due to early position
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
### BiasedTextRank
|
|
144
|
+
|
|
145
|
+
Steers extraction toward specific topics using focus terms. The `bias_weight` parameter controls how strongly results favor the focus terms.
|
|
146
|
+
|
|
147
|
+
```python
|
|
148
|
+
from rapid_textrank import BiasedTextRank
|
|
149
|
+
|
|
150
|
+
extractor = BiasedTextRank(
|
|
151
|
+
focus_terms=["security", "privacy"],
|
|
152
|
+
bias_weight=5.0, # Higher = stronger bias
|
|
153
|
+
top_n=10
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
result = extractor.extract_keywords("""
|
|
157
|
+
Modern web applications must balance user experience with security.
|
|
158
|
+
Privacy regulations require careful data handling. Performance
|
|
159
|
+
optimizations should not compromise security measures.
|
|
160
|
+
""")
|
|
161
|
+
|
|
162
|
+
# Results will favor security/privacy-related phrases
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
## API Reference
|
|
166
|
+
|
|
167
|
+
### Convenience Function
|
|
168
|
+
|
|
169
|
+
The simplest way to extract keywords:
|
|
170
|
+
|
|
171
|
+
```python
|
|
172
|
+
from rapid_textrank import extract_keywords
|
|
173
|
+
|
|
174
|
+
phrases = extract_keywords(
|
|
175
|
+
text, # Input text
|
|
176
|
+
top_n=10, # Number of keywords to return
|
|
177
|
+
language="en" # Language for stopword filtering
|
|
178
|
+
)
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
### Class-Based API
|
|
182
|
+
|
|
183
|
+
For more control, use the extractor classes:
|
|
184
|
+
|
|
185
|
+
```python
|
|
186
|
+
from rapid_textrank import BaseTextRank, PositionRank, BiasedTextRank
|
|
187
|
+
|
|
188
|
+
# Standard TextRank
|
|
189
|
+
extractor = BaseTextRank(top_n=10, language="en")
|
|
190
|
+
result = extractor.extract_keywords(text)
|
|
191
|
+
|
|
192
|
+
# Position-weighted
|
|
193
|
+
extractor = PositionRank(top_n=10, language="en")
|
|
194
|
+
result = extractor.extract_keywords(text)
|
|
195
|
+
|
|
196
|
+
# Topic-biased
|
|
197
|
+
extractor = BiasedTextRank(
|
|
198
|
+
focus_terms=["machine", "learning"],
|
|
199
|
+
bias_weight=5.0,
|
|
200
|
+
top_n=10,
|
|
201
|
+
language="en"
|
|
202
|
+
)
|
|
203
|
+
result = extractor.extract_keywords(text)
|
|
204
|
+
|
|
205
|
+
# You can also pass focus_terms per-call
|
|
206
|
+
result = extractor.extract_keywords(text, focus_terms=["neural", "network"])
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
### Configuration
|
|
210
|
+
|
|
211
|
+
Fine-tune the algorithm with `TextRankConfig`:
|
|
212
|
+
|
|
213
|
+
```python
|
|
214
|
+
from rapid_textrank import TextRankConfig, BaseTextRank
|
|
215
|
+
|
|
216
|
+
config = TextRankConfig(
|
|
217
|
+
damping=0.85, # PageRank damping factor (0-1)
|
|
218
|
+
max_iterations=100, # Maximum PageRank iterations
|
|
219
|
+
convergence_threshold=1e-6,# Convergence threshold
|
|
220
|
+
window_size=4, # Co-occurrence window size
|
|
221
|
+
top_n=10, # Number of results
|
|
222
|
+
min_phrase_length=1, # Minimum words in a phrase
|
|
223
|
+
max_phrase_length=4, # Maximum words in a phrase
|
|
224
|
+
score_aggregation="sum", # How to combine word scores: "sum", "mean", "max", "rms"
|
|
225
|
+
language="en" # Language for stopwords
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
extractor = BaseTextRank(config=config)
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
### Result Objects
|
|
232
|
+
|
|
233
|
+
```python
|
|
234
|
+
result = extractor.extract_keywords(text)
|
|
235
|
+
|
|
236
|
+
# TextRankResult attributes
|
|
237
|
+
result.phrases # List of Phrase objects
|
|
238
|
+
result.converged # Whether PageRank converged
|
|
239
|
+
result.iterations # Number of iterations run
|
|
240
|
+
|
|
241
|
+
# Phrase attributes
|
|
242
|
+
for phrase in result.phrases:
|
|
243
|
+
phrase.text # The phrase text (e.g., "machine learning")
|
|
244
|
+
phrase.lemma # Lemmatized form
|
|
245
|
+
phrase.score # TextRank score
|
|
246
|
+
phrase.count # Occurrences in text
|
|
247
|
+
phrase.rank # 1-indexed rank
|
|
248
|
+
|
|
249
|
+
# Convenience method
|
|
250
|
+
tuples = result.as_tuples() # [(text, score), ...]
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
### JSON Interface
|
|
254
|
+
|
|
255
|
+
For processing large documents or integrating with spaCy, use the JSON interface. This accepts pre-tokenized data to avoid re-tokenizing in Rust.
|
|
256
|
+
|
|
257
|
+
```python
|
|
258
|
+
from rapid_textrank import extract_from_json, extract_batch_from_json
|
|
259
|
+
import json
|
|
260
|
+
|
|
261
|
+
# Single document
|
|
262
|
+
doc = {
|
|
263
|
+
"tokens": [
|
|
264
|
+
{
|
|
265
|
+
"text": "Machine",
|
|
266
|
+
"lemma": "machine",
|
|
267
|
+
"pos": "NOUN",
|
|
268
|
+
"start": 0,
|
|
269
|
+
"end": 7,
|
|
270
|
+
"sentence_idx": 0,
|
|
271
|
+
"token_idx": 0,
|
|
272
|
+
"is_stopword": False
|
|
273
|
+
},
|
|
274
|
+
# ... more tokens
|
|
275
|
+
],
|
|
276
|
+
"config": {"top_n": 10}
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
result_json = extract_from_json(json.dumps(doc))
|
|
280
|
+
result = json.loads(result_json)
|
|
281
|
+
|
|
282
|
+
# Batch processing (parallel in Rust)
|
|
283
|
+
docs = [doc1, doc2, doc3]
|
|
284
|
+
results_json = extract_batch_from_json(json.dumps(docs))
|
|
285
|
+
results = json.loads(results_json)
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
## Supported Languages
|
|
289
|
+
|
|
290
|
+
Stopword filtering is available for 18 languages:
|
|
291
|
+
|
|
292
|
+
| Code | Language | Code | Language | Code | Language |
|
|
293
|
+
|------|----------|------|----------|------|----------|
|
|
294
|
+
| `en` | English | `de` | German | `fr` | French |
|
|
295
|
+
| `es` | Spanish | `it` | Italian | `pt` | Portuguese |
|
|
296
|
+
| `nl` | Dutch | `ru` | Russian | `sv` | Swedish |
|
|
297
|
+
| `no` | Norwegian | `da` | Danish | `fi` | Finnish |
|
|
298
|
+
| `hu` | Hungarian | `tr` | Turkish | `pl` | Polish |
|
|
299
|
+
| `ar` | Arabic | `zh` | Chinese | `ja` | Japanese |
|
|
300
|
+
|
|
301
|
+
## Performance
|
|
302
|
+
|
|
303
|
+
rapid_textrank achieves significant speedups through Rust's performance characteristics and careful algorithm implementation.
|
|
304
|
+
|
|
305
|
+
### Benchmark Script
|
|
306
|
+
|
|
307
|
+
Run this script to compare performance on your hardware:
|
|
308
|
+
|
|
309
|
+
```python
|
|
310
|
+
"""
|
|
311
|
+
Benchmark: rapid_textrank vs pytextrank
|
|
312
|
+
|
|
313
|
+
Prerequisites:
|
|
314
|
+
pip install rapid_textrank pytextrank spacy
|
|
315
|
+
python -m spacy download en_core_web_sm
|
|
316
|
+
"""
|
|
317
|
+
|
|
318
|
+
import time
|
|
319
|
+
import statistics
|
|
320
|
+
|
|
321
|
+
# Sample texts of varying sizes
|
|
322
|
+
TEXTS = {
|
|
323
|
+
"small": """
|
|
324
|
+
Machine learning is a subset of artificial intelligence.
|
|
325
|
+
Deep learning uses neural networks with many layers.
|
|
326
|
+
""",
|
|
327
|
+
|
|
328
|
+
"medium": """
|
|
329
|
+
Natural language processing (NLP) is a field of artificial intelligence
|
|
330
|
+
that focuses on the interaction between computers and humans through
|
|
331
|
+
natural language. The ultimate goal of NLP is to enable computers to
|
|
332
|
+
understand, interpret, and generate human language in a valuable way.
|
|
333
|
+
|
|
334
|
+
Machine learning approaches have transformed NLP in recent years.
|
|
335
|
+
Deep learning models, particularly transformers, have achieved
|
|
336
|
+
state-of-the-art results on many NLP tasks including translation,
|
|
337
|
+
summarization, and question answering.
|
|
338
|
+
|
|
339
|
+
Key applications include sentiment analysis, named entity recognition,
|
|
340
|
+
machine translation, and text classification. These technologies
|
|
341
|
+
power virtual assistants, search engines, and content recommendation
|
|
342
|
+
systems used by millions of people daily.
|
|
343
|
+
""",
|
|
344
|
+
|
|
345
|
+
"large": """
|
|
346
|
+
Artificial intelligence has evolved dramatically since its inception in
|
|
347
|
+
the mid-20th century. Early AI systems relied on symbolic reasoning and
|
|
348
|
+
expert systems, where human knowledge was manually encoded into rules.
|
|
349
|
+
|
|
350
|
+
The machine learning revolution changed everything. Instead of explicit
|
|
351
|
+
programming, systems learn patterns from data. Supervised learning uses
|
|
352
|
+
labeled examples, unsupervised learning finds hidden structures, and
|
|
353
|
+
reinforcement learning optimizes through trial and error.
|
|
354
|
+
|
|
355
|
+
Deep learning, powered by neural networks with multiple layers, has
|
|
356
|
+
achieved remarkable success. Convolutional neural networks excel at
|
|
357
|
+
image recognition. Recurrent neural networks and transformers handle
|
|
358
|
+
sequential data like text and speech. Generative adversarial networks
|
|
359
|
+
create realistic synthetic content.
|
|
360
|
+
|
|
361
|
+
Natural language processing has been transformed by these advances.
|
|
362
|
+
Word embeddings capture semantic relationships. Attention mechanisms
|
|
363
|
+
allow models to focus on relevant context. Large language models
|
|
364
|
+
demonstrate emergent capabilities in reasoning and generation.
|
|
365
|
+
|
|
366
|
+
Computer vision applications include object detection, facial recognition,
|
|
367
|
+
medical image analysis, and autonomous vehicle perception. These systems
|
|
368
|
+
process visual information with superhuman accuracy in many domains.
|
|
369
|
+
|
|
370
|
+
The ethical implications of AI are significant. Bias in training data
|
|
371
|
+
can lead to unfair outcomes. Privacy concerns arise from data collection.
|
|
372
|
+
Job displacement affects workers across industries. Regulation and
|
|
373
|
+
governance frameworks are being developed worldwide.
|
|
374
|
+
|
|
375
|
+
Future directions include neuromorphic computing, quantum machine learning,
|
|
376
|
+
and artificial general intelligence. Researchers continue to push
|
|
377
|
+
boundaries while addressing safety and alignment challenges.
|
|
378
|
+
""" * 3 # ~1000 words
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
def benchmark_rapid_textrank(text: str, runs: int = 10) -> dict:
|
|
383
|
+
"""Benchmark rapid_textrank."""
|
|
384
|
+
from rapid_textrank import BaseTextRank
|
|
385
|
+
|
|
386
|
+
extractor = BaseTextRank(top_n=10, language="en")
|
|
387
|
+
|
|
388
|
+
# Warmup
|
|
389
|
+
extractor.extract_keywords(text)
|
|
390
|
+
|
|
391
|
+
times = []
|
|
392
|
+
for _ in range(runs):
|
|
393
|
+
start = time.perf_counter()
|
|
394
|
+
result = extractor.extract_keywords(text)
|
|
395
|
+
elapsed = time.perf_counter() - start
|
|
396
|
+
times.append(elapsed * 1000) # Convert to ms
|
|
397
|
+
|
|
398
|
+
return {
|
|
399
|
+
"min": min(times),
|
|
400
|
+
"mean": statistics.mean(times),
|
|
401
|
+
"median": statistics.median(times),
|
|
402
|
+
"std": statistics.stdev(times) if len(times) > 1 else 0,
|
|
403
|
+
"phrases": len(result.phrases)
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
def benchmark_pytextrank(text: str, runs: int = 10) -> dict:
|
|
408
|
+
"""Benchmark pytextrank with spaCy."""
|
|
409
|
+
import spacy
|
|
410
|
+
import pytextrank
|
|
411
|
+
|
|
412
|
+
nlp = spacy.load("en_core_web_sm")
|
|
413
|
+
nlp.add_pipe("textrank")
|
|
414
|
+
|
|
415
|
+
# Warmup
|
|
416
|
+
doc = nlp(text)
|
|
417
|
+
|
|
418
|
+
times = []
|
|
419
|
+
for _ in range(runs):
|
|
420
|
+
start = time.perf_counter()
|
|
421
|
+
doc = nlp(text)
|
|
422
|
+
phrases = list(doc._.phrases[:10])
|
|
423
|
+
elapsed = time.perf_counter() - start
|
|
424
|
+
times.append(elapsed * 1000)
|
|
425
|
+
|
|
426
|
+
return {
|
|
427
|
+
"min": min(times),
|
|
428
|
+
"mean": statistics.mean(times),
|
|
429
|
+
"median": statistics.median(times),
|
|
430
|
+
"std": statistics.stdev(times) if len(times) > 1 else 0,
|
|
431
|
+
"phrases": len(phrases)
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
def main():
|
|
436
|
+
print("=" * 70)
|
|
437
|
+
print("TextRank Performance Benchmark")
|
|
438
|
+
print("=" * 70)
|
|
439
|
+
|
|
440
|
+
for size, text in TEXTS.items():
|
|
441
|
+
word_count = len(text.split())
|
|
442
|
+
print(f"\n{size.upper()} TEXT (~{word_count} words)")
|
|
443
|
+
print("-" * 50)
|
|
444
|
+
|
|
445
|
+
# Benchmark rapid_textrank
|
|
446
|
+
rust_results = benchmark_rapid_textrank(text)
|
|
447
|
+
print(f"rapid_textrank: {rust_results['mean']:>8.2f} ms (±{rust_results['std']:.2f})")
|
|
448
|
+
|
|
449
|
+
# Benchmark pytextrank
|
|
450
|
+
try:
|
|
451
|
+
py_results = benchmark_pytextrank(text)
|
|
452
|
+
print(f"pytextrank: {py_results['mean']:>8.2f} ms (±{py_results['std']:.2f})")
|
|
453
|
+
|
|
454
|
+
speedup = py_results['mean'] / rust_results['mean']
|
|
455
|
+
print(f"Speedup: {speedup:>8.1f}x faster")
|
|
456
|
+
except Exception as e:
|
|
457
|
+
print(f"pytextrank: (not available: {e})")
|
|
458
|
+
|
|
459
|
+
print("\n" + "=" * 70)
|
|
460
|
+
print("Note: pytextrank times include spaCy tokenization.")
|
|
461
|
+
print("For fair comparison with pre-tokenized input, use rapid_textrank's JSON API.")
|
|
462
|
+
print("=" * 70)
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
if __name__ == "__main__":
|
|
466
|
+
main()
|
|
467
|
+
```
|
|
468
|
+
|
|
469
|
+
### Why Rust is Fast
|
|
470
|
+
|
|
471
|
+
The performance advantage comes from several factors:
|
|
472
|
+
|
|
473
|
+
1. **CSR Graph Format**: The co-occurrence graph uses Compressed Sparse Row format, enabling cache-friendly memory access during PageRank iteration.
|
|
474
|
+
|
|
475
|
+
2. **String Interning**: Repeated words share a single allocation via `StringPool`, reducing memory usage 10-100x for typical documents.
|
|
476
|
+
|
|
477
|
+
3. **Parallel Processing**: Rayon provides data parallelism for batch processing without explicit thread management.
|
|
478
|
+
|
|
479
|
+
4. **Link-Time Optimization (LTO)**: Release builds use full LTO with single codegen unit for maximum inlining.
|
|
480
|
+
|
|
481
|
+
5. **No GIL**: All computation happens in Rust. Python's Global Interpreter Lock is released during extraction.
|
|
482
|
+
|
|
483
|
+
6. **FxHash**: Fast non-cryptographic hashing for internal hash maps.
|
|
484
|
+
|
|
485
|
+
## Installation
|
|
486
|
+
|
|
487
|
+
### From PyPI
|
|
488
|
+
|
|
489
|
+
```bash
|
|
490
|
+
pip install rapid_textrank
|
|
491
|
+
```
|
|
492
|
+
|
|
493
|
+
Import name is `rapid_textrank`.
|
|
494
|
+
|
|
495
|
+
### With spaCy Support
|
|
496
|
+
|
|
497
|
+
```bash
|
|
498
|
+
pip install rapid_textrank[spacy]
|
|
499
|
+
```
|
|
500
|
+
|
|
501
|
+
### From Source
|
|
502
|
+
|
|
503
|
+
Requirements: Rust 1.70+, Python 3.9+
|
|
504
|
+
|
|
505
|
+
```bash
|
|
506
|
+
git clone https://github.com/textranker/rapid_textrank
|
|
507
|
+
cd rapid_textrank
|
|
508
|
+
pip install maturin
|
|
509
|
+
maturin develop --release
|
|
510
|
+
```
|
|
511
|
+
|
|
512
|
+
### Development Setup
|
|
513
|
+
|
|
514
|
+
```bash
|
|
515
|
+
# Install with dev dependencies
|
|
516
|
+
pip install -e ".[dev]"
|
|
517
|
+
|
|
518
|
+
# Run tests
|
|
519
|
+
pytest
|
|
520
|
+
|
|
521
|
+
# Run Rust tests
|
|
522
|
+
cargo test
|
|
523
|
+
```
|
|
524
|
+
|
|
525
|
+
## Publishing
|
|
526
|
+
|
|
527
|
+
Publishing is automated with GitHub Actions using Trusted Publishing (OIDC), so no API tokens are stored.
|
|
528
|
+
|
|
529
|
+
TestPyPI release (push a tag):
|
|
530
|
+
|
|
531
|
+
```bash
|
|
532
|
+
git tag -a test-0.1.0 -m "TestPyPI 0.1.0"
|
|
533
|
+
git push origin test-0.1.0
|
|
534
|
+
```
|
|
535
|
+
|
|
536
|
+
Tag pattern: `test-*`
|
|
537
|
+
|
|
538
|
+
PyPI release (push a tag):
|
|
539
|
+
|
|
540
|
+
```bash
|
|
541
|
+
git tag -a v0.1.0 -m "Release 0.1.0"
|
|
542
|
+
git push origin v0.1.0
|
|
543
|
+
```
|
|
544
|
+
|
|
545
|
+
Tag pattern: `v*`
|
|
546
|
+
|
|
547
|
+
Wheel builds
|
|
548
|
+
|
|
549
|
+
GitHub Actions builds wheels for Python 3.9–3.12 on Linux, macOS, and Windows.
|
|
550
|
+
|
|
551
|
+
Before the first publish, add Trusted Publishers on TestPyPI and PyPI:
|
|
552
|
+
|
|
553
|
+
- Repo: `xang1234/textranker`
|
|
554
|
+
- Workflows: `.github/workflows/publish-testpypi.yml` and `.github/workflows/publish-pypi.yml`
|
|
555
|
+
- Environments: `testpypi` and `pypi`
|
|
556
|
+
|
|
557
|
+
You can also trigger either workflow manually via GitHub Actions if needed.
|
|
558
|
+
|
|
559
|
+
## License
|
|
560
|
+
|
|
561
|
+
MIT License - see [LICENSE](LICENSE) for details.
|
|
562
|
+
|
|
563
|
+
## Citation
|
|
564
|
+
|
|
565
|
+
If you use rapid_textrank in research, please cite the original TextRank paper:
|
|
566
|
+
|
|
567
|
+
```bibtex
|
|
568
|
+
@inproceedings{mihalcea-tarau-2004-textrank,
|
|
569
|
+
title = "{T}ext{R}ank: Bringing Order into Text",
|
|
570
|
+
author = "Mihalcea, Rada and Tarau, Paul",
|
|
571
|
+
booktitle = "Proceedings of EMNLP 2004",
|
|
572
|
+
year = "2004",
|
|
573
|
+
publisher = "Association for Computational Linguistics",
|
|
574
|
+
}
|
|
575
|
+
```
|
|
576
|
+
|
|
577
|
+
For PositionRank:
|
|
578
|
+
|
|
579
|
+
```bibtex
|
|
580
|
+
@inproceedings{florescu-caragea-2017-positionrank,
|
|
581
|
+
title = "{P}osition{R}ank: An Unsupervised Approach to Keyphrase Extraction from Scholarly Documents",
|
|
582
|
+
author = "Florescu, Corina and Caragea, Cornelia",
|
|
583
|
+
booktitle = "Proceedings of ACL 2017",
|
|
584
|
+
year = "2017",
|
|
585
|
+
}
|
|
586
|
+
```
|
|
587
|
+
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
rapid_textrank/__init__.py,sha256=o_pRt43a7RqoeenKu0Gg1EOg3zR88U29u7C5uz228ZM,1373
|
|
2
|
+
rapid_textrank/_rust.cpython-314-darwin.so,sha256=UFhljfGAKMkXLD7xjwU8UP3NLFao7zfS5-mDtu7f32o,1108680
|
|
3
|
+
rapid_textrank/spacy_component.py,sha256=yYOR-2sP9uxg_YhNHPYPjamt2meCGBOM1_mYbjh6pnY,7509
|
|
4
|
+
rapid_textrank-0.0.1.dist-info/METADATA,sha256=N0HQLk2FkzTYVxUGM2_vmpAWnP04HcXFYGIyVvN3jwE,18686
|
|
5
|
+
rapid_textrank-0.0.1.dist-info/WHEEL,sha256=jyP0hJCe-fSX_gEscesIqqW7KerDJw7iyldGx-__w10,107
|
|
6
|
+
rapid_textrank-0.0.1.dist-info/licenses/LICENSE,sha256=yWeokG20y7cdx3UXBPoQgILIFjkz-ODLh3NElIXgGUA,1080
|
|
7
|
+
rapid_textrank-0.0.1.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 TextRanker Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|