rapid-textrank 0.1.0__cp39-cp39-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,55 @@
1
+ """
2
+ rapid_textrank - High-performance TextRank implementation
3
+
4
+ A fast TextRank implementation in Rust with Python bindings,
5
+ providing keyword extraction and text summarization.
6
+ """
7
+
8
+ from rapid_textrank._rust import (
9
+ __version__,
10
+ Phrase,
11
+ TextRankResult,
12
+ TextRankConfig,
13
+ BaseTextRank,
14
+ PositionRank,
15
+ BiasedTextRank,
16
+ extract_from_json,
17
+ extract_batch_from_json,
18
+ get_stopwords,
19
+ )
20
+
21
+ __all__ = [
22
+ "__version__",
23
+ "Phrase",
24
+ "TextRankResult",
25
+ "TextRankConfig",
26
+ "BaseTextRank",
27
+ "PositionRank",
28
+ "BiasedTextRank",
29
+ "extract_from_json",
30
+ "extract_batch_from_json",
31
+ "get_stopwords",
32
+ ]
33
+
34
+
35
+ def extract_keywords(text: str, top_n: int = 10, language: str = "en") -> list:
36
+ """
37
+ Extract keywords from text using TextRank.
38
+
39
+ Args:
40
+ text: The input text to extract keywords from
41
+ top_n: Number of top keywords to return
42
+ language: Language code for stopword filtering
43
+
44
+ Returns:
45
+ List of Phrase objects with text, score, and rank
46
+
47
+ Example:
48
+ >>> from rapid_textrank import extract_keywords
49
+ >>> phrases = extract_keywords("Machine learning is a subset of AI.")
50
+ >>> for phrase in phrases:
51
+ ... print(f"{phrase.text}: {phrase.score:.4f}")
52
+ """
53
+ extractor = BaseTextRank(top_n=top_n, language=language)
54
+ result = extractor.extract_keywords(text)
55
+ return list(result.phrases)
Binary file
@@ -0,0 +1,239 @@
1
+ """
2
+ spaCy pipeline component for rapid_textrank.
3
+
4
+ This module provides a spaCy pipeline component that uses rapid_textrank
5
+ for keyword extraction. It can be used as a drop-in replacement for
6
+ pytextrank with significantly better performance.
7
+
8
+ Example:
9
+ >>> import spacy
10
+ >>> from rapid_textrank.spacy_component import RustTextRank
11
+ >>>
12
+ >>> nlp = spacy.load("en_core_web_sm")
13
+ >>> nlp.add_pipe("rapid_textrank")
14
+ >>>
15
+ >>> doc = nlp("Machine learning is a subset of artificial intelligence.")
16
+ >>> for phrase in doc._.phrases[:5]:
17
+ ... print(f"{phrase.text}: {phrase.score:.4f}")
18
+ """
19
+
20
+ from typing import List, Optional, Dict, Any
21
+ import json
22
+
23
+ try:
24
+ from spacy.tokens import Doc
25
+ from spacy.language import Language
26
+
27
+ SPACY_AVAILABLE = True
28
+ except ImportError:
29
+ SPACY_AVAILABLE = False
30
+ Doc = None
31
+ Language = None
32
+
33
+ from rapid_textrank._rust import extract_from_json
34
+
35
+
36
+ class Phrase:
37
+ """
38
+ A keyphrase extracted by RustTextRank.
39
+
40
+ Compatible with pytextrank's Phrase interface.
41
+ """
42
+
43
+ def __init__(self, text: str, lemma: str, score: float, count: int, rank: int):
44
+ self.text = text
45
+ self.lemma = lemma
46
+ self.score = score
47
+ self.count = count
48
+ self.rank = rank
49
+ # For pytextrank compatibility
50
+ self.chunks = []
51
+
52
+ def __repr__(self) -> str:
53
+ return f"Phrase(text='{self.text}', score={self.score:.4f}, rank={self.rank})"
54
+
55
+ def __str__(self) -> str:
56
+ return self.text
57
+
58
+
59
+ class RustTextRankResult:
60
+ """Result container for TextRank extraction."""
61
+
62
+ def __init__(self, phrases: List[Phrase], converged: bool, iterations: int):
63
+ self.phrases = phrases
64
+ self.converged = converged
65
+ self.iterations = iterations
66
+
67
+ def __len__(self) -> int:
68
+ return len(self.phrases)
69
+
70
+ def __iter__(self):
71
+ return iter(self.phrases)
72
+
73
+
74
+ if SPACY_AVAILABLE:
75
+
76
+ @Language.factory(
77
+ "rapid_textrank",
78
+ default_config={
79
+ "damping": 0.85,
80
+ "max_iterations": 100,
81
+ "convergence_threshold": 1e-6,
82
+ "window_size": 4,
83
+ "top_n": 10,
84
+ "min_phrase_length": 1,
85
+ "max_phrase_length": 4,
86
+ "score_aggregation": "sum",
87
+ },
88
+ )
89
+ def create_rapid_textrank(
90
+ nlp: Language,
91
+ name: str,
92
+ damping: float,
93
+ max_iterations: int,
94
+ convergence_threshold: float,
95
+ window_size: int,
96
+ top_n: int,
97
+ min_phrase_length: int,
98
+ max_phrase_length: int,
99
+ score_aggregation: str,
100
+ ):
101
+ """Create a RustTextRank pipeline component."""
102
+ return RustTextRank(
103
+ nlp=nlp,
104
+ name=name,
105
+ damping=damping,
106
+ max_iterations=max_iterations,
107
+ convergence_threshold=convergence_threshold,
108
+ window_size=window_size,
109
+ top_n=top_n,
110
+ min_phrase_length=min_phrase_length,
111
+ max_phrase_length=max_phrase_length,
112
+ score_aggregation=score_aggregation,
113
+ )
114
+
115
+ class RustTextRank:
116
+ """
117
+ spaCy pipeline component for TextRank keyword extraction.
118
+
119
+ This component uses the Rust implementation for fast extraction
120
+ while integrating seamlessly with spaCy's NLP pipeline.
121
+
122
+ Example:
123
+ >>> import spacy
124
+ >>> nlp = spacy.load("en_core_web_sm")
125
+ >>> nlp.add_pipe("rapid_textrank")
126
+ >>> doc = nlp("Machine learning is transforming industries.")
127
+ >>> for phrase in doc._.phrases:
128
+ ... print(phrase.text, phrase.score)
129
+ """
130
+
131
+ def __init__(
132
+ self,
133
+ nlp: Language,
134
+ name: str = "rapid_textrank",
135
+ damping: float = 0.85,
136
+ max_iterations: int = 100,
137
+ convergence_threshold: float = 1e-6,
138
+ window_size: int = 4,
139
+ top_n: int = 10,
140
+ min_phrase_length: int = 1,
141
+ max_phrase_length: int = 4,
142
+ score_aggregation: str = "sum",
143
+ ):
144
+ self.nlp = nlp
145
+ self.name = name
146
+ self.config = {
147
+ "damping": damping,
148
+ "max_iterations": max_iterations,
149
+ "convergence_threshold": convergence_threshold,
150
+ "window_size": window_size,
151
+ "top_n": top_n,
152
+ "min_phrase_length": min_phrase_length,
153
+ "max_phrase_length": max_phrase_length,
154
+ "score_aggregation": score_aggregation,
155
+ }
156
+
157
+ # Register custom extensions
158
+ if not Doc.has_extension("phrases"):
159
+ Doc.set_extension("phrases", default=[])
160
+ if not Doc.has_extension("textrank_result"):
161
+ Doc.set_extension("textrank_result", default=None)
162
+
163
+ def __call__(self, doc: Doc) -> Doc:
164
+ """Process a spaCy Doc and extract keyphrases."""
165
+ # Convert spaCy tokens to JSON format
166
+ tokens = []
167
+ for sent_idx, sent in enumerate(doc.sents):
168
+ for token in sent:
169
+ tokens.append(
170
+ {
171
+ "text": token.text,
172
+ "lemma": token.lemma_,
173
+ "pos": token.pos_,
174
+ "start": token.idx,
175
+ "end": token.idx + len(token.text),
176
+ "sentence_idx": sent_idx,
177
+ "token_idx": token.i,
178
+ "is_stopword": token.is_stop,
179
+ }
180
+ )
181
+
182
+ # Create JSON input
183
+ json_input = json.dumps({"tokens": tokens, "config": self.config})
184
+
185
+ # Extract keyphrases using Rust
186
+ json_output = extract_from_json(json_input)
187
+ result = json.loads(json_output)
188
+
189
+ # Convert to Phrase objects
190
+ phrases = [
191
+ Phrase(
192
+ text=p["text"],
193
+ lemma=p["lemma"],
194
+ score=p["score"],
195
+ count=p["count"],
196
+ rank=p["rank"],
197
+ )
198
+ for p in result["phrases"]
199
+ ]
200
+
201
+ # Store results
202
+ doc._.phrases = phrases
203
+ doc._.textrank_result = RustTextRankResult(
204
+ phrases=phrases,
205
+ converged=result["converged"],
206
+ iterations=result["iterations"],
207
+ )
208
+
209
+ return doc
210
+
211
+ def to_disk(self, path, **kwargs):
212
+ """Save component configuration to disk."""
213
+ import json
214
+ from pathlib import Path
215
+
216
+ config_path = Path(path) / "config.json"
217
+ with open(config_path, "w") as f:
218
+ json.dump(self.config, f)
219
+
220
+ def from_disk(self, path, **kwargs):
221
+ """Load component configuration from disk."""
222
+ import json
223
+ from pathlib import Path
224
+
225
+ config_path = Path(path) / "config.json"
226
+ with open(config_path, "r") as f:
227
+ self.config = json.load(f)
228
+ return self
229
+
230
+ else:
231
+ # Fallback when spaCy is not available
232
+ class RustTextRank:
233
+ """Placeholder when spaCy is not installed."""
234
+
235
+ def __init__(self, *args, **kwargs):
236
+ raise ImportError(
237
+ "spaCy is required for the RustTextRank pipeline component. "
238
+ "Install it with: pip install spacy"
239
+ )
@@ -0,0 +1,606 @@
1
+ Metadata-Version: 2.4
2
+ Name: rapid_textrank
3
+ Version: 0.1.0
4
+ Classifier: Development Status :: 4 - Beta
5
+ Classifier: Intended Audience :: Developers
6
+ Classifier: Intended Audience :: Science/Research
7
+ Classifier: License :: OSI Approved :: MIT License
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.9
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Programming Language :: Rust
14
+ Classifier: Operating System :: MacOS :: MacOS X
15
+ Classifier: Operating System :: Microsoft :: Windows
16
+ Classifier: Operating System :: POSIX :: Linux
17
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
+ Classifier: Topic :: Text Processing :: Linguistic
19
+ Requires-Dist: pytest>=7.0 ; extra == 'dev'
20
+ Requires-Dist: pytest-benchmark ; extra == 'dev'
21
+ Requires-Dist: spacy>=3.0 ; extra == 'dev'
22
+ Requires-Dist: spacy>=3.0 ; extra == 'spacy'
23
+ Provides-Extra: dev
24
+ Provides-Extra: spacy
25
+ License-File: LICENSE
26
+ Summary: High-performance TextRank implementation with Python bindings
27
+ Keywords: nlp,textrank,keyword-extraction,summarization,pagerank
28
+ Author: TextRanker Contributors
29
+ License-Expression: MIT
30
+ Requires-Python: >=3.9
31
+ Description-Content-Type: text/markdown
32
+ Project-URL: Homepage, https://github.com/xang1234/rapid-textrank
33
+ Project-URL: Issues, https://github.com/xang1234/rapid-textrank/issues
34
+ Project-URL: Repository, https://github.com/xang1234/rapid-textrank
35
+
36
+ # rapid_textrank
37
+
38
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
39
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
40
+ [![Rust](https://img.shields.io/badge/rust-2021-orange.svg)](https://www.rust-lang.org/)
41
+
42
+ **High-performance TextRank implementation in Rust with Python bindings.**
43
+
44
+ Extract keywords and key phrases from text up to 10-100x faster than pure Python implementations (depending on document size and tokenization), with support for multiple algorithm variants and 18 languages.
45
+
46
+ ## Features
47
+
48
+ - **Fast**: Up to 10-100x faster than pure Python implementations (see benchmarks)
49
+ - **Multiple algorithms**: TextRank, PositionRank, and BiasedTextRank variants
50
+ - **Unicode-aware**: Proper handling of CJK and other scripts (emoji are ignored by the built-in tokenizer)
51
+ - **Multi-language**: Stopword support for 18 languages
52
+ - **Dual API**: Native Python classes + JSON interface for batch processing
53
+ - **Rust core**: Computation happens in Rust (the Python GIL is currently held during extraction)
54
+
55
+ ## Quick Start
56
+
57
+ ```bash
58
+ pip install rapid_textrank
59
+ ```
60
+
61
+ ```python
62
+ from rapid_textrank import extract_keywords
63
+
64
+ text = """
65
+ Machine learning is a subset of artificial intelligence that enables
66
+ systems to learn and improve from experience. Deep learning, a type of
67
+ machine learning, uses neural networks with many layers.
68
+ """
69
+
70
+ keywords = extract_keywords(text, top_n=5, language="en")
71
+ for phrase in keywords:
72
+ print(f"{phrase.text}: {phrase.score:.4f}")
73
+ ```
74
+
75
+ Output:
76
+ ```
77
+ machine learning: 0.2341
78
+ deep learning: 0.1872
79
+ artificial intelligence: 0.1654
80
+ neural networks: 0.1432
81
+ systems: 0.0891
82
+ ```
83
+
84
+ ## How TextRank Works
85
+
86
+ TextRank is a graph-based ranking algorithm for keyword extraction, inspired by Google's PageRank.
87
+
88
+ ### The Algorithm
89
+
90
+ 1. **Build a co-occurrence graph**: Words become nodes. An edge connects two words if they appear within a sliding window (default: 4 words).
91
+
92
+ 2. **Run PageRank**: The algorithm iteratively distributes "importance" through the graph. Words connected to many important words become important themselves.
93
+
94
+ 3. **Extract phrases**: High-scoring words are grouped into noun chunks (POS-filtered) to form key phrases. Scores are aggregated (sum, mean, or max).
95
+
96
+ ```
97
+ Text: "Machine learning enables systems to learn from data"
98
+
99
+ Co-occurrence graph (window=2):
100
+ machine ←→ learning ←→ enables ←→ systems ←→ learn ←→ data
101
+
102
+ PageRank
103
+
104
+ Scores: machine(0.23) learning(0.31) enables(0.12) ...
105
+
106
+ Phrase extraction
107
+
108
+ "machine learning" (0.54), "systems" (0.18), ...
109
+ ```
110
+
111
+ ### Further Reading
112
+
113
+ - [TextRank: Bringing Order into Texts](https://aclanthology.org/W04-3252/) (Mihalcea & Tarau, 2004)
114
+ - [PositionRank: An Unsupervised Approach to Keyphrase Extraction](https://aclanthology.org/P17-1102/) (Florescu & Caragea, 2017)
115
+ - [BiasedTextRank: Unsupervised Graph-Based Content Extraction](https://aclanthology.org/2020.coling-main.144/) (Kazemi et al., 2020)
116
+
117
+ ## Algorithm Variants
118
+
119
+ | Variant | Best For | Description |
120
+ |---------|----------|-------------|
121
+ | `BaseTextRank` | General text | Standard TextRank implementation |
122
+ | `PositionRank` | Academic papers, news | Favors words appearing early in the document |
123
+ | `BiasedTextRank` | Topic-focused extraction | Biases results toward specified focus terms |
124
+
125
+ ### PositionRank
126
+
127
+ Weights words by their position—earlier appearances score higher. Useful for documents where key information appears in titles, abstracts, or opening paragraphs.
128
+
129
+ ```python
130
+ from rapid_textrank import PositionRank
131
+
132
+ extractor = PositionRank(top_n=10)
133
+ result = extractor.extract_keywords("""
134
+ Quantum Computing Advances in 2024
135
+
136
+ Researchers have made significant breakthroughs in quantum error correction.
137
+ The quantum computing field continues to evolve rapidly...
138
+ """)
139
+
140
+ # "quantum computing" and "quantum" will rank higher due to early position
141
+ ```
142
+
143
+ ### BiasedTextRank
144
+
145
+ Steers extraction toward specific topics using focus terms. The `bias_weight` parameter controls how strongly results favor the focus terms.
146
+
147
+ ```python
148
+ from rapid_textrank import BiasedTextRank
149
+
150
+ extractor = BiasedTextRank(
151
+ focus_terms=["security", "privacy"],
152
+ bias_weight=5.0, # Higher = stronger bias
153
+ top_n=10
154
+ )
155
+
156
+ result = extractor.extract_keywords("""
157
+ Modern web applications must balance user experience with security.
158
+ Privacy regulations require careful data handling. Performance
159
+ optimizations should not compromise security measures.
160
+ """)
161
+
162
+ # Results will favor security/privacy-related phrases
163
+ ```
164
+
165
+ ## API Reference
166
+
167
+ ### Convenience Function
168
+
169
+ The simplest way to extract keywords:
170
+
171
+ ```python
172
+ from rapid_textrank import extract_keywords
173
+
174
+ phrases = extract_keywords(
175
+ text, # Input text
176
+ top_n=10, # Number of keywords to return
177
+ language="en" # Language for stopword filtering
178
+ )
179
+ ```
180
+
181
+ ### Class-Based API
182
+
183
+ For more control, use the extractor classes:
184
+
185
+ ```python
186
+ from rapid_textrank import BaseTextRank, PositionRank, BiasedTextRank
187
+
188
+ # Standard TextRank
189
+ extractor = BaseTextRank(top_n=10, language="en")
190
+ result = extractor.extract_keywords(text)
191
+
192
+ # Position-weighted
193
+ extractor = PositionRank(top_n=10, language="en")
194
+ result = extractor.extract_keywords(text)
195
+
196
+ # Topic-biased
197
+ extractor = BiasedTextRank(
198
+ focus_terms=["machine", "learning"],
199
+ bias_weight=5.0,
200
+ top_n=10,
201
+ language="en"
202
+ )
203
+ result = extractor.extract_keywords(text)
204
+
205
+ # You can also pass focus_terms per-call
206
+ result = extractor.extract_keywords(text, focus_terms=["neural", "network"])
207
+ ```
208
+
209
+ ### Configuration
210
+
211
+ Fine-tune the algorithm with `TextRankConfig`:
212
+
213
+ ```python
214
+ from rapid_textrank import TextRankConfig, BaseTextRank
215
+
216
+ config = TextRankConfig(
217
+ damping=0.85, # PageRank damping factor (0-1)
218
+ max_iterations=100, # Maximum PageRank iterations
219
+ convergence_threshold=1e-6,# Convergence threshold
220
+ window_size=4, # Co-occurrence window size
221
+ top_n=10, # Number of results
222
+ min_phrase_length=1, # Minimum words in a phrase
223
+ max_phrase_length=4, # Maximum words in a phrase
224
+ score_aggregation="sum", # How to combine word scores: "sum", "mean", "max", "rms"
225
+ language="en" # Language for stopwords
226
+ )
227
+
228
+ extractor = BaseTextRank(config=config)
229
+ ```
230
+
231
+ ### Result Objects
232
+
233
+ ```python
234
+ result = extractor.extract_keywords(text)
235
+
236
+ # TextRankResult attributes
237
+ result.phrases # List of Phrase objects
238
+ result.converged # Whether PageRank converged
239
+ result.iterations # Number of iterations run
240
+
241
+ # Phrase attributes
242
+ for phrase in result.phrases:
243
+ phrase.text # The phrase text (e.g., "machine learning")
244
+ phrase.lemma # Lemmatized form
245
+ phrase.score # TextRank score
246
+ phrase.count # Occurrences in text
247
+ phrase.rank # 1-indexed rank
248
+
249
+ # Convenience method
250
+ tuples = result.as_tuples() # [(text, score), ...]
251
+ ```
252
+
253
+ ### JSON Interface
254
+
255
+ For processing large documents or integrating with spaCy, use the JSON interface. This accepts pre-tokenized data to avoid re-tokenizing in Rust. Stopword handling can use each token's `is_stopword` field and/or a `config.language` plus `config.stopwords` (additional words that extend the built-in list). Language codes follow the Supported Languages table below.
256
+
257
+ ```python
258
+ from rapid_textrank import extract_from_json, extract_batch_from_json
259
+ import json
260
+
261
+ # Single document
262
+ doc = {
263
+ "tokens": [
264
+ {
265
+ "text": "Machine",
266
+ "lemma": "machine",
267
+ "pos": "NOUN",
268
+ "start": 0,
269
+ "end": 7,
270
+ "sentence_idx": 0,
271
+ "token_idx": 0,
272
+ "is_stopword": False
273
+ },
274
+ # ... more tokens
275
+ ],
276
+ "config": {"top_n": 10, "language": "en", "stopwords": ["nlp", "transformers"]}
277
+ }
278
+
279
+ result_json = extract_from_json(json.dumps(doc))
280
+ result = json.loads(result_json)
281
+
282
+ # Batch processing (Rust core; per-document processing is sequential)
283
+ docs = [doc1, doc2, doc3]
284
+ results_json = extract_batch_from_json(json.dumps(docs))
285
+ results = json.loads(results_json)
286
+ ```
287
+
288
+ ## Supported Languages
289
+
290
+ Stopword filtering is available for 18 languages. Use these codes for the `language` parameter in all APIs (including JSON config):
291
+
292
+ | Code | Language | Code | Language | Code | Language |
293
+ |------|----------|------|----------|------|----------|
294
+ | `en` | English | `de` | German | `fr` | French |
295
+ | `es` | Spanish | `it` | Italian | `pt` | Portuguese |
296
+ | `nl` | Dutch | `ru` | Russian | `sv` | Swedish |
297
+ | `no` | Norwegian | `da` | Danish | `fi` | Finnish |
298
+ | `hu` | Hungarian | `tr` | Turkish | `pl` | Polish |
299
+ | `ar` | Arabic | `zh` | Chinese | `ja` | Japanese |
300
+
301
+ You can inspect the built-in stopword list with:
302
+
303
+ ```python
304
+ import rapid_textrank as rt
305
+ rt.get_stopwords("en")
306
+ ```
307
+
308
+ ## Performance
309
+
310
+ rapid_textrank achieves significant speedups through Rust's performance characteristics and careful algorithm implementation.
311
+
312
+ ### Benchmark Script
313
+
314
+ Run this script to compare performance on your hardware:
315
+
316
+ ```python
317
+ """
318
+ Benchmark: rapid_textrank vs pytextrank
319
+
320
+ Prerequisites:
321
+ pip install rapid_textrank pytextrank spacy
322
+ python -m spacy download en_core_web_sm
323
+ """
324
+
325
+ import time
326
+ import statistics
327
+
328
+ # Sample texts of varying sizes
329
+ TEXTS = {
330
+ "small": """
331
+ Machine learning is a subset of artificial intelligence.
332
+ Deep learning uses neural networks with many layers.
333
+ """,
334
+
335
+ "medium": """
336
+ Natural language processing (NLP) is a field of artificial intelligence
337
+ that focuses on the interaction between computers and humans through
338
+ natural language. The ultimate goal of NLP is to enable computers to
339
+ understand, interpret, and generate human language in a valuable way.
340
+
341
+ Machine learning approaches have transformed NLP in recent years.
342
+ Deep learning models, particularly transformers, have achieved
343
+ state-of-the-art results on many NLP tasks including translation,
344
+ summarization, and question answering.
345
+
346
+ Key applications include sentiment analysis, named entity recognition,
347
+ machine translation, and text classification. These technologies
348
+ power virtual assistants, search engines, and content recommendation
349
+ systems used by millions of people daily.
350
+ """,
351
+
352
+ "large": """
353
+ Artificial intelligence has evolved dramatically since its inception in
354
+ the mid-20th century. Early AI systems relied on symbolic reasoning and
355
+ expert systems, where human knowledge was manually encoded into rules.
356
+
357
+ The machine learning revolution changed everything. Instead of explicit
358
+ programming, systems learn patterns from data. Supervised learning uses
359
+ labeled examples, unsupervised learning finds hidden structures, and
360
+ reinforcement learning optimizes through trial and error.
361
+
362
+ Deep learning, powered by neural networks with multiple layers, has
363
+ achieved remarkable success. Convolutional neural networks excel at
364
+ image recognition. Recurrent neural networks and transformers handle
365
+ sequential data like text and speech. Generative adversarial networks
366
+ create realistic synthetic content.
367
+
368
+ Natural language processing has been transformed by these advances.
369
+ Word embeddings capture semantic relationships. Attention mechanisms
370
+ allow models to focus on relevant context. Large language models
371
+ demonstrate emergent capabilities in reasoning and generation.
372
+
373
+ Computer vision applications include object detection, facial recognition,
374
+ medical image analysis, and autonomous vehicle perception. These systems
375
+ process visual information with superhuman accuracy in many domains.
376
+
377
+ The ethical implications of AI are significant. Bias in training data
378
+ can lead to unfair outcomes. Privacy concerns arise from data collection.
379
+ Job displacement affects workers across industries. Regulation and
380
+ governance frameworks are being developed worldwide.
381
+
382
+ Future directions include neuromorphic computing, quantum machine learning,
383
+ and artificial general intelligence. Researchers continue to push
384
+ boundaries while addressing safety and alignment challenges.
385
+ """ * 3 # ~1000 words
386
+ }
387
+
388
+
389
+ def benchmark_rapid_textrank(text: str, runs: int = 10) -> dict:
390
+ """Benchmark rapid_textrank."""
391
+ from rapid_textrank import BaseTextRank
392
+
393
+ extractor = BaseTextRank(top_n=10, language="en")
394
+
395
+ # Warmup
396
+ extractor.extract_keywords(text)
397
+
398
+ times = []
399
+ for _ in range(runs):
400
+ start = time.perf_counter()
401
+ result = extractor.extract_keywords(text)
402
+ elapsed = time.perf_counter() - start
403
+ times.append(elapsed * 1000) # Convert to ms
404
+
405
+ return {
406
+ "min": min(times),
407
+ "mean": statistics.mean(times),
408
+ "median": statistics.median(times),
409
+ "std": statistics.stdev(times) if len(times) > 1 else 0,
410
+ "phrases": len(result.phrases)
411
+ }
412
+
413
+
414
+ def benchmark_pytextrank(text: str, runs: int = 10) -> dict:
415
+ """Benchmark pytextrank with spaCy."""
416
+ import spacy
417
+ import pytextrank
418
+
419
+ nlp = spacy.load("en_core_web_sm")
420
+ nlp.add_pipe("textrank")
421
+
422
+ # Warmup
423
+ doc = nlp(text)
424
+
425
+ times = []
426
+ for _ in range(runs):
427
+ start = time.perf_counter()
428
+ doc = nlp(text)
429
+ phrases = list(doc._.phrases[:10])
430
+ elapsed = time.perf_counter() - start
431
+ times.append(elapsed * 1000)
432
+
433
+ return {
434
+ "min": min(times),
435
+ "mean": statistics.mean(times),
436
+ "median": statistics.median(times),
437
+ "std": statistics.stdev(times) if len(times) > 1 else 0,
438
+ "phrases": len(phrases)
439
+ }
440
+
441
+
442
+ def main():
443
+ print("=" * 70)
444
+ print("TextRank Performance Benchmark")
445
+ print("=" * 70)
446
+
447
+ for size, text in TEXTS.items():
448
+ word_count = len(text.split())
449
+ print(f"\n{size.upper()} TEXT (~{word_count} words)")
450
+ print("-" * 50)
451
+
452
+ # Benchmark rapid_textrank
453
+ rust_results = benchmark_rapid_textrank(text)
454
+ print(f"rapid_textrank: {rust_results['mean']:>8.2f} ms (±{rust_results['std']:.2f})")
455
+
456
+ # Benchmark pytextrank
457
+ try:
458
+ py_results = benchmark_pytextrank(text)
459
+ print(f"pytextrank: {py_results['mean']:>8.2f} ms (±{py_results['std']:.2f})")
460
+
461
+ speedup = py_results['mean'] / rust_results['mean']
462
+ print(f"Speedup: {speedup:>8.1f}x faster")
463
+ except Exception as e:
464
+ print(f"pytextrank: (not available: {e})")
465
+
466
+ print("\n" + "=" * 70)
467
+ print("Note: pytextrank times include spaCy tokenization.")
468
+ print("For fair comparison with pre-tokenized input, use rapid_textrank's JSON API.")
469
+ print("=" * 70)
470
+
471
+
472
+ if __name__ == "__main__":
473
+ main()
474
+ ```
475
+
476
+ ### Why Rust is Fast
477
+
478
+ The performance advantage comes from several factors:
479
+
480
+ 1. **CSR Graph Format**: The co-occurrence graph uses Compressed Sparse Row format, enabling cache-friendly memory access during PageRank iteration.
481
+
482
+ 2. **String Interning**: Repeated words share a single allocation via `StringPool`, reducing memory usage 10-100x for typical documents.
483
+
484
+ 3. **Parallel Processing**: Rayon provides data parallelism in internal graph construction without explicit thread management.
485
+
486
+ 4. **Link-Time Optimization (LTO)**: Release builds use full LTO with single codegen unit for maximum inlining.
487
+
488
+ 5. **Rust core**: Most computation happens in Rust, minimizing Python-level overhead.
489
+
490
+ 6. **FxHash**: Fast non-cryptographic hashing for internal hash maps.
491
+
492
+ ## Installation
493
+
494
+ ### From PyPI
495
+
496
+ ```bash
497
+ pip install rapid_textrank
498
+ ```
499
+
500
+ Import name is `rapid_textrank`.
501
+
502
+ ### With spaCy Support
503
+
504
+ ```bash
505
+ pip install rapid_textrank[spacy]
506
+ ```
507
+
508
+ ```python
509
+ import spacy
510
+ import rapid_textrank.spacy_component # registers the pipeline factory
511
+
512
+ nlp = spacy.load("en_core_web_sm")
513
+ nlp.add_pipe("rapid_textrank")
514
+
515
+ doc = nlp("Machine learning is a subset of artificial intelligence.")
516
+ for phrase in doc._.phrases[:5]:
517
+ print(f"{phrase.text}: {phrase.score:.4f}")
518
+ ```
519
+
520
+ ### From Source
521
+
522
+ Requirements: Rust 1.70+, Python 3.9+
523
+
524
+ ```bash
525
+ git clone https://github.com/xang1234/rapid-textrank
526
+ cd rapid_textrank
527
+ pip install maturin
528
+ maturin develop --release
529
+ ```
530
+
531
+ ### Development Setup
532
+
533
+ ```bash
534
+ # Install with dev dependencies
535
+ pip install -e ".[dev]"
536
+
537
+ # Run tests
538
+ pytest
539
+
540
+ # Run Rust tests
541
+ cargo test
542
+ ```
543
+
544
+ ## Publishing
545
+
546
+ Publishing is automated with GitHub Actions using Trusted Publishing (OIDC), so no API tokens are stored.
547
+
548
+ TestPyPI release (push a tag):
549
+
550
+ ```bash
551
+ git tag -a test-0.1.0 -m "TestPyPI 0.1.0"
552
+ git push origin test-0.1.0
553
+ ```
554
+
555
+ Tag pattern: `test-*`
556
+
557
+ PyPI release (push a tag):
558
+
559
+ ```bash
560
+ git tag -a v0.1.0 -m "Release 0.1.0"
561
+ git push origin v0.1.0
562
+ ```
563
+
564
+ Tag pattern: `v*`
565
+
566
+ Wheel builds
567
+
568
+ GitHub Actions builds wheels for Python 3.9–3.12 on Linux, macOS, and Windows.
569
+
570
+ Before the first publish, add Trusted Publishers on TestPyPI and PyPI:
571
+
572
+ - Repo: `xang1234/textranker`
573
+ - Workflows: `.github/workflows/publish-testpypi.yml` and `.github/workflows/publish-pypi.yml`
574
+ - Environments: `testpypi` and `pypi`
575
+
576
+ You can also trigger either workflow manually via GitHub Actions if needed.
577
+
578
+ ## License
579
+
580
+ MIT License - see [LICENSE](LICENSE) for details.
581
+
582
+ ## Citation
583
+
584
+ If you use rapid_textrank in research, please cite the original TextRank paper:
585
+
586
+ ```bibtex
587
+ @inproceedings{mihalcea-tarau-2004-textrank,
588
+ title = "{T}ext{R}ank: Bringing Order into Text",
589
+ author = "Mihalcea, Rada and Tarau, Paul",
590
+ booktitle = "Proceedings of EMNLP 2004",
591
+ year = "2004",
592
+ publisher = "Association for Computational Linguistics",
593
+ }
594
+ ```
595
+
596
+ For PositionRank:
597
+
598
+ ```bibtex
599
+ @inproceedings{florescu-caragea-2017-positionrank,
600
+ title = "{P}osition{R}ank: An Unsupervised Approach to Keyphrase Extraction from Scholarly Documents",
601
+ author = "Florescu, Corina and Caragea, Cornelia",
602
+ booktitle = "Proceedings of ACL 2017",
603
+ year = "2017",
604
+ }
605
+ ```
606
+
@@ -0,0 +1,7 @@
1
+ rapid_textrank\__init__.py,sha256=Wy5toY0yQbI43q7MgonlRLotePsvlqCEheU9cgdXD4Y,1468
2
+ rapid_textrank\_rust.cp39-win_amd64.pyd,sha256=_yTYq-V7V4v5408xF3ioUFvG46UbkuHtJ2h0eDm2PtM,913408
3
+ rapid_textrank\spacy_component.py,sha256=ys9InEe3mjjt4a7kKKYZwyDHhfPpgwxbKR4IDM_aJp0,7748
4
+ rapid_textrank-0.1.0.dist-info\METADATA,sha256=etaKPMvwUvEgrCrf33OujpRdMqKIVj-Rns9onauBp1g,20224
5
+ rapid_textrank-0.1.0.dist-info\WHEEL,sha256=H5klTgXu3iVXpFbMzUkXja9m3gL244ExCR0k1sRMImo,95
6
+ rapid_textrank-0.1.0.dist-info\licenses\LICENSE,sha256=c4x10XhH9H6ZyODSE4qpBstV7JDc3fb0adKMKCNEa2Q,1101
7
+ rapid_textrank-0.1.0.dist-info\RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: maturin (1.11.5)
3
+ Root-Is-Purelib: false
4
+ Tag: cp39-cp39-win_amd64
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 TextRanker Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.