rapid-textrank 0.0.1__cp314-cp314-macosx_10_12_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,53 @@
1
+ """
2
+ rapid_textrank - High-performance TextRank implementation
3
+
4
+ A fast TextRank implementation in Rust with Python bindings,
5
+ providing keyword extraction and text summarization.
6
+ """
7
+
8
+ from rapid_textrank._rust import (
9
+ __version__,
10
+ Phrase,
11
+ TextRankResult,
12
+ TextRankConfig,
13
+ BaseTextRank,
14
+ PositionRank,
15
+ BiasedTextRank,
16
+ extract_from_json,
17
+ extract_batch_from_json,
18
+ )
19
+
20
+ __all__ = [
21
+ "__version__",
22
+ "Phrase",
23
+ "TextRankResult",
24
+ "TextRankConfig",
25
+ "BaseTextRank",
26
+ "PositionRank",
27
+ "BiasedTextRank",
28
+ "extract_from_json",
29
+ "extract_batch_from_json",
30
+ ]
31
+
32
+
33
+ def extract_keywords(text: str, top_n: int = 10, language: str = "en") -> list:
34
+ """
35
+ Extract keywords from text using TextRank.
36
+
37
+ Args:
38
+ text: The input text to extract keywords from
39
+ top_n: Number of top keywords to return
40
+ language: Language code for stopword filtering
41
+
42
+ Returns:
43
+ List of Phrase objects with text, score, and rank
44
+
45
+ Example:
46
+ >>> from rapid_textrank import extract_keywords
47
+ >>> phrases = extract_keywords("Machine learning is a subset of AI.")
48
+ >>> for phrase in phrases:
49
+ ... print(f"{phrase.text}: {phrase.score:.4f}")
50
+ """
51
+ extractor = BaseTextRank(top_n=top_n, language=language)
52
+ result = extractor.extract_keywords(text)
53
+ return list(result.phrases)
@@ -0,0 +1,239 @@
1
+ """
2
+ spaCy pipeline component for rapid_textrank.
3
+
4
+ This module provides a spaCy pipeline component that uses rapid_textrank
5
+ for keyword extraction. It can be used as a drop-in replacement for
6
+ pytextrank with significantly better performance.
7
+
8
+ Example:
9
+ >>> import spacy
10
+ >>> from rapid_textrank.spacy_component import RustTextRank
11
+ >>>
12
+ >>> nlp = spacy.load("en_core_web_sm")
13
+ >>> nlp.add_pipe("rapid_textrank")
14
+ >>>
15
+ >>> doc = nlp("Machine learning is a subset of artificial intelligence.")
16
+ >>> for phrase in doc._.phrases[:5]:
17
+ ... print(f"{phrase.text}: {phrase.score:.4f}")
18
+ """
19
+
20
+ from typing import List, Optional, Dict, Any
21
+ import json
22
+
23
+ try:
24
+ from spacy.tokens import Doc
25
+ from spacy.language import Language
26
+
27
+ SPACY_AVAILABLE = True
28
+ except ImportError:
29
+ SPACY_AVAILABLE = False
30
+ Doc = None
31
+ Language = None
32
+
33
+ from rapid_textrank._rust import extract_from_json
34
+
35
+
36
+ class Phrase:
37
+ """
38
+ A keyphrase extracted by RustTextRank.
39
+
40
+ Compatible with pytextrank's Phrase interface.
41
+ """
42
+
43
+ def __init__(self, text: str, lemma: str, score: float, count: int, rank: int):
44
+ self.text = text
45
+ self.lemma = lemma
46
+ self.score = score
47
+ self.count = count
48
+ self.rank = rank
49
+ # For pytextrank compatibility
50
+ self.chunks = []
51
+
52
+ def __repr__(self) -> str:
53
+ return f"Phrase(text='{self.text}', score={self.score:.4f}, rank={self.rank})"
54
+
55
+ def __str__(self) -> str:
56
+ return self.text
57
+
58
+
59
+ class RustTextRankResult:
60
+ """Result container for TextRank extraction."""
61
+
62
+ def __init__(self, phrases: List[Phrase], converged: bool, iterations: int):
63
+ self.phrases = phrases
64
+ self.converged = converged
65
+ self.iterations = iterations
66
+
67
+ def __len__(self) -> int:
68
+ return len(self.phrases)
69
+
70
+ def __iter__(self):
71
+ return iter(self.phrases)
72
+
73
+
74
+ if SPACY_AVAILABLE:
75
+
76
+ @Language.factory(
77
+ "rapid_textrank",
78
+ default_config={
79
+ "damping": 0.85,
80
+ "max_iterations": 100,
81
+ "convergence_threshold": 1e-6,
82
+ "window_size": 4,
83
+ "top_n": 10,
84
+ "min_phrase_length": 1,
85
+ "max_phrase_length": 4,
86
+ "score_aggregation": "sum",
87
+ },
88
+ )
89
+ def create_rapid_textrank(
90
+ nlp: Language,
91
+ name: str,
92
+ damping: float,
93
+ max_iterations: int,
94
+ convergence_threshold: float,
95
+ window_size: int,
96
+ top_n: int,
97
+ min_phrase_length: int,
98
+ max_phrase_length: int,
99
+ score_aggregation: str,
100
+ ):
101
+ """Create a RustTextRank pipeline component."""
102
+ return RustTextRank(
103
+ nlp=nlp,
104
+ name=name,
105
+ damping=damping,
106
+ max_iterations=max_iterations,
107
+ convergence_threshold=convergence_threshold,
108
+ window_size=window_size,
109
+ top_n=top_n,
110
+ min_phrase_length=min_phrase_length,
111
+ max_phrase_length=max_phrase_length,
112
+ score_aggregation=score_aggregation,
113
+ )
114
+
115
+ class RustTextRank:
116
+ """
117
+ spaCy pipeline component for TextRank keyword extraction.
118
+
119
+ This component uses the Rust implementation for fast extraction
120
+ while integrating seamlessly with spaCy's NLP pipeline.
121
+
122
+ Example:
123
+ >>> import spacy
124
+ >>> nlp = spacy.load("en_core_web_sm")
125
+ >>> nlp.add_pipe("rapid_textrank")
126
+ >>> doc = nlp("Machine learning is transforming industries.")
127
+ >>> for phrase in doc._.phrases:
128
+ ... print(phrase.text, phrase.score)
129
+ """
130
+
131
+ def __init__(
132
+ self,
133
+ nlp: Language,
134
+ name: str = "rapid_textrank",
135
+ damping: float = 0.85,
136
+ max_iterations: int = 100,
137
+ convergence_threshold: float = 1e-6,
138
+ window_size: int = 4,
139
+ top_n: int = 10,
140
+ min_phrase_length: int = 1,
141
+ max_phrase_length: int = 4,
142
+ score_aggregation: str = "sum",
143
+ ):
144
+ self.nlp = nlp
145
+ self.name = name
146
+ self.config = {
147
+ "damping": damping,
148
+ "max_iterations": max_iterations,
149
+ "convergence_threshold": convergence_threshold,
150
+ "window_size": window_size,
151
+ "top_n": top_n,
152
+ "min_phrase_length": min_phrase_length,
153
+ "max_phrase_length": max_phrase_length,
154
+ "score_aggregation": score_aggregation,
155
+ }
156
+
157
+ # Register custom extensions
158
+ if not Doc.has_extension("phrases"):
159
+ Doc.set_extension("phrases", default=[])
160
+ if not Doc.has_extension("textrank_result"):
161
+ Doc.set_extension("textrank_result", default=None)
162
+
163
+ def __call__(self, doc: Doc) -> Doc:
164
+ """Process a spaCy Doc and extract keyphrases."""
165
+ # Convert spaCy tokens to JSON format
166
+ tokens = []
167
+ for sent_idx, sent in enumerate(doc.sents):
168
+ for token in sent:
169
+ tokens.append(
170
+ {
171
+ "text": token.text,
172
+ "lemma": token.lemma_,
173
+ "pos": token.pos_,
174
+ "start": token.idx,
175
+ "end": token.idx + len(token.text),
176
+ "sentence_idx": sent_idx,
177
+ "token_idx": token.i,
178
+ "is_stopword": token.is_stop,
179
+ }
180
+ )
181
+
182
+ # Create JSON input
183
+ json_input = json.dumps({"tokens": tokens, "config": self.config})
184
+
185
+ # Extract keyphrases using Rust
186
+ json_output = extract_from_json(json_input)
187
+ result = json.loads(json_output)
188
+
189
+ # Convert to Phrase objects
190
+ phrases = [
191
+ Phrase(
192
+ text=p["text"],
193
+ lemma=p["lemma"],
194
+ score=p["score"],
195
+ count=p["count"],
196
+ rank=p["rank"],
197
+ )
198
+ for p in result["phrases"]
199
+ ]
200
+
201
+ # Store results
202
+ doc._.phrases = phrases
203
+ doc._.textrank_result = RustTextRankResult(
204
+ phrases=phrases,
205
+ converged=result["converged"],
206
+ iterations=result["iterations"],
207
+ )
208
+
209
+ return doc
210
+
211
+ def to_disk(self, path, **kwargs):
212
+ """Save component configuration to disk."""
213
+ import json
214
+ from pathlib import Path
215
+
216
+ config_path = Path(path) / "config.json"
217
+ with open(config_path, "w") as f:
218
+ json.dump(self.config, f)
219
+
220
+ def from_disk(self, path, **kwargs):
221
+ """Load component configuration from disk."""
222
+ import json
223
+ from pathlib import Path
224
+
225
+ config_path = Path(path) / "config.json"
226
+ with open(config_path, "r") as f:
227
+ self.config = json.load(f)
228
+ return self
229
+
230
+ else:
231
+ # Fallback when spaCy is not available
232
+ class RustTextRank:
233
+ """Placeholder when spaCy is not installed."""
234
+
235
+ def __init__(self, *args, **kwargs):
236
+ raise ImportError(
237
+ "spaCy is required for the RustTextRank pipeline component. "
238
+ "Install it with: pip install spacy"
239
+ )
@@ -0,0 +1,587 @@
1
+ Metadata-Version: 2.4
2
+ Name: rapid_textrank
3
+ Version: 0.0.1
4
+ Classifier: Development Status :: 4 - Beta
5
+ Classifier: Intended Audience :: Developers
6
+ Classifier: Intended Audience :: Science/Research
7
+ Classifier: License :: OSI Approved :: MIT License
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.9
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Programming Language :: Rust
14
+ Classifier: Operating System :: MacOS :: MacOS X
15
+ Classifier: Operating System :: Microsoft :: Windows
16
+ Classifier: Operating System :: POSIX :: Linux
17
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
+ Classifier: Topic :: Text Processing :: Linguistic
19
+ Requires-Dist: pytest>=7.0 ; extra == 'dev'
20
+ Requires-Dist: pytest-benchmark ; extra == 'dev'
21
+ Requires-Dist: spacy>=3.0 ; extra == 'dev'
22
+ Requires-Dist: spacy>=3.0 ; extra == 'spacy'
23
+ Provides-Extra: dev
24
+ Provides-Extra: spacy
25
+ License-File: LICENSE
26
+ Summary: High-performance TextRank implementation with Python bindings
27
+ Keywords: nlp,textrank,keyword-extraction,summarization,pagerank
28
+ Author: TextRanker Contributors
29
+ License-Expression: MIT
30
+ Requires-Python: >=3.9
31
+ Description-Content-Type: text/markdown
32
+ Project-URL: Homepage, https://github.com/xang1234/rapid-textrank
33
+ Project-URL: Issues, https://github.com/xang1234/rapid-textrank/issues
34
+ Project-URL: Repository, https://github.com/xang1234/rapid-textrank
35
+
36
+ # rapid_textrank
37
+
38
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
39
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
40
+ [![Rust](https://img.shields.io/badge/rust-2021-orange.svg)](https://www.rust-lang.org/)
41
+
42
+ **High-performance TextRank implementation in Rust with Python bindings.**
43
+
44
+ Extract keywords and key phrases from text 10-100x faster than pure Python implementations, with support for multiple algorithm variants and 18 languages.
45
+
46
+ ## Features
47
+
48
+ - **Fast**: 10-100x faster than pure Python implementations
49
+ - **Multiple algorithms**: TextRank, PositionRank, and BiasedTextRank variants
50
+ - **Unicode-aware**: Proper handling of CJK, emoji, and other scripts
51
+ - **Multi-language**: Stopword support for 18 languages
52
+ - **Dual API**: Native Python classes + JSON interface for batch processing
53
+ - **Zero Python overhead**: Computation happens entirely in Rust (no GIL)
54
+
55
+ ## Quick Start
56
+
57
+ ```bash
58
+ pip install rapid_textrank
59
+ ```
60
+
61
+ ```python
62
+ from rapid_textrank import extract_keywords
63
+
64
+ text = """
65
+ Machine learning is a subset of artificial intelligence that enables
66
+ systems to learn and improve from experience. Deep learning, a type of
67
+ machine learning, uses neural networks with many layers.
68
+ """
69
+
70
+ keywords = extract_keywords(text, top_n=5, language="en")
71
+ for phrase in keywords:
72
+ print(f"{phrase.text}: {phrase.score:.4f}")
73
+ ```
74
+
75
+ Output:
76
+ ```
77
+ machine learning: 0.2341
78
+ deep learning: 0.1872
79
+ artificial intelligence: 0.1654
80
+ neural networks: 0.1432
81
+ systems: 0.0891
82
+ ```
83
+
84
+ ## How TextRank Works
85
+
86
+ TextRank is a graph-based ranking algorithm for keyword extraction, inspired by Google's PageRank.
87
+
88
+ ### The Algorithm
89
+
90
+ 1. **Build a co-occurrence graph**: Words become nodes. An edge connects two words if they appear within a sliding window (default: 4 words).
91
+
92
+ 2. **Run PageRank**: The algorithm iteratively distributes "importance" through the graph. Words connected to many important words become important themselves.
93
+
94
+ 3. **Extract phrases**: Adjacent high-scoring words are combined into key phrases. Scores are aggregated (sum, mean, or max).
95
+
96
+ ```
97
+ Text: "Machine learning enables systems to learn from data"
98
+
99
+ Co-occurrence graph (window=2):
100
+ machine ←→ learning ←→ enables ←→ systems ←→ learn ←→ data
101
+
102
+ PageRank
103
+
104
+ Scores: machine(0.23) learning(0.31) enables(0.12) ...
105
+
106
+ Phrase extraction
107
+
108
+ "machine learning" (0.54), "systems" (0.18), ...
109
+ ```
110
+
111
+ ### Further Reading
112
+
113
+ - [TextRank: Bringing Order into Texts](https://aclanthology.org/W04-3252/) (Mihalcea & Tarau, 2004)
114
+ - [PositionRank: An Unsupervised Approach to Keyphrase Extraction](https://aclanthology.org/P17-1102/) (Florescu & Caragea, 2017)
115
+ - [BiasedTextRank: Unsupervised Graph-Based Content Extraction](https://aclanthology.org/2020.coling-main.144/) (Kazemi et al., 2020)
116
+
117
+ ## Algorithm Variants
118
+
119
+ | Variant | Best For | Description |
120
+ |---------|----------|-------------|
121
+ | `BaseTextRank` | General text | Standard TextRank implementation |
122
+ | `PositionRank` | Academic papers, news | Favors words appearing early in the document |
123
+ | `BiasedTextRank` | Topic-focused extraction | Biases results toward specified focus terms |
124
+
125
+ ### PositionRank
126
+
127
+ Weights words by their position—earlier appearances score higher. Useful for documents where key information appears in titles, abstracts, or opening paragraphs.
128
+
129
+ ```python
130
+ from rapid_textrank import PositionRank
131
+
132
+ extractor = PositionRank(top_n=10)
133
+ result = extractor.extract_keywords("""
134
+ Quantum Computing Advances in 2024
135
+
136
+ Researchers have made significant breakthroughs in quantum error correction.
137
+ The quantum computing field continues to evolve rapidly...
138
+ """)
139
+
140
+ # "quantum computing" and "quantum" will rank higher due to early position
141
+ ```
142
+
143
+ ### BiasedTextRank
144
+
145
+ Steers extraction toward specific topics using focus terms. The `bias_weight` parameter controls how strongly results favor the focus terms.
146
+
147
+ ```python
148
+ from rapid_textrank import BiasedTextRank
149
+
150
+ extractor = BiasedTextRank(
151
+ focus_terms=["security", "privacy"],
152
+ bias_weight=5.0, # Higher = stronger bias
153
+ top_n=10
154
+ )
155
+
156
+ result = extractor.extract_keywords("""
157
+ Modern web applications must balance user experience with security.
158
+ Privacy regulations require careful data handling. Performance
159
+ optimizations should not compromise security measures.
160
+ """)
161
+
162
+ # Results will favor security/privacy-related phrases
163
+ ```
164
+
165
+ ## API Reference
166
+
167
+ ### Convenience Function
168
+
169
+ The simplest way to extract keywords:
170
+
171
+ ```python
172
+ from rapid_textrank import extract_keywords
173
+
174
+ phrases = extract_keywords(
175
+ text, # Input text
176
+ top_n=10, # Number of keywords to return
177
+ language="en" # Language for stopword filtering
178
+ )
179
+ ```
180
+
181
+ ### Class-Based API
182
+
183
+ For more control, use the extractor classes:
184
+
185
+ ```python
186
+ from rapid_textrank import BaseTextRank, PositionRank, BiasedTextRank
187
+
188
+ # Standard TextRank
189
+ extractor = BaseTextRank(top_n=10, language="en")
190
+ result = extractor.extract_keywords(text)
191
+
192
+ # Position-weighted
193
+ extractor = PositionRank(top_n=10, language="en")
194
+ result = extractor.extract_keywords(text)
195
+
196
+ # Topic-biased
197
+ extractor = BiasedTextRank(
198
+ focus_terms=["machine", "learning"],
199
+ bias_weight=5.0,
200
+ top_n=10,
201
+ language="en"
202
+ )
203
+ result = extractor.extract_keywords(text)
204
+
205
+ # You can also pass focus_terms per-call
206
+ result = extractor.extract_keywords(text, focus_terms=["neural", "network"])
207
+ ```
208
+
209
+ ### Configuration
210
+
211
+ Fine-tune the algorithm with `TextRankConfig`:
212
+
213
+ ```python
214
+ from rapid_textrank import TextRankConfig, BaseTextRank
215
+
216
+ config = TextRankConfig(
217
+ damping=0.85, # PageRank damping factor (0-1)
218
+ max_iterations=100, # Maximum PageRank iterations
219
+ convergence_threshold=1e-6,# Convergence threshold
220
+ window_size=4, # Co-occurrence window size
221
+ top_n=10, # Number of results
222
+ min_phrase_length=1, # Minimum words in a phrase
223
+ max_phrase_length=4, # Maximum words in a phrase
224
+ score_aggregation="sum", # How to combine word scores: "sum", "mean", "max", "rms"
225
+ language="en" # Language for stopwords
226
+ )
227
+
228
+ extractor = BaseTextRank(config=config)
229
+ ```
230
+
231
+ ### Result Objects
232
+
233
+ ```python
234
+ result = extractor.extract_keywords(text)
235
+
236
+ # TextRankResult attributes
237
+ result.phrases # List of Phrase objects
238
+ result.converged # Whether PageRank converged
239
+ result.iterations # Number of iterations run
240
+
241
+ # Phrase attributes
242
+ for phrase in result.phrases:
243
+ phrase.text # The phrase text (e.g., "machine learning")
244
+ phrase.lemma # Lemmatized form
245
+ phrase.score # TextRank score
246
+ phrase.count # Occurrences in text
247
+ phrase.rank # 1-indexed rank
248
+
249
+ # Convenience method
250
+ tuples = result.as_tuples() # [(text, score), ...]
251
+ ```
252
+
253
+ ### JSON Interface
254
+
255
+ For processing large documents or integrating with spaCy, use the JSON interface. This accepts pre-tokenized data to avoid re-tokenizing in Rust.
256
+
257
+ ```python
258
+ from rapid_textrank import extract_from_json, extract_batch_from_json
259
+ import json
260
+
261
+ # Single document
262
+ doc = {
263
+ "tokens": [
264
+ {
265
+ "text": "Machine",
266
+ "lemma": "machine",
267
+ "pos": "NOUN",
268
+ "start": 0,
269
+ "end": 7,
270
+ "sentence_idx": 0,
271
+ "token_idx": 0,
272
+ "is_stopword": False
273
+ },
274
+ # ... more tokens
275
+ ],
276
+ "config": {"top_n": 10}
277
+ }
278
+
279
+ result_json = extract_from_json(json.dumps(doc))
280
+ result = json.loads(result_json)
281
+
282
+ # Batch processing (parallel in Rust)
283
+ docs = [doc1, doc2, doc3]
284
+ results_json = extract_batch_from_json(json.dumps(docs))
285
+ results = json.loads(results_json)
286
+ ```
287
+
288
+ ## Supported Languages
289
+
290
+ Stopword filtering is available for 18 languages:
291
+
292
+ | Code | Language | Code | Language | Code | Language |
293
+ |------|----------|------|----------|------|----------|
294
+ | `en` | English | `de` | German | `fr` | French |
295
+ | `es` | Spanish | `it` | Italian | `pt` | Portuguese |
296
+ | `nl` | Dutch | `ru` | Russian | `sv` | Swedish |
297
+ | `no` | Norwegian | `da` | Danish | `fi` | Finnish |
298
+ | `hu` | Hungarian | `tr` | Turkish | `pl` | Polish |
299
+ | `ar` | Arabic | `zh` | Chinese | `ja` | Japanese |
300
+
301
+ ## Performance
302
+
303
+ rapid_textrank achieves significant speedups through Rust's performance characteristics and careful algorithm implementation.
304
+
305
+ ### Benchmark Script
306
+
307
+ Run this script to compare performance on your hardware:
308
+
309
+ ```python
310
+ """
311
+ Benchmark: rapid_textrank vs pytextrank
312
+
313
+ Prerequisites:
314
+ pip install rapid_textrank pytextrank spacy
315
+ python -m spacy download en_core_web_sm
316
+ """
317
+
318
+ import time
319
+ import statistics
320
+
321
+ # Sample texts of varying sizes
322
+ TEXTS = {
323
+ "small": """
324
+ Machine learning is a subset of artificial intelligence.
325
+ Deep learning uses neural networks with many layers.
326
+ """,
327
+
328
+ "medium": """
329
+ Natural language processing (NLP) is a field of artificial intelligence
330
+ that focuses on the interaction between computers and humans through
331
+ natural language. The ultimate goal of NLP is to enable computers to
332
+ understand, interpret, and generate human language in a valuable way.
333
+
334
+ Machine learning approaches have transformed NLP in recent years.
335
+ Deep learning models, particularly transformers, have achieved
336
+ state-of-the-art results on many NLP tasks including translation,
337
+ summarization, and question answering.
338
+
339
+ Key applications include sentiment analysis, named entity recognition,
340
+ machine translation, and text classification. These technologies
341
+ power virtual assistants, search engines, and content recommendation
342
+ systems used by millions of people daily.
343
+ """,
344
+
345
+ "large": """
346
+ Artificial intelligence has evolved dramatically since its inception in
347
+ the mid-20th century. Early AI systems relied on symbolic reasoning and
348
+ expert systems, where human knowledge was manually encoded into rules.
349
+
350
+ The machine learning revolution changed everything. Instead of explicit
351
+ programming, systems learn patterns from data. Supervised learning uses
352
+ labeled examples, unsupervised learning finds hidden structures, and
353
+ reinforcement learning optimizes through trial and error.
354
+
355
+ Deep learning, powered by neural networks with multiple layers, has
356
+ achieved remarkable success. Convolutional neural networks excel at
357
+ image recognition. Recurrent neural networks and transformers handle
358
+ sequential data like text and speech. Generative adversarial networks
359
+ create realistic synthetic content.
360
+
361
+ Natural language processing has been transformed by these advances.
362
+ Word embeddings capture semantic relationships. Attention mechanisms
363
+ allow models to focus on relevant context. Large language models
364
+ demonstrate emergent capabilities in reasoning and generation.
365
+
366
+ Computer vision applications include object detection, facial recognition,
367
+ medical image analysis, and autonomous vehicle perception. These systems
368
+ process visual information with superhuman accuracy in many domains.
369
+
370
+ The ethical implications of AI are significant. Bias in training data
371
+ can lead to unfair outcomes. Privacy concerns arise from data collection.
372
+ Job displacement affects workers across industries. Regulation and
373
+ governance frameworks are being developed worldwide.
374
+
375
+ Future directions include neuromorphic computing, quantum machine learning,
376
+ and artificial general intelligence. Researchers continue to push
377
+ boundaries while addressing safety and alignment challenges.
378
+ """ * 3 # ~1000 words
379
+ }
380
+
381
+
382
+ def benchmark_rapid_textrank(text: str, runs: int = 10) -> dict:
383
+ """Benchmark rapid_textrank."""
384
+ from rapid_textrank import BaseTextRank
385
+
386
+ extractor = BaseTextRank(top_n=10, language="en")
387
+
388
+ # Warmup
389
+ extractor.extract_keywords(text)
390
+
391
+ times = []
392
+ for _ in range(runs):
393
+ start = time.perf_counter()
394
+ result = extractor.extract_keywords(text)
395
+ elapsed = time.perf_counter() - start
396
+ times.append(elapsed * 1000) # Convert to ms
397
+
398
+ return {
399
+ "min": min(times),
400
+ "mean": statistics.mean(times),
401
+ "median": statistics.median(times),
402
+ "std": statistics.stdev(times) if len(times) > 1 else 0,
403
+ "phrases": len(result.phrases)
404
+ }
405
+
406
+
407
+ def benchmark_pytextrank(text: str, runs: int = 10) -> dict:
408
+ """Benchmark pytextrank with spaCy."""
409
+ import spacy
410
+ import pytextrank
411
+
412
+ nlp = spacy.load("en_core_web_sm")
413
+ nlp.add_pipe("textrank")
414
+
415
+ # Warmup
416
+ doc = nlp(text)
417
+
418
+ times = []
419
+ for _ in range(runs):
420
+ start = time.perf_counter()
421
+ doc = nlp(text)
422
+ phrases = list(doc._.phrases[:10])
423
+ elapsed = time.perf_counter() - start
424
+ times.append(elapsed * 1000)
425
+
426
+ return {
427
+ "min": min(times),
428
+ "mean": statistics.mean(times),
429
+ "median": statistics.median(times),
430
+ "std": statistics.stdev(times) if len(times) > 1 else 0,
431
+ "phrases": len(phrases)
432
+ }
433
+
434
+
435
+ def main():
436
+ print("=" * 70)
437
+ print("TextRank Performance Benchmark")
438
+ print("=" * 70)
439
+
440
+ for size, text in TEXTS.items():
441
+ word_count = len(text.split())
442
+ print(f"\n{size.upper()} TEXT (~{word_count} words)")
443
+ print("-" * 50)
444
+
445
+ # Benchmark rapid_textrank
446
+ rust_results = benchmark_rapid_textrank(text)
447
+ print(f"rapid_textrank: {rust_results['mean']:>8.2f} ms (±{rust_results['std']:.2f})")
448
+
449
+ # Benchmark pytextrank
450
+ try:
451
+ py_results = benchmark_pytextrank(text)
452
+ print(f"pytextrank: {py_results['mean']:>8.2f} ms (±{py_results['std']:.2f})")
453
+
454
+ speedup = py_results['mean'] / rust_results['mean']
455
+ print(f"Speedup: {speedup:>8.1f}x faster")
456
+ except Exception as e:
457
+ print(f"pytextrank: (not available: {e})")
458
+
459
+ print("\n" + "=" * 70)
460
+ print("Note: pytextrank times include spaCy tokenization.")
461
+ print("For fair comparison with pre-tokenized input, use rapid_textrank's JSON API.")
462
+ print("=" * 70)
463
+
464
+
465
+ if __name__ == "__main__":
466
+ main()
467
+ ```
468
+
469
+ ### Why Rust is Fast
470
+
471
+ The performance advantage comes from several factors:
472
+
473
+ 1. **CSR Graph Format**: The co-occurrence graph uses Compressed Sparse Row format, enabling cache-friendly memory access during PageRank iteration.
474
+
475
+ 2. **String Interning**: Repeated words share a single allocation via `StringPool`, reducing memory usage 10-100x for typical documents.
476
+
477
+ 3. **Parallel Processing**: Rayon provides data parallelism for batch processing without explicit thread management.
478
+
479
+ 4. **Link-Time Optimization (LTO)**: Release builds use full LTO with single codegen unit for maximum inlining.
480
+
481
+ 5. **No GIL**: All computation happens in Rust. Python's Global Interpreter Lock is released during extraction.
482
+
483
+ 6. **FxHash**: Fast non-cryptographic hashing for internal hash maps.
484
+
485
+ ## Installation
486
+
487
+ ### From PyPI
488
+
489
+ ```bash
490
+ pip install rapid_textrank
491
+ ```
492
+
493
+ Import name is `rapid_textrank`.
494
+
495
+ ### With spaCy Support
496
+
497
+ ```bash
498
+ pip install rapid_textrank[spacy]
499
+ ```
500
+
501
+ ### From Source
502
+
503
+ Requirements: Rust 1.70+, Python 3.9+
504
+
505
+ ```bash
506
+ git clone https://github.com/textranker/rapid_textrank
507
+ cd rapid_textrank
508
+ pip install maturin
509
+ maturin develop --release
510
+ ```
511
+
512
+ ### Development Setup
513
+
514
+ ```bash
515
+ # Install with dev dependencies
516
+ pip install -e ".[dev]"
517
+
518
+ # Run tests
519
+ pytest
520
+
521
+ # Run Rust tests
522
+ cargo test
523
+ ```
524
+
525
+ ## Publishing
526
+
527
+ Publishing is automated with GitHub Actions using Trusted Publishing (OIDC), so no API tokens are stored.
528
+
529
+ TestPyPI release (push a tag):
530
+
531
+ ```bash
532
+ git tag -a test-0.1.0 -m "TestPyPI 0.1.0"
533
+ git push origin test-0.1.0
534
+ ```
535
+
536
+ Tag pattern: `test-*`
537
+
538
+ PyPI release (push a tag):
539
+
540
+ ```bash
541
+ git tag -a v0.1.0 -m "Release 0.1.0"
542
+ git push origin v0.1.0
543
+ ```
544
+
545
+ Tag pattern: `v*`
546
+
547
+ Wheel builds
548
+
549
+ GitHub Actions builds wheels for Python 3.9–3.12 on Linux, macOS, and Windows.
550
+
551
+ Before the first publish, add Trusted Publishers on TestPyPI and PyPI:
552
+
553
+ - Repo: `xang1234/textranker`
554
+ - Workflows: `.github/workflows/publish-testpypi.yml` and `.github/workflows/publish-pypi.yml`
555
+ - Environments: `testpypi` and `pypi`
556
+
557
+ You can also trigger either workflow manually via GitHub Actions if needed.
558
+
559
+ ## License
560
+
561
+ MIT License - see [LICENSE](LICENSE) for details.
562
+
563
+ ## Citation
564
+
565
+ If you use rapid_textrank in research, please cite the original TextRank paper:
566
+
567
+ ```bibtex
568
+ @inproceedings{mihalcea-tarau-2004-textrank,
569
+ title = "{T}ext{R}ank: Bringing Order into Text",
570
+ author = "Mihalcea, Rada and Tarau, Paul",
571
+ booktitle = "Proceedings of EMNLP 2004",
572
+ year = "2004",
573
+ publisher = "Association for Computational Linguistics",
574
+ }
575
+ ```
576
+
577
+ For PositionRank:
578
+
579
+ ```bibtex
580
+ @inproceedings{florescu-caragea-2017-positionrank,
581
+ title = "{P}osition{R}ank: An Unsupervised Approach to Keyphrase Extraction from Scholarly Documents",
582
+ author = "Florescu, Corina and Caragea, Cornelia",
583
+ booktitle = "Proceedings of ACL 2017",
584
+ year = "2017",
585
+ }
586
+ ```
587
+
@@ -0,0 +1,7 @@
1
+ rapid_textrank/__init__.py,sha256=o_pRt43a7RqoeenKu0Gg1EOg3zR88U29u7C5uz228ZM,1373
2
+ rapid_textrank/_rust.cpython-314-darwin.so,sha256=UFhljfGAKMkXLD7xjwU8UP3NLFao7zfS5-mDtu7f32o,1108680
3
+ rapid_textrank/spacy_component.py,sha256=yYOR-2sP9uxg_YhNHPYPjamt2meCGBOM1_mYbjh6pnY,7509
4
+ rapid_textrank-0.0.1.dist-info/METADATA,sha256=N0HQLk2FkzTYVxUGM2_vmpAWnP04HcXFYGIyVvN3jwE,18686
5
+ rapid_textrank-0.0.1.dist-info/WHEEL,sha256=jyP0hJCe-fSX_gEscesIqqW7KerDJw7iyldGx-__w10,107
6
+ rapid_textrank-0.0.1.dist-info/licenses/LICENSE,sha256=yWeokG20y7cdx3UXBPoQgILIFjkz-ODLh3NElIXgGUA,1080
7
+ rapid_textrank-0.0.1.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: maturin (1.11.5)
3
+ Root-Is-Purelib: false
4
+ Tag: cp314-cp314-macosx_10_12_x86_64
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 TextRanker Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.