rapid-textrank 0.0.1__tar.gz → 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. rapid_textrank-0.1.1/.beads/issues.jsonl +0 -0
  2. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/.gitignore +3 -0
  3. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/Cargo.lock +1 -1
  4. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/Cargo.toml +1 -1
  5. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/PKG-INFO +38 -15
  6. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/README.md +37 -14
  7. rapid_textrank-0.1.1/notebooks/01_quickstart.ipynb +482 -0
  8. rapid_textrank-0.1.1/notebooks/02_algorithm_variants.ipynb +710 -0
  9. rapid_textrank-0.1.1/notebooks/03_explain_algorithm.ipynb +725 -0
  10. rapid_textrank-0.1.1/notebooks/04_benchmarks.ipynb +981 -0
  11. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/pyproject.toml +1 -1
  12. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/python/rapid_textrank/__init__.py +2 -0
  13. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/python/rapid_textrank/spacy_component.py +29 -2
  14. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/src/graph/builder.rs +32 -15
  15. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/src/nlp/stopwords.rs +35 -0
  16. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/src/nlp/tokenizer.rs +28 -0
  17. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/src/phrase/chunker.rs +11 -3
  18. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/src/phrase/extraction.rs +75 -34
  19. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/src/python/json.rs +71 -15
  20. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/src/python/mod.rs +1 -0
  21. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/src/python/native.rs +35 -6
  22. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/src/types.rs +70 -2
  23. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/src/variants/biased_textrank.rs +32 -11
  24. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/src/variants/position_rank.rs +2 -1
  25. rapid_textrank-0.0.1/.beads/issues.jsonl +0 -15
  26. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/.beads/.gitignore +0 -0
  27. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/.beads/README.md +0 -0
  28. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/.beads/config.yaml +0 -0
  29. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/.beads/interactions.jsonl +0 -0
  30. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/.beads/metadata.json +0 -0
  31. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/.gitattributes +0 -0
  32. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/.github/workflows/CI.yml +0 -0
  33. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/.github/workflows/publish-pypi.yml +0 -0
  34. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/.github/workflows/publish-testpypi.yml +0 -0
  35. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/AGENTS.md +0 -0
  36. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/CLAUDE.md +0 -0
  37. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/LICENSE +0 -0
  38. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/benches/benchmark.rs +0 -0
  39. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/python/tests/test_api.py +0 -0
  40. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/src/errors.rs +0 -0
  41. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/src/graph/csr.rs +0 -0
  42. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/src/graph/mod.rs +0 -0
  43. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/src/lib.rs +0 -0
  44. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/src/nlp/mod.rs +0 -0
  45. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/src/pagerank/mod.rs +0 -0
  46. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/src/pagerank/personalized.rs +0 -0
  47. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/src/pagerank/standard.rs +0 -0
  48. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/src/phrase/dedup.rs +0 -0
  49. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/src/phrase/mod.rs +0 -0
  50. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/src/summarizer/mod.rs +0 -0
  51. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/src/summarizer/selector.rs +0 -0
  52. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/src/summarizer/unit_vector.rs +0 -0
  53. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/src/variants/mod.rs +0 -0
  54. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/src/variants/topic_rank.rs +0 -0
  55. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/tests/integration_tests.rs +0 -0
  56. {rapid_textrank-0.0.1 → rapid_textrank-0.1.1}/tests/property_tests.rs +0 -0
File without changes
@@ -15,7 +15,10 @@ dist/
15
15
  build/
16
16
  .eggs/
17
17
  *.egg
18
+
19
+ # Jupyter notebooks (but keep notebooks/ directory for examples)
18
20
  *.ipynb
21
+ !notebooks/*.ipynb
19
22
 
20
23
  # Virtual environments
21
24
  .venv/
@@ -579,7 +579,7 @@ dependencies = [
579
579
 
580
580
  [[package]]
581
581
  name = "rapid_textrank"
582
- version = "0.0.1"
582
+ version = "0.1.1"
583
583
  dependencies = [
584
584
  "approx",
585
585
  "criterion",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "rapid_textrank"
3
- version = "0.0.1"
3
+ version = "0.1.1"
4
4
  edition = "2021"
5
5
  authors = ["TextRanker Contributors"]
6
6
  description = "High-performance TextRank implementation with Python bindings"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rapid_textrank
3
- Version: 0.0.1
3
+ Version: 0.1.1
4
4
  Classifier: Development Status :: 4 - Beta
5
5
  Classifier: Intended Audience :: Developers
6
6
  Classifier: Intended Audience :: Science/Research
@@ -41,16 +41,16 @@ Project-URL: Repository, https://github.com/xang1234/rapid-textrank
41
41
 
42
42
  **High-performance TextRank implementation in Rust with Python bindings.**
43
43
 
44
- Extract keywords and key phrases from text 10-100x faster than pure Python implementations, with support for multiple algorithm variants and 18 languages.
44
+ Extract keywords and key phrases from text up to 10-100x faster than pure Python implementations (depending on document size and tokenization), with support for multiple algorithm variants and 18 languages.
45
45
 
46
46
  ## Features
47
47
 
48
- - **Fast**: 10-100x faster than pure Python implementations
48
+ - **Fast**: Up to 10-100x faster than pure Python implementations (see benchmarks)
49
49
  - **Multiple algorithms**: TextRank, PositionRank, and BiasedTextRank variants
50
- - **Unicode-aware**: Proper handling of CJK, emoji, and other scripts
50
+ - **Unicode-aware**: Proper handling of CJK and other scripts (emoji are ignored by the built-in tokenizer)
51
51
  - **Multi-language**: Stopword support for 18 languages
52
52
  - **Dual API**: Native Python classes + JSON interface for batch processing
53
- - **Zero Python overhead**: Computation happens entirely in Rust (no GIL)
53
+ - **Rust core**: Computation happens in Rust (the Python GIL is currently held during extraction)
54
54
 
55
55
  ## Quick Start
56
56
 
@@ -91,7 +91,7 @@ TextRank is a graph-based ranking algorithm for keyword extraction, inspired by
91
91
 
92
92
  2. **Run PageRank**: The algorithm iteratively distributes "importance" through the graph. Words connected to many important words become important themselves.
93
93
 
94
- 3. **Extract phrases**: Adjacent high-scoring words are combined into key phrases. Scores are aggregated (sum, mean, or max).
94
+ 3. **Extract phrases**: High-scoring words are grouped into noun chunks (POS-filtered) to form key phrases. Scores are aggregated (sum, mean, or max).
95
95
 
96
96
  ```
97
97
  Text: "Machine learning enables systems to learn from data"
@@ -217,12 +217,16 @@ config = TextRankConfig(
217
217
  damping=0.85, # PageRank damping factor (0-1)
218
218
  max_iterations=100, # Maximum PageRank iterations
219
219
  convergence_threshold=1e-6,# Convergence threshold
220
- window_size=4, # Co-occurrence window size
220
+ window_size=3, # Co-occurrence window size
221
221
  top_n=10, # Number of results
222
222
  min_phrase_length=1, # Minimum words in a phrase
223
223
  max_phrase_length=4, # Maximum words in a phrase
224
224
  score_aggregation="sum", # How to combine word scores: "sum", "mean", "max", "rms"
225
- language="en" # Language for stopwords
225
+ language="en", # Language for stopwords
226
+ include_pos=["NOUN","ADJ","PROPN","VERB"], # POS tags to include in the graph
227
+ use_pos_in_nodes=True, # If True, graph nodes are lemma+POS
228
+ phrase_grouping="scrubbed_text", # "lemma" or "scrubbed_text"
229
+ stopwords=["custom", "terms"] # Additional stopwords (extends built-in list)
226
230
  )
227
231
 
228
232
  extractor = BaseTextRank(config=config)
@@ -252,7 +256,7 @@ tuples = result.as_tuples() # [(text, score), ...]
252
256
 
253
257
  ### JSON Interface
254
258
 
255
- For processing large documents or integrating with spaCy, use the JSON interface. This accepts pre-tokenized data to avoid re-tokenizing in Rust.
259
+ For processing large documents or integrating with spaCy, use the JSON interface. This accepts pre-tokenized data to avoid re-tokenizing in Rust. Stopword handling can use each token's `is_stopword` field and/or a `config.language` plus `config.stopwords` (additional words that extend the built-in list). Language codes follow the Supported Languages table below.
256
260
 
257
261
  ```python
258
262
  from rapid_textrank import extract_from_json, extract_batch_from_json
@@ -273,13 +277,13 @@ doc = {
273
277
  },
274
278
  # ... more tokens
275
279
  ],
276
- "config": {"top_n": 10}
280
+ "config": {"top_n": 10, "language": "en", "stopwords": ["nlp", "transformers"]}
277
281
  }
278
282
 
279
283
  result_json = extract_from_json(json.dumps(doc))
280
284
  result = json.loads(result_json)
281
285
 
282
- # Batch processing (parallel in Rust)
286
+ # Batch processing (Rust core; per-document processing is sequential)
283
287
  docs = [doc1, doc2, doc3]
284
288
  results_json = extract_batch_from_json(json.dumps(docs))
285
289
  results = json.loads(results_json)
@@ -287,7 +291,7 @@ results = json.loads(results_json)
287
291
 
288
292
  ## Supported Languages
289
293
 
290
- Stopword filtering is available for 18 languages:
294
+ Stopword filtering is available for 18 languages. Use these codes for the `language` parameter in all APIs (including JSON config):
291
295
 
292
296
  | Code | Language | Code | Language | Code | Language |
293
297
  |------|----------|------|----------|------|----------|
@@ -298,6 +302,13 @@ Stopword filtering is available for 18 languages:
298
302
  | `hu` | Hungarian | `tr` | Turkish | `pl` | Polish |
299
303
  | `ar` | Arabic | `zh` | Chinese | `ja` | Japanese |
300
304
 
305
+ You can inspect the built-in stopword list with:
306
+
307
+ ```python
308
+ import rapid_textrank as rt
309
+ rt.get_stopwords("en")
310
+ ```
311
+
301
312
  ## Performance
302
313
 
303
314
  rapid_textrank achieves significant speedups through Rust's performance characteristics and careful algorithm implementation.
@@ -474,11 +485,11 @@ The performance advantage comes from several factors:
474
485
 
475
486
  2. **String Interning**: Repeated words share a single allocation via `StringPool`, reducing memory usage 10-100x for typical documents.
476
487
 
477
- 3. **Parallel Processing**: Rayon provides data parallelism for batch processing without explicit thread management.
488
+ 3. **Parallel Processing**: Rayon provides data parallelism in internal graph construction without explicit thread management.
478
489
 
479
490
  4. **Link-Time Optimization (LTO)**: Release builds use full LTO with single codegen unit for maximum inlining.
480
491
 
481
- 5. **No GIL**: All computation happens in Rust. Python's Global Interpreter Lock is released during extraction.
492
+ 5. **Rust core**: Most computation happens in Rust, minimizing Python-level overhead.
482
493
 
483
494
  6. **FxHash**: Fast non-cryptographic hashing for internal hash maps.
484
495
 
@@ -498,12 +509,24 @@ Import name is `rapid_textrank`.
498
509
  pip install rapid_textrank[spacy]
499
510
  ```
500
511
 
512
+ ```python
513
+ import spacy
514
+ import rapid_textrank.spacy_component # registers the pipeline factory
515
+
516
+ nlp = spacy.load("en_core_web_sm")
517
+ nlp.add_pipe("rapid_textrank")
518
+
519
+ doc = nlp("Machine learning is a subset of artificial intelligence.")
520
+ for phrase in doc._.phrases[:5]:
521
+ print(f"{phrase.text}: {phrase.score:.4f}")
522
+ ```
523
+
501
524
  ### From Source
502
525
 
503
526
  Requirements: Rust 1.70+, Python 3.9+
504
527
 
505
528
  ```bash
506
- git clone https://github.com/textranker/rapid_textrank
529
+ git clone https://github.com/xang1234/rapid-textrank
507
530
  cd rapid_textrank
508
531
  pip install maturin
509
532
  maturin develop --release
@@ -6,16 +6,16 @@
6
6
 
7
7
  **High-performance TextRank implementation in Rust with Python bindings.**
8
8
 
9
- Extract keywords and key phrases from text 10-100x faster than pure Python implementations, with support for multiple algorithm variants and 18 languages.
9
+ Extract keywords and key phrases from text up to 10-100x faster than pure Python implementations (depending on document size and tokenization), with support for multiple algorithm variants and 18 languages.
10
10
 
11
11
  ## Features
12
12
 
13
- - **Fast**: 10-100x faster than pure Python implementations
13
+ - **Fast**: Up to 10-100x faster than pure Python implementations (see benchmarks)
14
14
  - **Multiple algorithms**: TextRank, PositionRank, and BiasedTextRank variants
15
- - **Unicode-aware**: Proper handling of CJK, emoji, and other scripts
15
+ - **Unicode-aware**: Proper handling of CJK and other scripts (emoji are ignored by the built-in tokenizer)
16
16
  - **Multi-language**: Stopword support for 18 languages
17
17
  - **Dual API**: Native Python classes + JSON interface for batch processing
18
- - **Zero Python overhead**: Computation happens entirely in Rust (no GIL)
18
+ - **Rust core**: Computation happens in Rust (the Python GIL is currently held during extraction)
19
19
 
20
20
  ## Quick Start
21
21
 
@@ -56,7 +56,7 @@ TextRank is a graph-based ranking algorithm for keyword extraction, inspired by
56
56
 
57
57
  2. **Run PageRank**: The algorithm iteratively distributes "importance" through the graph. Words connected to many important words become important themselves.
58
58
 
59
- 3. **Extract phrases**: Adjacent high-scoring words are combined into key phrases. Scores are aggregated (sum, mean, or max).
59
+ 3. **Extract phrases**: High-scoring words are grouped into noun chunks (POS-filtered) to form key phrases. Scores are aggregated (sum, mean, or max).
60
60
 
61
61
  ```
62
62
  Text: "Machine learning enables systems to learn from data"
@@ -182,12 +182,16 @@ config = TextRankConfig(
182
182
  damping=0.85, # PageRank damping factor (0-1)
183
183
  max_iterations=100, # Maximum PageRank iterations
184
184
  convergence_threshold=1e-6,# Convergence threshold
185
- window_size=4, # Co-occurrence window size
185
+ window_size=3, # Co-occurrence window size
186
186
  top_n=10, # Number of results
187
187
  min_phrase_length=1, # Minimum words in a phrase
188
188
  max_phrase_length=4, # Maximum words in a phrase
189
189
  score_aggregation="sum", # How to combine word scores: "sum", "mean", "max", "rms"
190
- language="en" # Language for stopwords
190
+ language="en", # Language for stopwords
191
+ include_pos=["NOUN","ADJ","PROPN","VERB"], # POS tags to include in the graph
192
+ use_pos_in_nodes=True, # If True, graph nodes are lemma+POS
193
+ phrase_grouping="scrubbed_text", # "lemma" or "scrubbed_text"
194
+ stopwords=["custom", "terms"] # Additional stopwords (extends built-in list)
191
195
  )
192
196
 
193
197
  extractor = BaseTextRank(config=config)
@@ -217,7 +221,7 @@ tuples = result.as_tuples() # [(text, score), ...]
217
221
 
218
222
  ### JSON Interface
219
223
 
220
- For processing large documents or integrating with spaCy, use the JSON interface. This accepts pre-tokenized data to avoid re-tokenizing in Rust.
224
+ For processing large documents or integrating with spaCy, use the JSON interface. This accepts pre-tokenized data to avoid re-tokenizing in Rust. Stopword handling can use each token's `is_stopword` field and/or a `config.language` plus `config.stopwords` (additional words that extend the built-in list). Language codes follow the Supported Languages table below.
221
225
 
222
226
  ```python
223
227
  from rapid_textrank import extract_from_json, extract_batch_from_json
@@ -238,13 +242,13 @@ doc = {
238
242
  },
239
243
  # ... more tokens
240
244
  ],
241
- "config": {"top_n": 10}
245
+ "config": {"top_n": 10, "language": "en", "stopwords": ["nlp", "transformers"]}
242
246
  }
243
247
 
244
248
  result_json = extract_from_json(json.dumps(doc))
245
249
  result = json.loads(result_json)
246
250
 
247
- # Batch processing (parallel in Rust)
251
+ # Batch processing (Rust core; per-document processing is sequential)
248
252
  docs = [doc1, doc2, doc3]
249
253
  results_json = extract_batch_from_json(json.dumps(docs))
250
254
  results = json.loads(results_json)
@@ -252,7 +256,7 @@ results = json.loads(results_json)
252
256
 
253
257
  ## Supported Languages
254
258
 
255
- Stopword filtering is available for 18 languages:
259
+ Stopword filtering is available for 18 languages. Use these codes for the `language` parameter in all APIs (including JSON config):
256
260
 
257
261
  | Code | Language | Code | Language | Code | Language |
258
262
  |------|----------|------|----------|------|----------|
@@ -263,6 +267,13 @@ Stopword filtering is available for 18 languages:
263
267
  | `hu` | Hungarian | `tr` | Turkish | `pl` | Polish |
264
268
  | `ar` | Arabic | `zh` | Chinese | `ja` | Japanese |
265
269
 
270
+ You can inspect the built-in stopword list with:
271
+
272
+ ```python
273
+ import rapid_textrank as rt
274
+ rt.get_stopwords("en")
275
+ ```
276
+
266
277
  ## Performance
267
278
 
268
279
  rapid_textrank achieves significant speedups through Rust's performance characteristics and careful algorithm implementation.
@@ -439,11 +450,11 @@ The performance advantage comes from several factors:
439
450
 
440
451
  2. **String Interning**: Repeated words share a single allocation via `StringPool`, reducing memory usage 10-100x for typical documents.
441
452
 
442
- 3. **Parallel Processing**: Rayon provides data parallelism for batch processing without explicit thread management.
453
+ 3. **Parallel Processing**: Rayon provides data parallelism in internal graph construction without explicit thread management.
443
454
 
444
455
  4. **Link-Time Optimization (LTO)**: Release builds use full LTO with single codegen unit for maximum inlining.
445
456
 
446
- 5. **No GIL**: All computation happens in Rust. Python's Global Interpreter Lock is released during extraction.
457
+ 5. **Rust core**: Most computation happens in Rust, minimizing Python-level overhead.
447
458
 
448
459
  6. **FxHash**: Fast non-cryptographic hashing for internal hash maps.
449
460
 
@@ -463,12 +474,24 @@ Import name is `rapid_textrank`.
463
474
  pip install rapid_textrank[spacy]
464
475
  ```
465
476
 
477
+ ```python
478
+ import spacy
479
+ import rapid_textrank.spacy_component # registers the pipeline factory
480
+
481
+ nlp = spacy.load("en_core_web_sm")
482
+ nlp.add_pipe("rapid_textrank")
483
+
484
+ doc = nlp("Machine learning is a subset of artificial intelligence.")
485
+ for phrase in doc._.phrases[:5]:
486
+ print(f"{phrase.text}: {phrase.score:.4f}")
487
+ ```
488
+
466
489
  ### From Source
467
490
 
468
491
  Requirements: Rust 1.70+, Python 3.9+
469
492
 
470
493
  ```bash
471
- git clone https://github.com/textranker/rapid_textrank
494
+ git clone https://github.com/xang1234/rapid-textrank
472
495
  cd rapid_textrank
473
496
  pip install maturin
474
497
  maturin develop --release