rapid-textrank 0.0.1__tar.gz → 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. rapid_textrank-0.1.0/.beads/issues.jsonl +0 -0
  2. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/.gitignore +3 -0
  3. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/Cargo.lock +1 -1
  4. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/Cargo.toml +1 -1
  5. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/PKG-INFO +32 -13
  6. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/README.md +31 -12
  7. rapid_textrank-0.1.0/notebooks/01_quickstart.ipynb +487 -0
  8. rapid_textrank-0.1.0/notebooks/02_algorithm_variants.ipynb +710 -0
  9. rapid_textrank-0.1.0/notebooks/03_explain_algorithm.ipynb +725 -0
  10. rapid_textrank-0.1.0/notebooks/04_benchmarks.ipynb +981 -0
  11. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/pyproject.toml +1 -1
  12. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/python/rapid_textrank/__init__.py +2 -0
  13. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/src/nlp/stopwords.rs +35 -0
  14. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/src/nlp/tokenizer.rs +32 -0
  15. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/src/phrase/chunker.rs +11 -3
  16. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/src/python/json.rs +38 -3
  17. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/src/python/mod.rs +1 -0
  18. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/src/python/native.rs +26 -4
  19. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/src/types.rs +4 -0
  20. rapid_textrank-0.0.1/.beads/issues.jsonl +0 -15
  21. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/.beads/.gitignore +0 -0
  22. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/.beads/README.md +0 -0
  23. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/.beads/config.yaml +0 -0
  24. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/.beads/interactions.jsonl +0 -0
  25. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/.beads/metadata.json +0 -0
  26. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/.gitattributes +0 -0
  27. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/.github/workflows/CI.yml +0 -0
  28. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/.github/workflows/publish-pypi.yml +0 -0
  29. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/.github/workflows/publish-testpypi.yml +0 -0
  30. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/AGENTS.md +0 -0
  31. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/CLAUDE.md +0 -0
  32. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/LICENSE +0 -0
  33. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/benches/benchmark.rs +0 -0
  34. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/python/rapid_textrank/spacy_component.py +0 -0
  35. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/python/tests/test_api.py +0 -0
  36. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/src/errors.rs +0 -0
  37. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/src/graph/builder.rs +0 -0
  38. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/src/graph/csr.rs +0 -0
  39. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/src/graph/mod.rs +0 -0
  40. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/src/lib.rs +0 -0
  41. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/src/nlp/mod.rs +0 -0
  42. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/src/pagerank/mod.rs +0 -0
  43. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/src/pagerank/personalized.rs +0 -0
  44. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/src/pagerank/standard.rs +0 -0
  45. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/src/phrase/dedup.rs +0 -0
  46. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/src/phrase/extraction.rs +0 -0
  47. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/src/phrase/mod.rs +0 -0
  48. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/src/summarizer/mod.rs +0 -0
  49. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/src/summarizer/selector.rs +0 -0
  50. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/src/summarizer/unit_vector.rs +0 -0
  51. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/src/variants/biased_textrank.rs +0 -0
  52. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/src/variants/mod.rs +0 -0
  53. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/src/variants/position_rank.rs +0 -0
  54. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/src/variants/topic_rank.rs +0 -0
  55. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/tests/integration_tests.rs +0 -0
  56. {rapid_textrank-0.0.1 → rapid_textrank-0.1.0}/tests/property_tests.rs +0 -0
File without changes
@@ -15,7 +15,10 @@ dist/
15
15
  build/
16
16
  .eggs/
17
17
  *.egg
18
+
19
+ # Jupyter notebooks (but keep notebooks/ directory for examples)
18
20
  *.ipynb
21
+ !notebooks/*.ipynb
19
22
 
20
23
  # Virtual environments
21
24
  .venv/
@@ -579,7 +579,7 @@ dependencies = [
579
579
 
580
580
  [[package]]
581
581
  name = "rapid_textrank"
582
- version = "0.0.1"
582
+ version = "0.1.0"
583
583
  dependencies = [
584
584
  "approx",
585
585
  "criterion",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "rapid_textrank"
3
- version = "0.0.1"
3
+ version = "0.1.0"
4
4
  edition = "2021"
5
5
  authors = ["TextRanker Contributors"]
6
6
  description = "High-performance TextRank implementation with Python bindings"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rapid_textrank
3
- Version: 0.0.1
3
+ Version: 0.1.0
4
4
  Classifier: Development Status :: 4 - Beta
5
5
  Classifier: Intended Audience :: Developers
6
6
  Classifier: Intended Audience :: Science/Research
@@ -41,16 +41,16 @@ Project-URL: Repository, https://github.com/xang1234/rapid-textrank
41
41
 
42
42
  **High-performance TextRank implementation in Rust with Python bindings.**
43
43
 
44
- Extract keywords and key phrases from text 10-100x faster than pure Python implementations, with support for multiple algorithm variants and 18 languages.
44
+ Extract keywords and key phrases from text up to 10-100x faster than pure Python implementations (depending on document size and tokenization), with support for multiple algorithm variants and 18 languages.
45
45
 
46
46
  ## Features
47
47
 
48
- - **Fast**: 10-100x faster than pure Python implementations
48
+ - **Fast**: Up to 10-100x faster than pure Python implementations (see benchmarks)
49
49
  - **Multiple algorithms**: TextRank, PositionRank, and BiasedTextRank variants
50
- - **Unicode-aware**: Proper handling of CJK, emoji, and other scripts
50
+ - **Unicode-aware**: Proper handling of CJK and other scripts (emoji are ignored by the built-in tokenizer)
51
51
  - **Multi-language**: Stopword support for 18 languages
52
52
  - **Dual API**: Native Python classes + JSON interface for batch processing
53
- - **Zero Python overhead**: Computation happens entirely in Rust (no GIL)
53
+ - **Rust core**: Computation happens in Rust (the Python GIL is currently held during extraction)
54
54
 
55
55
  ## Quick Start
56
56
 
@@ -91,7 +91,7 @@ TextRank is a graph-based ranking algorithm for keyword extraction, inspired by
91
91
 
92
92
  2. **Run PageRank**: The algorithm iteratively distributes "importance" through the graph. Words connected to many important words become important themselves.
93
93
 
94
- 3. **Extract phrases**: Adjacent high-scoring words are combined into key phrases. Scores are aggregated (sum, mean, or max).
94
+ 3. **Extract phrases**: High-scoring words are grouped into noun chunks (POS-filtered) to form key phrases. Scores are aggregated (sum, mean, or max).
95
95
 
96
96
  ```
97
97
  Text: "Machine learning enables systems to learn from data"
@@ -252,7 +252,7 @@ tuples = result.as_tuples() # [(text, score), ...]
252
252
 
253
253
  ### JSON Interface
254
254
 
255
- For processing large documents or integrating with spaCy, use the JSON interface. This accepts pre-tokenized data to avoid re-tokenizing in Rust.
255
+ For processing large documents or integrating with spaCy, use the JSON interface. This accepts pre-tokenized data to avoid re-tokenizing in Rust. Stopword handling can use each token's `is_stopword` field and/or a `config.language` plus `config.stopwords` (additional words that extend the built-in list). Language codes follow the Supported Languages table below.
256
256
 
257
257
  ```python
258
258
  from rapid_textrank import extract_from_json, extract_batch_from_json
@@ -273,13 +273,13 @@ doc = {
273
273
  },
274
274
  # ... more tokens
275
275
  ],
276
- "config": {"top_n": 10}
276
+ "config": {"top_n": 10, "language": "en", "stopwords": ["nlp", "transformers"]}
277
277
  }
278
278
 
279
279
  result_json = extract_from_json(json.dumps(doc))
280
280
  result = json.loads(result_json)
281
281
 
282
- # Batch processing (parallel in Rust)
282
+ # Batch processing (Rust core; per-document processing is sequential)
283
283
  docs = [doc1, doc2, doc3]
284
284
  results_json = extract_batch_from_json(json.dumps(docs))
285
285
  results = json.loads(results_json)
@@ -287,7 +287,7 @@ results = json.loads(results_json)
287
287
 
288
288
  ## Supported Languages
289
289
 
290
- Stopword filtering is available for 18 languages:
290
+ Stopword filtering is available for 18 languages. Use these codes for the `language` parameter in all APIs (including JSON config):
291
291
 
292
292
  | Code | Language | Code | Language | Code | Language |
293
293
  |------|----------|------|----------|------|----------|
@@ -298,6 +298,13 @@ Stopword filtering is available for 18 languages:
298
298
  | `hu` | Hungarian | `tr` | Turkish | `pl` | Polish |
299
299
  | `ar` | Arabic | `zh` | Chinese | `ja` | Japanese |
300
300
 
301
+ You can inspect the built-in stopword list with:
302
+
303
+ ```python
304
+ import rapid_textrank as rt
305
+ rt.get_stopwords("en")
306
+ ```
307
+
301
308
  ## Performance
302
309
 
303
310
  rapid_textrank achieves significant speedups through Rust's performance characteristics and careful algorithm implementation.
@@ -474,11 +481,11 @@ The performance advantage comes from several factors:
474
481
 
475
482
  2. **String Interning**: Repeated words share a single allocation via `StringPool`, reducing memory usage 10-100x for typical documents.
476
483
 
477
- 3. **Parallel Processing**: Rayon provides data parallelism for batch processing without explicit thread management.
484
+ 3. **Parallel Processing**: Rayon provides data parallelism in internal graph construction without explicit thread management.
478
485
 
479
486
  4. **Link-Time Optimization (LTO)**: Release builds use full LTO with single codegen unit for maximum inlining.
480
487
 
481
- 5. **No GIL**: All computation happens in Rust. Python's Global Interpreter Lock is released during extraction.
488
+ 5. **Rust core**: Most computation happens in Rust, minimizing Python-level overhead.
482
489
 
483
490
  6. **FxHash**: Fast non-cryptographic hashing for internal hash maps.
484
491
 
@@ -498,12 +505,24 @@ Import name is `rapid_textrank`.
498
505
  pip install rapid_textrank[spacy]
499
506
  ```
500
507
 
508
+ ```python
509
+ import spacy
510
+ import rapid_textrank.spacy_component # registers the pipeline factory
511
+
512
+ nlp = spacy.load("en_core_web_sm")
513
+ nlp.add_pipe("rapid_textrank")
514
+
515
+ doc = nlp("Machine learning is a subset of artificial intelligence.")
516
+ for phrase in doc._.phrases[:5]:
517
+ print(f"{phrase.text}: {phrase.score:.4f}")
518
+ ```
519
+
501
520
  ### From Source
502
521
 
503
522
  Requirements: Rust 1.70+, Python 3.9+
504
523
 
505
524
  ```bash
506
- git clone https://github.com/textranker/rapid_textrank
525
+ git clone https://github.com/xang1234/rapid-textrank
507
526
  cd rapid_textrank
508
527
  pip install maturin
509
528
  maturin develop --release
@@ -6,16 +6,16 @@
6
6
 
7
7
  **High-performance TextRank implementation in Rust with Python bindings.**
8
8
 
9
- Extract keywords and key phrases from text 10-100x faster than pure Python implementations, with support for multiple algorithm variants and 18 languages.
9
+ Extract keywords and key phrases from text up to 10-100x faster than pure Python implementations (depending on document size and tokenization), with support for multiple algorithm variants and 18 languages.
10
10
 
11
11
  ## Features
12
12
 
13
- - **Fast**: 10-100x faster than pure Python implementations
13
+ - **Fast**: Up to 10-100x faster than pure Python implementations (see benchmarks)
14
14
  - **Multiple algorithms**: TextRank, PositionRank, and BiasedTextRank variants
15
- - **Unicode-aware**: Proper handling of CJK, emoji, and other scripts
15
+ - **Unicode-aware**: Proper handling of CJK and other scripts (emoji are ignored by the built-in tokenizer)
16
16
  - **Multi-language**: Stopword support for 18 languages
17
17
  - **Dual API**: Native Python classes + JSON interface for batch processing
18
- - **Zero Python overhead**: Computation happens entirely in Rust (no GIL)
18
+ - **Rust core**: Computation happens in Rust (the Python GIL is currently held during extraction)
19
19
 
20
20
  ## Quick Start
21
21
 
@@ -56,7 +56,7 @@ TextRank is a graph-based ranking algorithm for keyword extraction, inspired by
56
56
 
57
57
  2. **Run PageRank**: The algorithm iteratively distributes "importance" through the graph. Words connected to many important words become important themselves.
58
58
 
59
- 3. **Extract phrases**: Adjacent high-scoring words are combined into key phrases. Scores are aggregated (sum, mean, or max).
59
+ 3. **Extract phrases**: High-scoring words are grouped into noun chunks (POS-filtered) to form key phrases. Scores are aggregated (sum, mean, or max).
60
60
 
61
61
  ```
62
62
  Text: "Machine learning enables systems to learn from data"
@@ -217,7 +217,7 @@ tuples = result.as_tuples() # [(text, score), ...]
217
217
 
218
218
  ### JSON Interface
219
219
 
220
- For processing large documents or integrating with spaCy, use the JSON interface. This accepts pre-tokenized data to avoid re-tokenizing in Rust.
220
+ For processing large documents or integrating with spaCy, use the JSON interface. This accepts pre-tokenized data to avoid re-tokenizing in Rust. Stopword handling can use each token's `is_stopword` field and/or a `config.language` plus `config.stopwords` (additional words that extend the built-in list). Language codes follow the Supported Languages table below.
221
221
 
222
222
  ```python
223
223
  from rapid_textrank import extract_from_json, extract_batch_from_json
@@ -238,13 +238,13 @@ doc = {
238
238
  },
239
239
  # ... more tokens
240
240
  ],
241
- "config": {"top_n": 10}
241
+ "config": {"top_n": 10, "language": "en", "stopwords": ["nlp", "transformers"]}
242
242
  }
243
243
 
244
244
  result_json = extract_from_json(json.dumps(doc))
245
245
  result = json.loads(result_json)
246
246
 
247
- # Batch processing (parallel in Rust)
247
+ # Batch processing (Rust core; per-document processing is sequential)
248
248
  docs = [doc1, doc2, doc3]
249
249
  results_json = extract_batch_from_json(json.dumps(docs))
250
250
  results = json.loads(results_json)
@@ -252,7 +252,7 @@ results = json.loads(results_json)
252
252
 
253
253
  ## Supported Languages
254
254
 
255
- Stopword filtering is available for 18 languages:
255
+ Stopword filtering is available for 18 languages. Use these codes for the `language` parameter in all APIs (including JSON config):
256
256
 
257
257
  | Code | Language | Code | Language | Code | Language |
258
258
  |------|----------|------|----------|------|----------|
@@ -263,6 +263,13 @@ Stopword filtering is available for 18 languages:
263
263
  | `hu` | Hungarian | `tr` | Turkish | `pl` | Polish |
264
264
  | `ar` | Arabic | `zh` | Chinese | `ja` | Japanese |
265
265
 
266
+ You can inspect the built-in stopword list with:
267
+
268
+ ```python
269
+ import rapid_textrank as rt
270
+ rt.get_stopwords("en")
271
+ ```
272
+
266
273
  ## Performance
267
274
 
268
275
  rapid_textrank achieves significant speedups through Rust's performance characteristics and careful algorithm implementation.
@@ -439,11 +446,11 @@ The performance advantage comes from several factors:
439
446
 
440
447
  2. **String Interning**: Repeated words share a single allocation via `StringPool`, reducing memory usage 10-100x for typical documents.
441
448
 
442
- 3. **Parallel Processing**: Rayon provides data parallelism for batch processing without explicit thread management.
449
+ 3. **Parallel Processing**: Rayon provides data parallelism in internal graph construction without explicit thread management.
443
450
 
444
451
  4. **Link-Time Optimization (LTO)**: Release builds use full LTO with single codegen unit for maximum inlining.
445
452
 
446
- 5. **No GIL**: All computation happens in Rust. Python's Global Interpreter Lock is released during extraction.
453
+ 5. **Rust core**: Most computation happens in Rust, minimizing Python-level overhead.
447
454
 
448
455
  6. **FxHash**: Fast non-cryptographic hashing for internal hash maps.
449
456
 
@@ -463,12 +470,24 @@ Import name is `rapid_textrank`.
463
470
  pip install rapid_textrank[spacy]
464
471
  ```
465
472
 
473
+ ```python
474
+ import spacy
475
+ import rapid_textrank.spacy_component # registers the pipeline factory
476
+
477
+ nlp = spacy.load("en_core_web_sm")
478
+ nlp.add_pipe("rapid_textrank")
479
+
480
+ doc = nlp("Machine learning is a subset of artificial intelligence.")
481
+ for phrase in doc._.phrases[:5]:
482
+ print(f"{phrase.text}: {phrase.score:.4f}")
483
+ ```
484
+
466
485
  ### From Source
467
486
 
468
487
  Requirements: Rust 1.70+, Python 3.9+
469
488
 
470
489
  ```bash
471
- git clone https://github.com/textranker/rapid_textrank
490
+ git clone https://github.com/xang1234/rapid-textrank
472
491
  cd rapid_textrank
473
492
  pip install maturin
474
493
  maturin develop --release