rapid-textrank 0.1.0__tar.gz → 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/Cargo.lock +1 -1
  2. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/Cargo.toml +1 -1
  3. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/PKG-INFO +7 -3
  4. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/README.md +6 -2
  5. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/notebooks/01_quickstart.ipynb +15 -20
  6. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/notebooks/02_algorithm_variants.ipynb +23 -23
  7. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/notebooks/03_explain_algorithm.ipynb +9 -9
  8. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/notebooks/04_benchmarks.ipynb +22 -22
  9. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/pyproject.toml +1 -1
  10. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/python/rapid_textrank/spacy_component.py +29 -2
  11. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/graph/builder.rs +32 -15
  12. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/nlp/tokenizer.rs +6 -10
  13. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/phrase/extraction.rs +75 -34
  14. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/python/json.rs +33 -12
  15. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/python/native.rs +10 -3
  16. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/types.rs +66 -2
  17. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/variants/biased_textrank.rs +32 -11
  18. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/variants/position_rank.rs +2 -1
  19. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/.beads/.gitignore +0 -0
  20. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/.beads/README.md +0 -0
  21. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/.beads/config.yaml +0 -0
  22. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/.beads/interactions.jsonl +0 -0
  23. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/.beads/issues.jsonl +0 -0
  24. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/.beads/metadata.json +0 -0
  25. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/.gitattributes +0 -0
  26. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/.github/workflows/CI.yml +0 -0
  27. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/.github/workflows/publish-pypi.yml +0 -0
  28. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/.github/workflows/publish-testpypi.yml +0 -0
  29. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/.gitignore +0 -0
  30. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/AGENTS.md +0 -0
  31. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/CLAUDE.md +0 -0
  32. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/LICENSE +0 -0
  33. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/benches/benchmark.rs +0 -0
  34. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/python/rapid_textrank/__init__.py +0 -0
  35. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/python/tests/test_api.py +0 -0
  36. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/errors.rs +0 -0
  37. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/graph/csr.rs +0 -0
  38. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/graph/mod.rs +0 -0
  39. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/lib.rs +0 -0
  40. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/nlp/mod.rs +0 -0
  41. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/nlp/stopwords.rs +0 -0
  42. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/pagerank/mod.rs +0 -0
  43. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/pagerank/personalized.rs +0 -0
  44. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/pagerank/standard.rs +0 -0
  45. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/phrase/chunker.rs +0 -0
  46. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/phrase/dedup.rs +0 -0
  47. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/phrase/mod.rs +0 -0
  48. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/python/mod.rs +0 -0
  49. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/summarizer/mod.rs +0 -0
  50. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/summarizer/selector.rs +0 -0
  51. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/summarizer/unit_vector.rs +0 -0
  52. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/variants/mod.rs +0 -0
  53. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/variants/topic_rank.rs +0 -0
  54. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/tests/integration_tests.rs +0 -0
  55. {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/tests/property_tests.rs +0 -0
@@ -579,7 +579,7 @@ dependencies = [
579
579
 
580
580
  [[package]]
581
581
  name = "rapid_textrank"
582
- version = "0.1.0"
582
+ version = "0.1.1"
583
583
  dependencies = [
584
584
  "approx",
585
585
  "criterion",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "rapid_textrank"
3
- version = "0.1.0"
3
+ version = "0.1.1"
4
4
  edition = "2021"
5
5
  authors = ["TextRanker Contributors"]
6
6
  description = "High-performance TextRank implementation with Python bindings"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rapid_textrank
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Classifier: Development Status :: 4 - Beta
5
5
  Classifier: Intended Audience :: Developers
6
6
  Classifier: Intended Audience :: Science/Research
@@ -217,12 +217,16 @@ config = TextRankConfig(
217
217
  damping=0.85, # PageRank damping factor (0-1)
218
218
  max_iterations=100, # Maximum PageRank iterations
219
219
  convergence_threshold=1e-6,# Convergence threshold
220
- window_size=4, # Co-occurrence window size
220
+ window_size=3, # Co-occurrence window size
221
221
  top_n=10, # Number of results
222
222
  min_phrase_length=1, # Minimum words in a phrase
223
223
  max_phrase_length=4, # Maximum words in a phrase
224
224
  score_aggregation="sum", # How to combine word scores: "sum", "mean", "max", "rms"
225
- language="en" # Language for stopwords
225
+ language="en", # Language for stopwords
226
+ include_pos=["NOUN","ADJ","PROPN","VERB"], # POS tags to include in the graph
227
+ use_pos_in_nodes=True, # If True, graph nodes are lemma+POS
228
+ phrase_grouping="scrubbed_text", # "lemma" or "scrubbed_text"
229
+ stopwords=["custom", "terms"] # Additional stopwords (extends built-in list)
226
230
  )
227
231
 
228
232
  extractor = BaseTextRank(config=config)
@@ -182,12 +182,16 @@ config = TextRankConfig(
182
182
  damping=0.85, # PageRank damping factor (0-1)
183
183
  max_iterations=100, # Maximum PageRank iterations
184
184
  convergence_threshold=1e-6,# Convergence threshold
185
- window_size=4, # Co-occurrence window size
185
+ window_size=3, # Co-occurrence window size
186
186
  top_n=10, # Number of results
187
187
  min_phrase_length=1, # Minimum words in a phrase
188
188
  max_phrase_length=4, # Maximum words in a phrase
189
189
  score_aggregation="sum", # How to combine word scores: "sum", "mean", "max", "rms"
190
- language="en" # Language for stopwords
190
+ language="en", # Language for stopwords
191
+ include_pos=["NOUN","ADJ","PROPN","VERB"], # POS tags to include in the graph
192
+ use_pos_in_nodes=True, # If True, graph nodes are lemma+POS
193
+ phrase_grouping="scrubbed_text", # "lemma" or "scrubbed_text"
194
+ stopwords=["custom", "terms"] # Additional stopwords (extends built-in list)
191
195
  )
192
196
 
193
197
  extractor = BaseTextRank(config=config)
@@ -193,17 +193,11 @@
193
193
  "source": [
194
194
  "from rapid_textrank import TextRankConfig, BaseTextRank\n",
195
195
  "\n",
196
- "# Create a custom configuration\n",
196
+ "# Create a custom configuration (only overriding a few defaults)\n",
197
197
  "config = TextRankConfig(\n",
198
- " damping=0.85, # PageRank damping factor (0-1)\n",
199
- " max_iterations=100, # Maximum PageRank iterations\n",
200
- " convergence_threshold=1e-6,# Stop when scores change less than this\n",
201
- " window_size=4, # Co-occurrence window size\n",
202
- " top_n=10, # Number of results\n",
203
- " min_phrase_length=1, # Minimum words in a phrase\n",
204
- " max_phrase_length=4, # Maximum words in a phrase\n",
205
- " score_aggregation=\"sum\", # How to combine word scores: \"sum\", \"mean\", \"max\", \"rms\"\n",
206
- " language=\"en\" # Language for stopwords\n",
198
+ " top_n=10,\n",
199
+ " score_aggregation=\"sum\",\n",
200
+ " language=\"en\",\n",
207
201
  ")\n",
208
202
  "\n",
209
203
  "# Create an extractor with the config\n",
@@ -214,9 +208,10 @@
214
208
  "\n",
215
209
  "print(f\"Converged: {result.converged}\")\n",
216
210
  "print(f\"Iterations: {result.iterations}\")\n",
217
- "print(f\"\\nTop phrases:\")\n",
211
+ "print(f\"\n",
212
+ "Top phrases:\")\n",
218
213
  "for p in result.phrases[:5]:\n",
219
- " print(f\" {p.rank}. {p.text}: {p.score:.4f}\")"
214
+ " print(f\" {p.rank}. {p.text}: {p.score:.4f}\")\n"
220
215
  ]
221
216
  },
222
217
  {
@@ -317,7 +312,7 @@
317
312
  "output_type": "stream",
318
313
  "text": [
319
314
  "German keywords:\n",
320
- " 1. Teilgebiet der künstlichen Intelligenz: 0.1860\n",
315
+ " 1. Teilgebiet der k\u00fcnstlichen Intelligenz: 0.1860\n",
321
316
  " 2. aus Erfahrung zu lernen: 0.1768\n",
322
317
  " 3. Netze mit vielen Schichten: 0.1184\n"
323
318
  ]
@@ -326,9 +321,9 @@
326
321
  "source": [
327
322
  "# German example\n",
328
323
  "german_text = \"\"\"\n",
329
- "Maschinelles Lernen ist ein Teilgebiet der künstlichen Intelligenz.\n",
324
+ "Maschinelles Lernen ist ein Teilgebiet der k\u00fcnstlichen Intelligenz.\n",
330
325
  "Deep Learning verwendet neuronale Netze mit vielen Schichten.\n",
331
- "Diese Technologie ermöglicht es Computern, aus Erfahrung zu lernen.\n",
326
+ "Diese Technologie erm\u00f6glicht es Computern, aus Erfahrung zu lernen.\n",
332
327
  "\"\"\"\n",
333
328
  "\n",
334
329
  "keywords_de = extract_keywords(german_text, top_n=5, language=\"de\")\n",
@@ -350,7 +345,7 @@
350
345
  "text": [
351
346
  "French keywords:\n",
352
347
  " 1. branche de l'intelligence artificielle: 0.1906\n",
353
- " 2. l'analyse de données complexes: 0.1764\n",
348
+ " 2. l'analyse de donn\u00e9es complexes: 0.1764\n",
354
349
  " 3. de nombreux secteurs industriels: 0.1250\n"
355
350
  ]
356
351
  }
@@ -359,7 +354,7 @@
359
354
  "# French example\n",
360
355
  "french_text = \"\"\"\n",
361
356
  "L'apprentissage automatique est une branche de l'intelligence artificielle.\n",
362
- "Les réseaux de neurones profonds permettent l'analyse de données complexes.\n",
357
+ "Les r\u00e9seaux de neurones profonds permettent l'analyse de donn\u00e9es complexes.\n",
363
358
  "Ces technologies transforment de nombreux secteurs industriels.\n",
364
359
  "\"\"\"\n",
365
360
  "\n",
@@ -395,10 +390,10 @@
395
390
  "Note: you may need to restart the kernel to use updated packages.\n",
396
391
  "Collecting en-core-web-sm==3.8.0\n",
397
392
  " Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)\n",
398
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.8/12.8 MB\u001b[0m \u001b[31m37.8 MB/s\u001b[0m \u001b[33m0:00:00\u001b[0meta \u001b[36m0:00:01\u001b[0m\n",
393
+ "\u001b[2K \u001b[90m\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u001b[0m \u001b[32m12.8/12.8 MB\u001b[0m \u001b[31m37.8 MB/s\u001b[0m \u001b[33m0:00:00\u001b[0meta \u001b[36m0:00:01\u001b[0m\n",
399
394
  "\u001b[?25hInstalling collected packages: en-core-web-sm\n",
400
395
  "Successfully installed en-core-web-sm-3.8.0\n",
401
- "\u001b[38;5;2m Download and installation successful\u001b[0m\n",
396
+ "\u001b[38;5;2m\u2714 Download and installation successful\u001b[0m\n",
402
397
  "You can now load the package via spacy.load('en_core_web_sm')\n"
403
398
  ]
404
399
  }
@@ -484,4 +479,4 @@
484
479
  },
485
480
  "nbformat": 4,
486
481
  "nbformat_minor": 5
487
- }
482
+ }
@@ -118,7 +118,7 @@
118
118
  "\n",
119
119
  "Based on [Florescu & Caragea (2017)](https://aclanthology.org/P17-1102/), PositionRank weights words by their position in the document.\n",
120
120
  "\n",
121
- "**Key insight:** In many documents (papers, news articles, reports), important terms appear early—in titles, abstracts, or introductory paragraphs.\n",
121
+ "**Key insight:** In many documents (papers, news articles, reports), important terms appear early\u2014in titles, abstracts, or introductory paragraphs.\n",
122
122
  "\n",
123
123
  "**How it differs from BaseTextRank:**\n",
124
124
  "- Words appearing early get higher initial importance\n",
@@ -555,7 +555,7 @@
555
555
  " ],\n",
556
556
  " \"config\": {\n",
557
557
  " \"top_n\": 5,\n",
558
- " \"window_size\": 4,\n",
558
+ " \"window_size\": 3,\n",
559
559
  " \"damping\": 0.85\n",
560
560
  " }\n",
561
561
  "}\n",
@@ -565,7 +565,7 @@
565
565
  "\n",
566
566
  "print(\"Single document result:\")\n",
567
567
  "for phrase in result[\"phrases\"]:\n",
568
- " print(f\" {phrase['text']}: {phrase['score']:.4f}\")"
568
+ " print(f\" {phrase['text']}: {phrase['score']:.4f}\")\n"
569
569
  ]
570
570
  },
571
571
  {
@@ -639,27 +639,27 @@
639
639
  "\n",
640
640
  "```\n",
641
641
  " START\n",
642
- " │\n",
643
- " ▼\n",
644
- " ┌─────────────────────────┐\n",
645
- " Do you have specific │\n",
646
- " topics to focus on? │\n",
647
- " └─────────────────────────┘\n",
648
- " │\n",
642
+ " \u2502\n",
643
+ " \u25bc\n",
644
+ " \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n",
645
+ " \u2502 Do you have specific \u2502\n",
646
+ " \u2502 topics to focus on? \u2502\n",
647
+ " \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n",
648
+ " \u2502 \u2502\n",
649
649
  " YES NO\n",
650
- " │\n",
651
- " ▼\n",
652
- " ┌──────────────┐ ┌─────────────────────────┐\n",
653
- " BiasedTextRank Is key info at the │\n",
654
- " beginning of the doc? │\n",
655
- " └──────────────┘ └─────────────────────────┘\n",
656
- " │\n",
650
+ " \u2502 \u2502\n",
651
+ " \u25bc \u25bc\n",
652
+ " \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510 \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n",
653
+ " \u2502 BiasedTextRank\u2502 \u2502 Is key info at the \u2502\n",
654
+ " \u2502 \u2502 \u2502 beginning of the doc? \u2502\n",
655
+ " \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518 \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n",
656
+ " \u2502 \u2502\n",
657
657
  " YES NO\n",
658
- " │\n",
659
- " ▼\n",
660
- " ┌──────────────┐ ┌──────────────┐\n",
661
- " PositionRank BaseTextRank │\n",
662
- " └──────────────┘ └──────────────┘\n",
658
+ " \u2502 \u2502\n",
659
+ " \u25bc \u25bc\n",
660
+ " \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510 \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n",
661
+ " \u2502 PositionRank \u2502 \u2502 BaseTextRank \u2502\n",
662
+ " \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518 \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n",
663
663
  "```\n",
664
664
  "\n",
665
665
  "### Recommendations by Document Type\n",
@@ -707,4 +707,4 @@
707
707
  },
708
708
  "nbformat": 4,
709
709
  "nbformat_minor": 5
710
- }
710
+ }
@@ -179,7 +179,7 @@
179
179
  "metadata": {},
180
180
  "outputs": [],
181
181
  "source": [
182
- "def build_cooccurrence_graph(tokens, window_size=4):\n",
182
+ "def build_cooccurrence_graph(tokens, window_size=3):\n",
183
183
  " \"\"\"Build a co-occurrence graph from tokens.\"\"\"\n",
184
184
  " G = nx.Graph()\n",
185
185
  " \n",
@@ -205,11 +205,11 @@
205
205
  " return G\n",
206
206
  "\n",
207
207
  "# Build graph with default window size\n",
208
- "G = build_cooccurrence_graph(tokens, window_size=4)\n",
208
+ "G = build_cooccurrence_graph(tokens, window_size=3)\n",
209
209
  "\n",
210
210
  "print(f\"Graph Statistics:\")\n",
211
211
  "print(f\" Nodes: {G.number_of_nodes()}\")\n",
212
- "print(f\" Edges: {G.number_of_edges()}\")"
212
+ "print(f\" Edges: {G.number_of_edges()}\")\n"
213
213
  ]
214
214
  },
215
215
  {
@@ -241,10 +241,10 @@
241
241
  "edge_labels = {(u, v): G[u][v]['weight'] for u, v in G.edges()}\n",
242
242
  "nx.draw_networkx_edge_labels(G, pos, edge_labels, font_size=8, ax=ax)\n",
243
243
  "\n",
244
- "ax.set_title(\"Co-occurrence Graph (window_size=4)\", fontsize=14, fontweight='bold')\n",
244
+ "ax.set_title(\"Co-occurrence Graph (window_size=3)\", fontsize=14, fontweight='bold')\n",
245
245
  "ax.axis('off')\n",
246
246
  "plt.tight_layout()\n",
247
- "plt.show()"
247
+ "plt.show()\n"
248
248
  ]
249
249
  },
250
250
  {
@@ -345,13 +345,13 @@
345
345
  " return scores, history\n",
346
346
  "\n",
347
347
  "# Run PageRank\n",
348
- "G = build_cooccurrence_graph(tokens, window_size=4)\n",
348
+ "G = build_cooccurrence_graph(tokens, window_size=3)\n",
349
349
  "scores, history = pagerank_with_history(G)\n",
350
350
  "\n",
351
351
  "print(f\"PageRank converged in {len(history)-1} iterations\")\n",
352
352
  "print(f\"\\nFinal scores (sorted by importance):\")\n",
353
353
  "for word, score in sorted(scores.items(), key=lambda x: -x[1]):\n",
354
- " print(f\" {word:<15} {score:.4f}\")"
354
+ " print(f\" {word:<15} {score:.4f}\")\n"
355
355
  ]
356
356
  },
357
357
  {
@@ -391,7 +391,7 @@
391
391
  "# Visualize graph with node size proportional to score\n",
392
392
  "fig, ax = plt.subplots(figsize=(12, 8))\n",
393
393
  "\n",
394
- "G = build_cooccurrence_graph(tokens, window_size=4)\n",
394
+ "G = build_cooccurrence_graph(tokens, window_size=3)\n",
395
395
  "pos = nx.spring_layout(G, k=2, iterations=50, seed=42)\n",
396
396
  "\n",
397
397
  "# Node sizes based on PageRank scores\n",
@@ -414,7 +414,7 @@
414
414
  " fontsize=14, fontweight='bold')\n",
415
415
  "ax.axis('off')\n",
416
416
  "plt.tight_layout()\n",
417
- "plt.show()"
417
+ "plt.show()\n"
418
418
  ]
419
419
  },
420
420
  {
@@ -27,7 +27,7 @@
27
27
  "output_type": "stream",
28
28
  "text": [
29
29
  "Note: you may need to restart the kernel to use updated packages.\n",
30
- "\u001b[38;5;2m Download and installation successful\u001b[0m\n",
30
+ "\u001b[38;5;2m\u2714 Download and installation successful\u001b[0m\n",
31
31
  "You can now load the package via spacy.load('en_core_web_sm')\n"
32
32
  ]
33
33
  }
@@ -240,20 +240,20 @@
240
240
  "\n",
241
241
  "SMALL TEXT (~16 words)\n",
242
242
  "--------------------------------------------------\n",
243
- "rapid_textrank: 4.44 ms (±1.38)\n",
244
- "pytextrank: 7.54 ms (±1.00)\n",
243
+ "rapid_textrank: 4.44 ms (\u00b11.38)\n",
244
+ "pytextrank: 7.54 ms (\u00b11.00)\n",
245
245
  "Speedup: 1.7x faster\n",
246
246
  "\n",
247
247
  "MEDIUM TEXT (~100 words)\n",
248
248
  "--------------------------------------------------\n",
249
- "rapid_textrank: 3.08 ms (±0.32)\n",
250
- "pytextrank: 66.14 ms (±104.73)\n",
249
+ "rapid_textrank: 3.08 ms (\u00b10.32)\n",
250
+ "pytextrank: 66.14 ms (\u00b1104.73)\n",
251
251
  "Speedup: 21.4x faster\n",
252
252
  "\n",
253
253
  "LARGE TEXT (~660 words)\n",
254
254
  "--------------------------------------------------\n",
255
- "rapid_textrank: 4.12 ms (±0.83)\n",
256
- "pytextrank: 184.31 ms (±26.20)\n",
255
+ "rapid_textrank: 4.12 ms (\u00b10.83)\n",
256
+ "pytextrank: 184.31 ms (\u00b126.20)\n",
257
257
  "Speedup: 44.7x faster\n"
258
258
  ]
259
259
  }
@@ -272,11 +272,11 @@
272
272
  "\n",
273
273
  " # Benchmark rapid_textrank\n",
274
274
  " rust_results = benchmark_rapid_textrank(text)\n",
275
- " print(f\"rapid_textrank: {rust_results['mean']:>8.2f} ms (±{rust_results['std']:.2f})\")\n",
275
+ " print(f\"rapid_textrank: {rust_results['mean']:>8.2f} ms (\u00b1{rust_results['std']:.2f})\")\n",
276
276
  "\n",
277
277
  " # Benchmark pytextrank\n",
278
278
  " py_results = benchmark_pytextrank(text)\n",
279
- " print(f\"pytextrank: {py_results['mean']:>8.2f} ms (±{py_results['std']:.2f})\")\n",
279
+ " print(f\"pytextrank: {py_results['mean']:>8.2f} ms (\u00b1{py_results['std']:.2f})\")\n",
280
280
  "\n",
281
281
  " speedup = py_results['mean'] / rust_results['mean']\n",
282
282
  " print(f\"Speedup: {speedup:>8.1f}x faster\")\n",
@@ -496,20 +496,20 @@
496
496
  "\n",
497
497
  "SMALL TEXT (21 tokens)\n",
498
498
  "--------------------------------------------------\n",
499
- "rapid_textrank (JSON API): 0.037 ms (±0.010)\n",
500
- "pytextrank (extraction): 1.596 ms (±0.582)\n",
499
+ "rapid_textrank (JSON API): 0.037 ms (\u00b10.010)\n",
500
+ "pytextrank (extraction): 1.596 ms (\u00b10.582)\n",
501
501
  "Speedup: 42.9x faster\n",
502
502
  "\n",
503
503
  "MEDIUM TEXT (138 tokens)\n",
504
504
  "--------------------------------------------------\n",
505
- "rapid_textrank (JSON API): 0.231 ms (±0.101)\n",
506
- "pytextrank (extraction): 2.881 ms (±0.472)\n",
505
+ "rapid_textrank (JSON API): 0.231 ms (\u00b10.101)\n",
506
+ "pytextrank (extraction): 2.881 ms (\u00b10.472)\n",
507
507
  "Speedup: 12.5x faster\n",
508
508
  "\n",
509
509
  "LARGE TEXT (838 tokens)\n",
510
510
  "--------------------------------------------------\n",
511
- "rapid_textrank (JSON API): 0.869 ms (±0.068)\n",
512
- "pytextrank (extraction): 9.171 ms (±0.891)\n",
511
+ "rapid_textrank (JSON API): 0.869 ms (\u00b10.068)\n",
512
+ "pytextrank (extraction): 9.171 ms (\u00b10.891)\n",
513
513
  "Speedup: 10.5x faster\n"
514
514
  ]
515
515
  }
@@ -529,10 +529,10 @@
529
529
  " print(\"-\" * 50)\n",
530
530
  " \n",
531
531
  " rapid_res = benchmark_rapid_extraction_only(tokens)\n",
532
- " print(f\"rapid_textrank (JSON API): {rapid_res['mean']:>8.3f} ms (±{rapid_res['std']:.3f})\")\n",
532
+ " print(f\"rapid_textrank (JSON API): {rapid_res['mean']:>8.3f} ms (\u00b1{rapid_res['std']:.3f})\")\n",
533
533
  " \n",
534
534
  " py_res = benchmark_pytextrank_extraction_only(doc)\n",
535
- " print(f\"pytextrank (extraction): {py_res['mean']:>8.3f} ms (±{py_res['std']:.3f})\")\n",
535
+ " print(f\"pytextrank (extraction): {py_res['mean']:>8.3f} ms (\u00b1{py_res['std']:.3f})\")\n",
536
536
  " \n",
537
537
  " speedup = py_res['mean'] / rapid_res['mean'] if rapid_res['mean'] > 0 else float('inf')\n",
538
538
  " print(f\"Speedup: {speedup:>8.1f}x faster\")\n",
@@ -647,9 +647,9 @@
647
647
  "text": [
648
648
  "Benchmarking batch processing (100 documents)...\n",
649
649
  "============================================================\n",
650
- "rapid_textrank (batch JSON): 18.50 ms (±0.37)\n",
650
+ "rapid_textrank (batch JSON): 18.50 ms (\u00b10.37)\n",
651
651
  " Per document: 0.185 ms\n",
652
- "pytextrank (sequential): 3431.75 ms (±849.32)\n",
652
+ "pytextrank (sequential): 3431.75 ms (\u00b1849.32)\n",
653
653
  " Per document: 34.317 ms\n",
654
654
  "\n",
655
655
  "Speedup: 185.5x faster\n"
@@ -697,11 +697,11 @@
697
697
  "print(\"=\" * 60)\n",
698
698
  "\n",
699
699
  "rapid_mean, rapid_std = benchmark_rapid_batch(batch_tokens)\n",
700
- "print(f\"rapid_textrank (batch JSON): {rapid_mean:>10.2f} ms (±{rapid_std:.2f})\")\n",
700
+ "print(f\"rapid_textrank (batch JSON): {rapid_mean:>10.2f} ms (\u00b1{rapid_std:.2f})\")\n",
701
701
  "print(f\" Per document: {rapid_mean/num_docs:>10.3f} ms\")\n",
702
702
  "\n",
703
703
  "py_mean, py_std = benchmark_pytextrank_batch(batch_texts)\n",
704
- "print(f\"pytextrank (sequential): {py_mean:>10.2f} ms (±{py_std:.2f})\")\n",
704
+ "print(f\"pytextrank (sequential): {py_mean:>10.2f} ms (\u00b1{py_std:.2f})\")\n",
705
705
  "print(f\" Per document: {py_mean/num_docs:>10.3f} ms\")\n",
706
706
  "\n",
707
707
  "speedup = py_mean / rapid_mean\n",
@@ -978,4 +978,4 @@
978
978
  },
979
979
  "nbformat": 4,
980
980
  "nbformat_minor": 5
981
- }
981
+ }
@@ -4,7 +4,7 @@ build-backend = "maturin"
4
4
 
5
5
  [project]
6
6
  name = "rapid_textrank"
7
- version = "0.1.0"
7
+ version = "0.1.1"
8
8
  description = "High-performance TextRank implementation with Python bindings"
9
9
  readme = { file = "README.md", content-type = "text/markdown" }
10
10
  license = "MIT"
@@ -79,11 +79,16 @@ if SPACY_AVAILABLE:
79
79
  "damping": 0.85,
80
80
  "max_iterations": 100,
81
81
  "convergence_threshold": 1e-6,
82
- "window_size": 4,
82
+ "window_size": 3,
83
83
  "top_n": 10,
84
84
  "min_phrase_length": 1,
85
85
  "max_phrase_length": 4,
86
86
  "score_aggregation": "sum",
87
+ "include_pos": ["ADJ", "NOUN", "PROPN", "VERB"],
88
+ "use_pos_in_nodes": True,
89
+ "phrase_grouping": "scrubbed_text",
90
+ "language": "en",
91
+ "stopwords": None,
87
92
  },
88
93
  )
89
94
  def create_rapid_textrank(
@@ -97,6 +102,11 @@ if SPACY_AVAILABLE:
97
102
  min_phrase_length: int,
98
103
  max_phrase_length: int,
99
104
  score_aggregation: str,
105
+ include_pos: Optional[List[str]],
106
+ use_pos_in_nodes: bool,
107
+ phrase_grouping: str,
108
+ language: str,
109
+ stopwords: Optional[List[str]],
100
110
  ):
101
111
  """Create a RustTextRank pipeline component."""
102
112
  return RustTextRank(
@@ -110,6 +120,11 @@ if SPACY_AVAILABLE:
110
120
  min_phrase_length=min_phrase_length,
111
121
  max_phrase_length=max_phrase_length,
112
122
  score_aggregation=score_aggregation,
123
+ include_pos=include_pos,
124
+ use_pos_in_nodes=use_pos_in_nodes,
125
+ phrase_grouping=phrase_grouping,
126
+ language=language,
127
+ stopwords=stopwords,
113
128
  )
114
129
 
115
130
  class RustTextRank:
@@ -135,11 +150,16 @@ if SPACY_AVAILABLE:
135
150
  damping: float = 0.85,
136
151
  max_iterations: int = 100,
137
152
  convergence_threshold: float = 1e-6,
138
- window_size: int = 4,
153
+ window_size: int = 3,
139
154
  top_n: int = 10,
140
155
  min_phrase_length: int = 1,
141
156
  max_phrase_length: int = 4,
142
157
  score_aggregation: str = "sum",
158
+ include_pos: Optional[List[str]] = None,
159
+ use_pos_in_nodes: bool = True,
160
+ phrase_grouping: str = "scrubbed_text",
161
+ language: str = "en",
162
+ stopwords: Optional[List[str]] = None,
143
163
  ):
144
164
  self.nlp = nlp
145
165
  self.name = name
@@ -152,7 +172,14 @@ if SPACY_AVAILABLE:
152
172
  "min_phrase_length": min_phrase_length,
153
173
  "max_phrase_length": max_phrase_length,
154
174
  "score_aggregation": score_aggregation,
175
+ "use_pos_in_nodes": use_pos_in_nodes,
176
+ "phrase_grouping": phrase_grouping,
177
+ "language": language,
155
178
  }
179
+ if include_pos is not None:
180
+ self.config["include_pos"] = include_pos
181
+ if stopwords is not None:
182
+ self.config["stopwords"] = stopwords
156
183
 
157
184
  # Register custom extensions
158
185
  if not Doc.has_extension("phrases"):