rapid-textrank 0.1.0__tar.gz → 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/Cargo.lock +1 -1
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/Cargo.toml +1 -1
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/PKG-INFO +7 -3
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/README.md +6 -2
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/notebooks/01_quickstart.ipynb +15 -20
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/notebooks/02_algorithm_variants.ipynb +23 -23
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/notebooks/03_explain_algorithm.ipynb +9 -9
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/notebooks/04_benchmarks.ipynb +22 -22
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/pyproject.toml +1 -1
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/python/rapid_textrank/spacy_component.py +29 -2
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/graph/builder.rs +32 -15
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/nlp/tokenizer.rs +6 -10
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/phrase/extraction.rs +75 -34
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/python/json.rs +33 -12
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/python/native.rs +10 -3
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/types.rs +66 -2
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/variants/biased_textrank.rs +32 -11
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/variants/position_rank.rs +2 -1
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/.beads/.gitignore +0 -0
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/.beads/README.md +0 -0
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/.beads/config.yaml +0 -0
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/.beads/interactions.jsonl +0 -0
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/.beads/issues.jsonl +0 -0
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/.beads/metadata.json +0 -0
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/.gitattributes +0 -0
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/.github/workflows/CI.yml +0 -0
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/.github/workflows/publish-pypi.yml +0 -0
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/.github/workflows/publish-testpypi.yml +0 -0
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/.gitignore +0 -0
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/AGENTS.md +0 -0
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/CLAUDE.md +0 -0
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/LICENSE +0 -0
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/benches/benchmark.rs +0 -0
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/python/rapid_textrank/__init__.py +0 -0
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/python/tests/test_api.py +0 -0
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/errors.rs +0 -0
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/graph/csr.rs +0 -0
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/graph/mod.rs +0 -0
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/lib.rs +0 -0
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/nlp/mod.rs +0 -0
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/nlp/stopwords.rs +0 -0
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/pagerank/mod.rs +0 -0
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/pagerank/personalized.rs +0 -0
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/pagerank/standard.rs +0 -0
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/phrase/chunker.rs +0 -0
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/phrase/dedup.rs +0 -0
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/phrase/mod.rs +0 -0
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/python/mod.rs +0 -0
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/summarizer/mod.rs +0 -0
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/summarizer/selector.rs +0 -0
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/summarizer/unit_vector.rs +0 -0
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/variants/mod.rs +0 -0
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/src/variants/topic_rank.rs +0 -0
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/tests/integration_tests.rs +0 -0
- {rapid_textrank-0.1.0 → rapid_textrank-0.1.1}/tests/property_tests.rs +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: rapid_textrank
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.1
|
|
4
4
|
Classifier: Development Status :: 4 - Beta
|
|
5
5
|
Classifier: Intended Audience :: Developers
|
|
6
6
|
Classifier: Intended Audience :: Science/Research
|
|
@@ -217,12 +217,16 @@ config = TextRankConfig(
|
|
|
217
217
|
damping=0.85, # PageRank damping factor (0-1)
|
|
218
218
|
max_iterations=100, # Maximum PageRank iterations
|
|
219
219
|
convergence_threshold=1e-6,# Convergence threshold
|
|
220
|
-
window_size=
|
|
220
|
+
window_size=3, # Co-occurrence window size
|
|
221
221
|
top_n=10, # Number of results
|
|
222
222
|
min_phrase_length=1, # Minimum words in a phrase
|
|
223
223
|
max_phrase_length=4, # Maximum words in a phrase
|
|
224
224
|
score_aggregation="sum", # How to combine word scores: "sum", "mean", "max", "rms"
|
|
225
|
-
language="en"
|
|
225
|
+
language="en", # Language for stopwords
|
|
226
|
+
include_pos=["NOUN","ADJ","PROPN","VERB"], # POS tags to include in the graph
|
|
227
|
+
use_pos_in_nodes=True, # If True, graph nodes are lemma+POS
|
|
228
|
+
phrase_grouping="scrubbed_text", # "lemma" or "scrubbed_text"
|
|
229
|
+
stopwords=["custom", "terms"] # Additional stopwords (extends built-in list)
|
|
226
230
|
)
|
|
227
231
|
|
|
228
232
|
extractor = BaseTextRank(config=config)
|
|
@@ -182,12 +182,16 @@ config = TextRankConfig(
|
|
|
182
182
|
damping=0.85, # PageRank damping factor (0-1)
|
|
183
183
|
max_iterations=100, # Maximum PageRank iterations
|
|
184
184
|
convergence_threshold=1e-6,# Convergence threshold
|
|
185
|
-
window_size=
|
|
185
|
+
window_size=3, # Co-occurrence window size
|
|
186
186
|
top_n=10, # Number of results
|
|
187
187
|
min_phrase_length=1, # Minimum words in a phrase
|
|
188
188
|
max_phrase_length=4, # Maximum words in a phrase
|
|
189
189
|
score_aggregation="sum", # How to combine word scores: "sum", "mean", "max", "rms"
|
|
190
|
-
language="en"
|
|
190
|
+
language="en", # Language for stopwords
|
|
191
|
+
include_pos=["NOUN","ADJ","PROPN","VERB"], # POS tags to include in the graph
|
|
192
|
+
use_pos_in_nodes=True, # If True, graph nodes are lemma+POS
|
|
193
|
+
phrase_grouping="scrubbed_text", # "lemma" or "scrubbed_text"
|
|
194
|
+
stopwords=["custom", "terms"] # Additional stopwords (extends built-in list)
|
|
191
195
|
)
|
|
192
196
|
|
|
193
197
|
extractor = BaseTextRank(config=config)
|
|
@@ -193,17 +193,11 @@
|
|
|
193
193
|
"source": [
|
|
194
194
|
"from rapid_textrank import TextRankConfig, BaseTextRank\n",
|
|
195
195
|
"\n",
|
|
196
|
-
"# Create a custom configuration\n",
|
|
196
|
+
"# Create a custom configuration (only overriding a few defaults)\n",
|
|
197
197
|
"config = TextRankConfig(\n",
|
|
198
|
-
"
|
|
199
|
-
"
|
|
200
|
-
"
|
|
201
|
-
" window_size=4, # Co-occurrence window size\n",
|
|
202
|
-
" top_n=10, # Number of results\n",
|
|
203
|
-
" min_phrase_length=1, # Minimum words in a phrase\n",
|
|
204
|
-
" max_phrase_length=4, # Maximum words in a phrase\n",
|
|
205
|
-
" score_aggregation=\"sum\", # How to combine word scores: \"sum\", \"mean\", \"max\", \"rms\"\n",
|
|
206
|
-
" language=\"en\" # Language for stopwords\n",
|
|
198
|
+
" top_n=10,\n",
|
|
199
|
+
" score_aggregation=\"sum\",\n",
|
|
200
|
+
" language=\"en\",\n",
|
|
207
201
|
")\n",
|
|
208
202
|
"\n",
|
|
209
203
|
"# Create an extractor with the config\n",
|
|
@@ -214,9 +208,10 @@
|
|
|
214
208
|
"\n",
|
|
215
209
|
"print(f\"Converged: {result.converged}\")\n",
|
|
216
210
|
"print(f\"Iterations: {result.iterations}\")\n",
|
|
217
|
-
"print(f\"
|
|
211
|
+
"print(f\"\n",
|
|
212
|
+
"Top phrases:\")\n",
|
|
218
213
|
"for p in result.phrases[:5]:\n",
|
|
219
|
-
" print(f\" {p.rank}. {p.text}: {p.score:.4f}\")"
|
|
214
|
+
" print(f\" {p.rank}. {p.text}: {p.score:.4f}\")\n"
|
|
220
215
|
]
|
|
221
216
|
},
|
|
222
217
|
{
|
|
@@ -317,7 +312,7 @@
|
|
|
317
312
|
"output_type": "stream",
|
|
318
313
|
"text": [
|
|
319
314
|
"German keywords:\n",
|
|
320
|
-
" 1. Teilgebiet der
|
|
315
|
+
" 1. Teilgebiet der k\u00fcnstlichen Intelligenz: 0.1860\n",
|
|
321
316
|
" 2. aus Erfahrung zu lernen: 0.1768\n",
|
|
322
317
|
" 3. Netze mit vielen Schichten: 0.1184\n"
|
|
323
318
|
]
|
|
@@ -326,9 +321,9 @@
|
|
|
326
321
|
"source": [
|
|
327
322
|
"# German example\n",
|
|
328
323
|
"german_text = \"\"\"\n",
|
|
329
|
-
"Maschinelles Lernen ist ein Teilgebiet der
|
|
324
|
+
"Maschinelles Lernen ist ein Teilgebiet der k\u00fcnstlichen Intelligenz.\n",
|
|
330
325
|
"Deep Learning verwendet neuronale Netze mit vielen Schichten.\n",
|
|
331
|
-
"Diese Technologie
|
|
326
|
+
"Diese Technologie erm\u00f6glicht es Computern, aus Erfahrung zu lernen.\n",
|
|
332
327
|
"\"\"\"\n",
|
|
333
328
|
"\n",
|
|
334
329
|
"keywords_de = extract_keywords(german_text, top_n=5, language=\"de\")\n",
|
|
@@ -350,7 +345,7 @@
|
|
|
350
345
|
"text": [
|
|
351
346
|
"French keywords:\n",
|
|
352
347
|
" 1. branche de l'intelligence artificielle: 0.1906\n",
|
|
353
|
-
" 2. l'analyse de
|
|
348
|
+
" 2. l'analyse de donn\u00e9es complexes: 0.1764\n",
|
|
354
349
|
" 3. de nombreux secteurs industriels: 0.1250\n"
|
|
355
350
|
]
|
|
356
351
|
}
|
|
@@ -359,7 +354,7 @@
|
|
|
359
354
|
"# French example\n",
|
|
360
355
|
"french_text = \"\"\"\n",
|
|
361
356
|
"L'apprentissage automatique est une branche de l'intelligence artificielle.\n",
|
|
362
|
-
"Les
|
|
357
|
+
"Les r\u00e9seaux de neurones profonds permettent l'analyse de donn\u00e9es complexes.\n",
|
|
363
358
|
"Ces technologies transforment de nombreux secteurs industriels.\n",
|
|
364
359
|
"\"\"\"\n",
|
|
365
360
|
"\n",
|
|
@@ -395,10 +390,10 @@
|
|
|
395
390
|
"Note: you may need to restart the kernel to use updated packages.\n",
|
|
396
391
|
"Collecting en-core-web-sm==3.8.0\n",
|
|
397
392
|
" Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)\n",
|
|
398
|
-
"\u001b[2K \u001b[90m
|
|
393
|
+
"\u001b[2K \u001b[90m\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u001b[0m \u001b[32m12.8/12.8 MB\u001b[0m \u001b[31m37.8 MB/s\u001b[0m \u001b[33m0:00:00\u001b[0meta \u001b[36m0:00:01\u001b[0m\n",
|
|
399
394
|
"\u001b[?25hInstalling collected packages: en-core-web-sm\n",
|
|
400
395
|
"Successfully installed en-core-web-sm-3.8.0\n",
|
|
401
|
-
"\u001b[38;5;2m
|
|
396
|
+
"\u001b[38;5;2m\u2714 Download and installation successful\u001b[0m\n",
|
|
402
397
|
"You can now load the package via spacy.load('en_core_web_sm')\n"
|
|
403
398
|
]
|
|
404
399
|
}
|
|
@@ -484,4 +479,4 @@
|
|
|
484
479
|
},
|
|
485
480
|
"nbformat": 4,
|
|
486
481
|
"nbformat_minor": 5
|
|
487
|
-
}
|
|
482
|
+
}
|
|
@@ -118,7 +118,7 @@
|
|
|
118
118
|
"\n",
|
|
119
119
|
"Based on [Florescu & Caragea (2017)](https://aclanthology.org/P17-1102/), PositionRank weights words by their position in the document.\n",
|
|
120
120
|
"\n",
|
|
121
|
-
"**Key insight:** In many documents (papers, news articles, reports), important terms appear early
|
|
121
|
+
"**Key insight:** In many documents (papers, news articles, reports), important terms appear early\u2014in titles, abstracts, or introductory paragraphs.\n",
|
|
122
122
|
"\n",
|
|
123
123
|
"**How it differs from BaseTextRank:**\n",
|
|
124
124
|
"- Words appearing early get higher initial importance\n",
|
|
@@ -555,7 +555,7 @@
|
|
|
555
555
|
" ],\n",
|
|
556
556
|
" \"config\": {\n",
|
|
557
557
|
" \"top_n\": 5,\n",
|
|
558
|
-
" \"window_size\":
|
|
558
|
+
" \"window_size\": 3,\n",
|
|
559
559
|
" \"damping\": 0.85\n",
|
|
560
560
|
" }\n",
|
|
561
561
|
"}\n",
|
|
@@ -565,7 +565,7 @@
|
|
|
565
565
|
"\n",
|
|
566
566
|
"print(\"Single document result:\")\n",
|
|
567
567
|
"for phrase in result[\"phrases\"]:\n",
|
|
568
|
-
" print(f\" {phrase['text']}: {phrase['score']:.4f}\")"
|
|
568
|
+
" print(f\" {phrase['text']}: {phrase['score']:.4f}\")\n"
|
|
569
569
|
]
|
|
570
570
|
},
|
|
571
571
|
{
|
|
@@ -639,27 +639,27 @@
|
|
|
639
639
|
"\n",
|
|
640
640
|
"```\n",
|
|
641
641
|
" START\n",
|
|
642
|
-
"
|
|
643
|
-
"
|
|
644
|
-
"
|
|
645
|
-
"
|
|
646
|
-
"
|
|
647
|
-
"
|
|
648
|
-
"
|
|
642
|
+
" \u2502\n",
|
|
643
|
+
" \u25bc\n",
|
|
644
|
+
" \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n",
|
|
645
|
+
" \u2502 Do you have specific \u2502\n",
|
|
646
|
+
" \u2502 topics to focus on? \u2502\n",
|
|
647
|
+
" \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n",
|
|
648
|
+
" \u2502 \u2502\n",
|
|
649
649
|
" YES NO\n",
|
|
650
|
-
"
|
|
651
|
-
"
|
|
652
|
-
"
|
|
653
|
-
"
|
|
654
|
-
"
|
|
655
|
-
"
|
|
656
|
-
"
|
|
650
|
+
" \u2502 \u2502\n",
|
|
651
|
+
" \u25bc \u25bc\n",
|
|
652
|
+
" \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510 \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n",
|
|
653
|
+
" \u2502 BiasedTextRank\u2502 \u2502 Is key info at the \u2502\n",
|
|
654
|
+
" \u2502 \u2502 \u2502 beginning of the doc? \u2502\n",
|
|
655
|
+
" \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518 \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n",
|
|
656
|
+
" \u2502 \u2502\n",
|
|
657
657
|
" YES NO\n",
|
|
658
|
-
"
|
|
659
|
-
"
|
|
660
|
-
"
|
|
661
|
-
"
|
|
662
|
-
"
|
|
658
|
+
" \u2502 \u2502\n",
|
|
659
|
+
" \u25bc \u25bc\n",
|
|
660
|
+
" \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510 \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n",
|
|
661
|
+
" \u2502 PositionRank \u2502 \u2502 BaseTextRank \u2502\n",
|
|
662
|
+
" \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518 \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n",
|
|
663
663
|
"```\n",
|
|
664
664
|
"\n",
|
|
665
665
|
"### Recommendations by Document Type\n",
|
|
@@ -707,4 +707,4 @@
|
|
|
707
707
|
},
|
|
708
708
|
"nbformat": 4,
|
|
709
709
|
"nbformat_minor": 5
|
|
710
|
-
}
|
|
710
|
+
}
|
|
@@ -179,7 +179,7 @@
|
|
|
179
179
|
"metadata": {},
|
|
180
180
|
"outputs": [],
|
|
181
181
|
"source": [
|
|
182
|
-
"def build_cooccurrence_graph(tokens, window_size=
|
|
182
|
+
"def build_cooccurrence_graph(tokens, window_size=3):\n",
|
|
183
183
|
" \"\"\"Build a co-occurrence graph from tokens.\"\"\"\n",
|
|
184
184
|
" G = nx.Graph()\n",
|
|
185
185
|
" \n",
|
|
@@ -205,11 +205,11 @@
|
|
|
205
205
|
" return G\n",
|
|
206
206
|
"\n",
|
|
207
207
|
"# Build graph with default window size\n",
|
|
208
|
-
"G = build_cooccurrence_graph(tokens, window_size=
|
|
208
|
+
"G = build_cooccurrence_graph(tokens, window_size=3)\n",
|
|
209
209
|
"\n",
|
|
210
210
|
"print(f\"Graph Statistics:\")\n",
|
|
211
211
|
"print(f\" Nodes: {G.number_of_nodes()}\")\n",
|
|
212
|
-
"print(f\" Edges: {G.number_of_edges()}\")"
|
|
212
|
+
"print(f\" Edges: {G.number_of_edges()}\")\n"
|
|
213
213
|
]
|
|
214
214
|
},
|
|
215
215
|
{
|
|
@@ -241,10 +241,10 @@
|
|
|
241
241
|
"edge_labels = {(u, v): G[u][v]['weight'] for u, v in G.edges()}\n",
|
|
242
242
|
"nx.draw_networkx_edge_labels(G, pos, edge_labels, font_size=8, ax=ax)\n",
|
|
243
243
|
"\n",
|
|
244
|
-
"ax.set_title(\"Co-occurrence Graph (window_size=
|
|
244
|
+
"ax.set_title(\"Co-occurrence Graph (window_size=3)\", fontsize=14, fontweight='bold')\n",
|
|
245
245
|
"ax.axis('off')\n",
|
|
246
246
|
"plt.tight_layout()\n",
|
|
247
|
-
"plt.show()"
|
|
247
|
+
"plt.show()\n"
|
|
248
248
|
]
|
|
249
249
|
},
|
|
250
250
|
{
|
|
@@ -345,13 +345,13 @@
|
|
|
345
345
|
" return scores, history\n",
|
|
346
346
|
"\n",
|
|
347
347
|
"# Run PageRank\n",
|
|
348
|
-
"G = build_cooccurrence_graph(tokens, window_size=
|
|
348
|
+
"G = build_cooccurrence_graph(tokens, window_size=3)\n",
|
|
349
349
|
"scores, history = pagerank_with_history(G)\n",
|
|
350
350
|
"\n",
|
|
351
351
|
"print(f\"PageRank converged in {len(history)-1} iterations\")\n",
|
|
352
352
|
"print(f\"\\nFinal scores (sorted by importance):\")\n",
|
|
353
353
|
"for word, score in sorted(scores.items(), key=lambda x: -x[1]):\n",
|
|
354
|
-
" print(f\" {word:<15} {score:.4f}\")"
|
|
354
|
+
" print(f\" {word:<15} {score:.4f}\")\n"
|
|
355
355
|
]
|
|
356
356
|
},
|
|
357
357
|
{
|
|
@@ -391,7 +391,7 @@
|
|
|
391
391
|
"# Visualize graph with node size proportional to score\n",
|
|
392
392
|
"fig, ax = plt.subplots(figsize=(12, 8))\n",
|
|
393
393
|
"\n",
|
|
394
|
-
"G = build_cooccurrence_graph(tokens, window_size=
|
|
394
|
+
"G = build_cooccurrence_graph(tokens, window_size=3)\n",
|
|
395
395
|
"pos = nx.spring_layout(G, k=2, iterations=50, seed=42)\n",
|
|
396
396
|
"\n",
|
|
397
397
|
"# Node sizes based on PageRank scores\n",
|
|
@@ -414,7 +414,7 @@
|
|
|
414
414
|
" fontsize=14, fontweight='bold')\n",
|
|
415
415
|
"ax.axis('off')\n",
|
|
416
416
|
"plt.tight_layout()\n",
|
|
417
|
-
"plt.show()"
|
|
417
|
+
"plt.show()\n"
|
|
418
418
|
]
|
|
419
419
|
},
|
|
420
420
|
{
|
|
@@ -27,7 +27,7 @@
|
|
|
27
27
|
"output_type": "stream",
|
|
28
28
|
"text": [
|
|
29
29
|
"Note: you may need to restart the kernel to use updated packages.\n",
|
|
30
|
-
"\u001b[38;5;2m
|
|
30
|
+
"\u001b[38;5;2m\u2714 Download and installation successful\u001b[0m\n",
|
|
31
31
|
"You can now load the package via spacy.load('en_core_web_sm')\n"
|
|
32
32
|
]
|
|
33
33
|
}
|
|
@@ -240,20 +240,20 @@
|
|
|
240
240
|
"\n",
|
|
241
241
|
"SMALL TEXT (~16 words)\n",
|
|
242
242
|
"--------------------------------------------------\n",
|
|
243
|
-
"rapid_textrank: 4.44 ms (
|
|
244
|
-
"pytextrank: 7.54 ms (
|
|
243
|
+
"rapid_textrank: 4.44 ms (\u00b11.38)\n",
|
|
244
|
+
"pytextrank: 7.54 ms (\u00b11.00)\n",
|
|
245
245
|
"Speedup: 1.7x faster\n",
|
|
246
246
|
"\n",
|
|
247
247
|
"MEDIUM TEXT (~100 words)\n",
|
|
248
248
|
"--------------------------------------------------\n",
|
|
249
|
-
"rapid_textrank: 3.08 ms (
|
|
250
|
-
"pytextrank: 66.14 ms (
|
|
249
|
+
"rapid_textrank: 3.08 ms (\u00b10.32)\n",
|
|
250
|
+
"pytextrank: 66.14 ms (\u00b1104.73)\n",
|
|
251
251
|
"Speedup: 21.4x faster\n",
|
|
252
252
|
"\n",
|
|
253
253
|
"LARGE TEXT (~660 words)\n",
|
|
254
254
|
"--------------------------------------------------\n",
|
|
255
|
-
"rapid_textrank: 4.12 ms (
|
|
256
|
-
"pytextrank: 184.31 ms (
|
|
255
|
+
"rapid_textrank: 4.12 ms (\u00b10.83)\n",
|
|
256
|
+
"pytextrank: 184.31 ms (\u00b126.20)\n",
|
|
257
257
|
"Speedup: 44.7x faster\n"
|
|
258
258
|
]
|
|
259
259
|
}
|
|
@@ -272,11 +272,11 @@
|
|
|
272
272
|
"\n",
|
|
273
273
|
" # Benchmark rapid_textrank\n",
|
|
274
274
|
" rust_results = benchmark_rapid_textrank(text)\n",
|
|
275
|
-
" print(f\"rapid_textrank: {rust_results['mean']:>8.2f} ms (
|
|
275
|
+
" print(f\"rapid_textrank: {rust_results['mean']:>8.2f} ms (\u00b1{rust_results['std']:.2f})\")\n",
|
|
276
276
|
"\n",
|
|
277
277
|
" # Benchmark pytextrank\n",
|
|
278
278
|
" py_results = benchmark_pytextrank(text)\n",
|
|
279
|
-
" print(f\"pytextrank: {py_results['mean']:>8.2f} ms (
|
|
279
|
+
" print(f\"pytextrank: {py_results['mean']:>8.2f} ms (\u00b1{py_results['std']:.2f})\")\n",
|
|
280
280
|
"\n",
|
|
281
281
|
" speedup = py_results['mean'] / rust_results['mean']\n",
|
|
282
282
|
" print(f\"Speedup: {speedup:>8.1f}x faster\")\n",
|
|
@@ -496,20 +496,20 @@
|
|
|
496
496
|
"\n",
|
|
497
497
|
"SMALL TEXT (21 tokens)\n",
|
|
498
498
|
"--------------------------------------------------\n",
|
|
499
|
-
"rapid_textrank (JSON API): 0.037 ms (
|
|
500
|
-
"pytextrank (extraction): 1.596 ms (
|
|
499
|
+
"rapid_textrank (JSON API): 0.037 ms (\u00b10.010)\n",
|
|
500
|
+
"pytextrank (extraction): 1.596 ms (\u00b10.582)\n",
|
|
501
501
|
"Speedup: 42.9x faster\n",
|
|
502
502
|
"\n",
|
|
503
503
|
"MEDIUM TEXT (138 tokens)\n",
|
|
504
504
|
"--------------------------------------------------\n",
|
|
505
|
-
"rapid_textrank (JSON API): 0.231 ms (
|
|
506
|
-
"pytextrank (extraction): 2.881 ms (
|
|
505
|
+
"rapid_textrank (JSON API): 0.231 ms (\u00b10.101)\n",
|
|
506
|
+
"pytextrank (extraction): 2.881 ms (\u00b10.472)\n",
|
|
507
507
|
"Speedup: 12.5x faster\n",
|
|
508
508
|
"\n",
|
|
509
509
|
"LARGE TEXT (838 tokens)\n",
|
|
510
510
|
"--------------------------------------------------\n",
|
|
511
|
-
"rapid_textrank (JSON API): 0.869 ms (
|
|
512
|
-
"pytextrank (extraction): 9.171 ms (
|
|
511
|
+
"rapid_textrank (JSON API): 0.869 ms (\u00b10.068)\n",
|
|
512
|
+
"pytextrank (extraction): 9.171 ms (\u00b10.891)\n",
|
|
513
513
|
"Speedup: 10.5x faster\n"
|
|
514
514
|
]
|
|
515
515
|
}
|
|
@@ -529,10 +529,10 @@
|
|
|
529
529
|
" print(\"-\" * 50)\n",
|
|
530
530
|
" \n",
|
|
531
531
|
" rapid_res = benchmark_rapid_extraction_only(tokens)\n",
|
|
532
|
-
" print(f\"rapid_textrank (JSON API): {rapid_res['mean']:>8.3f} ms (
|
|
532
|
+
" print(f\"rapid_textrank (JSON API): {rapid_res['mean']:>8.3f} ms (\u00b1{rapid_res['std']:.3f})\")\n",
|
|
533
533
|
" \n",
|
|
534
534
|
" py_res = benchmark_pytextrank_extraction_only(doc)\n",
|
|
535
|
-
" print(f\"pytextrank (extraction): {py_res['mean']:>8.3f} ms (
|
|
535
|
+
" print(f\"pytextrank (extraction): {py_res['mean']:>8.3f} ms (\u00b1{py_res['std']:.3f})\")\n",
|
|
536
536
|
" \n",
|
|
537
537
|
" speedup = py_res['mean'] / rapid_res['mean'] if rapid_res['mean'] > 0 else float('inf')\n",
|
|
538
538
|
" print(f\"Speedup: {speedup:>8.1f}x faster\")\n",
|
|
@@ -647,9 +647,9 @@
|
|
|
647
647
|
"text": [
|
|
648
648
|
"Benchmarking batch processing (100 documents)...\n",
|
|
649
649
|
"============================================================\n",
|
|
650
|
-
"rapid_textrank (batch JSON): 18.50 ms (
|
|
650
|
+
"rapid_textrank (batch JSON): 18.50 ms (\u00b10.37)\n",
|
|
651
651
|
" Per document: 0.185 ms\n",
|
|
652
|
-
"pytextrank (sequential): 3431.75 ms (
|
|
652
|
+
"pytextrank (sequential): 3431.75 ms (\u00b1849.32)\n",
|
|
653
653
|
" Per document: 34.317 ms\n",
|
|
654
654
|
"\n",
|
|
655
655
|
"Speedup: 185.5x faster\n"
|
|
@@ -697,11 +697,11 @@
|
|
|
697
697
|
"print(\"=\" * 60)\n",
|
|
698
698
|
"\n",
|
|
699
699
|
"rapid_mean, rapid_std = benchmark_rapid_batch(batch_tokens)\n",
|
|
700
|
-
"print(f\"rapid_textrank (batch JSON): {rapid_mean:>10.2f} ms (
|
|
700
|
+
"print(f\"rapid_textrank (batch JSON): {rapid_mean:>10.2f} ms (\u00b1{rapid_std:.2f})\")\n",
|
|
701
701
|
"print(f\" Per document: {rapid_mean/num_docs:>10.3f} ms\")\n",
|
|
702
702
|
"\n",
|
|
703
703
|
"py_mean, py_std = benchmark_pytextrank_batch(batch_texts)\n",
|
|
704
|
-
"print(f\"pytextrank (sequential): {py_mean:>10.2f} ms (
|
|
704
|
+
"print(f\"pytextrank (sequential): {py_mean:>10.2f} ms (\u00b1{py_std:.2f})\")\n",
|
|
705
705
|
"print(f\" Per document: {py_mean/num_docs:>10.3f} ms\")\n",
|
|
706
706
|
"\n",
|
|
707
707
|
"speedup = py_mean / rapid_mean\n",
|
|
@@ -978,4 +978,4 @@
|
|
|
978
978
|
},
|
|
979
979
|
"nbformat": 4,
|
|
980
980
|
"nbformat_minor": 5
|
|
981
|
-
}
|
|
981
|
+
}
|
|
@@ -4,7 +4,7 @@ build-backend = "maturin"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "rapid_textrank"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.1"
|
|
8
8
|
description = "High-performance TextRank implementation with Python bindings"
|
|
9
9
|
readme = { file = "README.md", content-type = "text/markdown" }
|
|
10
10
|
license = "MIT"
|
|
@@ -79,11 +79,16 @@ if SPACY_AVAILABLE:
|
|
|
79
79
|
"damping": 0.85,
|
|
80
80
|
"max_iterations": 100,
|
|
81
81
|
"convergence_threshold": 1e-6,
|
|
82
|
-
"window_size":
|
|
82
|
+
"window_size": 3,
|
|
83
83
|
"top_n": 10,
|
|
84
84
|
"min_phrase_length": 1,
|
|
85
85
|
"max_phrase_length": 4,
|
|
86
86
|
"score_aggregation": "sum",
|
|
87
|
+
"include_pos": ["ADJ", "NOUN", "PROPN", "VERB"],
|
|
88
|
+
"use_pos_in_nodes": True,
|
|
89
|
+
"phrase_grouping": "scrubbed_text",
|
|
90
|
+
"language": "en",
|
|
91
|
+
"stopwords": None,
|
|
87
92
|
},
|
|
88
93
|
)
|
|
89
94
|
def create_rapid_textrank(
|
|
@@ -97,6 +102,11 @@ if SPACY_AVAILABLE:
|
|
|
97
102
|
min_phrase_length: int,
|
|
98
103
|
max_phrase_length: int,
|
|
99
104
|
score_aggregation: str,
|
|
105
|
+
include_pos: Optional[List[str]],
|
|
106
|
+
use_pos_in_nodes: bool,
|
|
107
|
+
phrase_grouping: str,
|
|
108
|
+
language: str,
|
|
109
|
+
stopwords: Optional[List[str]],
|
|
100
110
|
):
|
|
101
111
|
"""Create a RustTextRank pipeline component."""
|
|
102
112
|
return RustTextRank(
|
|
@@ -110,6 +120,11 @@ if SPACY_AVAILABLE:
|
|
|
110
120
|
min_phrase_length=min_phrase_length,
|
|
111
121
|
max_phrase_length=max_phrase_length,
|
|
112
122
|
score_aggregation=score_aggregation,
|
|
123
|
+
include_pos=include_pos,
|
|
124
|
+
use_pos_in_nodes=use_pos_in_nodes,
|
|
125
|
+
phrase_grouping=phrase_grouping,
|
|
126
|
+
language=language,
|
|
127
|
+
stopwords=stopwords,
|
|
113
128
|
)
|
|
114
129
|
|
|
115
130
|
class RustTextRank:
|
|
@@ -135,11 +150,16 @@ if SPACY_AVAILABLE:
|
|
|
135
150
|
damping: float = 0.85,
|
|
136
151
|
max_iterations: int = 100,
|
|
137
152
|
convergence_threshold: float = 1e-6,
|
|
138
|
-
window_size: int =
|
|
153
|
+
window_size: int = 3,
|
|
139
154
|
top_n: int = 10,
|
|
140
155
|
min_phrase_length: int = 1,
|
|
141
156
|
max_phrase_length: int = 4,
|
|
142
157
|
score_aggregation: str = "sum",
|
|
158
|
+
include_pos: Optional[List[str]] = None,
|
|
159
|
+
use_pos_in_nodes: bool = True,
|
|
160
|
+
phrase_grouping: str = "scrubbed_text",
|
|
161
|
+
language: str = "en",
|
|
162
|
+
stopwords: Optional[List[str]] = None,
|
|
143
163
|
):
|
|
144
164
|
self.nlp = nlp
|
|
145
165
|
self.name = name
|
|
@@ -152,7 +172,14 @@ if SPACY_AVAILABLE:
|
|
|
152
172
|
"min_phrase_length": min_phrase_length,
|
|
153
173
|
"max_phrase_length": max_phrase_length,
|
|
154
174
|
"score_aggregation": score_aggregation,
|
|
175
|
+
"use_pos_in_nodes": use_pos_in_nodes,
|
|
176
|
+
"phrase_grouping": phrase_grouping,
|
|
177
|
+
"language": language,
|
|
155
178
|
}
|
|
179
|
+
if include_pos is not None:
|
|
180
|
+
self.config["include_pos"] = include_pos
|
|
181
|
+
if stopwords is not None:
|
|
182
|
+
self.config["stopwords"] = stopwords
|
|
156
183
|
|
|
157
184
|
# Register custom extensions
|
|
158
185
|
if not Doc.has_extension("phrases"):
|