text-summarizer-aweebtaku 1.0.2__tar.gz → 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {text_summarizer_aweebtaku-1.0.2/text_summarizer_aweebtaku.egg-info → text_summarizer_aweebtaku-1.1.0}/PKG-INFO +10 -4
- {text_summarizer_aweebtaku-1.0.2 → text_summarizer_aweebtaku-1.1.0}/README.md +8 -3
- {text_summarizer_aweebtaku-1.0.2 → text_summarizer_aweebtaku-1.1.0}/requirements.txt +2 -1
- {text_summarizer_aweebtaku-1.0.2 → text_summarizer_aweebtaku-1.1.0}/setup.py +2 -2
- {text_summarizer_aweebtaku-1.0.2 → text_summarizer_aweebtaku-1.1.0}/text_summarizer/summarizer.py +62 -43
- {text_summarizer_aweebtaku-1.0.2 → text_summarizer_aweebtaku-1.1.0/text_summarizer_aweebtaku.egg-info}/PKG-INFO +10 -4
- {text_summarizer_aweebtaku-1.0.2 → text_summarizer_aweebtaku-1.1.0}/text_summarizer_aweebtaku.egg-info/requires.txt +1 -0
- {text_summarizer_aweebtaku-1.0.2 → text_summarizer_aweebtaku-1.1.0}/LICENSE +0 -0
- {text_summarizer_aweebtaku-1.0.2 → text_summarizer_aweebtaku-1.1.0}/MANIFEST.in +0 -0
- {text_summarizer_aweebtaku-1.0.2 → text_summarizer_aweebtaku-1.1.0}/setup.cfg +0 -0
- {text_summarizer_aweebtaku-1.0.2 → text_summarizer_aweebtaku-1.1.0}/text_summarizer/__init__.py +0 -0
- {text_summarizer_aweebtaku-1.0.2 → text_summarizer_aweebtaku-1.1.0}/text_summarizer/cli.py +0 -0
- {text_summarizer_aweebtaku-1.0.2 → text_summarizer_aweebtaku-1.1.0}/text_summarizer/data/tennis.csv +0 -0
- {text_summarizer_aweebtaku-1.0.2 → text_summarizer_aweebtaku-1.1.0}/text_summarizer/ui.py +0 -0
- {text_summarizer_aweebtaku-1.0.2 → text_summarizer_aweebtaku-1.1.0}/text_summarizer_aweebtaku.egg-info/SOURCES.txt +0 -0
- {text_summarizer_aweebtaku-1.0.2 → text_summarizer_aweebtaku-1.1.0}/text_summarizer_aweebtaku.egg-info/dependency_links.txt +0 -0
- {text_summarizer_aweebtaku-1.0.2 → text_summarizer_aweebtaku-1.1.0}/text_summarizer_aweebtaku.egg-info/entry_points.txt +0 -0
- {text_summarizer_aweebtaku-1.0.2 → text_summarizer_aweebtaku-1.1.0}/text_summarizer_aweebtaku.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: text-summarizer-aweebtaku
|
|
3
|
-
Version: 1.0
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Summary: A text summarization tool using GloVe embeddings and PageRank algorithm
|
|
5
5
|
Home-page: https://github.com/AWeebTaku/Summarizer
|
|
6
6
|
Author: Your Name
|
|
@@ -22,6 +22,7 @@ Requires-Dist: numpy
|
|
|
22
22
|
Requires-Dist: nltk
|
|
23
23
|
Requires-Dist: scikit-learn
|
|
24
24
|
Requires-Dist: networkx
|
|
25
|
+
Requires-Dist: requests
|
|
25
26
|
Dynamic: author
|
|
26
27
|
Dynamic: author-email
|
|
27
28
|
Dynamic: classifier
|
|
@@ -60,8 +61,6 @@ A Python-based text summarization tool that uses GloVe word embeddings and PageR
|
|
|
60
61
|
pip install text-summarizer-aweebtaku
|
|
61
62
|
```
|
|
62
63
|
|
|
63
|
-
**Note:** This package includes the GloVe word embeddings file (~400MB), so the installation may take some time.
|
|
64
|
-
|
|
65
64
|
### Install from Source
|
|
66
65
|
|
|
67
66
|
1. Clone the repository:
|
|
@@ -75,7 +74,14 @@ cd Summarizer
|
|
|
75
74
|
pip install -e .
|
|
76
75
|
```
|
|
77
76
|
|
|
78
|
-
|
|
77
|
+
### Download GloVe Embeddings
|
|
78
|
+
|
|
79
|
+
**No manual download required!** The package will automatically download GloVe embeddings (100d, ~400MB) on first use and cache them in your home directory (`~/.text_summarizer/`).
|
|
80
|
+
|
|
81
|
+
If you prefer to use your own GloVe file, you can specify the path:
|
|
82
|
+
```python
|
|
83
|
+
summarizer = TextSummarizer(glove_path='path/to/your/glove.6B.100d.txt')
|
|
84
|
+
```
|
|
79
85
|
|
|
80
86
|
## Usage
|
|
81
87
|
|
|
@@ -24,8 +24,6 @@ A Python-based text summarization tool that uses GloVe word embeddings and PageR
|
|
|
24
24
|
pip install text-summarizer-aweebtaku
|
|
25
25
|
```
|
|
26
26
|
|
|
27
|
-
**Note:** This package includes the GloVe word embeddings file (~400MB), so the installation may take some time.
|
|
28
|
-
|
|
29
27
|
### Install from Source
|
|
30
28
|
|
|
31
29
|
1. Clone the repository:
|
|
@@ -39,7 +37,14 @@ cd Summarizer
|
|
|
39
37
|
pip install -e .
|
|
40
38
|
```
|
|
41
39
|
|
|
42
|
-
|
|
40
|
+
### Download GloVe Embeddings
|
|
41
|
+
|
|
42
|
+
**No manual download required!** The package will automatically download GloVe embeddings (100d, ~400MB) on first use and cache them in your home directory (`~/.text_summarizer/`).
|
|
43
|
+
|
|
44
|
+
If you prefer to use your own GloVe file, you can specify the path:
|
|
45
|
+
```python
|
|
46
|
+
summarizer = TextSummarizer(glove_path='path/to/your/glove.6B.100d.txt')
|
|
47
|
+
```
|
|
43
48
|
|
|
44
49
|
## Usage
|
|
45
50
|
|
|
@@ -8,7 +8,7 @@ with open("requirements.txt", "r", encoding="utf-8") as fh:
|
|
|
8
8
|
|
|
9
9
|
setup(
|
|
10
10
|
name="text-summarizer-aweebtaku",
|
|
11
|
-
version="1.0
|
|
11
|
+
version="1.1.0",
|
|
12
12
|
author="Your Name",
|
|
13
13
|
author_email="your.email@example.com",
|
|
14
14
|
description="A text summarization tool using GloVe embeddings and PageRank algorithm",
|
|
@@ -37,6 +37,6 @@ setup(
|
|
|
37
37
|
},
|
|
38
38
|
include_package_data=True,
|
|
39
39
|
package_data={
|
|
40
|
-
"
|
|
40
|
+
"textsummarizer": ["data/*.csv"],
|
|
41
41
|
},
|
|
42
42
|
)
|
{text_summarizer_aweebtaku-1.0.2 → text_summarizer_aweebtaku-1.1.0}/text_summarizer/summarizer.py
RENAMED
|
@@ -2,14 +2,13 @@ import pandas as pd
|
|
|
2
2
|
import numpy as np
|
|
3
3
|
import nltk
|
|
4
4
|
import os
|
|
5
|
+
import requests
|
|
6
|
+
import zipfile
|
|
7
|
+
from pathlib import Path
|
|
5
8
|
from nltk.tokenize import sent_tokenize
|
|
6
9
|
from nltk.corpus import stopwords
|
|
7
10
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
8
11
|
import networkx as nx
|
|
9
|
-
import pkg_resources
|
|
10
|
-
import urllib.request
|
|
11
|
-
import zipfile
|
|
12
|
-
import shutil
|
|
13
12
|
|
|
14
13
|
# Download necessary NLTK data
|
|
15
14
|
# nltk.download('punkt_tab')
|
|
@@ -19,68 +18,88 @@ class TextSummarizer:
|
|
|
19
18
|
"""A class for summarizing text documents using GloVe embeddings and PageRank."""
|
|
20
19
|
|
|
21
20
|
def __init__(self, glove_path=None, num_sentences=5):
|
|
22
|
-
if glove_path is None:
|
|
23
|
-
# Try to find GloVe file in package data first
|
|
24
|
-
try:
|
|
25
|
-
glove_path = pkg_resources.resource_filename('text_summarizer', 'glove.6B.100d.txt/glove.6B.100d.txt')
|
|
26
|
-
except (FileNotFoundError, ModuleNotFoundError):
|
|
27
|
-
# Fallback to default path
|
|
28
|
-
glove_path = 'glove.6B.100d.txt/glove.6B.100d.txt'
|
|
29
|
-
|
|
30
|
-
# Download GloVe if it doesn't exist
|
|
31
|
-
if not os.path.exists(glove_path):
|
|
32
|
-
print("GloVe embeddings not found. Downloading...")
|
|
33
|
-
self._download_glove()
|
|
34
|
-
|
|
35
|
-
self.glove_path = glove_path
|
|
36
21
|
self.num_sentences = num_sentences
|
|
37
22
|
self.word_embeddings = {}
|
|
38
23
|
self.stop_words = set(stopwords.words('english'))
|
|
24
|
+
|
|
25
|
+
# Set default GloVe path
|
|
26
|
+
if glove_path is None:
|
|
27
|
+
glove_path = self._get_default_glove_path()
|
|
28
|
+
|
|
29
|
+
self.glove_path = glove_path
|
|
39
30
|
self._load_embeddings()
|
|
40
31
|
|
|
41
|
-
def
|
|
42
|
-
"""
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
32
|
+
def _get_default_glove_path(self):
|
|
33
|
+
"""Get the default path for GloVe embeddings."""
|
|
34
|
+
# Use user's home directory for data
|
|
35
|
+
home_dir = Path.home()
|
|
36
|
+
glove_dir = home_dir / '.text_summarizer'
|
|
37
|
+
glove_dir.mkdir(exist_ok=True)
|
|
38
|
+
return glove_dir / 'glove.6B.100d.txt'
|
|
39
|
+
|
|
40
|
+
def _download_glove_embeddings(self):
|
|
41
|
+
"""Download GloVe embeddings if not present."""
|
|
42
|
+
print("GloVe embeddings not found. Downloading from Stanford NLP...")
|
|
43
|
+
|
|
44
|
+
# Create directory if it doesn't exist
|
|
45
|
+
glove_file = Path(self.glove_path)
|
|
46
|
+
glove_file.parent.mkdir(exist_ok=True)
|
|
47
|
+
|
|
48
|
+
# Download the zip file
|
|
49
|
+
url = "https://nlp.stanford.edu/data/glove.6B.zip"
|
|
50
|
+
zip_path = glove_file.parent / "glove.6B.zip"
|
|
51
|
+
|
|
48
52
|
try:
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
+
print("Downloading GloVe embeddings (862 MB)...")
|
|
54
|
+
response = requests.get(url, stream=True)
|
|
55
|
+
response.raise_for_status()
|
|
56
|
+
|
|
57
|
+
total_size = int(response.headers.get('content-length', 0))
|
|
58
|
+
downloaded_size = 0
|
|
59
|
+
|
|
60
|
+
with open(zip_path, 'wb') as f:
|
|
61
|
+
for chunk in response.iter_content(chunk_size=8192):
|
|
62
|
+
if chunk:
|
|
63
|
+
f.write(chunk)
|
|
64
|
+
downloaded_size += len(chunk)
|
|
65
|
+
if total_size > 0:
|
|
66
|
+
progress = (downloaded_size / total_size) * 100
|
|
67
|
+
print(".1f", end='', flush=True)
|
|
68
|
+
|
|
69
|
+
print("\nDownload complete. Extracting...")
|
|
70
|
+
|
|
53
71
|
# Extract the specific file we need
|
|
54
|
-
print("Extracting GloVe embeddings...")
|
|
55
72
|
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
# Clean up
|
|
65
|
-
os.remove(zip_path)
|
|
66
|
-
print("GloVe embeddings downloaded successfully!")
|
|
67
|
-
|
|
73
|
+
zip_ref.extract('glove.6B.100d.txt', glove_file.parent)
|
|
74
|
+
|
|
75
|
+
# Clean up zip file
|
|
76
|
+
zip_path.unlink()
|
|
77
|
+
|
|
78
|
+
print(f"GloVe embeddings extracted to {self.glove_path}")
|
|
79
|
+
|
|
68
80
|
except Exception as e:
|
|
69
81
|
print(f"Failed to download GloVe embeddings: {e}")
|
|
70
|
-
print("Please download manually from:
|
|
82
|
+
print("Please download manually from: https://nlp.stanford.edu/data/glove.6B.zip")
|
|
71
83
|
raise
|
|
72
84
|
|
|
73
85
|
def _load_embeddings(self):
|
|
74
86
|
"""Load GloVe word embeddings from file."""
|
|
87
|
+
if not os.path.exists(self.glove_path):
|
|
88
|
+
self._download_glove_embeddings()
|
|
89
|
+
|
|
75
90
|
try:
|
|
91
|
+
print(f"Loading GloVe embeddings from {self.glove_path}...")
|
|
76
92
|
with open(self.glove_path, 'r', encoding='utf-8') as f:
|
|
77
93
|
for line in f:
|
|
78
94
|
values = line.split()
|
|
79
95
|
word = values[0]
|
|
80
96
|
coefs = np.asarray(values[1:], dtype='float32')
|
|
81
97
|
self.word_embeddings[word] = coefs
|
|
98
|
+
print(f"Loaded {len(self.word_embeddings)} word embeddings.")
|
|
82
99
|
except FileNotFoundError:
|
|
83
100
|
raise FileNotFoundError(f"GloVe file not found at {self.glove_path}")
|
|
101
|
+
except Exception as e:
|
|
102
|
+
raise Exception(f"Error loading GloVe embeddings: {e}")
|
|
84
103
|
|
|
85
104
|
def load_data(self):
|
|
86
105
|
"""Load data interactively."""
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: text-summarizer-aweebtaku
|
|
3
|
-
Version: 1.0
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Summary: A text summarization tool using GloVe embeddings and PageRank algorithm
|
|
5
5
|
Home-page: https://github.com/AWeebTaku/Summarizer
|
|
6
6
|
Author: Your Name
|
|
@@ -22,6 +22,7 @@ Requires-Dist: numpy
|
|
|
22
22
|
Requires-Dist: nltk
|
|
23
23
|
Requires-Dist: scikit-learn
|
|
24
24
|
Requires-Dist: networkx
|
|
25
|
+
Requires-Dist: requests
|
|
25
26
|
Dynamic: author
|
|
26
27
|
Dynamic: author-email
|
|
27
28
|
Dynamic: classifier
|
|
@@ -60,8 +61,6 @@ A Python-based text summarization tool that uses GloVe word embeddings and PageR
|
|
|
60
61
|
pip install text-summarizer-aweebtaku
|
|
61
62
|
```
|
|
62
63
|
|
|
63
|
-
**Note:** This package includes the GloVe word embeddings file (~400MB), so the installation may take some time.
|
|
64
|
-
|
|
65
64
|
### Install from Source
|
|
66
65
|
|
|
67
66
|
1. Clone the repository:
|
|
@@ -75,7 +74,14 @@ cd Summarizer
|
|
|
75
74
|
pip install -e .
|
|
76
75
|
```
|
|
77
76
|
|
|
78
|
-
|
|
77
|
+
### Download GloVe Embeddings
|
|
78
|
+
|
|
79
|
+
**No manual download required!** The package will automatically download GloVe embeddings (100d, ~400MB) on first use and cache them in your home directory (`~/.text_summarizer/`).
|
|
80
|
+
|
|
81
|
+
If you prefer to use your own GloVe file, you can specify the path:
|
|
82
|
+
```python
|
|
83
|
+
summarizer = TextSummarizer(glove_path='path/to/your/glove.6B.100d.txt')
|
|
84
|
+
```
|
|
79
85
|
|
|
80
86
|
## Usage
|
|
81
87
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{text_summarizer_aweebtaku-1.0.2 → text_summarizer_aweebtaku-1.1.0}/text_summarizer/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{text_summarizer_aweebtaku-1.0.2 → text_summarizer_aweebtaku-1.1.0}/text_summarizer/data/tennis.csv
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|