text-summarizer-aweebtaku 1.2.6__py3-none-any.whl → 1.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- text_summarizer/__init__.py +3 -3
- text_summarizer/cli.py +96 -96
- text_summarizer/create_shortcuts.py +63 -63
- text_summarizer/data/tennis.csv +9 -0
- text_summarizer/summarizer.py +322 -322
- text_summarizer/ui.py +379 -379
- {text_summarizer_aweebtaku-1.2.6.dist-info → text_summarizer_aweebtaku-1.2.7.dist-info}/METADATA +217 -206
- text_summarizer_aweebtaku-1.2.7.dist-info/RECORD +13 -0
- {text_summarizer_aweebtaku-1.2.6.dist-info → text_summarizer_aweebtaku-1.2.7.dist-info}/WHEEL +1 -1
- {text_summarizer_aweebtaku-1.2.6.dist-info → text_summarizer_aweebtaku-1.2.7.dist-info/licenses}/LICENSE +20 -20
- text_summarizer_aweebtaku-1.2.6.dist-info/RECORD +0 -12
- {text_summarizer_aweebtaku-1.2.6.dist-info → text_summarizer_aweebtaku-1.2.7.dist-info}/entry_points.txt +0 -0
- {text_summarizer_aweebtaku-1.2.6.dist-info → text_summarizer_aweebtaku-1.2.7.dist-info}/top_level.txt +0 -0
text_summarizer/summarizer.py
CHANGED
|
@@ -1,323 +1,323 @@
|
|
|
1
|
-
import pandas as pd
|
|
2
|
-
import numpy as np
|
|
3
|
-
import nltk
|
|
4
|
-
import os
|
|
5
|
-
import zipfile
|
|
6
|
-
from pathlib import Path
|
|
7
|
-
from typing import Dict, List, Tuple, Optional, Union
|
|
8
|
-
from nltk.tokenize import sent_tokenize
|
|
9
|
-
from nltk.corpus import stopwords
|
|
10
|
-
from sklearn.metrics.pairwise import cosine_similarity
|
|
11
|
-
import networkx as nx
|
|
12
|
-
|
|
13
|
-
# Download necessary NLTK data
|
|
14
|
-
# nltk.download('punkt_tab')
|
|
15
|
-
# nltk.download('stopwords')
|
|
16
|
-
|
|
17
|
-
class TextSummarizer:
|
|
18
|
-
"""A class for summarizing text documents using GloVe embeddings and PageRank."""
|
|
19
|
-
|
|
20
|
-
def __init__(self, glove_path: Optional[str] = None, num_sentences: int = 5):
|
|
21
|
-
self.num_sentences = num_sentences
|
|
22
|
-
self.word_embeddings: Dict[str, np.ndarray] = {}
|
|
23
|
-
self.stop_words: set = set(stopwords.words('english'))
|
|
24
|
-
|
|
25
|
-
# Set default GloVe path
|
|
26
|
-
if glove_path is None:
|
|
27
|
-
glove_path = self._get_default_glove_path()
|
|
28
|
-
|
|
29
|
-
self.glove_path = glove_path
|
|
30
|
-
self._load_embeddings()
|
|
31
|
-
|
|
32
|
-
def _get_default_glove_path(self):
|
|
33
|
-
"""Get the default path for GloVe embeddings."""
|
|
34
|
-
# Use user's home directory for data
|
|
35
|
-
home_dir = Path.home()
|
|
36
|
-
glove_dir = home_dir / '.text_summarizer'
|
|
37
|
-
glove_dir.mkdir(exist_ok=True)
|
|
38
|
-
return glove_dir / 'glove.6B.100d.txt'
|
|
39
|
-
|
|
40
|
-
def _download_glove_embeddings(self):
|
|
41
|
-
"""Download GloVe embeddings if not present with improved error handling."""
|
|
42
|
-
import requests
|
|
43
|
-
|
|
44
|
-
print("GloVe embeddings not found. Downloading from Stanford NLP...")
|
|
45
|
-
|
|
46
|
-
# Create directory if it doesn't exist
|
|
47
|
-
glove_file = Path(self.glove_path)
|
|
48
|
-
glove_file.parent.mkdir(exist_ok=True)
|
|
49
|
-
|
|
50
|
-
# Download the zip file
|
|
51
|
-
url = "https://nlp.stanford.edu/data/glove.6B.zip"
|
|
52
|
-
zip_path = glove_file.parent / "glove.6B.zip"
|
|
53
|
-
|
|
54
|
-
headers = {
|
|
55
|
-
'User-Agent': 'TextSummarizer/1.1.0 (https://github.com/AWeebTaku/Summarizer)',
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
try:
|
|
59
|
-
print("Downloading GloVe embeddings (862 MB)...")
|
|
60
|
-
with requests.get(url, headers=headers, stream=True, timeout=30) as response:
|
|
61
|
-
response.raise_for_status()
|
|
62
|
-
|
|
63
|
-
total_size = int(response.headers.get('content-length', 0))
|
|
64
|
-
downloaded_size = 0
|
|
65
|
-
|
|
66
|
-
with open(zip_path, 'wb') as f:
|
|
67
|
-
for chunk in response.iter_content(chunk_size=8192):
|
|
68
|
-
if chunk:
|
|
69
|
-
f.write(chunk)
|
|
70
|
-
downloaded_size += len(chunk)
|
|
71
|
-
if total_size > 0:
|
|
72
|
-
progress = (downloaded_size / total_size) * 100
|
|
73
|
-
print(".1f", end='', flush=True)
|
|
74
|
-
|
|
75
|
-
print("\nDownload complete. Extracting...")
|
|
76
|
-
|
|
77
|
-
# Extract the specific file we need
|
|
78
|
-
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
|
79
|
-
zip_ref.extract('glove.6B.100d.txt', glove_file.parent)
|
|
80
|
-
|
|
81
|
-
# Verify extraction
|
|
82
|
-
if not glove_file.exists():
|
|
83
|
-
raise FileNotFoundError("Failed to extract GloVe file from zip")
|
|
84
|
-
|
|
85
|
-
# Clean up zip file
|
|
86
|
-
zip_path.unlink()
|
|
87
|
-
|
|
88
|
-
print(f"GloVe embeddings extracted to {self.glove_path}")
|
|
89
|
-
|
|
90
|
-
except requests.exceptions.RequestException as e:
|
|
91
|
-
print(f"Network error during download: {e}")
|
|
92
|
-
raise Exception(f"Failed to download GloVe embeddings: {e}")
|
|
93
|
-
except zipfile.BadZipFile as e:
|
|
94
|
-
print(f"Invalid zip file downloaded: {e}")
|
|
95
|
-
if zip_path.exists():
|
|
96
|
-
zip_path.unlink()
|
|
97
|
-
raise Exception("Downloaded file is not a valid zip archive")
|
|
98
|
-
except Exception as e:
|
|
99
|
-
print(f"Unexpected error during download: {e}")
|
|
100
|
-
if zip_path.exists():
|
|
101
|
-
zip_path.unlink()
|
|
102
|
-
raise
|
|
103
|
-
|
|
104
|
-
def _load_embeddings(self):
|
|
105
|
-
"""Load GloVe word embeddings from file with optimized memory usage."""
|
|
106
|
-
if not os.path.exists(self.glove_path):
|
|
107
|
-
self._download_glove_embeddings()
|
|
108
|
-
|
|
109
|
-
try:
|
|
110
|
-
print(f"Loading GloVe embeddings from {self.glove_path}...")
|
|
111
|
-
word_count = 0
|
|
112
|
-
|
|
113
|
-
with open(self.glove_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
114
|
-
for line_num, line in enumerate(f, 1):
|
|
115
|
-
line = line.strip()
|
|
116
|
-
if not line:
|
|
117
|
-
continue
|
|
118
|
-
|
|
119
|
-
try:
|
|
120
|
-
values = line.split()
|
|
121
|
-
if len(values) < 101: # word + 100 dimensions
|
|
122
|
-
continue
|
|
123
|
-
|
|
124
|
-
word = values[0]
|
|
125
|
-
coefs = np.asarray(values[1:101], dtype='float32') # Only take first 100 dims
|
|
126
|
-
self.word_embeddings[word] = coefs
|
|
127
|
-
word_count += 1
|
|
128
|
-
|
|
129
|
-
# Progress update every 50k words
|
|
130
|
-
if word_count % 50000 == 0:
|
|
131
|
-
print(f"Loaded {word_count} words...")
|
|
132
|
-
|
|
133
|
-
except (ValueError, IndexError) as e:
|
|
134
|
-
# Skip malformed lines
|
|
135
|
-
continue
|
|
136
|
-
|
|
137
|
-
print(f"Successfully loaded {len(self.word_embeddings)} word embeddings.")
|
|
138
|
-
|
|
139
|
-
if len(self.word_embeddings) == 0:
|
|
140
|
-
raise ValueError("No valid embeddings found in GloVe file")
|
|
141
|
-
|
|
142
|
-
except FileNotFoundError:
|
|
143
|
-
raise FileNotFoundError(f"GloVe file not found at {self.glove_path}")
|
|
144
|
-
except Exception as e:
|
|
145
|
-
raise Exception(f"Error loading GloVe embeddings: {e}")
|
|
146
|
-
|
|
147
|
-
def preprocess_sentences(self, df: pd.DataFrame) -> List[Dict]:
|
|
148
|
-
"""Tokenize articles into sentences and store metadata."""
|
|
149
|
-
all_sentences_data = []
|
|
150
|
-
sentence_counter_global = 0
|
|
151
|
-
for _, article_row in df.iterrows():
|
|
152
|
-
article_id = article_row['article_id']
|
|
153
|
-
article_text = article_row['article_text']
|
|
154
|
-
article_sentences = sent_tokenize(article_text)
|
|
155
|
-
for sent_idx, sentence_text in enumerate(article_sentences):
|
|
156
|
-
all_sentences_data.append({
|
|
157
|
-
'global_sentence_idx': sentence_counter_global,
|
|
158
|
-
'article_id': article_id,
|
|
159
|
-
'sentence_text': sentence_text,
|
|
160
|
-
'original_article_sentence_idx': sent_idx
|
|
161
|
-
})
|
|
162
|
-
sentence_counter_global += 1
|
|
163
|
-
return all_sentences_data
|
|
164
|
-
|
|
165
|
-
def clean_sentences(self, sentences):
|
|
166
|
-
"""Clean sentences: remove non-alphabetic, lowercase, remove stopwords."""
|
|
167
|
-
if not sentences:
|
|
168
|
-
return []
|
|
169
|
-
|
|
170
|
-
# Use pandas for efficient string operations
|
|
171
|
-
clean_sentences = pd.Series(sentences).str.replace(r"[^a-zA-Z\s]", " ", regex=True)
|
|
172
|
-
clean_sentences = clean_sentences.str.lower()
|
|
173
|
-
clean_sentences = clean_sentences.apply(self._remove_stopwords)
|
|
174
|
-
return clean_sentences.tolist()
|
|
175
|
-
|
|
176
|
-
def _remove_stopwords(self, sentence):
|
|
177
|
-
"""Remove stopwords from a sentence string."""
|
|
178
|
-
if not isinstance(sentence, str):
|
|
179
|
-
return ""
|
|
180
|
-
words = sentence.split()
|
|
181
|
-
filtered_words = [word for word in words if word not in self.stop_words]
|
|
182
|
-
return " ".join(filtered_words)
|
|
183
|
-
|
|
184
|
-
def compute_sentence_vectors(self, clean_sentences):
|
|
185
|
-
"""Compute sentence vectors using GloVe embeddings with vectorized operations."""
|
|
186
|
-
if not clean_sentences:
|
|
187
|
-
return []
|
|
188
|
-
|
|
189
|
-
sentence_vectors = []
|
|
190
|
-
for sentence in clean_sentences:
|
|
191
|
-
words = sentence.split()
|
|
192
|
-
if words:
|
|
193
|
-
# Get embeddings for all words in sentence
|
|
194
|
-
vectors = []
|
|
195
|
-
for word in words:
|
|
196
|
-
embedding = self.word_embeddings.get(word, np.zeros(100, dtype=np.float32))
|
|
197
|
-
vectors.append(embedding)
|
|
198
|
-
|
|
199
|
-
if vectors:
|
|
200
|
-
# Use mean of word vectors
|
|
201
|
-
v = np.mean(vectors, axis=0)
|
|
202
|
-
else:
|
|
203
|
-
v = np.zeros(100, dtype=np.float32)
|
|
204
|
-
else:
|
|
205
|
-
v = np.zeros(100, dtype=np.float32)
|
|
206
|
-
sentence_vectors.append(v)
|
|
207
|
-
|
|
208
|
-
return sentence_vectors
|
|
209
|
-
|
|
210
|
-
def compute_similarity_matrix(self, sentence_vectors):
|
|
211
|
-
"""Compute cosine similarity matrix using vectorized operations."""
|
|
212
|
-
if not sentence_vectors:
|
|
213
|
-
return np.array([])
|
|
214
|
-
|
|
215
|
-
# Convert to numpy array for vectorized operations
|
|
216
|
-
vectors = np.array(sentence_vectors)
|
|
217
|
-
n = len(vectors)
|
|
218
|
-
|
|
219
|
-
# Normalize vectors for faster cosine similarity
|
|
220
|
-
norms = np.linalg.norm(vectors, axis=1, keepdims=True)
|
|
221
|
-
norms[norms == 0] = 1 # Avoid division by zero
|
|
222
|
-
normalized_vectors = vectors / norms
|
|
223
|
-
|
|
224
|
-
# Compute cosine similarity matrix using matrix multiplication
|
|
225
|
-
sim_mat = np.dot(normalized_vectors, normalized_vectors.T)
|
|
226
|
-
|
|
227
|
-
# Ensure diagonal is zero (no self-similarity)
|
|
228
|
-
np.fill_diagonal(sim_mat, 0)
|
|
229
|
-
|
|
230
|
-
return sim_mat
|
|
231
|
-
|
|
232
|
-
def rank_sentences(self, sim_mat):
|
|
233
|
-
"""Rank sentences using PageRank with optimized parameters."""
|
|
234
|
-
if sim_mat.size == 0:
|
|
235
|
-
return {}
|
|
236
|
-
|
|
237
|
-
try:
|
|
238
|
-
# Create graph from similarity matrix
|
|
239
|
-
nx_graph = nx.from_numpy_array(sim_mat)
|
|
240
|
-
|
|
241
|
-
# Use optimized PageRank parameters
|
|
242
|
-
scores = nx.pagerank(
|
|
243
|
-
nx_graph,
|
|
244
|
-
alpha=0.85, # Damping factor
|
|
245
|
-
max_iter=100,
|
|
246
|
-
tol=1e-6
|
|
247
|
-
)
|
|
248
|
-
|
|
249
|
-
return scores
|
|
250
|
-
except Exception as e:
|
|
251
|
-
print(f"Warning: PageRank failed, using uniform scores: {e}")
|
|
252
|
-
# Fallback: return uniform scores
|
|
253
|
-
n = sim_mat.shape[0]
|
|
254
|
-
return {i: 1.0/n for i in range(n)}
|
|
255
|
-
|
|
256
|
-
def summarize_article(self, scored_sentences, article_id, df):
|
|
257
|
-
"""Generate summary for a specific article."""
|
|
258
|
-
article_sentences = [s for s in scored_sentences if s['article_id'] == article_id]
|
|
259
|
-
if not article_sentences:
|
|
260
|
-
return None, None
|
|
261
|
-
|
|
262
|
-
article_sentences.sort(key=lambda x: x['score'], reverse=True)
|
|
263
|
-
top_sentences = article_sentences[:self.num_sentences]
|
|
264
|
-
top_sentences.sort(key=lambda x: x['original_article_sentence_idx'])
|
|
265
|
-
summary = " ".join([s['sentence_text'] for s in top_sentences])
|
|
266
|
-
|
|
267
|
-
article_row = df[df['article_id'] == article_id]
|
|
268
|
-
if not article_row.empty:
|
|
269
|
-
article_text = article_row['article_text'].iloc[0]
|
|
270
|
-
return article_text, summary
|
|
271
|
-
return None, None
|
|
272
|
-
|
|
273
|
-
def summarize_all_articles(self, scored_sentences, df):
|
|
274
|
-
"""Generate summaries for all articles."""
|
|
275
|
-
summaries = {}
|
|
276
|
-
for _, article_row in df.iterrows():
|
|
277
|
-
article_id = article_row['article_id']
|
|
278
|
-
article_text, summary = self.summarize_article(scored_sentences, article_id, df)
|
|
279
|
-
if article_text and summary:
|
|
280
|
-
summaries[article_id] = {'article': article_text, 'summary': summary}
|
|
281
|
-
return summaries
|
|
282
|
-
|
|
283
|
-
def run_summarization(self, df):
|
|
284
|
-
"""Run the full summarization pipeline."""
|
|
285
|
-
sentences_data = self.preprocess_sentences(df)
|
|
286
|
-
sentences = [s['sentence_text'] for s in sentences_data]
|
|
287
|
-
clean_sentences = self.clean_sentences(sentences)
|
|
288
|
-
sentence_vectors = self.compute_sentence_vectors(clean_sentences)
|
|
289
|
-
sim_mat = self.compute_similarity_matrix(sentence_vectors)
|
|
290
|
-
scores = self.rank_sentences(sim_mat)
|
|
291
|
-
|
|
292
|
-
for i, sentence_data in enumerate(sentences_data):
|
|
293
|
-
sentence_data['score'] = scores[i]
|
|
294
|
-
|
|
295
|
-
return sentences_data
|
|
296
|
-
|
|
297
|
-
def summarize_text(self, text: str, num_sentences: Optional[int] = None) -> str:
|
|
298
|
-
"""
|
|
299
|
-
Summarize a single text document.
|
|
300
|
-
|
|
301
|
-
Args:
|
|
302
|
-
text (str): The text to summarize
|
|
303
|
-
num_sentences (int, optional): Number of sentences in summary. Defaults to self.num_sentences.
|
|
304
|
-
|
|
305
|
-
Returns:
|
|
306
|
-
str: The summarized text
|
|
307
|
-
"""
|
|
308
|
-
if not text or not text.strip():
|
|
309
|
-
return ""
|
|
310
|
-
|
|
311
|
-
if num_sentences is None:
|
|
312
|
-
num_sentences = self.num_sentences
|
|
313
|
-
|
|
314
|
-
# Create a temporary DataFrame
|
|
315
|
-
df = pd.DataFrame([{'article_id': 1, 'article_text': text}])
|
|
316
|
-
|
|
317
|
-
# Run summarization pipeline
|
|
318
|
-
scored_sentences = self.run_summarization(df)
|
|
319
|
-
|
|
320
|
-
# Get summary
|
|
321
|
-
_, summary = self.summarize_article(scored_sentences, 1, df)
|
|
322
|
-
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
import nltk
|
|
4
|
+
import os
|
|
5
|
+
import zipfile
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Dict, List, Tuple, Optional, Union
|
|
8
|
+
from nltk.tokenize import sent_tokenize
|
|
9
|
+
from nltk.corpus import stopwords
|
|
10
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
11
|
+
import networkx as nx
|
|
12
|
+
|
|
13
|
+
# Download necessary NLTK data
|
|
14
|
+
# nltk.download('punkt_tab')
|
|
15
|
+
# nltk.download('stopwords')
|
|
16
|
+
|
|
17
|
+
class TextSummarizer:
|
|
18
|
+
"""A class for summarizing text documents using GloVe embeddings and PageRank."""
|
|
19
|
+
|
|
20
|
+
def __init__(self, glove_path: Optional[str] = None, num_sentences: int = 5):
|
|
21
|
+
self.num_sentences = num_sentences
|
|
22
|
+
self.word_embeddings: Dict[str, np.ndarray] = {}
|
|
23
|
+
self.stop_words: set = set(stopwords.words('english'))
|
|
24
|
+
|
|
25
|
+
# Set default GloVe path
|
|
26
|
+
if glove_path is None:
|
|
27
|
+
glove_path = self._get_default_glove_path()
|
|
28
|
+
|
|
29
|
+
self.glove_path = glove_path
|
|
30
|
+
self._load_embeddings()
|
|
31
|
+
|
|
32
|
+
def _get_default_glove_path(self):
|
|
33
|
+
"""Get the default path for GloVe embeddings."""
|
|
34
|
+
# Use user's home directory for data
|
|
35
|
+
home_dir = Path.home()
|
|
36
|
+
glove_dir = home_dir / '.text_summarizer'
|
|
37
|
+
glove_dir.mkdir(exist_ok=True)
|
|
38
|
+
return glove_dir / 'glove.6B.100d.txt'
|
|
39
|
+
|
|
40
|
+
def _download_glove_embeddings(self):
|
|
41
|
+
"""Download GloVe embeddings if not present with improved error handling."""
|
|
42
|
+
import requests
|
|
43
|
+
|
|
44
|
+
print("GloVe embeddings not found. Downloading from Stanford NLP...")
|
|
45
|
+
|
|
46
|
+
# Create directory if it doesn't exist
|
|
47
|
+
glove_file = Path(self.glove_path)
|
|
48
|
+
glove_file.parent.mkdir(exist_ok=True)
|
|
49
|
+
|
|
50
|
+
# Download the zip file
|
|
51
|
+
url = "https://nlp.stanford.edu/data/glove.6B.zip"
|
|
52
|
+
zip_path = glove_file.parent / "glove.6B.zip"
|
|
53
|
+
|
|
54
|
+
headers = {
|
|
55
|
+
'User-Agent': 'TextSummarizer/1.1.0 (https://github.com/AWeebTaku/Summarizer)',
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
print("Downloading GloVe embeddings (862 MB)...")
|
|
60
|
+
with requests.get(url, headers=headers, stream=True, timeout=30) as response:
|
|
61
|
+
response.raise_for_status()
|
|
62
|
+
|
|
63
|
+
total_size = int(response.headers.get('content-length', 0))
|
|
64
|
+
downloaded_size = 0
|
|
65
|
+
|
|
66
|
+
with open(zip_path, 'wb') as f:
|
|
67
|
+
for chunk in response.iter_content(chunk_size=8192):
|
|
68
|
+
if chunk:
|
|
69
|
+
f.write(chunk)
|
|
70
|
+
downloaded_size += len(chunk)
|
|
71
|
+
if total_size > 0:
|
|
72
|
+
progress = (downloaded_size / total_size) * 100
|
|
73
|
+
print(".1f", end='', flush=True)
|
|
74
|
+
|
|
75
|
+
print("\nDownload complete. Extracting...")
|
|
76
|
+
|
|
77
|
+
# Extract the specific file we need
|
|
78
|
+
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
|
79
|
+
zip_ref.extract('glove.6B.100d.txt', glove_file.parent)
|
|
80
|
+
|
|
81
|
+
# Verify extraction
|
|
82
|
+
if not glove_file.exists():
|
|
83
|
+
raise FileNotFoundError("Failed to extract GloVe file from zip")
|
|
84
|
+
|
|
85
|
+
# Clean up zip file
|
|
86
|
+
zip_path.unlink()
|
|
87
|
+
|
|
88
|
+
print(f"GloVe embeddings extracted to {self.glove_path}")
|
|
89
|
+
|
|
90
|
+
except requests.exceptions.RequestException as e:
|
|
91
|
+
print(f"Network error during download: {e}")
|
|
92
|
+
raise Exception(f"Failed to download GloVe embeddings: {e}")
|
|
93
|
+
except zipfile.BadZipFile as e:
|
|
94
|
+
print(f"Invalid zip file downloaded: {e}")
|
|
95
|
+
if zip_path.exists():
|
|
96
|
+
zip_path.unlink()
|
|
97
|
+
raise Exception("Downloaded file is not a valid zip archive")
|
|
98
|
+
except Exception as e:
|
|
99
|
+
print(f"Unexpected error during download: {e}")
|
|
100
|
+
if zip_path.exists():
|
|
101
|
+
zip_path.unlink()
|
|
102
|
+
raise
|
|
103
|
+
|
|
104
|
+
def _load_embeddings(self):
|
|
105
|
+
"""Load GloVe word embeddings from file with optimized memory usage."""
|
|
106
|
+
if not os.path.exists(self.glove_path):
|
|
107
|
+
self._download_glove_embeddings()
|
|
108
|
+
|
|
109
|
+
try:
|
|
110
|
+
print(f"Loading GloVe embeddings from {self.glove_path}...")
|
|
111
|
+
word_count = 0
|
|
112
|
+
|
|
113
|
+
with open(self.glove_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
114
|
+
for line_num, line in enumerate(f, 1):
|
|
115
|
+
line = line.strip()
|
|
116
|
+
if not line:
|
|
117
|
+
continue
|
|
118
|
+
|
|
119
|
+
try:
|
|
120
|
+
values = line.split()
|
|
121
|
+
if len(values) < 101: # word + 100 dimensions
|
|
122
|
+
continue
|
|
123
|
+
|
|
124
|
+
word = values[0]
|
|
125
|
+
coefs = np.asarray(values[1:101], dtype='float32') # Only take first 100 dims
|
|
126
|
+
self.word_embeddings[word] = coefs
|
|
127
|
+
word_count += 1
|
|
128
|
+
|
|
129
|
+
# Progress update every 50k words
|
|
130
|
+
if word_count % 50000 == 0:
|
|
131
|
+
print(f"Loaded {word_count} words...")
|
|
132
|
+
|
|
133
|
+
except (ValueError, IndexError) as e:
|
|
134
|
+
# Skip malformed lines
|
|
135
|
+
continue
|
|
136
|
+
|
|
137
|
+
print(f"Successfully loaded {len(self.word_embeddings)} word embeddings.")
|
|
138
|
+
|
|
139
|
+
if len(self.word_embeddings) == 0:
|
|
140
|
+
raise ValueError("No valid embeddings found in GloVe file")
|
|
141
|
+
|
|
142
|
+
except FileNotFoundError:
|
|
143
|
+
raise FileNotFoundError(f"GloVe file not found at {self.glove_path}")
|
|
144
|
+
except Exception as e:
|
|
145
|
+
raise Exception(f"Error loading GloVe embeddings: {e}")
|
|
146
|
+
|
|
147
|
+
def preprocess_sentences(self, df: pd.DataFrame) -> List[Dict]:
|
|
148
|
+
"""Tokenize articles into sentences and store metadata."""
|
|
149
|
+
all_sentences_data = []
|
|
150
|
+
sentence_counter_global = 0
|
|
151
|
+
for _, article_row in df.iterrows():
|
|
152
|
+
article_id = article_row['article_id']
|
|
153
|
+
article_text = article_row['article_text']
|
|
154
|
+
article_sentences = sent_tokenize(article_text)
|
|
155
|
+
for sent_idx, sentence_text in enumerate(article_sentences):
|
|
156
|
+
all_sentences_data.append({
|
|
157
|
+
'global_sentence_idx': sentence_counter_global,
|
|
158
|
+
'article_id': article_id,
|
|
159
|
+
'sentence_text': sentence_text,
|
|
160
|
+
'original_article_sentence_idx': sent_idx
|
|
161
|
+
})
|
|
162
|
+
sentence_counter_global += 1
|
|
163
|
+
return all_sentences_data
|
|
164
|
+
|
|
165
|
+
def clean_sentences(self, sentences):
|
|
166
|
+
"""Clean sentences: remove non-alphabetic, lowercase, remove stopwords."""
|
|
167
|
+
if not sentences:
|
|
168
|
+
return []
|
|
169
|
+
|
|
170
|
+
# Use pandas for efficient string operations
|
|
171
|
+
clean_sentences = pd.Series(sentences).str.replace(r"[^a-zA-Z\s]", " ", regex=True)
|
|
172
|
+
clean_sentences = clean_sentences.str.lower()
|
|
173
|
+
clean_sentences = clean_sentences.apply(self._remove_stopwords)
|
|
174
|
+
return clean_sentences.tolist()
|
|
175
|
+
|
|
176
|
+
def _remove_stopwords(self, sentence):
|
|
177
|
+
"""Remove stopwords from a sentence string."""
|
|
178
|
+
if not isinstance(sentence, str):
|
|
179
|
+
return ""
|
|
180
|
+
words = sentence.split()
|
|
181
|
+
filtered_words = [word for word in words if word not in self.stop_words]
|
|
182
|
+
return " ".join(filtered_words)
|
|
183
|
+
|
|
184
|
+
def compute_sentence_vectors(self, clean_sentences):
|
|
185
|
+
"""Compute sentence vectors using GloVe embeddings with vectorized operations."""
|
|
186
|
+
if not clean_sentences:
|
|
187
|
+
return []
|
|
188
|
+
|
|
189
|
+
sentence_vectors = []
|
|
190
|
+
for sentence in clean_sentences:
|
|
191
|
+
words = sentence.split()
|
|
192
|
+
if words:
|
|
193
|
+
# Get embeddings for all words in sentence
|
|
194
|
+
vectors = []
|
|
195
|
+
for word in words:
|
|
196
|
+
embedding = self.word_embeddings.get(word, np.zeros(100, dtype=np.float32))
|
|
197
|
+
vectors.append(embedding)
|
|
198
|
+
|
|
199
|
+
if vectors:
|
|
200
|
+
# Use mean of word vectors
|
|
201
|
+
v = np.mean(vectors, axis=0)
|
|
202
|
+
else:
|
|
203
|
+
v = np.zeros(100, dtype=np.float32)
|
|
204
|
+
else:
|
|
205
|
+
v = np.zeros(100, dtype=np.float32)
|
|
206
|
+
sentence_vectors.append(v)
|
|
207
|
+
|
|
208
|
+
return sentence_vectors
|
|
209
|
+
|
|
210
|
+
def compute_similarity_matrix(self, sentence_vectors):
|
|
211
|
+
"""Compute cosine similarity matrix using vectorized operations."""
|
|
212
|
+
if not sentence_vectors:
|
|
213
|
+
return np.array([])
|
|
214
|
+
|
|
215
|
+
# Convert to numpy array for vectorized operations
|
|
216
|
+
vectors = np.array(sentence_vectors)
|
|
217
|
+
n = len(vectors)
|
|
218
|
+
|
|
219
|
+
# Normalize vectors for faster cosine similarity
|
|
220
|
+
norms = np.linalg.norm(vectors, axis=1, keepdims=True)
|
|
221
|
+
norms[norms == 0] = 1 # Avoid division by zero
|
|
222
|
+
normalized_vectors = vectors / norms
|
|
223
|
+
|
|
224
|
+
# Compute cosine similarity matrix using matrix multiplication
|
|
225
|
+
sim_mat = np.dot(normalized_vectors, normalized_vectors.T)
|
|
226
|
+
|
|
227
|
+
# Ensure diagonal is zero (no self-similarity)
|
|
228
|
+
np.fill_diagonal(sim_mat, 0)
|
|
229
|
+
|
|
230
|
+
return sim_mat
|
|
231
|
+
|
|
232
|
+
def rank_sentences(self, sim_mat):
|
|
233
|
+
"""Rank sentences using PageRank with optimized parameters."""
|
|
234
|
+
if sim_mat.size == 0:
|
|
235
|
+
return {}
|
|
236
|
+
|
|
237
|
+
try:
|
|
238
|
+
# Create graph from similarity matrix
|
|
239
|
+
nx_graph = nx.from_numpy_array(sim_mat)
|
|
240
|
+
|
|
241
|
+
# Use optimized PageRank parameters
|
|
242
|
+
scores = nx.pagerank(
|
|
243
|
+
nx_graph,
|
|
244
|
+
alpha=0.85, # Damping factor
|
|
245
|
+
max_iter=100,
|
|
246
|
+
tol=1e-6
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
return scores
|
|
250
|
+
except Exception as e:
|
|
251
|
+
print(f"Warning: PageRank failed, using uniform scores: {e}")
|
|
252
|
+
# Fallback: return uniform scores
|
|
253
|
+
n = sim_mat.shape[0]
|
|
254
|
+
return {i: 1.0/n for i in range(n)}
|
|
255
|
+
|
|
256
|
+
def summarize_article(self, scored_sentences, article_id, df):
|
|
257
|
+
"""Generate summary for a specific article."""
|
|
258
|
+
article_sentences = [s for s in scored_sentences if s['article_id'] == article_id]
|
|
259
|
+
if not article_sentences:
|
|
260
|
+
return None, None
|
|
261
|
+
|
|
262
|
+
article_sentences.sort(key=lambda x: x['score'], reverse=True)
|
|
263
|
+
top_sentences = article_sentences[:self.num_sentences]
|
|
264
|
+
top_sentences.sort(key=lambda x: x['original_article_sentence_idx'])
|
|
265
|
+
summary = " ".join([s['sentence_text'] for s in top_sentences])
|
|
266
|
+
|
|
267
|
+
article_row = df[df['article_id'] == article_id]
|
|
268
|
+
if not article_row.empty:
|
|
269
|
+
article_text = article_row['article_text'].iloc[0]
|
|
270
|
+
return article_text, summary
|
|
271
|
+
return None, None
|
|
272
|
+
|
|
273
|
+
def summarize_all_articles(self, scored_sentences, df):
|
|
274
|
+
"""Generate summaries for all articles."""
|
|
275
|
+
summaries = {}
|
|
276
|
+
for _, article_row in df.iterrows():
|
|
277
|
+
article_id = article_row['article_id']
|
|
278
|
+
article_text, summary = self.summarize_article(scored_sentences, article_id, df)
|
|
279
|
+
if article_text and summary:
|
|
280
|
+
summaries[article_id] = {'article': article_text, 'summary': summary}
|
|
281
|
+
return summaries
|
|
282
|
+
|
|
283
|
+
def run_summarization(self, df):
|
|
284
|
+
"""Run the full summarization pipeline."""
|
|
285
|
+
sentences_data = self.preprocess_sentences(df)
|
|
286
|
+
sentences = [s['sentence_text'] for s in sentences_data]
|
|
287
|
+
clean_sentences = self.clean_sentences(sentences)
|
|
288
|
+
sentence_vectors = self.compute_sentence_vectors(clean_sentences)
|
|
289
|
+
sim_mat = self.compute_similarity_matrix(sentence_vectors)
|
|
290
|
+
scores = self.rank_sentences(sim_mat)
|
|
291
|
+
|
|
292
|
+
for i, sentence_data in enumerate(sentences_data):
|
|
293
|
+
sentence_data['score'] = scores[i]
|
|
294
|
+
|
|
295
|
+
return sentences_data
|
|
296
|
+
|
|
297
|
+
def summarize_text(self, text: str, num_sentences: Optional[int] = None) -> str:
|
|
298
|
+
"""
|
|
299
|
+
Summarize a single text document.
|
|
300
|
+
|
|
301
|
+
Args:
|
|
302
|
+
text (str): The text to summarize
|
|
303
|
+
num_sentences (int, optional): Number of sentences in summary. Defaults to self.num_sentences.
|
|
304
|
+
|
|
305
|
+
Returns:
|
|
306
|
+
str: The summarized text
|
|
307
|
+
"""
|
|
308
|
+
if not text or not text.strip():
|
|
309
|
+
return ""
|
|
310
|
+
|
|
311
|
+
if num_sentences is None:
|
|
312
|
+
num_sentences = self.num_sentences
|
|
313
|
+
|
|
314
|
+
# Create a temporary DataFrame
|
|
315
|
+
df = pd.DataFrame([{'article_id': 1, 'article_text': text}])
|
|
316
|
+
|
|
317
|
+
# Run summarization pipeline
|
|
318
|
+
scored_sentences = self.run_summarization(df)
|
|
319
|
+
|
|
320
|
+
# Get summary
|
|
321
|
+
_, summary = self.summarize_article(scored_sentences, 1, df)
|
|
322
|
+
|
|
323
323
|
return summary if summary else text
|