text-summarizer-aweebtaku 1.1.0__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,3 @@
1
1
  from .summarizer import TextSummarizer
2
2
 
3
- __version__ = "1.0.0"
3
+ __version__ = "1.2.1"
@@ -2,9 +2,9 @@ import pandas as pd
2
2
  import numpy as np
3
3
  import nltk
4
4
  import os
5
- import requests
6
5
  import zipfile
7
6
  from pathlib import Path
7
+ from typing import Dict, List, Tuple, Optional, Union
8
8
  from nltk.tokenize import sent_tokenize
9
9
  from nltk.corpus import stopwords
10
10
  from sklearn.metrics.pairwise import cosine_similarity
@@ -17,10 +17,10 @@ import networkx as nx
17
17
  class TextSummarizer:
18
18
  """A class for summarizing text documents using GloVe embeddings and PageRank."""
19
19
 
20
- def __init__(self, glove_path=None, num_sentences=5):
20
+ def __init__(self, glove_path: Optional[str] = None, num_sentences: int = 5):
21
21
  self.num_sentences = num_sentences
22
- self.word_embeddings = {}
23
- self.stop_words = set(stopwords.words('english'))
22
+ self.word_embeddings: Dict[str, np.ndarray] = {}
23
+ self.stop_words: set = set(stopwords.words('english'))
24
24
 
25
25
  # Set default GloVe path
26
26
  if glove_path is None:
@@ -38,7 +38,9 @@ class TextSummarizer:
38
38
  return glove_dir / 'glove.6B.100d.txt'
39
39
 
40
40
  def _download_glove_embeddings(self):
41
- """Download GloVe embeddings if not present."""
41
+ """Download GloVe embeddings if not present with improved error handling."""
42
+ import requests
43
+
42
44
  print("GloVe embeddings not found. Downloading from Stanford NLP...")
43
45
 
44
46
  # Create directory if it doesn't exist
@@ -49,22 +51,26 @@ class TextSummarizer:
49
51
  url = "https://nlp.stanford.edu/data/glove.6B.zip"
50
52
  zip_path = glove_file.parent / "glove.6B.zip"
51
53
 
54
+ headers = {
55
+ 'User-Agent': 'TextSummarizer/1.1.0 (https://github.com/AWeebTaku/Summarizer)',
56
+ }
57
+
52
58
  try:
53
59
  print("Downloading GloVe embeddings (862 MB)...")
54
- response = requests.get(url, stream=True)
55
- response.raise_for_status()
60
+ with requests.get(url, headers=headers, stream=True, timeout=30) as response:
61
+ response.raise_for_status()
56
62
 
57
- total_size = int(response.headers.get('content-length', 0))
58
- downloaded_size = 0
63
+ total_size = int(response.headers.get('content-length', 0))
64
+ downloaded_size = 0
59
65
 
60
- with open(zip_path, 'wb') as f:
61
- for chunk in response.iter_content(chunk_size=8192):
62
- if chunk:
63
- f.write(chunk)
64
- downloaded_size += len(chunk)
65
- if total_size > 0:
66
- progress = (downloaded_size / total_size) * 100
67
- print(".1f", end='', flush=True)
66
+ with open(zip_path, 'wb') as f:
67
+ for chunk in response.iter_content(chunk_size=8192):
68
+ if chunk:
69
+ f.write(chunk)
70
+ downloaded_size += len(chunk)
71
+ if total_size > 0:
72
+ progress = (downloaded_size / total_size) * 100
73
+ print(".1f", end='', flush=True)
68
74
 
69
75
  print("\nDownload complete. Extracting...")
70
76
 
@@ -72,102 +78,73 @@ class TextSummarizer:
72
78
  with zipfile.ZipFile(zip_path, 'r') as zip_ref:
73
79
  zip_ref.extract('glove.6B.100d.txt', glove_file.parent)
74
80
 
81
+ # Verify extraction
82
+ if not glove_file.exists():
83
+ raise FileNotFoundError("Failed to extract GloVe file from zip")
84
+
75
85
  # Clean up zip file
76
86
  zip_path.unlink()
77
87
 
78
88
  print(f"GloVe embeddings extracted to {self.glove_path}")
79
89
 
90
+ except requests.exceptions.RequestException as e:
91
+ print(f"Network error during download: {e}")
92
+ raise Exception(f"Failed to download GloVe embeddings: {e}")
93
+ except zipfile.BadZipFile as e:
94
+ print(f"Invalid zip file downloaded: {e}")
95
+ if zip_path.exists():
96
+ zip_path.unlink()
97
+ raise Exception("Downloaded file is not a valid zip archive")
80
98
  except Exception as e:
81
- print(f"Failed to download GloVe embeddings: {e}")
82
- print("Please download manually from: https://nlp.stanford.edu/data/glove.6B.zip")
99
+ print(f"Unexpected error during download: {e}")
100
+ if zip_path.exists():
101
+ zip_path.unlink()
83
102
  raise
84
103
 
85
104
  def _load_embeddings(self):
86
- """Load GloVe word embeddings from file."""
105
+ """Load GloVe word embeddings from file with optimized memory usage."""
87
106
  if not os.path.exists(self.glove_path):
88
107
  self._download_glove_embeddings()
89
108
 
90
109
  try:
91
110
  print(f"Loading GloVe embeddings from {self.glove_path}...")
92
- with open(self.glove_path, 'r', encoding='utf-8') as f:
93
- for line in f:
94
- values = line.split()
95
- word = values[0]
96
- coefs = np.asarray(values[1:], dtype='float32')
97
- self.word_embeddings[word] = coefs
98
- print(f"Loaded {len(self.word_embeddings)} word embeddings.")
99
- except FileNotFoundError:
100
- raise FileNotFoundError(f"GloVe file not found at {self.glove_path}")
101
- except Exception as e:
102
- raise Exception(f"Error loading GloVe embeddings: {e}")
111
+ word_count = 0
103
112
 
104
- def load_data(self):
105
- """Load data interactively."""
106
- while True:
107
- choice = input("Enter 'P' to paste a single article,\n'U' to upload a CSV with multiple articles,\n'C' to create a new CSV with multiple articles: ").upper()
108
- df = pd.DataFrame()
109
- save_csv = True
110
-
111
- if choice == 'P':
112
- article_text = input("Paste your article text here:\n")
113
- df = pd.DataFrame([{'article_id': 1, 'article_text': article_text}])
114
- print('DataFrame created from single article.')
115
- save_csv = False
116
- break
117
- elif choice == 'U':
118
- print("You chose to load an existing CSV file. It should contain 'article_id' and 'article_text' columns.")
119
- save_csv = False
120
- while True:
121
- file_name = input("Enter the name of the CSV file (e.g., 'tennis.csv') or type 'cancel' to go back: ").strip()
122
- if file_name.lower() == 'cancel':
123
- break
124
- if os.path.exists(file_name) and file_name.lower().endswith('.csv'):
125
- try:
126
- df = pd.read_csv(file_name)
127
- print(f'CSV file "{file_name}" loaded successfully.')
128
- break
129
- except Exception as e:
130
- print(f"Error reading file '{file_name}': {e}")
131
- else:
132
- print(f"File '{file_name}' not found or is not a CSV. Please try again.")
133
- if not df.empty:
134
- break
135
- elif choice == 'C':
136
- print("You've chosen to create a CSV with multiple articles. Enter 'done' for article ID when finished.")
137
- articles_data = []
138
- article_counter = 1
139
- while True:
140
- article_id_input = input(f"Enter article ID for article {article_counter} (or 'done' to finish): ").strip()
141
- if article_id_input.lower() == 'done':
142
- break
143
- try:
144
- article_id = int(article_id_input)
145
- except ValueError:
146
- print("Invalid Article ID. Please enter a number or 'done'.")
113
+ with open(self.glove_path, 'r', encoding='utf-8', errors='ignore') as f:
114
+ for line_num, line in enumerate(f, 1):
115
+ line = line.strip()
116
+ if not line:
147
117
  continue
148
- article_text = input("Enter article text:\n").strip()
149
- if not article_text:
150
- print("Article text cannot be empty. Please try again.")
118
+
119
+ try:
120
+ values = line.split()
121
+ if len(values) < 101: # word + 100 dimensions
122
+ continue
123
+
124
+ word = values[0]
125
+ coefs = np.asarray(values[1:101], dtype='float32') # Only take first 100 dims
126
+ self.word_embeddings[word] = coefs
127
+ word_count += 1
128
+
129
+ # Progress update every 50k words
130
+ if word_count % 50000 == 0:
131
+ print(f"Loaded {word_count} words...")
132
+
133
+ except (ValueError, IndexError) as e:
134
+ # Skip malformed lines
151
135
  continue
152
- articles_data.append({'article_id': article_id, 'article_text': article_text})
153
- article_counter += 1
154
- if articles_data:
155
- df = pd.DataFrame(articles_data)
156
- print('DataFrame created from multiple articles.')
157
- break
158
- else:
159
- print("No articles were entered. Please try again or choose another option.")
160
- else:
161
- print("Invalid choice. Please enter 'P', 'U', or 'C'.")
162
136
 
163
- if not df.empty and save_csv:
164
- df.to_csv('article.csv', index=False)
165
- print('CSV file "article.csv" created/updated successfully.')
166
- elif df.empty:
167
- print("No DataFrame was created.")
168
- return df
137
+ print(f"Successfully loaded {len(self.word_embeddings)} word embeddings.")
138
+
139
+ if len(self.word_embeddings) == 0:
140
+ raise ValueError("No valid embeddings found in GloVe file")
141
+
142
+ except FileNotFoundError:
143
+ raise FileNotFoundError(f"GloVe file not found at {self.glove_path}")
144
+ except Exception as e:
145
+ raise Exception(f"Error loading GloVe embeddings: {e}")
169
146
 
170
- def preprocess_sentences(self, df):
147
+ def preprocess_sentences(self, df: pd.DataFrame) -> List[Dict]:
171
148
  """Tokenize articles into sentences and store metadata."""
172
149
  all_sentences_data = []
173
150
  sentence_counter_global = 0
@@ -187,46 +164,94 @@ class TextSummarizer:
187
164
 
188
165
  def clean_sentences(self, sentences):
189
166
  """Clean sentences: remove non-alphabetic, lowercase, remove stopwords."""
167
+ if not sentences:
168
+ return []
169
+
170
+ # Use pandas for efficient string operations
190
171
  clean_sentences = pd.Series(sentences).str.replace(r"[^a-zA-Z\s]", " ", regex=True)
191
172
  clean_sentences = clean_sentences.str.lower()
192
- clean_sentences = clean_sentences.apply(lambda s: self._remove_stopwords(s.split()))
173
+ clean_sentences = clean_sentences.apply(self._remove_stopwords)
193
174
  return clean_sentences.tolist()
194
175
 
195
- def _remove_stopwords(self, sen):
196
- """Remove stopwords from a list of words."""
197
- return " ".join([word for word in sen if word not in self.stop_words])
176
+ def _remove_stopwords(self, sentence):
177
+ """Remove stopwords from a sentence string."""
178
+ if not isinstance(sentence, str):
179
+ return ""
180
+ words = sentence.split()
181
+ filtered_words = [word for word in words if word not in self.stop_words]
182
+ return " ".join(filtered_words)
198
183
 
199
184
  def compute_sentence_vectors(self, clean_sentences):
200
- """Compute sentence vectors using GloVe embeddings."""
185
+ """Compute sentence vectors using GloVe embeddings with vectorized operations."""
186
+ if not clean_sentences:
187
+ return []
188
+
201
189
  sentence_vectors = []
202
190
  for sentence in clean_sentences:
203
191
  words = sentence.split()
204
192
  if words:
205
- vectors = [self.word_embeddings.get(w, np.zeros(100)) for w in words]
206
- v = np.mean(vectors, axis=0)
193
+ # Get embeddings for all words in sentence
194
+ vectors = []
195
+ for word in words:
196
+ embedding = self.word_embeddings.get(word, np.zeros(100, dtype=np.float32))
197
+ vectors.append(embedding)
198
+
199
+ if vectors:
200
+ # Use mean of word vectors
201
+ v = np.mean(vectors, axis=0)
202
+ else:
203
+ v = np.zeros(100, dtype=np.float32)
207
204
  else:
208
- v = np.zeros(100)
205
+ v = np.zeros(100, dtype=np.float32)
209
206
  sentence_vectors.append(v)
207
+
210
208
  return sentence_vectors
211
209
 
212
210
  def compute_similarity_matrix(self, sentence_vectors):
213
- """Compute cosine similarity matrix."""
214
- n = len(sentence_vectors)
215
- sim_mat = np.zeros((n, n))
216
- for i in range(n):
217
- for j in range(n):
218
- if i != j:
219
- sim_mat[i][j] = cosine_similarity(
220
- sentence_vectors[i].reshape(1, -1),
221
- sentence_vectors[j].reshape(1, -1)
222
- )[0, 0]
211
+ """Compute cosine similarity matrix using vectorized operations."""
212
+ if not sentence_vectors:
213
+ return np.array([])
214
+
215
+ # Convert to numpy array for vectorized operations
216
+ vectors = np.array(sentence_vectors)
217
+ n = len(vectors)
218
+
219
+ # Normalize vectors for faster cosine similarity
220
+ norms = np.linalg.norm(vectors, axis=1, keepdims=True)
221
+ norms[norms == 0] = 1 # Avoid division by zero
222
+ normalized_vectors = vectors / norms
223
+
224
+ # Compute cosine similarity matrix using matrix multiplication
225
+ sim_mat = np.dot(normalized_vectors, normalized_vectors.T)
226
+
227
+ # Ensure diagonal is zero (no self-similarity)
228
+ np.fill_diagonal(sim_mat, 0)
229
+
223
230
  return sim_mat
224
231
 
225
232
  def rank_sentences(self, sim_mat):
226
- """Rank sentences using PageRank."""
227
- nx_graph = nx.from_numpy_array(sim_mat)
228
- scores = nx.pagerank(nx_graph)
229
- return scores
233
+ """Rank sentences using PageRank with optimized parameters."""
234
+ if sim_mat.size == 0:
235
+ return {}
236
+
237
+ try:
238
+ # Create graph from similarity matrix
239
+ nx_graph = nx.from_numpy_array(sim_mat)
240
+
241
+ # Use optimized PageRank parameters
242
+ scores = nx.pagerank(
243
+ nx_graph,
244
+ alpha=0.85, # Damping factor
245
+ max_iter=100,
246
+ tol=1e-6
247
+ )
248
+
249
+ return scores
250
+ except Exception as e:
251
+ print(f"Warning: PageRank failed, using uniform scores: {e}")
252
+ # Fallback: return uniform scores
253
+ n = sim_mat.shape[0]
254
+ return {i: 1.0/n for i in range(n)}
230
255
 
231
256
  def summarize_article(self, scored_sentences, article_id, df):
232
257
  """Generate summary for a specific article."""
@@ -267,4 +292,32 @@ class TextSummarizer:
267
292
  for i, sentence_data in enumerate(sentences_data):
268
293
  sentence_data['score'] = scores[i]
269
294
 
270
- return sentences_data
295
+ return sentences_data
296
+
297
+ def summarize_text(self, text: str, num_sentences: Optional[int] = None) -> str:
298
+ """
299
+ Summarize a single text document.
300
+
301
+ Args:
302
+ text (str): The text to summarize
303
+ num_sentences (int, optional): Number of sentences in summary. Defaults to self.num_sentences.
304
+
305
+ Returns:
306
+ str: The summarized text
307
+ """
308
+ if not text or not text.strip():
309
+ return ""
310
+
311
+ if num_sentences is None:
312
+ num_sentences = self.num_sentences
313
+
314
+ # Create a temporary DataFrame
315
+ df = pd.DataFrame([{'article_id': 1, 'article_text': text}])
316
+
317
+ # Run summarization pipeline
318
+ scored_sentences = self.run_summarization(df)
319
+
320
+ # Get summary
321
+ _, summary = self.summarize_article(scored_sentences, 1, df)
322
+
323
+ return summary if summary else text
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: text-summarizer-aweebtaku
3
- Version: 1.1.0
3
+ Version: 1.2.1
4
4
  Summary: A text summarization tool using GloVe embeddings and PageRank algorithm
5
5
  Home-page: https://github.com/AWeebTaku/Summarizer
6
6
  Author: Your Name
@@ -109,20 +109,20 @@ text-summarizer-gui
109
109
 
110
110
  ```python
111
111
  from text_summarizer import TextSummarizer
112
- import pandas as pd
113
112
 
114
- # Initialize summarizer
115
- summarizer = TextSummarizer(glove_path='glove.6B.100d.txt')
113
+ # Initialize summarizer (automatic GloVe download)
114
+ summarizer = TextSummarizer(num_sentences=3)
116
115
 
117
- # Load data
118
- df = pd.DataFrame([{'article_id': 1, 'article_text': 'Your text here...'}])
116
+ # Simple text summarization
117
+ text = "Your long text here..."
118
+ summary = summarizer.summarize_text(text)
119
+ print(summary)
119
120
 
120
- # Run summarization
121
+ # Advanced usage with DataFrame
122
+ import pandas as pd
123
+ df = pd.DataFrame([{'article_id': 1, 'article_text': text}])
121
124
  scored_sentences = summarizer.run_summarization(df)
122
-
123
- # Get summary for article ID 1
124
125
  article_text, summary = summarizer.summarize_article(scored_sentences, 1, df)
125
- print(summary)
126
126
  ```
127
127
 
128
128
  ## Data Format
@@ -0,0 +1,11 @@
1
+ text_summarizer/__init__.py,sha256=Dy8TQ4n2xYpHrFYOht_q0vVjpfGv9DNvao-dmOPUT-s,63
2
+ text_summarizer/cli.py,sha256=rWbSpT1gJ8kVcsTQ-ov6AZkfy5uUz2taAXeSnDEy0Zw,3773
3
+ text_summarizer/summarizer.py,sha256=80RamR76QFtOAZGdVGqy-Bi5xQb3WBbQ2pSYSnOnT5c,12733
4
+ text_summarizer/ui.py,sha256=Ky40zcr-_0zh5I7Kh4Bc8hKrEBdOALe5G4i3ukDJWts,16638
5
+ text_summarizer/data/tennis.csv,sha256=oEPZr4Dy6cmCDtdQ2QYJyJpERzQseuNJ53JP2XyIfBk,12943
6
+ text_summarizer_aweebtaku-1.2.1.dist-info/licenses/LICENSE,sha256=q53YqEH5OACuJ8YmE3i9pND509hapVaOX42ix2AMkZ8,1085
7
+ text_summarizer_aweebtaku-1.2.1.dist-info/METADATA,sha256=zEzfngmeV_kPi5aaeE8NdN5ZmUEeeeU1XziTfuY5K8I,5106
8
+ text_summarizer_aweebtaku-1.2.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
9
+ text_summarizer_aweebtaku-1.2.1.dist-info/entry_points.txt,sha256=a8n647pYmETd5RzGoOBcfYtIxxNFNu7P5zctmhpldNY,117
10
+ text_summarizer_aweebtaku-1.2.1.dist-info/top_level.txt,sha256=2s-4Uyii86k2iEeiIi0JghAXW47cEQ8qM_ONYPs9Gh8,16
11
+ text_summarizer_aweebtaku-1.2.1.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- text_summarizer/__init__.py,sha256=juqSmwYQLqoiZpyLfxE1sJKoYLNAe_-a3_LOIUV6J6g,63
2
- text_summarizer/cli.py,sha256=rWbSpT1gJ8kVcsTQ-ov6AZkfy5uUz2taAXeSnDEy0Zw,3773
3
- text_summarizer/summarizer.py,sha256=YbQOZTxje6ZCwzjncVzGsu1mNn8drLi0Nr4neZpIwQw,11838
4
- text_summarizer/ui.py,sha256=Ky40zcr-_0zh5I7Kh4Bc8hKrEBdOALe5G4i3ukDJWts,16638
5
- text_summarizer/data/tennis.csv,sha256=oEPZr4Dy6cmCDtdQ2QYJyJpERzQseuNJ53JP2XyIfBk,12943
6
- text_summarizer_aweebtaku-1.1.0.dist-info/licenses/LICENSE,sha256=q53YqEH5OACuJ8YmE3i9pND509hapVaOX42ix2AMkZ8,1085
7
- text_summarizer_aweebtaku-1.1.0.dist-info/METADATA,sha256=urXNWLQtvIgvOPBB1uWeQdhegQX7PeLy6z4yvkVf7TA,5039
8
- text_summarizer_aweebtaku-1.1.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
9
- text_summarizer_aweebtaku-1.1.0.dist-info/entry_points.txt,sha256=a8n647pYmETd5RzGoOBcfYtIxxNFNu7P5zctmhpldNY,117
10
- text_summarizer_aweebtaku-1.1.0.dist-info/top_level.txt,sha256=2s-4Uyii86k2iEeiIi0JghAXW47cEQ8qM_ONYPs9Gh8,16
11
- text_summarizer_aweebtaku-1.1.0.dist-info/RECORD,,