text-summarizer-aweebtaku 1.0.2__tar.gz → 1.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (19) hide show
  1. {text_summarizer_aweebtaku-1.0.2/text_summarizer_aweebtaku.egg-info → text_summarizer_aweebtaku-1.2.0}/PKG-INFO +19 -13
  2. {text_summarizer_aweebtaku-1.0.2 → text_summarizer_aweebtaku-1.2.0}/README.md +17 -12
  3. {text_summarizer_aweebtaku-1.0.2 → text_summarizer_aweebtaku-1.2.0}/requirements.txt +2 -1
  4. {text_summarizer_aweebtaku-1.0.2 → text_summarizer_aweebtaku-1.2.0}/setup.py +2 -2
  5. text_summarizer_aweebtaku-1.2.0/text_summarizer/summarizer.py +323 -0
  6. {text_summarizer_aweebtaku-1.0.2 → text_summarizer_aweebtaku-1.2.0/text_summarizer_aweebtaku.egg-info}/PKG-INFO +19 -13
  7. {text_summarizer_aweebtaku-1.0.2 → text_summarizer_aweebtaku-1.2.0}/text_summarizer_aweebtaku.egg-info/requires.txt +1 -0
  8. text_summarizer_aweebtaku-1.0.2/text_summarizer/summarizer.py +0 -251
  9. {text_summarizer_aweebtaku-1.0.2 → text_summarizer_aweebtaku-1.2.0}/LICENSE +0 -0
  10. {text_summarizer_aweebtaku-1.0.2 → text_summarizer_aweebtaku-1.2.0}/MANIFEST.in +0 -0
  11. {text_summarizer_aweebtaku-1.0.2 → text_summarizer_aweebtaku-1.2.0}/setup.cfg +0 -0
  12. {text_summarizer_aweebtaku-1.0.2 → text_summarizer_aweebtaku-1.2.0}/text_summarizer/__init__.py +0 -0
  13. {text_summarizer_aweebtaku-1.0.2 → text_summarizer_aweebtaku-1.2.0}/text_summarizer/cli.py +0 -0
  14. {text_summarizer_aweebtaku-1.0.2 → text_summarizer_aweebtaku-1.2.0}/text_summarizer/data/tennis.csv +0 -0
  15. {text_summarizer_aweebtaku-1.0.2 → text_summarizer_aweebtaku-1.2.0}/text_summarizer/ui.py +0 -0
  16. {text_summarizer_aweebtaku-1.0.2 → text_summarizer_aweebtaku-1.2.0}/text_summarizer_aweebtaku.egg-info/SOURCES.txt +0 -0
  17. {text_summarizer_aweebtaku-1.0.2 → text_summarizer_aweebtaku-1.2.0}/text_summarizer_aweebtaku.egg-info/dependency_links.txt +0 -0
  18. {text_summarizer_aweebtaku-1.0.2 → text_summarizer_aweebtaku-1.2.0}/text_summarizer_aweebtaku.egg-info/entry_points.txt +0 -0
  19. {text_summarizer_aweebtaku-1.0.2 → text_summarizer_aweebtaku-1.2.0}/text_summarizer_aweebtaku.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: text-summarizer-aweebtaku
3
- Version: 1.0.2
3
+ Version: 1.2.0
4
4
  Summary: A text summarization tool using GloVe embeddings and PageRank algorithm
5
5
  Home-page: https://github.com/AWeebTaku/Summarizer
6
6
  Author: Your Name
@@ -22,6 +22,7 @@ Requires-Dist: numpy
22
22
  Requires-Dist: nltk
23
23
  Requires-Dist: scikit-learn
24
24
  Requires-Dist: networkx
25
+ Requires-Dist: requests
25
26
  Dynamic: author
26
27
  Dynamic: author-email
27
28
  Dynamic: classifier
@@ -60,8 +61,6 @@ A Python-based text summarization tool that uses GloVe word embeddings and PageR
60
61
  pip install text-summarizer-aweebtaku
61
62
  ```
62
63
 
63
- **Note:** This package includes the GloVe word embeddings file (~400MB), so the installation may take some time.
64
-
65
64
  ### Install from Source
66
65
 
67
66
  1. Clone the repository:
@@ -75,7 +74,14 @@ cd Summarizer
75
74
  pip install -e .
76
75
  ```
77
76
 
78
- **Note:** The GloVe word embeddings are included in the package, so no additional download is required.
77
+ ### Download GloVe Embeddings
78
+
79
+ **No manual download required!** The package will automatically download GloVe embeddings (100d, ~400MB) on first use and cache them in your home directory (`~/.text_summarizer/`).
80
+
81
+ If you prefer to use your own GloVe file, you can specify the path:
82
+ ```python
83
+ summarizer = TextSummarizer(glove_path='path/to/your/glove.6B.100d.txt')
84
+ ```
79
85
 
80
86
  ## Usage
81
87
 
@@ -103,20 +109,20 @@ text-summarizer-gui
103
109
 
104
110
  ```python
105
111
  from text_summarizer import TextSummarizer
106
- import pandas as pd
107
112
 
108
- # Initialize summarizer
109
- summarizer = TextSummarizer(glove_path='glove.6B.100d.txt')
113
+ # Initialize summarizer (automatic GloVe download)
114
+ summarizer = TextSummarizer(num_sentences=3)
110
115
 
111
- # Load data
112
- df = pd.DataFrame([{'article_id': 1, 'article_text': 'Your text here...'}])
116
+ # Simple text summarization
117
+ text = "Your long text here..."
118
+ summary = summarizer.summarize_text(text)
119
+ print(summary)
113
120
 
114
- # Run summarization
121
+ # Advanced usage with DataFrame
122
+ import pandas as pd
123
+ df = pd.DataFrame([{'article_id': 1, 'article_text': text}])
115
124
  scored_sentences = summarizer.run_summarization(df)
116
-
117
- # Get summary for article ID 1
118
125
  article_text, summary = summarizer.summarize_article(scored_sentences, 1, df)
119
- print(summary)
120
126
  ```
121
127
 
122
128
  ## Data Format
@@ -24,8 +24,6 @@ A Python-based text summarization tool that uses GloVe word embeddings and PageR
24
24
  pip install text-summarizer-aweebtaku
25
25
  ```
26
26
 
27
- **Note:** This package includes the GloVe word embeddings file (~400MB), so the installation may take some time.
28
-
29
27
  ### Install from Source
30
28
 
31
29
  1. Clone the repository:
@@ -39,7 +37,14 @@ cd Summarizer
39
37
  pip install -e .
40
38
  ```
41
39
 
42
- **Note:** The GloVe word embeddings are included in the package, so no additional download is required.
40
+ ### Download GloVe Embeddings
41
+
42
+ **No manual download required!** The package will automatically download GloVe embeddings (100d, ~400MB) on first use and cache them in your home directory (`~/.text_summarizer/`).
43
+
44
+ If you prefer to use your own GloVe file, you can specify the path:
45
+ ```python
46
+ summarizer = TextSummarizer(glove_path='path/to/your/glove.6B.100d.txt')
47
+ ```
43
48
 
44
49
  ## Usage
45
50
 
@@ -67,20 +72,20 @@ text-summarizer-gui
67
72
 
68
73
  ```python
69
74
  from text_summarizer import TextSummarizer
70
- import pandas as pd
71
75
 
72
- # Initialize summarizer
73
- summarizer = TextSummarizer(glove_path='glove.6B.100d.txt')
76
+ # Initialize summarizer (automatic GloVe download)
77
+ summarizer = TextSummarizer(num_sentences=3)
74
78
 
75
- # Load data
76
- df = pd.DataFrame([{'article_id': 1, 'article_text': 'Your text here...'}])
79
+ # Simple text summarization
80
+ text = "Your long text here..."
81
+ summary = summarizer.summarize_text(text)
82
+ print(summary)
77
83
 
78
- # Run summarization
84
+ # Advanced usage with DataFrame
85
+ import pandas as pd
86
+ df = pd.DataFrame([{'article_id': 1, 'article_text': text}])
79
87
  scored_sentences = summarizer.run_summarization(df)
80
-
81
- # Get summary for article ID 1
82
88
  article_text, summary = summarizer.summarize_article(scored_sentences, 1, df)
83
- print(summary)
84
89
  ```
85
90
 
86
91
  ## Data Format
@@ -2,4 +2,5 @@ pandas
2
2
  numpy
3
3
  nltk
4
4
  scikit-learn
5
- networkx
5
+ networkx
6
+ requests
@@ -8,7 +8,7 @@ with open("requirements.txt", "r", encoding="utf-8") as fh:
8
8
 
9
9
  setup(
10
10
  name="text-summarizer-aweebtaku",
11
- version="1.0.2",
11
+ version="1.2.0",
12
12
  author="Your Name",
13
13
  author_email="your.email@example.com",
14
14
  description="A text summarization tool using GloVe embeddings and PageRank algorithm",
@@ -37,6 +37,6 @@ setup(
37
37
  },
38
38
  include_package_data=True,
39
39
  package_data={
40
- "text_summarizer": ["data/*.csv"],
40
+ "textsummarizer": ["data/*.csv"],
41
41
  },
42
42
  )
@@ -0,0 +1,323 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ import nltk
4
+ import os
5
+ import zipfile
6
+ from pathlib import Path
7
+ from typing import Dict, List, Tuple, Optional, Union
8
+ from nltk.tokenize import sent_tokenize
9
+ from nltk.corpus import stopwords
10
+ from sklearn.metrics.pairwise import cosine_similarity
11
+ import networkx as nx
12
+
13
+ # Download necessary NLTK data
14
+ # nltk.download('punkt_tab')
15
+ # nltk.download('stopwords')
16
+
17
+ class TextSummarizer:
18
+ """A class for summarizing text documents using GloVe embeddings and PageRank."""
19
+
20
+ def __init__(self, glove_path: Optional[str] = None, num_sentences: int = 5):
21
+ self.num_sentences = num_sentences
22
+ self.word_embeddings: Dict[str, np.ndarray] = {}
23
+ self.stop_words: set = set(stopwords.words('english'))
24
+
25
+ # Set default GloVe path
26
+ if glove_path is None:
27
+ glove_path = self._get_default_glove_path()
28
+
29
+ self.glove_path = glove_path
30
+ self._load_embeddings()
31
+
32
+ def _get_default_glove_path(self):
33
+ """Get the default path for GloVe embeddings."""
34
+ # Use user's home directory for data
35
+ home_dir = Path.home()
36
+ glove_dir = home_dir / '.text_summarizer'
37
+ glove_dir.mkdir(exist_ok=True)
38
+ return glove_dir / 'glove.6B.100d.txt'
39
+
40
+ def _download_glove_embeddings(self):
41
+ """Download GloVe embeddings if not present with improved error handling."""
42
+ import requests
43
+
44
+ print("GloVe embeddings not found. Downloading from Stanford NLP...")
45
+
46
+ # Create directory if it doesn't exist
47
+ glove_file = Path(self.glove_path)
48
+ glove_file.parent.mkdir(exist_ok=True)
49
+
50
+ # Download the zip file
51
+ url = "https://nlp.stanford.edu/data/glove.6B.zip"
52
+ zip_path = glove_file.parent / "glove.6B.zip"
53
+
54
+ headers = {
55
+ 'User-Agent': 'TextSummarizer/1.1.0 (https://github.com/AWeebTaku/Summarizer)',
56
+ }
57
+
58
+ try:
59
+ print("Downloading GloVe embeddings (862 MB)...")
60
+ with requests.get(url, headers=headers, stream=True, timeout=30) as response:
61
+ response.raise_for_status()
62
+
63
+ total_size = int(response.headers.get('content-length', 0))
64
+ downloaded_size = 0
65
+
66
+ with open(zip_path, 'wb') as f:
67
+ for chunk in response.iter_content(chunk_size=8192):
68
+ if chunk:
69
+ f.write(chunk)
70
+ downloaded_size += len(chunk)
71
+ if total_size > 0:
72
+ progress = (downloaded_size / total_size) * 100
73
+ print(".1f", end='', flush=True)
74
+
75
+ print("\nDownload complete. Extracting...")
76
+
77
+ # Extract the specific file we need
78
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
79
+ zip_ref.extract('glove.6B.100d.txt', glove_file.parent)
80
+
81
+ # Verify extraction
82
+ if not glove_file.exists():
83
+ raise FileNotFoundError("Failed to extract GloVe file from zip")
84
+
85
+ # Clean up zip file
86
+ zip_path.unlink()
87
+
88
+ print(f"GloVe embeddings extracted to {self.glove_path}")
89
+
90
+ except requests.exceptions.RequestException as e:
91
+ print(f"Network error during download: {e}")
92
+ raise Exception(f"Failed to download GloVe embeddings: {e}")
93
+ except zipfile.BadZipFile as e:
94
+ print(f"Invalid zip file downloaded: {e}")
95
+ if zip_path.exists():
96
+ zip_path.unlink()
97
+ raise Exception("Downloaded file is not a valid zip archive")
98
+ except Exception as e:
99
+ print(f"Unexpected error during download: {e}")
100
+ if zip_path.exists():
101
+ zip_path.unlink()
102
+ raise
103
+
104
+ def _load_embeddings(self):
105
+ """Load GloVe word embeddings from file with optimized memory usage."""
106
+ if not os.path.exists(self.glove_path):
107
+ self._download_glove_embeddings()
108
+
109
+ try:
110
+ print(f"Loading GloVe embeddings from {self.glove_path}...")
111
+ word_count = 0
112
+
113
+ with open(self.glove_path, 'r', encoding='utf-8', errors='ignore') as f:
114
+ for line_num, line in enumerate(f, 1):
115
+ line = line.strip()
116
+ if not line:
117
+ continue
118
+
119
+ try:
120
+ values = line.split()
121
+ if len(values) < 101: # word + 100 dimensions
122
+ continue
123
+
124
+ word = values[0]
125
+ coefs = np.asarray(values[1:101], dtype='float32') # Only take first 100 dims
126
+ self.word_embeddings[word] = coefs
127
+ word_count += 1
128
+
129
+ # Progress update every 50k words
130
+ if word_count % 50000 == 0:
131
+ print(f"Loaded {word_count} words...")
132
+
133
+ except (ValueError, IndexError) as e:
134
+ # Skip malformed lines
135
+ continue
136
+
137
+ print(f"Successfully loaded {len(self.word_embeddings)} word embeddings.")
138
+
139
+ if len(self.word_embeddings) == 0:
140
+ raise ValueError("No valid embeddings found in GloVe file")
141
+
142
+ except FileNotFoundError:
143
+ raise FileNotFoundError(f"GloVe file not found at {self.glove_path}")
144
+ except Exception as e:
145
+ raise Exception(f"Error loading GloVe embeddings: {e}")
146
+
147
+ def preprocess_sentences(self, df: pd.DataFrame) -> List[Dict]:
148
+ """Tokenize articles into sentences and store metadata."""
149
+ all_sentences_data = []
150
+ sentence_counter_global = 0
151
+ for _, article_row in df.iterrows():
152
+ article_id = article_row['article_id']
153
+ article_text = article_row['article_text']
154
+ article_sentences = sent_tokenize(article_text)
155
+ for sent_idx, sentence_text in enumerate(article_sentences):
156
+ all_sentences_data.append({
157
+ 'global_sentence_idx': sentence_counter_global,
158
+ 'article_id': article_id,
159
+ 'sentence_text': sentence_text,
160
+ 'original_article_sentence_idx': sent_idx
161
+ })
162
+ sentence_counter_global += 1
163
+ return all_sentences_data
164
+
165
+ def clean_sentences(self, sentences):
166
+ """Clean sentences: remove non-alphabetic, lowercase, remove stopwords."""
167
+ if not sentences:
168
+ return []
169
+
170
+ # Use pandas for efficient string operations
171
+ clean_sentences = pd.Series(sentences).str.replace(r"[^a-zA-Z\s]", " ", regex=True)
172
+ clean_sentences = clean_sentences.str.lower()
173
+ clean_sentences = clean_sentences.apply(self._remove_stopwords)
174
+ return clean_sentences.tolist()
175
+
176
+ def _remove_stopwords(self, sentence):
177
+ """Remove stopwords from a sentence string."""
178
+ if not isinstance(sentence, str):
179
+ return ""
180
+ words = sentence.split()
181
+ filtered_words = [word for word in words if word not in self.stop_words]
182
+ return " ".join(filtered_words)
183
+
184
+ def compute_sentence_vectors(self, clean_sentences):
185
+ """Compute sentence vectors using GloVe embeddings with vectorized operations."""
186
+ if not clean_sentences:
187
+ return []
188
+
189
+ sentence_vectors = []
190
+ for sentence in clean_sentences:
191
+ words = sentence.split()
192
+ if words:
193
+ # Get embeddings for all words in sentence
194
+ vectors = []
195
+ for word in words:
196
+ embedding = self.word_embeddings.get(word, np.zeros(100, dtype=np.float32))
197
+ vectors.append(embedding)
198
+
199
+ if vectors:
200
+ # Use mean of word vectors
201
+ v = np.mean(vectors, axis=0)
202
+ else:
203
+ v = np.zeros(100, dtype=np.float32)
204
+ else:
205
+ v = np.zeros(100, dtype=np.float32)
206
+ sentence_vectors.append(v)
207
+
208
+ return sentence_vectors
209
+
210
+ def compute_similarity_matrix(self, sentence_vectors):
211
+ """Compute cosine similarity matrix using vectorized operations."""
212
+ if not sentence_vectors:
213
+ return np.array([])
214
+
215
+ # Convert to numpy array for vectorized operations
216
+ vectors = np.array(sentence_vectors)
217
+ n = len(vectors)
218
+
219
+ # Normalize vectors for faster cosine similarity
220
+ norms = np.linalg.norm(vectors, axis=1, keepdims=True)
221
+ norms[norms == 0] = 1 # Avoid division by zero
222
+ normalized_vectors = vectors / norms
223
+
224
+ # Compute cosine similarity matrix using matrix multiplication
225
+ sim_mat = np.dot(normalized_vectors, normalized_vectors.T)
226
+
227
+ # Ensure diagonal is zero (no self-similarity)
228
+ np.fill_diagonal(sim_mat, 0)
229
+
230
+ return sim_mat
231
+
232
+ def rank_sentences(self, sim_mat):
233
+ """Rank sentences using PageRank with optimized parameters."""
234
+ if sim_mat.size == 0:
235
+ return {}
236
+
237
+ try:
238
+ # Create graph from similarity matrix
239
+ nx_graph = nx.from_numpy_array(sim_mat)
240
+
241
+ # Use optimized PageRank parameters
242
+ scores = nx.pagerank(
243
+ nx_graph,
244
+ alpha=0.85, # Damping factor
245
+ max_iter=100,
246
+ tol=1e-6
247
+ )
248
+
249
+ return scores
250
+ except Exception as e:
251
+ print(f"Warning: PageRank failed, using uniform scores: {e}")
252
+ # Fallback: return uniform scores
253
+ n = sim_mat.shape[0]
254
+ return {i: 1.0/n for i in range(n)}
255
+
256
+ def summarize_article(self, scored_sentences, article_id, df):
257
+ """Generate summary for a specific article."""
258
+ article_sentences = [s for s in scored_sentences if s['article_id'] == article_id]
259
+ if not article_sentences:
260
+ return None, None
261
+
262
+ article_sentences.sort(key=lambda x: x['score'], reverse=True)
263
+ top_sentences = article_sentences[:self.num_sentences]
264
+ top_sentences.sort(key=lambda x: x['original_article_sentence_idx'])
265
+ summary = " ".join([s['sentence_text'] for s in top_sentences])
266
+
267
+ article_row = df[df['article_id'] == article_id]
268
+ if not article_row.empty:
269
+ article_text = article_row['article_text'].iloc[0]
270
+ return article_text, summary
271
+ return None, None
272
+
273
+ def summarize_all_articles(self, scored_sentences, df):
274
+ """Generate summaries for all articles."""
275
+ summaries = {}
276
+ for _, article_row in df.iterrows():
277
+ article_id = article_row['article_id']
278
+ article_text, summary = self.summarize_article(scored_sentences, article_id, df)
279
+ if article_text and summary:
280
+ summaries[article_id] = {'article': article_text, 'summary': summary}
281
+ return summaries
282
+
283
+ def run_summarization(self, df):
284
+ """Run the full summarization pipeline."""
285
+ sentences_data = self.preprocess_sentences(df)
286
+ sentences = [s['sentence_text'] for s in sentences_data]
287
+ clean_sentences = self.clean_sentences(sentences)
288
+ sentence_vectors = self.compute_sentence_vectors(clean_sentences)
289
+ sim_mat = self.compute_similarity_matrix(sentence_vectors)
290
+ scores = self.rank_sentences(sim_mat)
291
+
292
+ for i, sentence_data in enumerate(sentences_data):
293
+ sentence_data['score'] = scores[i]
294
+
295
+ return sentences_data
296
+
297
+ def summarize_text(self, text: str, num_sentences: Optional[int] = None) -> str:
298
+ """
299
+ Summarize a single text document.
300
+
301
+ Args:
302
+ text (str): The text to summarize
303
+ num_sentences (int, optional): Number of sentences in summary. Defaults to self.num_sentences.
304
+
305
+ Returns:
306
+ str: The summarized text
307
+ """
308
+ if not text or not text.strip():
309
+ return ""
310
+
311
+ if num_sentences is None:
312
+ num_sentences = self.num_sentences
313
+
314
+ # Create a temporary DataFrame
315
+ df = pd.DataFrame([{'article_id': 1, 'article_text': text}])
316
+
317
+ # Run summarization pipeline
318
+ scored_sentences = self.run_summarization(df)
319
+
320
+ # Get summary
321
+ _, summary = self.summarize_article(scored_sentences, 1, df)
322
+
323
+ return summary if summary else text
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: text-summarizer-aweebtaku
3
- Version: 1.0.2
3
+ Version: 1.2.0
4
4
  Summary: A text summarization tool using GloVe embeddings and PageRank algorithm
5
5
  Home-page: https://github.com/AWeebTaku/Summarizer
6
6
  Author: Your Name
@@ -22,6 +22,7 @@ Requires-Dist: numpy
22
22
  Requires-Dist: nltk
23
23
  Requires-Dist: scikit-learn
24
24
  Requires-Dist: networkx
25
+ Requires-Dist: requests
25
26
  Dynamic: author
26
27
  Dynamic: author-email
27
28
  Dynamic: classifier
@@ -60,8 +61,6 @@ A Python-based text summarization tool that uses GloVe word embeddings and PageR
60
61
  pip install text-summarizer-aweebtaku
61
62
  ```
62
63
 
63
- **Note:** This package includes the GloVe word embeddings file (~400MB), so the installation may take some time.
64
-
65
64
  ### Install from Source
66
65
 
67
66
  1. Clone the repository:
@@ -75,7 +74,14 @@ cd Summarizer
75
74
  pip install -e .
76
75
  ```
77
76
 
78
- **Note:** The GloVe word embeddings are included in the package, so no additional download is required.
77
+ ### Download GloVe Embeddings
78
+
79
+ **No manual download required!** The package will automatically download GloVe embeddings (100d, ~400MB) on first use and cache them in your home directory (`~/.text_summarizer/`).
80
+
81
+ If you prefer to use your own GloVe file, you can specify the path:
82
+ ```python
83
+ summarizer = TextSummarizer(glove_path='path/to/your/glove.6B.100d.txt')
84
+ ```
79
85
 
80
86
  ## Usage
81
87
 
@@ -103,20 +109,20 @@ text-summarizer-gui
103
109
 
104
110
  ```python
105
111
  from text_summarizer import TextSummarizer
106
- import pandas as pd
107
112
 
108
- # Initialize summarizer
109
- summarizer = TextSummarizer(glove_path='glove.6B.100d.txt')
113
+ # Initialize summarizer (automatic GloVe download)
114
+ summarizer = TextSummarizer(num_sentences=3)
110
115
 
111
- # Load data
112
- df = pd.DataFrame([{'article_id': 1, 'article_text': 'Your text here...'}])
116
+ # Simple text summarization
117
+ text = "Your long text here..."
118
+ summary = summarizer.summarize_text(text)
119
+ print(summary)
113
120
 
114
- # Run summarization
121
+ # Advanced usage with DataFrame
122
+ import pandas as pd
123
+ df = pd.DataFrame([{'article_id': 1, 'article_text': text}])
115
124
  scored_sentences = summarizer.run_summarization(df)
116
-
117
- # Get summary for article ID 1
118
125
  article_text, summary = summarizer.summarize_article(scored_sentences, 1, df)
119
- print(summary)
120
126
  ```
121
127
 
122
128
  ## Data Format
@@ -1,251 +0,0 @@
1
- import pandas as pd
2
- import numpy as np
3
- import nltk
4
- import os
5
- from nltk.tokenize import sent_tokenize
6
- from nltk.corpus import stopwords
7
- from sklearn.metrics.pairwise import cosine_similarity
8
- import networkx as nx
9
- import pkg_resources
10
- import urllib.request
11
- import zipfile
12
- import shutil
13
-
14
- # Download necessary NLTK data
15
- # nltk.download('punkt_tab')
16
- # nltk.download('stopwords')
17
-
18
- class TextSummarizer:
19
- """A class for summarizing text documents using GloVe embeddings and PageRank."""
20
-
21
- def __init__(self, glove_path=None, num_sentences=5):
22
- if glove_path is None:
23
- # Try to find GloVe file in package data first
24
- try:
25
- glove_path = pkg_resources.resource_filename('text_summarizer', 'glove.6B.100d.txt/glove.6B.100d.txt')
26
- except (FileNotFoundError, ModuleNotFoundError):
27
- # Fallback to default path
28
- glove_path = 'glove.6B.100d.txt/glove.6B.100d.txt'
29
-
30
- # Download GloVe if it doesn't exist
31
- if not os.path.exists(glove_path):
32
- print("GloVe embeddings not found. Downloading...")
33
- self._download_glove()
34
-
35
- self.glove_path = glove_path
36
- self.num_sentences = num_sentences
37
- self.word_embeddings = {}
38
- self.stop_words = set(stopwords.words('english'))
39
- self._load_embeddings()
40
-
41
- def _download_glove(self):
42
- """Download and extract GloVe embeddings."""
43
- glove_dir = 'glove.6B.100d.txt'
44
- glove_file = os.path.join(glove_dir, 'glove.6B.100d.txt')
45
- zip_url = 'http://nlp.stanford.edu/data/glove.6B.zip'
46
- zip_path = 'glove.6B.zip'
47
-
48
- try:
49
- # Download the zip file
50
- print(f"Downloading GloVe from {zip_url}...")
51
- urllib.request.urlretrieve(zip_url, zip_path)
52
-
53
- # Extract the specific file we need
54
- print("Extracting GloVe embeddings...")
55
- with zipfile.ZipFile(zip_path, 'r') as zip_ref:
56
- # Create directory if it doesn't exist
57
- os.makedirs(glove_dir, exist_ok=True)
58
- # Extract only the 100d file
59
- zip_ref.extract('glove.6B.100d.txt', '.')
60
- # Move to the expected directory
61
- if os.path.exists('glove.6B.100d.txt'):
62
- shutil.move('glove.6B.100d.txt', glove_file)
63
-
64
- # Clean up
65
- os.remove(zip_path)
66
- print("GloVe embeddings downloaded successfully!")
67
-
68
- except Exception as e:
69
- print(f"Failed to download GloVe embeddings: {e}")
70
- print("Please download manually from: http://nlp.stanford.edu/data/glove.6B.zip")
71
- raise
72
-
73
- def _load_embeddings(self):
74
- """Load GloVe word embeddings from file."""
75
- try:
76
- with open(self.glove_path, 'r', encoding='utf-8') as f:
77
- for line in f:
78
- values = line.split()
79
- word = values[0]
80
- coefs = np.asarray(values[1:], dtype='float32')
81
- self.word_embeddings[word] = coefs
82
- except FileNotFoundError:
83
- raise FileNotFoundError(f"GloVe file not found at {self.glove_path}")
84
-
85
- def load_data(self):
86
- """Load data interactively."""
87
- while True:
88
- choice = input("Enter 'P' to paste a single article,\n'U' to upload a CSV with multiple articles,\n'C' to create a new CSV with multiple articles: ").upper()
89
- df = pd.DataFrame()
90
- save_csv = True
91
-
92
- if choice == 'P':
93
- article_text = input("Paste your article text here:\n")
94
- df = pd.DataFrame([{'article_id': 1, 'article_text': article_text}])
95
- print('DataFrame created from single article.')
96
- save_csv = False
97
- break
98
- elif choice == 'U':
99
- print("You chose to load an existing CSV file. It should contain 'article_id' and 'article_text' columns.")
100
- save_csv = False
101
- while True:
102
- file_name = input("Enter the name of the CSV file (e.g., 'tennis.csv') or type 'cancel' to go back: ").strip()
103
- if file_name.lower() == 'cancel':
104
- break
105
- if os.path.exists(file_name) and file_name.lower().endswith('.csv'):
106
- try:
107
- df = pd.read_csv(file_name)
108
- print(f'CSV file "{file_name}" loaded successfully.')
109
- break
110
- except Exception as e:
111
- print(f"Error reading file '{file_name}': {e}")
112
- else:
113
- print(f"File '{file_name}' not found or is not a CSV. Please try again.")
114
- if not df.empty:
115
- break
116
- elif choice == 'C':
117
- print("You've chosen to create a CSV with multiple articles. Enter 'done' for article ID when finished.")
118
- articles_data = []
119
- article_counter = 1
120
- while True:
121
- article_id_input = input(f"Enter article ID for article {article_counter} (or 'done' to finish): ").strip()
122
- if article_id_input.lower() == 'done':
123
- break
124
- try:
125
- article_id = int(article_id_input)
126
- except ValueError:
127
- print("Invalid Article ID. Please enter a number or 'done'.")
128
- continue
129
- article_text = input("Enter article text:\n").strip()
130
- if not article_text:
131
- print("Article text cannot be empty. Please try again.")
132
- continue
133
- articles_data.append({'article_id': article_id, 'article_text': article_text})
134
- article_counter += 1
135
- if articles_data:
136
- df = pd.DataFrame(articles_data)
137
- print('DataFrame created from multiple articles.')
138
- break
139
- else:
140
- print("No articles were entered. Please try again or choose another option.")
141
- else:
142
- print("Invalid choice. Please enter 'P', 'U', or 'C'.")
143
-
144
- if not df.empty and save_csv:
145
- df.to_csv('article.csv', index=False)
146
- print('CSV file "article.csv" created/updated successfully.')
147
- elif df.empty:
148
- print("No DataFrame was created.")
149
- return df
150
-
151
- def preprocess_sentences(self, df):
152
- """Tokenize articles into sentences and store metadata."""
153
- all_sentences_data = []
154
- sentence_counter_global = 0
155
- for _, article_row in df.iterrows():
156
- article_id = article_row['article_id']
157
- article_text = article_row['article_text']
158
- article_sentences = sent_tokenize(article_text)
159
- for sent_idx, sentence_text in enumerate(article_sentences):
160
- all_sentences_data.append({
161
- 'global_sentence_idx': sentence_counter_global,
162
- 'article_id': article_id,
163
- 'sentence_text': sentence_text,
164
- 'original_article_sentence_idx': sent_idx
165
- })
166
- sentence_counter_global += 1
167
- return all_sentences_data
168
-
169
- def clean_sentences(self, sentences):
170
- """Clean sentences: remove non-alphabetic, lowercase, remove stopwords."""
171
- clean_sentences = pd.Series(sentences).str.replace(r"[^a-zA-Z\s]", " ", regex=True)
172
- clean_sentences = clean_sentences.str.lower()
173
- clean_sentences = clean_sentences.apply(lambda s: self._remove_stopwords(s.split()))
174
- return clean_sentences.tolist()
175
-
176
- def _remove_stopwords(self, sen):
177
- """Remove stopwords from a list of words."""
178
- return " ".join([word for word in sen if word not in self.stop_words])
179
-
180
- def compute_sentence_vectors(self, clean_sentences):
181
- """Compute sentence vectors using GloVe embeddings."""
182
- sentence_vectors = []
183
- for sentence in clean_sentences:
184
- words = sentence.split()
185
- if words:
186
- vectors = [self.word_embeddings.get(w, np.zeros(100)) for w in words]
187
- v = np.mean(vectors, axis=0)
188
- else:
189
- v = np.zeros(100)
190
- sentence_vectors.append(v)
191
- return sentence_vectors
192
-
193
- def compute_similarity_matrix(self, sentence_vectors):
194
- """Compute cosine similarity matrix."""
195
- n = len(sentence_vectors)
196
- sim_mat = np.zeros((n, n))
197
- for i in range(n):
198
- for j in range(n):
199
- if i != j:
200
- sim_mat[i][j] = cosine_similarity(
201
- sentence_vectors[i].reshape(1, -1),
202
- sentence_vectors[j].reshape(1, -1)
203
- )[0, 0]
204
- return sim_mat
205
-
206
- def rank_sentences(self, sim_mat):
207
- """Rank sentences using PageRank."""
208
- nx_graph = nx.from_numpy_array(sim_mat)
209
- scores = nx.pagerank(nx_graph)
210
- return scores
211
-
212
- def summarize_article(self, scored_sentences, article_id, df):
213
- """Generate summary for a specific article."""
214
- article_sentences = [s for s in scored_sentences if s['article_id'] == article_id]
215
- if not article_sentences:
216
- return None, None
217
-
218
- article_sentences.sort(key=lambda x: x['score'], reverse=True)
219
- top_sentences = article_sentences[:self.num_sentences]
220
- top_sentences.sort(key=lambda x: x['original_article_sentence_idx'])
221
- summary = " ".join([s['sentence_text'] for s in top_sentences])
222
-
223
- article_row = df[df['article_id'] == article_id]
224
- if not article_row.empty:
225
- article_text = article_row['article_text'].iloc[0]
226
- return article_text, summary
227
- return None, None
228
-
229
- def summarize_all_articles(self, scored_sentences, df):
230
- """Generate summaries for all articles."""
231
- summaries = {}
232
- for _, article_row in df.iterrows():
233
- article_id = article_row['article_id']
234
- article_text, summary = self.summarize_article(scored_sentences, article_id, df)
235
- if article_text and summary:
236
- summaries[article_id] = {'article': article_text, 'summary': summary}
237
- return summaries
238
-
239
- def run_summarization(self, df):
240
- """Run the full summarization pipeline."""
241
- sentences_data = self.preprocess_sentences(df)
242
- sentences = [s['sentence_text'] for s in sentences_data]
243
- clean_sentences = self.clean_sentences(sentences)
244
- sentence_vectors = self.compute_sentence_vectors(clean_sentences)
245
- sim_mat = self.compute_similarity_matrix(sentence_vectors)
246
- scores = self.rank_sentences(sim_mat)
247
-
248
- for i, sentence_data in enumerate(sentences_data):
249
- sentence_data['score'] = scores[i]
250
-
251
- return sentences_data