text-summarizer-aweebtaku 1.0.2__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- text_summarizer/summarizer.py +219 -107
- {text_summarizer_aweebtaku-1.0.2.dist-info → text_summarizer_aweebtaku-1.2.0.dist-info}/METADATA +19 -13
- {text_summarizer_aweebtaku-1.0.2.dist-info → text_summarizer_aweebtaku-1.2.0.dist-info}/RECORD +7 -7
- {text_summarizer_aweebtaku-1.0.2.dist-info → text_summarizer_aweebtaku-1.2.0.dist-info}/WHEEL +0 -0
- {text_summarizer_aweebtaku-1.0.2.dist-info → text_summarizer_aweebtaku-1.2.0.dist-info}/entry_points.txt +0 -0
- {text_summarizer_aweebtaku-1.0.2.dist-info → text_summarizer_aweebtaku-1.2.0.dist-info}/licenses/LICENSE +0 -0
- {text_summarizer_aweebtaku-1.0.2.dist-info → text_summarizer_aweebtaku-1.2.0.dist-info}/top_level.txt +0 -0
text_summarizer/summarizer.py
CHANGED
|
@@ -2,11 +2,13 @@ import pandas as pd
|
|
|
2
2
|
import numpy as np
|
|
3
3
|
import nltk
|
|
4
4
|
import os
|
|
5
|
+
import zipfile
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Dict, List, Tuple, Optional, Union
|
|
5
8
|
from nltk.tokenize import sent_tokenize
|
|
6
9
|
from nltk.corpus import stopwords
|
|
7
10
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
8
11
|
import networkx as nx
|
|
9
|
-
import pkg_resources
|
|
10
12
|
|
|
11
13
|
# Download necessary NLTK data
|
|
12
14
|
# nltk.download('punkt_tab')
|
|
@@ -15,100 +17,134 @@ import pkg_resources
|
|
|
15
17
|
class TextSummarizer:
|
|
16
18
|
"""A class for summarizing text documents using GloVe embeddings and PageRank."""
|
|
17
19
|
|
|
18
|
-
def __init__(self, glove_path=None, num_sentences=5):
|
|
20
|
+
def __init__(self, glove_path: Optional[str] = None, num_sentences: int = 5):
|
|
21
|
+
self.num_sentences = num_sentences
|
|
22
|
+
self.word_embeddings: Dict[str, np.ndarray] = {}
|
|
23
|
+
self.stop_words: set = set(stopwords.words('english'))
|
|
24
|
+
|
|
25
|
+
# Set default GloVe path
|
|
19
26
|
if glove_path is None:
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
glove_path = pkg_resources.resource_filename('text_summarizer', 'glove.6B.100d.txt/glove.6B.100d.txt')
|
|
23
|
-
except (FileNotFoundError, ModuleNotFoundError):
|
|
24
|
-
# Fallback to default path
|
|
25
|
-
glove_path = 'glove.6B.100d.txt/glove.6B.100d.txt'
|
|
26
|
-
|
|
27
|
+
glove_path = self._get_default_glove_path()
|
|
28
|
+
|
|
27
29
|
self.glove_path = glove_path
|
|
28
|
-
self.num_sentences = num_sentences
|
|
29
|
-
self.word_embeddings = {}
|
|
30
|
-
self.stop_words = set(stopwords.words('english'))
|
|
31
30
|
self._load_embeddings()
|
|
32
31
|
|
|
32
|
+
def _get_default_glove_path(self):
|
|
33
|
+
"""Get the default path for GloVe embeddings."""
|
|
34
|
+
# Use user's home directory for data
|
|
35
|
+
home_dir = Path.home()
|
|
36
|
+
glove_dir = home_dir / '.text_summarizer'
|
|
37
|
+
glove_dir.mkdir(exist_ok=True)
|
|
38
|
+
return glove_dir / 'glove.6B.100d.txt'
|
|
39
|
+
|
|
40
|
+
def _download_glove_embeddings(self):
|
|
41
|
+
"""Download GloVe embeddings if not present with improved error handling."""
|
|
42
|
+
import requests
|
|
43
|
+
|
|
44
|
+
print("GloVe embeddings not found. Downloading from Stanford NLP...")
|
|
45
|
+
|
|
46
|
+
# Create directory if it doesn't exist
|
|
47
|
+
glove_file = Path(self.glove_path)
|
|
48
|
+
glove_file.parent.mkdir(exist_ok=True)
|
|
49
|
+
|
|
50
|
+
# Download the zip file
|
|
51
|
+
url = "https://nlp.stanford.edu/data/glove.6B.zip"
|
|
52
|
+
zip_path = glove_file.parent / "glove.6B.zip"
|
|
53
|
+
|
|
54
|
+
headers = {
|
|
55
|
+
'User-Agent': 'TextSummarizer/1.1.0 (https://github.com/AWeebTaku/Summarizer)',
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
print("Downloading GloVe embeddings (862 MB)...")
|
|
60
|
+
with requests.get(url, headers=headers, stream=True, timeout=30) as response:
|
|
61
|
+
response.raise_for_status()
|
|
62
|
+
|
|
63
|
+
total_size = int(response.headers.get('content-length', 0))
|
|
64
|
+
downloaded_size = 0
|
|
65
|
+
|
|
66
|
+
with open(zip_path, 'wb') as f:
|
|
67
|
+
for chunk in response.iter_content(chunk_size=8192):
|
|
68
|
+
if chunk:
|
|
69
|
+
f.write(chunk)
|
|
70
|
+
downloaded_size += len(chunk)
|
|
71
|
+
if total_size > 0:
|
|
72
|
+
progress = (downloaded_size / total_size) * 100
|
|
73
|
+
print(".1f", end='', flush=True)
|
|
74
|
+
|
|
75
|
+
print("\nDownload complete. Extracting...")
|
|
76
|
+
|
|
77
|
+
# Extract the specific file we need
|
|
78
|
+
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
|
79
|
+
zip_ref.extract('glove.6B.100d.txt', glove_file.parent)
|
|
80
|
+
|
|
81
|
+
# Verify extraction
|
|
82
|
+
if not glove_file.exists():
|
|
83
|
+
raise FileNotFoundError("Failed to extract GloVe file from zip")
|
|
84
|
+
|
|
85
|
+
# Clean up zip file
|
|
86
|
+
zip_path.unlink()
|
|
87
|
+
|
|
88
|
+
print(f"GloVe embeddings extracted to {self.glove_path}")
|
|
89
|
+
|
|
90
|
+
except requests.exceptions.RequestException as e:
|
|
91
|
+
print(f"Network error during download: {e}")
|
|
92
|
+
raise Exception(f"Failed to download GloVe embeddings: {e}")
|
|
93
|
+
except zipfile.BadZipFile as e:
|
|
94
|
+
print(f"Invalid zip file downloaded: {e}")
|
|
95
|
+
if zip_path.exists():
|
|
96
|
+
zip_path.unlink()
|
|
97
|
+
raise Exception("Downloaded file is not a valid zip archive")
|
|
98
|
+
except Exception as e:
|
|
99
|
+
print(f"Unexpected error during download: {e}")
|
|
100
|
+
if zip_path.exists():
|
|
101
|
+
zip_path.unlink()
|
|
102
|
+
raise
|
|
103
|
+
|
|
33
104
|
def _load_embeddings(self):
|
|
34
|
-
"""Load GloVe word embeddings from file."""
|
|
105
|
+
"""Load GloVe word embeddings from file with optimized memory usage."""
|
|
106
|
+
if not os.path.exists(self.glove_path):
|
|
107
|
+
self._download_glove_embeddings()
|
|
108
|
+
|
|
35
109
|
try:
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
values = line.split()
|
|
39
|
-
word = values[0]
|
|
40
|
-
coefs = np.asarray(values[1:], dtype='float32')
|
|
41
|
-
self.word_embeddings[word] = coefs
|
|
42
|
-
except FileNotFoundError:
|
|
43
|
-
raise FileNotFoundError(f"GloVe file not found at {self.glove_path}")
|
|
110
|
+
print(f"Loading GloVe embeddings from {self.glove_path}...")
|
|
111
|
+
word_count = 0
|
|
44
112
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
df = pd.DataFrame()
|
|
50
|
-
save_csv = True
|
|
51
|
-
|
|
52
|
-
if choice == 'P':
|
|
53
|
-
article_text = input("Paste your article text here:\n")
|
|
54
|
-
df = pd.DataFrame([{'article_id': 1, 'article_text': article_text}])
|
|
55
|
-
print('DataFrame created from single article.')
|
|
56
|
-
save_csv = False
|
|
57
|
-
break
|
|
58
|
-
elif choice == 'U':
|
|
59
|
-
print("You chose to load an existing CSV file. It should contain 'article_id' and 'article_text' columns.")
|
|
60
|
-
save_csv = False
|
|
61
|
-
while True:
|
|
62
|
-
file_name = input("Enter the name of the CSV file (e.g., 'tennis.csv') or type 'cancel' to go back: ").strip()
|
|
63
|
-
if file_name.lower() == 'cancel':
|
|
64
|
-
break
|
|
65
|
-
if os.path.exists(file_name) and file_name.lower().endswith('.csv'):
|
|
66
|
-
try:
|
|
67
|
-
df = pd.read_csv(file_name)
|
|
68
|
-
print(f'CSV file "{file_name}" loaded successfully.')
|
|
69
|
-
break
|
|
70
|
-
except Exception as e:
|
|
71
|
-
print(f"Error reading file '{file_name}': {e}")
|
|
72
|
-
else:
|
|
73
|
-
print(f"File '{file_name}' not found or is not a CSV. Please try again.")
|
|
74
|
-
if not df.empty:
|
|
75
|
-
break
|
|
76
|
-
elif choice == 'C':
|
|
77
|
-
print("You've chosen to create a CSV with multiple articles. Enter 'done' for article ID when finished.")
|
|
78
|
-
articles_data = []
|
|
79
|
-
article_counter = 1
|
|
80
|
-
while True:
|
|
81
|
-
article_id_input = input(f"Enter article ID for article {article_counter} (or 'done' to finish): ").strip()
|
|
82
|
-
if article_id_input.lower() == 'done':
|
|
83
|
-
break
|
|
84
|
-
try:
|
|
85
|
-
article_id = int(article_id_input)
|
|
86
|
-
except ValueError:
|
|
87
|
-
print("Invalid Article ID. Please enter a number or 'done'.")
|
|
113
|
+
with open(self.glove_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
114
|
+
for line_num, line in enumerate(f, 1):
|
|
115
|
+
line = line.strip()
|
|
116
|
+
if not line:
|
|
88
117
|
continue
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
118
|
+
|
|
119
|
+
try:
|
|
120
|
+
values = line.split()
|
|
121
|
+
if len(values) < 101: # word + 100 dimensions
|
|
122
|
+
continue
|
|
123
|
+
|
|
124
|
+
word = values[0]
|
|
125
|
+
coefs = np.asarray(values[1:101], dtype='float32') # Only take first 100 dims
|
|
126
|
+
self.word_embeddings[word] = coefs
|
|
127
|
+
word_count += 1
|
|
128
|
+
|
|
129
|
+
# Progress update every 50k words
|
|
130
|
+
if word_count % 50000 == 0:
|
|
131
|
+
print(f"Loaded {word_count} words...")
|
|
132
|
+
|
|
133
|
+
except (ValueError, IndexError) as e:
|
|
134
|
+
# Skip malformed lines
|
|
92
135
|
continue
|
|
93
|
-
articles_data.append({'article_id': article_id, 'article_text': article_text})
|
|
94
|
-
article_counter += 1
|
|
95
|
-
if articles_data:
|
|
96
|
-
df = pd.DataFrame(articles_data)
|
|
97
|
-
print('DataFrame created from multiple articles.')
|
|
98
|
-
break
|
|
99
|
-
else:
|
|
100
|
-
print("No articles were entered. Please try again or choose another option.")
|
|
101
|
-
else:
|
|
102
|
-
print("Invalid choice. Please enter 'P', 'U', or 'C'.")
|
|
103
136
|
|
|
104
|
-
|
|
105
|
-
df.to_csv('article.csv', index=False)
|
|
106
|
-
print('CSV file "article.csv" created/updated successfully.')
|
|
107
|
-
elif df.empty:
|
|
108
|
-
print("No DataFrame was created.")
|
|
109
|
-
return df
|
|
137
|
+
print(f"Successfully loaded {len(self.word_embeddings)} word embeddings.")
|
|
110
138
|
|
|
111
|
-
|
|
139
|
+
if len(self.word_embeddings) == 0:
|
|
140
|
+
raise ValueError("No valid embeddings found in GloVe file")
|
|
141
|
+
|
|
142
|
+
except FileNotFoundError:
|
|
143
|
+
raise FileNotFoundError(f"GloVe file not found at {self.glove_path}")
|
|
144
|
+
except Exception as e:
|
|
145
|
+
raise Exception(f"Error loading GloVe embeddings: {e}")
|
|
146
|
+
|
|
147
|
+
def preprocess_sentences(self, df: pd.DataFrame) -> List[Dict]:
|
|
112
148
|
"""Tokenize articles into sentences and store metadata."""
|
|
113
149
|
all_sentences_data = []
|
|
114
150
|
sentence_counter_global = 0
|
|
@@ -128,46 +164,94 @@ class TextSummarizer:
|
|
|
128
164
|
|
|
129
165
|
def clean_sentences(self, sentences):
|
|
130
166
|
"""Clean sentences: remove non-alphabetic, lowercase, remove stopwords."""
|
|
167
|
+
if not sentences:
|
|
168
|
+
return []
|
|
169
|
+
|
|
170
|
+
# Use pandas for efficient string operations
|
|
131
171
|
clean_sentences = pd.Series(sentences).str.replace(r"[^a-zA-Z\s]", " ", regex=True)
|
|
132
172
|
clean_sentences = clean_sentences.str.lower()
|
|
133
|
-
clean_sentences = clean_sentences.apply(
|
|
173
|
+
clean_sentences = clean_sentences.apply(self._remove_stopwords)
|
|
134
174
|
return clean_sentences.tolist()
|
|
135
175
|
|
|
136
|
-
def _remove_stopwords(self,
|
|
137
|
-
"""Remove stopwords from a
|
|
138
|
-
|
|
176
|
+
def _remove_stopwords(self, sentence):
|
|
177
|
+
"""Remove stopwords from a sentence string."""
|
|
178
|
+
if not isinstance(sentence, str):
|
|
179
|
+
return ""
|
|
180
|
+
words = sentence.split()
|
|
181
|
+
filtered_words = [word for word in words if word not in self.stop_words]
|
|
182
|
+
return " ".join(filtered_words)
|
|
139
183
|
|
|
140
184
|
def compute_sentence_vectors(self, clean_sentences):
|
|
141
|
-
"""Compute sentence vectors using GloVe embeddings."""
|
|
185
|
+
"""Compute sentence vectors using GloVe embeddings with vectorized operations."""
|
|
186
|
+
if not clean_sentences:
|
|
187
|
+
return []
|
|
188
|
+
|
|
142
189
|
sentence_vectors = []
|
|
143
190
|
for sentence in clean_sentences:
|
|
144
191
|
words = sentence.split()
|
|
145
192
|
if words:
|
|
146
|
-
|
|
147
|
-
|
|
193
|
+
# Get embeddings for all words in sentence
|
|
194
|
+
vectors = []
|
|
195
|
+
for word in words:
|
|
196
|
+
embedding = self.word_embeddings.get(word, np.zeros(100, dtype=np.float32))
|
|
197
|
+
vectors.append(embedding)
|
|
198
|
+
|
|
199
|
+
if vectors:
|
|
200
|
+
# Use mean of word vectors
|
|
201
|
+
v = np.mean(vectors, axis=0)
|
|
202
|
+
else:
|
|
203
|
+
v = np.zeros(100, dtype=np.float32)
|
|
148
204
|
else:
|
|
149
|
-
v = np.zeros(100)
|
|
205
|
+
v = np.zeros(100, dtype=np.float32)
|
|
150
206
|
sentence_vectors.append(v)
|
|
207
|
+
|
|
151
208
|
return sentence_vectors
|
|
152
209
|
|
|
153
210
|
def compute_similarity_matrix(self, sentence_vectors):
|
|
154
|
-
"""Compute cosine similarity matrix."""
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
211
|
+
"""Compute cosine similarity matrix using vectorized operations."""
|
|
212
|
+
if not sentence_vectors:
|
|
213
|
+
return np.array([])
|
|
214
|
+
|
|
215
|
+
# Convert to numpy array for vectorized operations
|
|
216
|
+
vectors = np.array(sentence_vectors)
|
|
217
|
+
n = len(vectors)
|
|
218
|
+
|
|
219
|
+
# Normalize vectors for faster cosine similarity
|
|
220
|
+
norms = np.linalg.norm(vectors, axis=1, keepdims=True)
|
|
221
|
+
norms[norms == 0] = 1 # Avoid division by zero
|
|
222
|
+
normalized_vectors = vectors / norms
|
|
223
|
+
|
|
224
|
+
# Compute cosine similarity matrix using matrix multiplication
|
|
225
|
+
sim_mat = np.dot(normalized_vectors, normalized_vectors.T)
|
|
226
|
+
|
|
227
|
+
# Ensure diagonal is zero (no self-similarity)
|
|
228
|
+
np.fill_diagonal(sim_mat, 0)
|
|
229
|
+
|
|
164
230
|
return sim_mat
|
|
165
231
|
|
|
166
232
|
def rank_sentences(self, sim_mat):
|
|
167
|
-
"""Rank sentences using PageRank."""
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
233
|
+
"""Rank sentences using PageRank with optimized parameters."""
|
|
234
|
+
if sim_mat.size == 0:
|
|
235
|
+
return {}
|
|
236
|
+
|
|
237
|
+
try:
|
|
238
|
+
# Create graph from similarity matrix
|
|
239
|
+
nx_graph = nx.from_numpy_array(sim_mat)
|
|
240
|
+
|
|
241
|
+
# Use optimized PageRank parameters
|
|
242
|
+
scores = nx.pagerank(
|
|
243
|
+
nx_graph,
|
|
244
|
+
alpha=0.85, # Damping factor
|
|
245
|
+
max_iter=100,
|
|
246
|
+
tol=1e-6
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
return scores
|
|
250
|
+
except Exception as e:
|
|
251
|
+
print(f"Warning: PageRank failed, using uniform scores: {e}")
|
|
252
|
+
# Fallback: return uniform scores
|
|
253
|
+
n = sim_mat.shape[0]
|
|
254
|
+
return {i: 1.0/n for i in range(n)}
|
|
171
255
|
|
|
172
256
|
def summarize_article(self, scored_sentences, article_id, df):
|
|
173
257
|
"""Generate summary for a specific article."""
|
|
@@ -208,4 +292,32 @@ class TextSummarizer:
|
|
|
208
292
|
for i, sentence_data in enumerate(sentences_data):
|
|
209
293
|
sentence_data['score'] = scores[i]
|
|
210
294
|
|
|
211
|
-
return sentences_data
|
|
295
|
+
return sentences_data
|
|
296
|
+
|
|
297
|
+
def summarize_text(self, text: str, num_sentences: Optional[int] = None) -> str:
|
|
298
|
+
"""
|
|
299
|
+
Summarize a single text document.
|
|
300
|
+
|
|
301
|
+
Args:
|
|
302
|
+
text (str): The text to summarize
|
|
303
|
+
num_sentences (int, optional): Number of sentences in summary. Defaults to self.num_sentences.
|
|
304
|
+
|
|
305
|
+
Returns:
|
|
306
|
+
str: The summarized text
|
|
307
|
+
"""
|
|
308
|
+
if not text or not text.strip():
|
|
309
|
+
return ""
|
|
310
|
+
|
|
311
|
+
if num_sentences is None:
|
|
312
|
+
num_sentences = self.num_sentences
|
|
313
|
+
|
|
314
|
+
# Create a temporary DataFrame
|
|
315
|
+
df = pd.DataFrame([{'article_id': 1, 'article_text': text}])
|
|
316
|
+
|
|
317
|
+
# Run summarization pipeline
|
|
318
|
+
scored_sentences = self.run_summarization(df)
|
|
319
|
+
|
|
320
|
+
# Get summary
|
|
321
|
+
_, summary = self.summarize_article(scored_sentences, 1, df)
|
|
322
|
+
|
|
323
|
+
return summary if summary else text
|
{text_summarizer_aweebtaku-1.0.2.dist-info → text_summarizer_aweebtaku-1.2.0.dist-info}/METADATA
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: text-summarizer-aweebtaku
|
|
3
|
-
Version: 1.0
|
|
3
|
+
Version: 1.2.0
|
|
4
4
|
Summary: A text summarization tool using GloVe embeddings and PageRank algorithm
|
|
5
5
|
Home-page: https://github.com/AWeebTaku/Summarizer
|
|
6
6
|
Author: Your Name
|
|
@@ -22,6 +22,7 @@ Requires-Dist: numpy
|
|
|
22
22
|
Requires-Dist: nltk
|
|
23
23
|
Requires-Dist: scikit-learn
|
|
24
24
|
Requires-Dist: networkx
|
|
25
|
+
Requires-Dist: requests
|
|
25
26
|
Dynamic: author
|
|
26
27
|
Dynamic: author-email
|
|
27
28
|
Dynamic: classifier
|
|
@@ -60,8 +61,6 @@ A Python-based text summarization tool that uses GloVe word embeddings and PageR
|
|
|
60
61
|
pip install text-summarizer-aweebtaku
|
|
61
62
|
```
|
|
62
63
|
|
|
63
|
-
**Note:** This package includes the GloVe word embeddings file (~400MB), so the installation may take some time.
|
|
64
|
-
|
|
65
64
|
### Install from Source
|
|
66
65
|
|
|
67
66
|
1. Clone the repository:
|
|
@@ -75,7 +74,14 @@ cd Summarizer
|
|
|
75
74
|
pip install -e .
|
|
76
75
|
```
|
|
77
76
|
|
|
78
|
-
|
|
77
|
+
### Download GloVe Embeddings
|
|
78
|
+
|
|
79
|
+
**No manual download required!** The package will automatically download GloVe embeddings (100d, ~400MB) on first use and cache them in your home directory (`~/.text_summarizer/`).
|
|
80
|
+
|
|
81
|
+
If you prefer to use your own GloVe file, you can specify the path:
|
|
82
|
+
```python
|
|
83
|
+
summarizer = TextSummarizer(glove_path='path/to/your/glove.6B.100d.txt')
|
|
84
|
+
```
|
|
79
85
|
|
|
80
86
|
## Usage
|
|
81
87
|
|
|
@@ -103,20 +109,20 @@ text-summarizer-gui
|
|
|
103
109
|
|
|
104
110
|
```python
|
|
105
111
|
from text_summarizer import TextSummarizer
|
|
106
|
-
import pandas as pd
|
|
107
112
|
|
|
108
|
-
# Initialize summarizer
|
|
109
|
-
summarizer = TextSummarizer(
|
|
113
|
+
# Initialize summarizer (automatic GloVe download)
|
|
114
|
+
summarizer = TextSummarizer(num_sentences=3)
|
|
110
115
|
|
|
111
|
-
#
|
|
112
|
-
|
|
116
|
+
# Simple text summarization
|
|
117
|
+
text = "Your long text here..."
|
|
118
|
+
summary = summarizer.summarize_text(text)
|
|
119
|
+
print(summary)
|
|
113
120
|
|
|
114
|
-
#
|
|
121
|
+
# Advanced usage with DataFrame
|
|
122
|
+
import pandas as pd
|
|
123
|
+
df = pd.DataFrame([{'article_id': 1, 'article_text': text}])
|
|
115
124
|
scored_sentences = summarizer.run_summarization(df)
|
|
116
|
-
|
|
117
|
-
# Get summary for article ID 1
|
|
118
125
|
article_text, summary = summarizer.summarize_article(scored_sentences, 1, df)
|
|
119
|
-
print(summary)
|
|
120
126
|
```
|
|
121
127
|
|
|
122
128
|
## Data Format
|
{text_summarizer_aweebtaku-1.0.2.dist-info → text_summarizer_aweebtaku-1.2.0.dist-info}/RECORD
RENAMED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
text_summarizer/__init__.py,sha256=juqSmwYQLqoiZpyLfxE1sJKoYLNAe_-a3_LOIUV6J6g,63
|
|
2
2
|
text_summarizer/cli.py,sha256=rWbSpT1gJ8kVcsTQ-ov6AZkfy5uUz2taAXeSnDEy0Zw,3773
|
|
3
|
-
text_summarizer/summarizer.py,sha256=
|
|
3
|
+
text_summarizer/summarizer.py,sha256=80RamR76QFtOAZGdVGqy-Bi5xQb3WBbQ2pSYSnOnT5c,12733
|
|
4
4
|
text_summarizer/ui.py,sha256=Ky40zcr-_0zh5I7Kh4Bc8hKrEBdOALe5G4i3ukDJWts,16638
|
|
5
5
|
text_summarizer/data/tennis.csv,sha256=oEPZr4Dy6cmCDtdQ2QYJyJpERzQseuNJ53JP2XyIfBk,12943
|
|
6
|
-
text_summarizer_aweebtaku-1.0.
|
|
7
|
-
text_summarizer_aweebtaku-1.0.
|
|
8
|
-
text_summarizer_aweebtaku-1.0.
|
|
9
|
-
text_summarizer_aweebtaku-1.0.
|
|
10
|
-
text_summarizer_aweebtaku-1.0.
|
|
11
|
-
text_summarizer_aweebtaku-1.0.
|
|
6
|
+
text_summarizer_aweebtaku-1.2.0.dist-info/licenses/LICENSE,sha256=q53YqEH5OACuJ8YmE3i9pND509hapVaOX42ix2AMkZ8,1085
|
|
7
|
+
text_summarizer_aweebtaku-1.2.0.dist-info/METADATA,sha256=bA4w6FulkE5sbMKauV8PXW8T0yQpL80R6TNwxhkY2wM,5106
|
|
8
|
+
text_summarizer_aweebtaku-1.2.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
9
|
+
text_summarizer_aweebtaku-1.2.0.dist-info/entry_points.txt,sha256=a8n647pYmETd5RzGoOBcfYtIxxNFNu7P5zctmhpldNY,117
|
|
10
|
+
text_summarizer_aweebtaku-1.2.0.dist-info/top_level.txt,sha256=2s-4Uyii86k2iEeiIi0JghAXW47cEQ8qM_ONYPs9Gh8,16
|
|
11
|
+
text_summarizer_aweebtaku-1.2.0.dist-info/RECORD,,
|
{text_summarizer_aweebtaku-1.0.2.dist-info → text_summarizer_aweebtaku-1.2.0.dist-info}/WHEEL
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|