umpaper-fetch 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- auth/__init__.py +1 -0
- auth/chrome_fix.py +119 -0
- auth/um_authenticator.py +521 -0
- downloader/__init__.py +1 -0
- downloader/pdf_downloader.py +207 -0
- scraper/__init__.py +1 -0
- scraper/paper_scraper.py +316 -0
- umpaper_fetch/__init__.py +26 -0
- umpaper_fetch/auth/__init__.py +1 -0
- umpaper_fetch/auth/chrome_fix.py +119 -0
- umpaper_fetch/auth/um_authenticator.py +521 -0
- umpaper_fetch/cli.py +316 -0
- umpaper_fetch/downloader/__init__.py +1 -0
- umpaper_fetch/downloader/pdf_downloader.py +207 -0
- umpaper_fetch/scraper/__init__.py +1 -0
- umpaper_fetch/scraper/paper_scraper.py +316 -0
- umpaper_fetch/utils/__init__.py +1 -0
- umpaper_fetch/utils/logger.py +67 -0
- umpaper_fetch/utils/zip_creator.py +299 -0
- umpaper_fetch-1.0.0.dist-info/METADATA +462 -0
- umpaper_fetch-1.0.0.dist-info/RECORD +28 -0
- umpaper_fetch-1.0.0.dist-info/WHEEL +5 -0
- umpaper_fetch-1.0.0.dist-info/entry_points.txt +2 -0
- umpaper_fetch-1.0.0.dist-info/licenses/LICENSE +22 -0
- umpaper_fetch-1.0.0.dist-info/top_level.txt +5 -0
- utils/__init__.py +1 -0
- utils/logger.py +67 -0
- utils/zip_creator.py +299 -0
utils/zip_creator.py
ADDED
@@ -0,0 +1,299 @@
|
|
1
|
+
"""
|
2
|
+
ZIP Creator Module
|
3
|
+
|
4
|
+
Creates organized ZIP archives with proper file naming and structure.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import logging
|
8
|
+
import zipfile
|
9
|
+
from pathlib import Path
|
10
|
+
import os
|
11
|
+
|
12
|
+
|
13
|
+
class ZipCreator:
|
14
|
+
"""Creates organized ZIP archives from downloaded papers."""
|
15
|
+
|
16
|
+
def __init__(self):
|
17
|
+
"""Initialize the ZIP creator."""
|
18
|
+
self.logger = logging.getLogger(__name__)
|
19
|
+
|
20
|
+
def create_zip(self, file_paths, zip_path, subject_code):
|
21
|
+
"""
|
22
|
+
Create a ZIP archive from downloaded files.
|
23
|
+
|
24
|
+
Args:
|
25
|
+
file_paths (list): List of file paths to include
|
26
|
+
zip_path (str or Path): Output ZIP file path
|
27
|
+
subject_code (str): Subject code for organization
|
28
|
+
|
29
|
+
Returns:
|
30
|
+
str: Path to created ZIP file
|
31
|
+
"""
|
32
|
+
zip_path = Path(zip_path)
|
33
|
+
|
34
|
+
if not file_paths:
|
35
|
+
self.logger.warning("No files to zip")
|
36
|
+
return None
|
37
|
+
|
38
|
+
self.logger.info(f"Creating ZIP archive: {zip_path.name}")
|
39
|
+
|
40
|
+
try:
|
41
|
+
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED, compresslevel=6) as zipf:
|
42
|
+
# Organize files by year and semester
|
43
|
+
organized_files = self._organize_files(file_paths)
|
44
|
+
|
45
|
+
# Add files to ZIP with organized structure
|
46
|
+
for file_path in file_paths:
|
47
|
+
file_path = Path(file_path)
|
48
|
+
|
49
|
+
if not file_path.exists():
|
50
|
+
self.logger.warning(f"File not found, skipping: {file_path}")
|
51
|
+
continue
|
52
|
+
|
53
|
+
# Determine archive path based on organization
|
54
|
+
archive_path = self._get_archive_path(file_path, subject_code, organized_files)
|
55
|
+
|
56
|
+
# Add file to ZIP
|
57
|
+
zipf.write(file_path, archive_path)
|
58
|
+
self.logger.debug(f"Added to ZIP: {archive_path}")
|
59
|
+
|
60
|
+
# Add a README file with information
|
61
|
+
readme_content = self._generate_readme(subject_code, file_paths)
|
62
|
+
zipf.writestr(f"{subject_code}_README.txt", readme_content)
|
63
|
+
|
64
|
+
# Verify ZIP was created successfully
|
65
|
+
if zip_path.exists() and zip_path.stat().st_size > 0:
|
66
|
+
self.logger.info(f"ZIP archive created successfully: {zip_path}")
|
67
|
+
self.logger.info(f"Archive size: {zip_path.stat().st_size / (1024*1024):.2f} MB")
|
68
|
+
return str(zip_path)
|
69
|
+
else:
|
70
|
+
self.logger.error("ZIP archive creation failed")
|
71
|
+
return None
|
72
|
+
|
73
|
+
except Exception as e:
|
74
|
+
self.logger.error(f"Error creating ZIP archive: {e}")
|
75
|
+
return None
|
76
|
+
|
77
|
+
def _organize_files(self, file_paths):
|
78
|
+
"""
|
79
|
+
Organize files by extracting year and semester information.
|
80
|
+
|
81
|
+
Args:
|
82
|
+
file_paths (list): List of file paths
|
83
|
+
|
84
|
+
Returns:
|
85
|
+
dict: Dictionary mapping file paths to organization info
|
86
|
+
"""
|
87
|
+
organized = {}
|
88
|
+
|
89
|
+
for file_path in file_paths:
|
90
|
+
file_path = Path(file_path)
|
91
|
+
filename = file_path.name
|
92
|
+
|
93
|
+
# Extract year and semester from filename
|
94
|
+
year = self._extract_year_from_filename(filename)
|
95
|
+
semester = self._extract_semester_from_filename(filename)
|
96
|
+
paper_type = self._extract_paper_type_from_filename(filename)
|
97
|
+
|
98
|
+
organized[str(file_path)] = {
|
99
|
+
'year': year,
|
100
|
+
'semester': semester,
|
101
|
+
'paper_type': paper_type,
|
102
|
+
'original_name': filename
|
103
|
+
}
|
104
|
+
|
105
|
+
return organized
|
106
|
+
|
107
|
+
def _get_archive_path(self, file_path, subject_code, organized_files):
|
108
|
+
"""
|
109
|
+
Get the archive path for a file within the ZIP.
|
110
|
+
|
111
|
+
Args:
|
112
|
+
file_path (Path): Original file path
|
113
|
+
subject_code (str): Subject code
|
114
|
+
organized_files (dict): Organization information
|
115
|
+
|
116
|
+
Returns:
|
117
|
+
str: Path within the ZIP archive
|
118
|
+
"""
|
119
|
+
file_info = organized_files.get(str(file_path), {})
|
120
|
+
|
121
|
+
# Build hierarchical path: SubjectCode/Year/filename (simplified structure)
|
122
|
+
path_parts = [subject_code]
|
123
|
+
|
124
|
+
year = file_info.get('year')
|
125
|
+
|
126
|
+
if year:
|
127
|
+
path_parts.append(f"Year_{year}")
|
128
|
+
else:
|
129
|
+
# If no year info, put in "Unsorted" folder
|
130
|
+
path_parts.append("Unsorted")
|
131
|
+
|
132
|
+
# Use original filename or clean it up
|
133
|
+
filename = file_info.get('original_name', file_path.name)
|
134
|
+
path_parts.append(filename)
|
135
|
+
|
136
|
+
return '/'.join(path_parts)
|
137
|
+
|
138
|
+
def _extract_year_from_filename(self, filename):
|
139
|
+
"""Extract year from filename."""
|
140
|
+
import re
|
141
|
+
|
142
|
+
# Look for 4-digit year (20xx)
|
143
|
+
year_match = re.search(r'(20\d{2})', filename)
|
144
|
+
if year_match:
|
145
|
+
return year_match.group(1)
|
146
|
+
|
147
|
+
# Look for Y followed by year
|
148
|
+
year_match = re.search(r'Y(20\d{2}|0\d|1\d|2\d)', filename)
|
149
|
+
if year_match:
|
150
|
+
year = year_match.group(1)
|
151
|
+
if len(year) == 2:
|
152
|
+
# Convert 2-digit to 4-digit year
|
153
|
+
year_int = int(year)
|
154
|
+
if year_int <= 30: # Assume 00-30 means 2000-2030
|
155
|
+
return f"20{year}"
|
156
|
+
else: # 31-99 means 1931-1999
|
157
|
+
return f"19{year}"
|
158
|
+
return year
|
159
|
+
|
160
|
+
return None
|
161
|
+
|
162
|
+
def _extract_semester_from_filename(self, filename):
|
163
|
+
"""Extract semester from filename."""
|
164
|
+
import re
|
165
|
+
|
166
|
+
filename_lower = filename.lower()
|
167
|
+
|
168
|
+
# Look for S1, S2, Sem1, Sem2, Semester 1, etc.
|
169
|
+
if re.search(r's1|sem1|semester\s*1', filename_lower):
|
170
|
+
return '1'
|
171
|
+
elif re.search(r's2|sem2|semester\s*2', filename_lower):
|
172
|
+
return '2'
|
173
|
+
elif re.search(r's3|sem3|semester\s*3', filename_lower):
|
174
|
+
return '3'
|
175
|
+
|
176
|
+
return None
|
177
|
+
|
178
|
+
def _extract_paper_type_from_filename(self, filename):
|
179
|
+
"""Extract paper type from filename."""
|
180
|
+
filename_lower = filename.lower()
|
181
|
+
|
182
|
+
if 'final' in filename_lower:
|
183
|
+
return 'Final_Exam'
|
184
|
+
elif any(word in filename_lower for word in ['mid', 'midterm']):
|
185
|
+
return 'Midterm_Exam'
|
186
|
+
elif 'quiz' in filename_lower:
|
187
|
+
return 'Quiz'
|
188
|
+
elif 'test' in filename_lower:
|
189
|
+
return 'Test'
|
190
|
+
elif 'assignment' in filename_lower:
|
191
|
+
return 'Assignment'
|
192
|
+
|
193
|
+
return 'Exam'
|
194
|
+
|
195
|
+
def _generate_readme(self, subject_code, file_paths):
|
196
|
+
"""Generate README content for the ZIP archive."""
|
197
|
+
content = f"""
|
198
|
+
{subject_code} Past Year Papers
|
199
|
+
===============================
|
200
|
+
|
201
|
+
This archive contains past year examination papers for {subject_code}.
|
202
|
+
|
203
|
+
Archive Contents:
|
204
|
+
- Total files: {len(file_paths)}
|
205
|
+
- Subject: {subject_code}
|
206
|
+
- Downloaded: {self._get_current_timestamp()}
|
207
|
+
|
208
|
+
File Organization:
|
209
|
+
- Files are organized by Year only
|
210
|
+
- Naming convention: Year_Semester_Type_Title.pdf
|
211
|
+
|
212
|
+
Usage Instructions:
|
213
|
+
1. Extract the archive to your desired location
|
214
|
+
2. Papers are organized in folders by year
|
215
|
+
3. Each year folder contains all papers for that year
|
216
|
+
4. File names include the year, semester, and exam type for identification
|
217
|
+
|
218
|
+
Notes:
|
219
|
+
- All papers are in PDF format
|
220
|
+
- Files maintain their original names with additional organization metadata
|
221
|
+
- This archive was created using the UM Past Year Paper Downloader tool
|
222
|
+
|
223
|
+
For questions or issues, please refer to the tool documentation.
|
224
|
+
|
225
|
+
Generated by UM Past Year Paper Downloader
|
226
|
+
==========================================
|
227
|
+
"""
|
228
|
+
return content.strip()
|
229
|
+
|
230
|
+
def _get_current_timestamp(self):
|
231
|
+
"""Get current timestamp as string."""
|
232
|
+
from datetime import datetime
|
233
|
+
return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
234
|
+
|
235
|
+
def verify_zip(self, zip_path):
|
236
|
+
"""
|
237
|
+
Verify that the ZIP file is valid and contains expected files.
|
238
|
+
|
239
|
+
Args:
|
240
|
+
zip_path (str or Path): Path to ZIP file
|
241
|
+
|
242
|
+
Returns:
|
243
|
+
bool: True if ZIP is valid, False otherwise
|
244
|
+
"""
|
245
|
+
try:
|
246
|
+
zip_path = Path(zip_path)
|
247
|
+
|
248
|
+
if not zip_path.exists():
|
249
|
+
return False
|
250
|
+
|
251
|
+
with zipfile.ZipFile(zip_path, 'r') as zipf:
|
252
|
+
# Test the ZIP file
|
253
|
+
bad_file = zipf.testzip()
|
254
|
+
if bad_file:
|
255
|
+
self.logger.error(f"Corrupted file in ZIP: {bad_file}")
|
256
|
+
return False
|
257
|
+
|
258
|
+
# Check if ZIP has content
|
259
|
+
file_list = zipf.namelist()
|
260
|
+
if not file_list:
|
261
|
+
self.logger.error("ZIP file is empty")
|
262
|
+
return False
|
263
|
+
|
264
|
+
self.logger.info(f"ZIP verification successful: {len(file_list)} files")
|
265
|
+
return True
|
266
|
+
|
267
|
+
except Exception as e:
|
268
|
+
self.logger.error(f"Error verifying ZIP file: {e}")
|
269
|
+
return False
|
270
|
+
|
271
|
+
def extract_zip_info(self, zip_path):
|
272
|
+
"""
|
273
|
+
Extract information about the ZIP archive.
|
274
|
+
|
275
|
+
Args:
|
276
|
+
zip_path (str or Path): Path to ZIP file
|
277
|
+
|
278
|
+
Returns:
|
279
|
+
dict: Information about the ZIP archive
|
280
|
+
"""
|
281
|
+
try:
|
282
|
+
zip_path = Path(zip_path)
|
283
|
+
|
284
|
+
with zipfile.ZipFile(zip_path, 'r') as zipf:
|
285
|
+
file_list = zipf.namelist()
|
286
|
+
total_size = sum(info.file_size for info in zipf.infolist())
|
287
|
+
compressed_size = zip_path.stat().st_size
|
288
|
+
|
289
|
+
return {
|
290
|
+
'file_count': len(file_list),
|
291
|
+
'total_uncompressed_size_mb': total_size / (1024 * 1024),
|
292
|
+
'compressed_size_mb': compressed_size / (1024 * 1024),
|
293
|
+
'compression_ratio': (1 - compressed_size / total_size) * 100 if total_size > 0 else 0,
|
294
|
+
'files': file_list
|
295
|
+
}
|
296
|
+
|
297
|
+
except Exception as e:
|
298
|
+
self.logger.error(f"Error extracting ZIP info: {e}")
|
299
|
+
return None
|