umpaper-fetch 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
utils/zip_creator.py ADDED
@@ -0,0 +1,299 @@
1
+ """
2
+ ZIP Creator Module
3
+
4
+ Creates organized ZIP archives with proper file naming and structure.
5
+ """
6
+
7
+ import logging
8
+ import zipfile
9
+ from pathlib import Path
10
+ import os
11
+
12
+
13
+ class ZipCreator:
14
+ """Creates organized ZIP archives from downloaded papers."""
15
+
16
+ def __init__(self):
17
+ """Initialize the ZIP creator."""
18
+ self.logger = logging.getLogger(__name__)
19
+
20
+ def create_zip(self, file_paths, zip_path, subject_code):
21
+ """
22
+ Create a ZIP archive from downloaded files.
23
+
24
+ Args:
25
+ file_paths (list): List of file paths to include
26
+ zip_path (str or Path): Output ZIP file path
27
+ subject_code (str): Subject code for organization
28
+
29
+ Returns:
30
+ str: Path to created ZIP file
31
+ """
32
+ zip_path = Path(zip_path)
33
+
34
+ if not file_paths:
35
+ self.logger.warning("No files to zip")
36
+ return None
37
+
38
+ self.logger.info(f"Creating ZIP archive: {zip_path.name}")
39
+
40
+ try:
41
+ with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED, compresslevel=6) as zipf:
42
+ # Organize files by year and semester
43
+ organized_files = self._organize_files(file_paths)
44
+
45
+ # Add files to ZIP with organized structure
46
+ for file_path in file_paths:
47
+ file_path = Path(file_path)
48
+
49
+ if not file_path.exists():
50
+ self.logger.warning(f"File not found, skipping: {file_path}")
51
+ continue
52
+
53
+ # Determine archive path based on organization
54
+ archive_path = self._get_archive_path(file_path, subject_code, organized_files)
55
+
56
+ # Add file to ZIP
57
+ zipf.write(file_path, archive_path)
58
+ self.logger.debug(f"Added to ZIP: {archive_path}")
59
+
60
+ # Add a README file with information
61
+ readme_content = self._generate_readme(subject_code, file_paths)
62
+ zipf.writestr(f"{subject_code}_README.txt", readme_content)
63
+
64
+ # Verify ZIP was created successfully
65
+ if zip_path.exists() and zip_path.stat().st_size > 0:
66
+ self.logger.info(f"ZIP archive created successfully: {zip_path}")
67
+ self.logger.info(f"Archive size: {zip_path.stat().st_size / (1024*1024):.2f} MB")
68
+ return str(zip_path)
69
+ else:
70
+ self.logger.error("ZIP archive creation failed")
71
+ return None
72
+
73
+ except Exception as e:
74
+ self.logger.error(f"Error creating ZIP archive: {e}")
75
+ return None
76
+
77
+ def _organize_files(self, file_paths):
78
+ """
79
+ Organize files by extracting year and semester information.
80
+
81
+ Args:
82
+ file_paths (list): List of file paths
83
+
84
+ Returns:
85
+ dict: Dictionary mapping file paths to organization info
86
+ """
87
+ organized = {}
88
+
89
+ for file_path in file_paths:
90
+ file_path = Path(file_path)
91
+ filename = file_path.name
92
+
93
+ # Extract year and semester from filename
94
+ year = self._extract_year_from_filename(filename)
95
+ semester = self._extract_semester_from_filename(filename)
96
+ paper_type = self._extract_paper_type_from_filename(filename)
97
+
98
+ organized[str(file_path)] = {
99
+ 'year': year,
100
+ 'semester': semester,
101
+ 'paper_type': paper_type,
102
+ 'original_name': filename
103
+ }
104
+
105
+ return organized
106
+
107
+ def _get_archive_path(self, file_path, subject_code, organized_files):
108
+ """
109
+ Get the archive path for a file within the ZIP.
110
+
111
+ Args:
112
+ file_path (Path): Original file path
113
+ subject_code (str): Subject code
114
+ organized_files (dict): Organization information
115
+
116
+ Returns:
117
+ str: Path within the ZIP archive
118
+ """
119
+ file_info = organized_files.get(str(file_path), {})
120
+
121
+ # Build hierarchical path: SubjectCode/Year/filename (simplified structure)
122
+ path_parts = [subject_code]
123
+
124
+ year = file_info.get('year')
125
+
126
+ if year:
127
+ path_parts.append(f"Year_{year}")
128
+ else:
129
+ # If no year info, put in "Unsorted" folder
130
+ path_parts.append("Unsorted")
131
+
132
+ # Use original filename or clean it up
133
+ filename = file_info.get('original_name', file_path.name)
134
+ path_parts.append(filename)
135
+
136
+ return '/'.join(path_parts)
137
+
138
+ def _extract_year_from_filename(self, filename):
139
+ """Extract year from filename."""
140
+ import re
141
+
142
+ # Look for 4-digit year (20xx)
143
+ year_match = re.search(r'(20\d{2})', filename)
144
+ if year_match:
145
+ return year_match.group(1)
146
+
147
+ # Look for Y followed by year
148
+ year_match = re.search(r'Y(20\d{2}|0\d|1\d|2\d)', filename)
149
+ if year_match:
150
+ year = year_match.group(1)
151
+ if len(year) == 2:
152
+ # Convert 2-digit to 4-digit year
153
+ year_int = int(year)
154
+ if year_int <= 30: # Assume 00-30 means 2000-2030
155
+ return f"20{year}"
156
+ else: # 31-99 means 1931-1999
157
+ return f"19{year}"
158
+ return year
159
+
160
+ return None
161
+
162
+ def _extract_semester_from_filename(self, filename):
163
+ """Extract semester from filename."""
164
+ import re
165
+
166
+ filename_lower = filename.lower()
167
+
168
+ # Look for S1, S2, Sem1, Sem2, Semester 1, etc.
169
+ if re.search(r's1|sem1|semester\s*1', filename_lower):
170
+ return '1'
171
+ elif re.search(r's2|sem2|semester\s*2', filename_lower):
172
+ return '2'
173
+ elif re.search(r's3|sem3|semester\s*3', filename_lower):
174
+ return '3'
175
+
176
+ return None
177
+
178
+ def _extract_paper_type_from_filename(self, filename):
179
+ """Extract paper type from filename."""
180
+ filename_lower = filename.lower()
181
+
182
+ if 'final' in filename_lower:
183
+ return 'Final_Exam'
184
+ elif any(word in filename_lower for word in ['mid', 'midterm']):
185
+ return 'Midterm_Exam'
186
+ elif 'quiz' in filename_lower:
187
+ return 'Quiz'
188
+ elif 'test' in filename_lower:
189
+ return 'Test'
190
+ elif 'assignment' in filename_lower:
191
+ return 'Assignment'
192
+
193
+ return 'Exam'
194
+
195
+ def _generate_readme(self, subject_code, file_paths):
196
+ """Generate README content for the ZIP archive."""
197
+ content = f"""
198
+ {subject_code} Past Year Papers
199
+ ===============================
200
+
201
+ This archive contains past year examination papers for {subject_code}.
202
+
203
+ Archive Contents:
204
+ - Total files: {len(file_paths)}
205
+ - Subject: {subject_code}
206
+ - Downloaded: {self._get_current_timestamp()}
207
+
208
+ File Organization:
209
+ - Files are organized by Year only
210
+ - Naming convention: Year_Semester_Type_Title.pdf
211
+
212
+ Usage Instructions:
213
+ 1. Extract the archive to your desired location
214
+ 2. Papers are organized in folders by year
215
+ 3. Each year folder contains all papers for that year
216
+ 4. File names include the year, semester, and exam type for identification
217
+
218
+ Notes:
219
+ - All papers are in PDF format
220
+ - Files maintain their original names with additional organization metadata
221
+ - This archive was created using the UM Past Year Paper Downloader tool
222
+
223
+ For questions or issues, please refer to the tool documentation.
224
+
225
+ Generated by UM Past Year Paper Downloader
226
+ ==========================================
227
+ """
228
+ return content.strip()
229
+
230
+ def _get_current_timestamp(self):
231
+ """Get current timestamp as string."""
232
+ from datetime import datetime
233
+ return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
234
+
235
+ def verify_zip(self, zip_path):
236
+ """
237
+ Verify that the ZIP file is valid and contains expected files.
238
+
239
+ Args:
240
+ zip_path (str or Path): Path to ZIP file
241
+
242
+ Returns:
243
+ bool: True if ZIP is valid, False otherwise
244
+ """
245
+ try:
246
+ zip_path = Path(zip_path)
247
+
248
+ if not zip_path.exists():
249
+ return False
250
+
251
+ with zipfile.ZipFile(zip_path, 'r') as zipf:
252
+ # Test the ZIP file
253
+ bad_file = zipf.testzip()
254
+ if bad_file:
255
+ self.logger.error(f"Corrupted file in ZIP: {bad_file}")
256
+ return False
257
+
258
+ # Check if ZIP has content
259
+ file_list = zipf.namelist()
260
+ if not file_list:
261
+ self.logger.error("ZIP file is empty")
262
+ return False
263
+
264
+ self.logger.info(f"ZIP verification successful: {len(file_list)} files")
265
+ return True
266
+
267
+ except Exception as e:
268
+ self.logger.error(f"Error verifying ZIP file: {e}")
269
+ return False
270
+
271
+ def extract_zip_info(self, zip_path):
272
+ """
273
+ Extract information about the ZIP archive.
274
+
275
+ Args:
276
+ zip_path (str or Path): Path to ZIP file
277
+
278
+ Returns:
279
+ dict: Information about the ZIP archive
280
+ """
281
+ try:
282
+ zip_path = Path(zip_path)
283
+
284
+ with zipfile.ZipFile(zip_path, 'r') as zipf:
285
+ file_list = zipf.namelist()
286
+ total_size = sum(info.file_size for info in zipf.infolist())
287
+ compressed_size = zip_path.stat().st_size
288
+
289
+ return {
290
+ 'file_count': len(file_list),
291
+ 'total_uncompressed_size_mb': total_size / (1024 * 1024),
292
+ 'compressed_size_mb': compressed_size / (1024 * 1024),
293
+ 'compression_ratio': (1 - compressed_size / total_size) * 100 if total_size > 0 else 0,
294
+ 'files': file_list
295
+ }
296
+
297
+ except Exception as e:
298
+ self.logger.error(f"Error extracting ZIP info: {e}")
299
+ return None