umpaper-fetch 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,316 @@
1
+ """
2
+ Paper Scraper Module
3
+
4
+ Handles searching for papers by subject code and extracting download URLs
5
+ and metadata from the UM exam paper repository.
6
+ """
7
+
8
+ import logging
9
+ import re
10
+ from urllib.parse import urljoin, parse_qs, urlparse
11
+ from bs4 import BeautifulSoup
12
+ import requests
13
+
14
+
15
+ class PaperInfo:
16
+ """Data class for storing paper information."""
17
+
18
+ def __init__(self, title, download_url, year=None, semester=None, paper_type=None):
19
+ self.title = title
20
+ self.download_url = download_url
21
+ self.year = year
22
+ self.semester = semester
23
+ self.paper_type = paper_type
24
+ self.filename = self._generate_filename()
25
+
26
+ def _generate_filename(self):
27
+ """Generate a clean filename for the paper."""
28
+ # Extract useful parts from title - remove subject code and semester/year info to avoid duplication
29
+ title_to_clean = self.title
30
+
31
+ # Remove subject code pattern from title (e.g., "WIA1005 (Semester 1, 2024)")
32
+ title_to_clean = re.sub(r'[A-Z]{2,4}\d{4}\s*\([^)]+\)\s*', '', title_to_clean)
33
+
34
+ # Clean the remaining title
35
+ clean_title = re.sub(r'[^\w\s-]', '', title_to_clean)
36
+ clean_title = re.sub(r'\s+', '_', clean_title.strip())
37
+
38
+ # Add year and semester if available
39
+ parts = []
40
+ if self.year:
41
+ parts.append(f"Y{self.year}")
42
+ if self.semester:
43
+ parts.append(f"S{self.semester}")
44
+ if self.paper_type:
45
+ parts.append(self.paper_type)
46
+
47
+ if parts:
48
+ filename = f"{'_'.join(parts)}_{clean_title}.pdf"
49
+ else:
50
+ filename = f"{clean_title}.pdf"
51
+
52
+ # Ensure filename is not too long
53
+ if len(filename) > 100:
54
+ filename = filename[:95] + ".pdf"
55
+
56
+ return filename
57
+
58
+ def __str__(self):
59
+ return f"PaperInfo(title='{self.title}', year={self.year}, semester={self.semester})"
60
+
61
+
62
+ class PaperScraper:
63
+ """Scrapes exam papers from UM repository."""
64
+
65
+ def __init__(self, session):
66
+ """Initialize scraper with authenticated session."""
67
+ self.session = session
68
+ self.base_url = "https://exampaper-um-edu-my.eu1.proxy.openathens.net"
69
+ self.search_url = f"{self.base_url}/cgi/search"
70
+ self.logger = logging.getLogger(__name__)
71
+
72
+ def search_papers(self, subject_code, max_results=100):
73
+ """
74
+ Search for papers by subject code.
75
+
76
+ Args:
77
+ subject_code (str): Subject code to search for
78
+ max_results (int): Maximum number of results to return
79
+
80
+ Returns:
81
+ list[PaperInfo]: List of paper information objects
82
+ """
83
+ self.logger.info(f"Searching for papers with subject code: {subject_code}")
84
+
85
+ papers = []
86
+
87
+ try:
88
+ # Use the correct search URL and parameters based on the actual form
89
+ search_params = {
90
+ 'q': subject_code,
91
+ '_action_search': 'Search',
92
+ '_order': 'bytitle',
93
+ 'basic_srchtype': 'ALL',
94
+ '_satisfyall': 'ALL'
95
+ }
96
+
97
+ self.logger.info(f"Performing search with params: {search_params}")
98
+
99
+ # Perform search request using GET (like the form does)
100
+ response = self.session.get(
101
+ "https://exampaper-um-edu-my.eu1.proxy.openathens.net/cgi/search",
102
+ params=search_params,
103
+ timeout=30
104
+ )
105
+
106
+ if response.status_code != 200:
107
+ self.logger.error(f"Search request failed: {response.status_code}")
108
+ return papers
109
+
110
+ # Parse search results
111
+ soup = BeautifulSoup(response.content, 'html.parser')
112
+
113
+ # Check if we got results
114
+ results_text = soup.find('div', class_='ep_search_controls')
115
+ if results_text:
116
+ text = results_text.get_text()
117
+ self.logger.info(f"Search results info: {text}")
118
+
119
+ # Extract number of results
120
+ import re
121
+ match = re.search(r'(\d+)\s+of\s+(\d+)', text)
122
+ if match:
123
+ total_results = int(match.group(2))
124
+ self.logger.info(f"Found {total_results} total results")
125
+ else:
126
+ self.logger.warning("Could not determine number of results")
127
+
128
+ papers = self._parse_search_results(soup, subject_code)
129
+
130
+ self.logger.info(f"Successfully extracted {len(papers)} papers for {subject_code}")
131
+
132
+ except Exception as e:
133
+ self.logger.error(f"Error searching for papers: {e}")
134
+
135
+ return papers[:max_results]
136
+
137
+ def _parse_search_results(self, soup, subject_code):
138
+ """Parse search results from HTML."""
139
+ papers = []
140
+
141
+ # Look for the results table
142
+ results_table = soup.find('table', class_='ep_paginate_list')
143
+ if not results_table:
144
+ self.logger.warning("No results table found with class 'ep_paginate_list'")
145
+ return papers
146
+
147
+ # Find all result rows
148
+ result_rows = results_table.find_all('tr', class_='ep_search_result')
149
+ self.logger.info(f"Found {len(result_rows)} result rows")
150
+
151
+ for i, row in enumerate(result_rows, 1):
152
+ try:
153
+ self.logger.info(f"Processing result {i}...")
154
+ paper_info = self._extract_paper_info_from_row(row, subject_code)
155
+ if paper_info:
156
+ papers.append(paper_info)
157
+ self.logger.info(f"✅ Extracted: {paper_info.title}")
158
+ else:
159
+ self.logger.warning(f"❌ Could not extract info from result {i}")
160
+ except Exception as e:
161
+ self.logger.warning(f"Error parsing result {i}: {e}")
162
+ continue
163
+
164
+ return papers
165
+
166
+ def _extract_paper_info_from_row(self, row, subject_code):
167
+ """Extract paper information from a search result row."""
168
+ try:
169
+ # Get all cells in the row
170
+ cells = row.find_all('td')
171
+ if len(cells) < 2:
172
+ self.logger.warning("Row doesn't have enough cells")
173
+ return None
174
+
175
+ # The main content is in the second cell
176
+ content_cell = cells[1]
177
+
178
+ # Extract the title and basic info
179
+ # Pattern: "WIA1005 (Semester X, YEAR) Title"
180
+ text_content = content_cell.get_text(strip=True)
181
+ self.logger.info(f"Row content: {text_content[:100]}...")
182
+
183
+ # Extract semester and year
184
+ semester_year_match = re.search(r'\(Semester (\d+), (\d{4})\)', text_content)
185
+ if semester_year_match:
186
+ semester = semester_year_match.group(1)
187
+ year = semester_year_match.group(2)
188
+ else:
189
+ semester = None
190
+ year = None
191
+ self.logger.warning("Could not extract semester/year info")
192
+
193
+ # Find the main paper link (usually the title link)
194
+ title_link = content_cell.find('a', href=True)
195
+ if title_link:
196
+ title = title_link.get_text(strip=True)
197
+ # Remove italic formatting
198
+ title = re.sub(r'[/*]', '', title)
199
+ paper_url = urljoin(self.base_url, title_link.get('href'))
200
+ self.logger.info(f"Found title link: {title}")
201
+ else:
202
+ self.logger.warning("No title link found")
203
+ return None
204
+
205
+ # Look for direct PDF download link
206
+ download_url = None
207
+
208
+ # Check the third cell (if exists) for PDF links
209
+ if len(cells) > 2:
210
+ pdf_cell = cells[2]
211
+ pdf_links = pdf_cell.find_all('a', href=True)
212
+ for link in pdf_links:
213
+ href = link.get('href')
214
+ if href and '.pdf' in href.lower():
215
+ download_url = urljoin(self.base_url, href)
216
+ self.logger.info(f"Found direct PDF link: {download_url}")
217
+ break
218
+
219
+ # If no direct PDF link found, try to get it from the paper page
220
+ if not download_url:
221
+ self.logger.info("No direct PDF link found, checking paper page...")
222
+ download_url = self._get_download_url(paper_url)
223
+
224
+ if not download_url:
225
+ self.logger.warning(f"No download URL found for: {title}")
226
+ return None
227
+
228
+ # Generate a clean title without redundant info (year/semester will be in filename prefix)
229
+ clean_title = f"{subject_code} {title}"
230
+
231
+ paper_type = self._determine_paper_type(title)
232
+
233
+ return PaperInfo(
234
+ title=clean_title,
235
+ download_url=download_url,
236
+ year=year,
237
+ semester=semester,
238
+ paper_type=paper_type
239
+ )
240
+
241
+ except Exception as e:
242
+ self.logger.warning(f"Error extracting paper info: {e}")
243
+ return None
244
+
245
+ def _determine_paper_type(self, title):
246
+ """Determine the type of paper from the title."""
247
+ title_lower = title.lower()
248
+
249
+ if 'final' in title_lower:
250
+ return 'Final'
251
+ elif 'mid' in title_lower or 'midterm' in title_lower:
252
+ return 'Midterm'
253
+ elif 'quiz' in title_lower:
254
+ return 'Quiz'
255
+ elif 'test' in title_lower:
256
+ return 'Test'
257
+ else:
258
+ return 'Exam'
259
+
260
+ def _get_download_url(self, paper_url):
261
+ """Get the actual PDF download URL from the paper page."""
262
+ try:
263
+ self.logger.info(f"Getting download URL from: {paper_url}")
264
+ response = self.session.get(paper_url, timeout=15)
265
+ if response.status_code != 200:
266
+ self.logger.warning(f"Failed to access paper page: {response.status_code}")
267
+ return None
268
+
269
+ soup = BeautifulSoup(response.content, 'html.parser')
270
+
271
+ # Method 1: Look for direct PDF download links
272
+ download_links = soup.find_all('a', href=True)
273
+
274
+ for link in download_links:
275
+ href = link.get('href')
276
+ link_text = link.get_text(strip=True).lower()
277
+
278
+ # Look for PDF files or download links
279
+ if href and ('.pdf' in href.lower() or
280
+ 'download' in href.lower() or
281
+ 'download' in link_text or
282
+ 'pdf' in link_text):
283
+ download_url = urljoin(self.base_url, href)
284
+ self.logger.info(f"Found download link: {download_url}")
285
+ return download_url
286
+
287
+ # Method 2: Look for repository-specific patterns
288
+ # UM repository often uses /id/eprint/XXXXX/1/filename.pdf
289
+ eprint_links = soup.find_all('a', href=re.compile(r'/\d+/\d+/.*\.pdf$', re.I))
290
+ if eprint_links:
291
+ download_url = urljoin(self.base_url, eprint_links[0].get('href'))
292
+ self.logger.info(f"Found eprint PDF: {download_url}")
293
+ return download_url
294
+
295
+ # Method 3: Look for any PDF links
296
+ pdf_links = soup.find_all('a', href=re.compile(r'\.pdf$', re.I))
297
+ if pdf_links:
298
+ download_url = urljoin(self.base_url, pdf_links[0].get('href'))
299
+ self.logger.info(f"Found PDF link: {download_url}")
300
+ return download_url
301
+
302
+ # Method 4: Check for embedded objects or iframes
303
+ objects = soup.find_all(['object', 'embed', 'iframe'])
304
+ for obj in objects:
305
+ src = obj.get('src') or obj.get('data')
306
+ if src and '.pdf' in src.lower():
307
+ download_url = urljoin(self.base_url, src)
308
+ self.logger.info(f"Found embedded PDF: {download_url}")
309
+ return download_url
310
+
311
+ self.logger.warning(f"No download URL found on page: {paper_url}")
312
+
313
+ except Exception as e:
314
+ self.logger.warning(f"Error getting download URL for {paper_url}: {e}")
315
+
316
+ return None
@@ -0,0 +1 @@
1
+ # Utils package
@@ -0,0 +1,67 @@
1
+ """
2
+ Logging Configuration Module
3
+
4
+ Sets up logging configuration for the UM Past Year Paper Downloader.
5
+ """
6
+
7
+ import logging
8
+ import sys
9
+ from pathlib import Path
10
+ from datetime import datetime
11
+
12
+
13
+ def setup_logger(level=logging.INFO, log_file=None):
14
+ """
15
+ Set up logging configuration.
16
+
17
+ Args:
18
+ level (int): Logging level (default: INFO)
19
+ log_file (str): Optional log file path
20
+
21
+ Returns:
22
+ logging.Logger: Configured logger instance
23
+ """
24
+ # Create logs directory if it doesn't exist
25
+ if log_file is None:
26
+ logs_dir = Path("logs")
27
+ logs_dir.mkdir(exist_ok=True)
28
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
29
+ log_file = logs_dir / f"um_downloader_{timestamp}.log"
30
+
31
+ # Create formatter
32
+ formatter = logging.Formatter(
33
+ '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
34
+ datefmt='%Y-%m-%d %H:%M:%S'
35
+ )
36
+
37
+ # Configure root logger
38
+ root_logger = logging.getLogger()
39
+ root_logger.setLevel(level)
40
+
41
+ # Clear any existing handlers
42
+ root_logger.handlers.clear()
43
+
44
+ # Console handler
45
+ console_handler = logging.StreamHandler(sys.stdout)
46
+ console_handler.setLevel(level)
47
+ console_handler.setFormatter(formatter)
48
+ root_logger.addHandler(console_handler)
49
+
50
+ # File handler
51
+ if log_file:
52
+ file_handler = logging.FileHandler(log_file, encoding='utf-8')
53
+ file_handler.setLevel(logging.DEBUG) # Always log everything to file
54
+ file_handler.setFormatter(formatter)
55
+ root_logger.addHandler(file_handler)
56
+
57
+ # Suppress some noisy third-party loggers
58
+ logging.getLogger('urllib3').setLevel(logging.WARNING)
59
+ logging.getLogger('selenium').setLevel(logging.WARNING)
60
+ logging.getLogger('webdriver_manager').setLevel(logging.WARNING)
61
+
62
+ logger = logging.getLogger(__name__)
63
+ logger.info(f"Logging initialized - Level: {logging.getLevelName(level)}")
64
+ if log_file:
65
+ logger.info(f"Log file: {log_file}")
66
+
67
+ return root_logger
@@ -0,0 +1,299 @@
1
+ """
2
+ ZIP Creator Module
3
+
4
+ Creates organized ZIP archives with proper file naming and structure.
5
+ """
6
+
7
+ import logging
8
+ import zipfile
9
+ from pathlib import Path
10
+ import os
11
+
12
+
13
+ class ZipCreator:
14
+ """Creates organized ZIP archives from downloaded papers."""
15
+
16
+ def __init__(self):
17
+ """Initialize the ZIP creator."""
18
+ self.logger = logging.getLogger(__name__)
19
+
20
+ def create_zip(self, file_paths, zip_path, subject_code):
21
+ """
22
+ Create a ZIP archive from downloaded files.
23
+
24
+ Args:
25
+ file_paths (list): List of file paths to include
26
+ zip_path (str or Path): Output ZIP file path
27
+ subject_code (str): Subject code for organization
28
+
29
+ Returns:
30
+ str: Path to created ZIP file
31
+ """
32
+ zip_path = Path(zip_path)
33
+
34
+ if not file_paths:
35
+ self.logger.warning("No files to zip")
36
+ return None
37
+
38
+ self.logger.info(f"Creating ZIP archive: {zip_path.name}")
39
+
40
+ try:
41
+ with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED, compresslevel=6) as zipf:
42
+ # Organize files by year and semester
43
+ organized_files = self._organize_files(file_paths)
44
+
45
+ # Add files to ZIP with organized structure
46
+ for file_path in file_paths:
47
+ file_path = Path(file_path)
48
+
49
+ if not file_path.exists():
50
+ self.logger.warning(f"File not found, skipping: {file_path}")
51
+ continue
52
+
53
+ # Determine archive path based on organization
54
+ archive_path = self._get_archive_path(file_path, subject_code, organized_files)
55
+
56
+ # Add file to ZIP
57
+ zipf.write(file_path, archive_path)
58
+ self.logger.debug(f"Added to ZIP: {archive_path}")
59
+
60
+ # Add a README file with information
61
+ readme_content = self._generate_readme(subject_code, file_paths)
62
+ zipf.writestr(f"{subject_code}_README.txt", readme_content)
63
+
64
+ # Verify ZIP was created successfully
65
+ if zip_path.exists() and zip_path.stat().st_size > 0:
66
+ self.logger.info(f"ZIP archive created successfully: {zip_path}")
67
+ self.logger.info(f"Archive size: {zip_path.stat().st_size / (1024*1024):.2f} MB")
68
+ return str(zip_path)
69
+ else:
70
+ self.logger.error("ZIP archive creation failed")
71
+ return None
72
+
73
+ except Exception as e:
74
+ self.logger.error(f"Error creating ZIP archive: {e}")
75
+ return None
76
+
77
+ def _organize_files(self, file_paths):
78
+ """
79
+ Organize files by extracting year and semester information.
80
+
81
+ Args:
82
+ file_paths (list): List of file paths
83
+
84
+ Returns:
85
+ dict: Dictionary mapping file paths to organization info
86
+ """
87
+ organized = {}
88
+
89
+ for file_path in file_paths:
90
+ file_path = Path(file_path)
91
+ filename = file_path.name
92
+
93
+ # Extract year and semester from filename
94
+ year = self._extract_year_from_filename(filename)
95
+ semester = self._extract_semester_from_filename(filename)
96
+ paper_type = self._extract_paper_type_from_filename(filename)
97
+
98
+ organized[str(file_path)] = {
99
+ 'year': year,
100
+ 'semester': semester,
101
+ 'paper_type': paper_type,
102
+ 'original_name': filename
103
+ }
104
+
105
+ return organized
106
+
107
+ def _get_archive_path(self, file_path, subject_code, organized_files):
108
+ """
109
+ Get the archive path for a file within the ZIP.
110
+
111
+ Args:
112
+ file_path (Path): Original file path
113
+ subject_code (str): Subject code
114
+ organized_files (dict): Organization information
115
+
116
+ Returns:
117
+ str: Path within the ZIP archive
118
+ """
119
+ file_info = organized_files.get(str(file_path), {})
120
+
121
+ # Build hierarchical path: SubjectCode/Year/filename (simplified structure)
122
+ path_parts = [subject_code]
123
+
124
+ year = file_info.get('year')
125
+
126
+ if year:
127
+ path_parts.append(f"Year_{year}")
128
+ else:
129
+ # If no year info, put in "Unsorted" folder
130
+ path_parts.append("Unsorted")
131
+
132
+ # Use original filename or clean it up
133
+ filename = file_info.get('original_name', file_path.name)
134
+ path_parts.append(filename)
135
+
136
+ return '/'.join(path_parts)
137
+
138
+ def _extract_year_from_filename(self, filename):
139
+ """Extract year from filename."""
140
+ import re
141
+
142
+ # Look for 4-digit year (20xx)
143
+ year_match = re.search(r'(20\d{2})', filename)
144
+ if year_match:
145
+ return year_match.group(1)
146
+
147
+ # Look for Y followed by year
148
+ year_match = re.search(r'Y(20\d{2}|0\d|1\d|2\d)', filename)
149
+ if year_match:
150
+ year = year_match.group(1)
151
+ if len(year) == 2:
152
+ # Convert 2-digit to 4-digit year
153
+ year_int = int(year)
154
+ if year_int <= 30: # Assume 00-30 means 2000-2030
155
+ return f"20{year}"
156
+ else: # 31-99 means 1931-1999
157
+ return f"19{year}"
158
+ return year
159
+
160
+ return None
161
+
162
+ def _extract_semester_from_filename(self, filename):
163
+ """Extract semester from filename."""
164
+ import re
165
+
166
+ filename_lower = filename.lower()
167
+
168
+ # Look for S1, S2, Sem1, Sem2, Semester 1, etc.
169
+ if re.search(r's1|sem1|semester\s*1', filename_lower):
170
+ return '1'
171
+ elif re.search(r's2|sem2|semester\s*2', filename_lower):
172
+ return '2'
173
+ elif re.search(r's3|sem3|semester\s*3', filename_lower):
174
+ return '3'
175
+
176
+ return None
177
+
178
+ def _extract_paper_type_from_filename(self, filename):
179
+ """Extract paper type from filename."""
180
+ filename_lower = filename.lower()
181
+
182
+ if 'final' in filename_lower:
183
+ return 'Final_Exam'
184
+ elif any(word in filename_lower for word in ['mid', 'midterm']):
185
+ return 'Midterm_Exam'
186
+ elif 'quiz' in filename_lower:
187
+ return 'Quiz'
188
+ elif 'test' in filename_lower:
189
+ return 'Test'
190
+ elif 'assignment' in filename_lower:
191
+ return 'Assignment'
192
+
193
+ return 'Exam'
194
+
195
+ def _generate_readme(self, subject_code, file_paths):
196
+ """Generate README content for the ZIP archive."""
197
+ content = f"""
198
+ {subject_code} Past Year Papers
199
+ ===============================
200
+
201
+ This archive contains past year examination papers for {subject_code}.
202
+
203
+ Archive Contents:
204
+ - Total files: {len(file_paths)}
205
+ - Subject: {subject_code}
206
+ - Downloaded: {self._get_current_timestamp()}
207
+
208
+ File Organization:
209
+ - Files are organized by Year only
210
+ - Naming convention: Year_Semester_Type_Title.pdf
211
+
212
+ Usage Instructions:
213
+ 1. Extract the archive to your desired location
214
+ 2. Papers are organized in folders by year
215
+ 3. Each year folder contains all papers for that year
216
+ 4. File names include the year, semester, and exam type for identification
217
+
218
+ Notes:
219
+ - All papers are in PDF format
220
+ - Files maintain their original names with additional organization metadata
221
+ - This archive was created using the UM Past Year Paper Downloader tool
222
+
223
+ For questions or issues, please refer to the tool documentation.
224
+
225
+ Generated by UM Past Year Paper Downloader
226
+ ==========================================
227
+ """
228
+ return content.strip()
229
+
230
+ def _get_current_timestamp(self):
231
+ """Get current timestamp as string."""
232
+ from datetime import datetime
233
+ return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
234
+
235
+ def verify_zip(self, zip_path):
236
+ """
237
+ Verify that the ZIP file is valid and contains expected files.
238
+
239
+ Args:
240
+ zip_path (str or Path): Path to ZIP file
241
+
242
+ Returns:
243
+ bool: True if ZIP is valid, False otherwise
244
+ """
245
+ try:
246
+ zip_path = Path(zip_path)
247
+
248
+ if not zip_path.exists():
249
+ return False
250
+
251
+ with zipfile.ZipFile(zip_path, 'r') as zipf:
252
+ # Test the ZIP file
253
+ bad_file = zipf.testzip()
254
+ if bad_file:
255
+ self.logger.error(f"Corrupted file in ZIP: {bad_file}")
256
+ return False
257
+
258
+ # Check if ZIP has content
259
+ file_list = zipf.namelist()
260
+ if not file_list:
261
+ self.logger.error("ZIP file is empty")
262
+ return False
263
+
264
+ self.logger.info(f"ZIP verification successful: {len(file_list)} files")
265
+ return True
266
+
267
+ except Exception as e:
268
+ self.logger.error(f"Error verifying ZIP file: {e}")
269
+ return False
270
+
271
+ def extract_zip_info(self, zip_path):
272
+ """
273
+ Extract information about the ZIP archive.
274
+
275
+ Args:
276
+ zip_path (str or Path): Path to ZIP file
277
+
278
+ Returns:
279
+ dict: Information about the ZIP archive
280
+ """
281
+ try:
282
+ zip_path = Path(zip_path)
283
+
284
+ with zipfile.ZipFile(zip_path, 'r') as zipf:
285
+ file_list = zipf.namelist()
286
+ total_size = sum(info.file_size for info in zipf.infolist())
287
+ compressed_size = zip_path.stat().st_size
288
+
289
+ return {
290
+ 'file_count': len(file_list),
291
+ 'total_uncompressed_size_mb': total_size / (1024 * 1024),
292
+ 'compressed_size_mb': compressed_size / (1024 * 1024),
293
+ 'compression_ratio': (1 - compressed_size / total_size) * 100 if total_size > 0 else 0,
294
+ 'files': file_list
295
+ }
296
+
297
+ except Exception as e:
298
+ self.logger.error(f"Error extracting ZIP info: {e}")
299
+ return None