umpaper-fetch 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,207 @@
1
+ """
2
+ PDF Downloader Module
3
+
4
+ Handles downloading PDF files with progress tracking, retry logic,
5
+ and concurrent download management.
6
+ """
7
+
8
+ import logging
9
+ import os
10
+ import time
11
+ from pathlib import Path
12
+ from concurrent.futures import ThreadPoolExecutor, as_completed
13
+ from tqdm import tqdm
14
+ import requests
15
+
16
+
17
+ class PDFDownloader:
18
+ """Downloads PDF files with progress tracking and retry logic."""
19
+
20
+ def __init__(self, session, output_dir, max_retries=3, max_workers=4):
21
+ """Initialize the PDF downloader."""
22
+ self.session = session
23
+ self.output_dir = Path(output_dir)
24
+ self.max_retries = max_retries
25
+ self.max_workers = max_workers
26
+ self.logger = logging.getLogger(__name__)
27
+
28
+ # Ensure output directory exists
29
+ self.output_dir.mkdir(parents=True, exist_ok=True)
30
+
31
+ def download_papers(self, papers):
32
+ """
33
+ Download multiple papers with progress tracking.
34
+
35
+ Args:
36
+ papers (list): List of PaperInfo objects to download
37
+
38
+ Returns:
39
+ list: List of successfully downloaded file paths
40
+ """
41
+ if not papers:
42
+ return []
43
+
44
+ self.logger.info(f"Starting download of {len(papers)} papers...")
45
+ downloaded_files = []
46
+
47
+ # Create progress bar
48
+ with tqdm(total=len(papers), desc="Downloading papers", unit="file") as pbar:
49
+ # Use ThreadPoolExecutor for concurrent downloads
50
+ with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
51
+ # Submit all download tasks
52
+ future_to_paper = {
53
+ executor.submit(self._download_paper, paper): paper
54
+ for paper in papers
55
+ }
56
+
57
+ # Process completed downloads
58
+ for future in as_completed(future_to_paper):
59
+ paper = future_to_paper[future]
60
+ try:
61
+ file_path = future.result()
62
+ if file_path:
63
+ downloaded_files.append(file_path)
64
+ self.logger.debug(f"Downloaded: {paper.filename}")
65
+ else:
66
+ self.logger.warning(f"Failed to download: {paper.title}")
67
+ except Exception as e:
68
+ self.logger.error(f"Error downloading {paper.title}: {e}")
69
+ finally:
70
+ pbar.update(1)
71
+
72
+ self.logger.info(f"Downloaded {len(downloaded_files)}/{len(papers)} papers successfully")
73
+ return downloaded_files
74
+
75
+ def _download_paper(self, paper):
76
+ """
77
+ Download a single paper with retry logic.
78
+
79
+ Args:
80
+ paper (PaperInfo): Paper information object
81
+
82
+ Returns:
83
+ str: Path to downloaded file, or None if failed
84
+ """
85
+ for attempt in range(self.max_retries + 1):
86
+ try:
87
+ file_path = self._download_file(paper.download_url, paper.filename)
88
+ if file_path and self._verify_download(file_path):
89
+ return file_path
90
+
91
+ except Exception as e:
92
+ self.logger.warning(
93
+ f"Download attempt {attempt + 1} failed for {paper.title}: {e}"
94
+ )
95
+
96
+ if attempt < self.max_retries:
97
+ # Wait before retry with exponential backoff
98
+ wait_time = 2 ** attempt
99
+ time.sleep(wait_time)
100
+
101
+ self.logger.error(f"Failed to download after {self.max_retries + 1} attempts: {paper.title}")
102
+ return None
103
+
104
+ def _download_file(self, url, filename):
105
+ """
106
+ Download a single file from URL.
107
+
108
+ Args:
109
+ url (str): Download URL
110
+ filename (str): Target filename
111
+
112
+ Returns:
113
+ str: Path to downloaded file
114
+ """
115
+ file_path = self.output_dir / filename
116
+
117
+ # Avoid re-downloading if file already exists and is valid
118
+ if file_path.exists() and self._verify_download(file_path):
119
+ self.logger.debug(f"File already exists: {filename}")
120
+ return str(file_path)
121
+
122
+ # Start download
123
+ response = self.session.get(url, stream=True, timeout=60)
124
+ response.raise_for_status()
125
+
126
+ # Check if response is actually a PDF
127
+ content_type = response.headers.get('content-type', '').lower()
128
+ if 'pdf' not in content_type and 'application/octet-stream' not in content_type:
129
+ self.logger.warning(f"Unexpected content type for {filename}: {content_type}")
130
+
131
+ # Write file with progress tracking
132
+ total_size = int(response.headers.get('content-length', 0))
133
+
134
+ with open(file_path, 'wb') as f:
135
+ if total_size > 0:
136
+ # Track download progress for large files
137
+ downloaded = 0
138
+ for chunk in response.iter_content(chunk_size=8192):
139
+ if chunk:
140
+ f.write(chunk)
141
+ downloaded += len(chunk)
142
+ else:
143
+ # For files without content-length header
144
+ for chunk in response.iter_content(chunk_size=8192):
145
+ if chunk:
146
+ f.write(chunk)
147
+
148
+ self.logger.debug(f"Downloaded {filename} ({file_path.stat().st_size} bytes)")
149
+ return str(file_path)
150
+
151
+ def _verify_download(self, file_path):
152
+ """
153
+ Verify that the downloaded file is valid.
154
+
155
+ Args:
156
+ file_path (str or Path): Path to the downloaded file
157
+
158
+ Returns:
159
+ bool: True if file is valid, False otherwise
160
+ """
161
+ try:
162
+ file_path = Path(file_path)
163
+
164
+ # Check if file exists and has content
165
+ if not file_path.exists() or file_path.stat().st_size == 0:
166
+ return False
167
+
168
+ # Basic PDF validation - check PDF header
169
+ with open(file_path, 'rb') as f:
170
+ header = f.read(8)
171
+ if not header.startswith(b'%PDF'):
172
+ self.logger.warning(f"File does not appear to be a PDF: {file_path.name}")
173
+ # Don't reject non-PDF files completely, might be valid documents
174
+ # return False
175
+
176
+ return True
177
+
178
+ except Exception as e:
179
+ self.logger.warning(f"Error verifying file {file_path}: {e}")
180
+ return False
181
+
182
+ def cleanup_failed_downloads(self):
183
+ """Remove any incomplete or corrupted downloads."""
184
+ cleaned_count = 0
185
+
186
+ for file_path in self.output_dir.glob('*.pdf'):
187
+ if not self._verify_download(file_path):
188
+ try:
189
+ file_path.unlink()
190
+ cleaned_count += 1
191
+ self.logger.debug(f"Removed invalid file: {file_path.name}")
192
+ except Exception as e:
193
+ self.logger.warning(f"Could not remove invalid file {file_path}: {e}")
194
+
195
+ if cleaned_count > 0:
196
+ self.logger.info(f"Cleaned up {cleaned_count} invalid downloaded files")
197
+
198
+ def get_download_stats(self):
199
+ """Get statistics about downloaded files."""
200
+ pdf_files = list(self.output_dir.glob('*.pdf'))
201
+ total_size = sum(f.stat().st_size for f in pdf_files)
202
+
203
+ return {
204
+ 'count': len(pdf_files),
205
+ 'total_size_mb': total_size / (1024 * 1024),
206
+ 'files': [f.name for f in pdf_files]
207
+ }
scraper/__init__.py ADDED
@@ -0,0 +1 @@
1
+ # Scraper package
@@ -0,0 +1,316 @@
1
+ """
2
+ Paper Scraper Module
3
+
4
+ Handles searching for papers by subject code and extracting download URLs
5
+ and metadata from the UM exam paper repository.
6
+ """
7
+
8
+ import logging
9
+ import re
10
+ from urllib.parse import urljoin, parse_qs, urlparse
11
+ from bs4 import BeautifulSoup
12
+ import requests
13
+
14
+
15
+ class PaperInfo:
16
+ """Data class for storing paper information."""
17
+
18
+ def __init__(self, title, download_url, year=None, semester=None, paper_type=None):
19
+ self.title = title
20
+ self.download_url = download_url
21
+ self.year = year
22
+ self.semester = semester
23
+ self.paper_type = paper_type
24
+ self.filename = self._generate_filename()
25
+
26
+ def _generate_filename(self):
27
+ """Generate a clean filename for the paper."""
28
+ # Extract useful parts from title - remove subject code and semester/year info to avoid duplication
29
+ title_to_clean = self.title
30
+
31
+ # Remove subject code pattern from title (e.g., "WIA1005 (Semester 1, 2024)")
32
+ title_to_clean = re.sub(r'[A-Z]{2,4}\d{4}\s*\([^)]+\)\s*', '', title_to_clean)
33
+
34
+ # Clean the remaining title
35
+ clean_title = re.sub(r'[^\w\s-]', '', title_to_clean)
36
+ clean_title = re.sub(r'\s+', '_', clean_title.strip())
37
+
38
+ # Add year and semester if available
39
+ parts = []
40
+ if self.year:
41
+ parts.append(f"Y{self.year}")
42
+ if self.semester:
43
+ parts.append(f"S{self.semester}")
44
+ if self.paper_type:
45
+ parts.append(self.paper_type)
46
+
47
+ if parts:
48
+ filename = f"{'_'.join(parts)}_{clean_title}.pdf"
49
+ else:
50
+ filename = f"{clean_title}.pdf"
51
+
52
+ # Ensure filename is not too long
53
+ if len(filename) > 100:
54
+ filename = filename[:95] + ".pdf"
55
+
56
+ return filename
57
+
58
+ def __str__(self):
59
+ return f"PaperInfo(title='{self.title}', year={self.year}, semester={self.semester})"
60
+
61
+
62
+ class PaperScraper:
63
+ """Scrapes exam papers from UM repository."""
64
+
65
+ def __init__(self, session):
66
+ """Initialize scraper with authenticated session."""
67
+ self.session = session
68
+ self.base_url = "https://exampaper-um-edu-my.eu1.proxy.openathens.net"
69
+ self.search_url = f"{self.base_url}/cgi/search"
70
+ self.logger = logging.getLogger(__name__)
71
+
72
+ def search_papers(self, subject_code, max_results=100):
73
+ """
74
+ Search for papers by subject code.
75
+
76
+ Args:
77
+ subject_code (str): Subject code to search for
78
+ max_results (int): Maximum number of results to return
79
+
80
+ Returns:
81
+ list[PaperInfo]: List of paper information objects
82
+ """
83
+ self.logger.info(f"Searching for papers with subject code: {subject_code}")
84
+
85
+ papers = []
86
+
87
+ try:
88
+ # Use the correct search URL and parameters based on the actual form
89
+ search_params = {
90
+ 'q': subject_code,
91
+ '_action_search': 'Search',
92
+ '_order': 'bytitle',
93
+ 'basic_srchtype': 'ALL',
94
+ '_satisfyall': 'ALL'
95
+ }
96
+
97
+ self.logger.info(f"Performing search with params: {search_params}")
98
+
99
+ # Perform search request using GET (like the form does)
100
+ response = self.session.get(
101
+ "https://exampaper-um-edu-my.eu1.proxy.openathens.net/cgi/search",
102
+ params=search_params,
103
+ timeout=30
104
+ )
105
+
106
+ if response.status_code != 200:
107
+ self.logger.error(f"Search request failed: {response.status_code}")
108
+ return papers
109
+
110
+ # Parse search results
111
+ soup = BeautifulSoup(response.content, 'html.parser')
112
+
113
+ # Check if we got results
114
+ results_text = soup.find('div', class_='ep_search_controls')
115
+ if results_text:
116
+ text = results_text.get_text()
117
+ self.logger.info(f"Search results info: {text}")
118
+
119
+ # Extract number of results
120
+ import re
121
+ match = re.search(r'(\d+)\s+of\s+(\d+)', text)
122
+ if match:
123
+ total_results = int(match.group(2))
124
+ self.logger.info(f"Found {total_results} total results")
125
+ else:
126
+ self.logger.warning("Could not determine number of results")
127
+
128
+ papers = self._parse_search_results(soup, subject_code)
129
+
130
+ self.logger.info(f"Successfully extracted {len(papers)} papers for {subject_code}")
131
+
132
+ except Exception as e:
133
+ self.logger.error(f"Error searching for papers: {e}")
134
+
135
+ return papers[:max_results]
136
+
137
+ def _parse_search_results(self, soup, subject_code):
138
+ """Parse search results from HTML."""
139
+ papers = []
140
+
141
+ # Look for the results table
142
+ results_table = soup.find('table', class_='ep_paginate_list')
143
+ if not results_table:
144
+ self.logger.warning("No results table found with class 'ep_paginate_list'")
145
+ return papers
146
+
147
+ # Find all result rows
148
+ result_rows = results_table.find_all('tr', class_='ep_search_result')
149
+ self.logger.info(f"Found {len(result_rows)} result rows")
150
+
151
+ for i, row in enumerate(result_rows, 1):
152
+ try:
153
+ self.logger.info(f"Processing result {i}...")
154
+ paper_info = self._extract_paper_info_from_row(row, subject_code)
155
+ if paper_info:
156
+ papers.append(paper_info)
157
+ self.logger.info(f"✅ Extracted: {paper_info.title}")
158
+ else:
159
+ self.logger.warning(f"❌ Could not extract info from result {i}")
160
+ except Exception as e:
161
+ self.logger.warning(f"Error parsing result {i}: {e}")
162
+ continue
163
+
164
+ return papers
165
+
166
+ def _extract_paper_info_from_row(self, row, subject_code):
167
+ """Extract paper information from a search result row."""
168
+ try:
169
+ # Get all cells in the row
170
+ cells = row.find_all('td')
171
+ if len(cells) < 2:
172
+ self.logger.warning("Row doesn't have enough cells")
173
+ return None
174
+
175
+ # The main content is in the second cell
176
+ content_cell = cells[1]
177
+
178
+ # Extract the title and basic info
179
+ # Pattern: "WIA1005 (Semester X, YEAR) Title"
180
+ text_content = content_cell.get_text(strip=True)
181
+ self.logger.info(f"Row content: {text_content[:100]}...")
182
+
183
+ # Extract semester and year
184
+ semester_year_match = re.search(r'\(Semester (\d+), (\d{4})\)', text_content)
185
+ if semester_year_match:
186
+ semester = semester_year_match.group(1)
187
+ year = semester_year_match.group(2)
188
+ else:
189
+ semester = None
190
+ year = None
191
+ self.logger.warning("Could not extract semester/year info")
192
+
193
+ # Find the main paper link (usually the title link)
194
+ title_link = content_cell.find('a', href=True)
195
+ if title_link:
196
+ title = title_link.get_text(strip=True)
197
+ # Remove italic formatting
198
+ title = re.sub(r'[/*]', '', title)
199
+ paper_url = urljoin(self.base_url, title_link.get('href'))
200
+ self.logger.info(f"Found title link: {title}")
201
+ else:
202
+ self.logger.warning("No title link found")
203
+ return None
204
+
205
+ # Look for direct PDF download link
206
+ download_url = None
207
+
208
+ # Check the third cell (if exists) for PDF links
209
+ if len(cells) > 2:
210
+ pdf_cell = cells[2]
211
+ pdf_links = pdf_cell.find_all('a', href=True)
212
+ for link in pdf_links:
213
+ href = link.get('href')
214
+ if href and '.pdf' in href.lower():
215
+ download_url = urljoin(self.base_url, href)
216
+ self.logger.info(f"Found direct PDF link: {download_url}")
217
+ break
218
+
219
+ # If no direct PDF link found, try to get it from the paper page
220
+ if not download_url:
221
+ self.logger.info("No direct PDF link found, checking paper page...")
222
+ download_url = self._get_download_url(paper_url)
223
+
224
+ if not download_url:
225
+ self.logger.warning(f"No download URL found for: {title}")
226
+ return None
227
+
228
+ # Generate a clean title without redundant info (year/semester will be in filename prefix)
229
+ clean_title = f"{subject_code} {title}"
230
+
231
+ paper_type = self._determine_paper_type(title)
232
+
233
+ return PaperInfo(
234
+ title=clean_title,
235
+ download_url=download_url,
236
+ year=year,
237
+ semester=semester,
238
+ paper_type=paper_type
239
+ )
240
+
241
+ except Exception as e:
242
+ self.logger.warning(f"Error extracting paper info: {e}")
243
+ return None
244
+
245
+ def _determine_paper_type(self, title):
246
+ """Determine the type of paper from the title."""
247
+ title_lower = title.lower()
248
+
249
+ if 'final' in title_lower:
250
+ return 'Final'
251
+ elif 'mid' in title_lower or 'midterm' in title_lower:
252
+ return 'Midterm'
253
+ elif 'quiz' in title_lower:
254
+ return 'Quiz'
255
+ elif 'test' in title_lower:
256
+ return 'Test'
257
+ else:
258
+ return 'Exam'
259
+
260
+ def _get_download_url(self, paper_url):
261
+ """Get the actual PDF download URL from the paper page."""
262
+ try:
263
+ self.logger.info(f"Getting download URL from: {paper_url}")
264
+ response = self.session.get(paper_url, timeout=15)
265
+ if response.status_code != 200:
266
+ self.logger.warning(f"Failed to access paper page: {response.status_code}")
267
+ return None
268
+
269
+ soup = BeautifulSoup(response.content, 'html.parser')
270
+
271
+ # Method 1: Look for direct PDF download links
272
+ download_links = soup.find_all('a', href=True)
273
+
274
+ for link in download_links:
275
+ href = link.get('href')
276
+ link_text = link.get_text(strip=True).lower()
277
+
278
+ # Look for PDF files or download links
279
+ if href and ('.pdf' in href.lower() or
280
+ 'download' in href.lower() or
281
+ 'download' in link_text or
282
+ 'pdf' in link_text):
283
+ download_url = urljoin(self.base_url, href)
284
+ self.logger.info(f"Found download link: {download_url}")
285
+ return download_url
286
+
287
+ # Method 2: Look for repository-specific patterns
288
+ # UM repository often uses /id/eprint/XXXXX/1/filename.pdf
289
+ eprint_links = soup.find_all('a', href=re.compile(r'/\d+/\d+/.*\.pdf$', re.I))
290
+ if eprint_links:
291
+ download_url = urljoin(self.base_url, eprint_links[0].get('href'))
292
+ self.logger.info(f"Found eprint PDF: {download_url}")
293
+ return download_url
294
+
295
+ # Method 3: Look for any PDF links
296
+ pdf_links = soup.find_all('a', href=re.compile(r'\.pdf$', re.I))
297
+ if pdf_links:
298
+ download_url = urljoin(self.base_url, pdf_links[0].get('href'))
299
+ self.logger.info(f"Found PDF link: {download_url}")
300
+ return download_url
301
+
302
+ # Method 4: Check for embedded objects or iframes
303
+ objects = soup.find_all(['object', 'embed', 'iframe'])
304
+ for obj in objects:
305
+ src = obj.get('src') or obj.get('data')
306
+ if src and '.pdf' in src.lower():
307
+ download_url = urljoin(self.base_url, src)
308
+ self.logger.info(f"Found embedded PDF: {download_url}")
309
+ return download_url
310
+
311
+ self.logger.warning(f"No download URL found on page: {paper_url}")
312
+
313
+ except Exception as e:
314
+ self.logger.warning(f"Error getting download URL for {paper_url}: {e}")
315
+
316
+ return None
@@ -0,0 +1,26 @@
1
+ """
2
+ umpaper-fetch: Automated downloader for University Malaya past year exam papers.
3
+
4
+ This package provides tools to automatically download past year exam papers
5
+ from University Malaya's repository through an automated browser interface.
6
+ """
7
+
8
+ __version__ = "1.0.0"
9
+ __author__ = "Marcus Mah"
10
+ __email__ = "marcusmah6969@gmail.com"
11
+ __description__ = "Automated downloader for University Malaya past year exam papers"
12
+
13
+ # Import main classes for easier access
14
+ from .auth.um_authenticator import UMAuthenticator
15
+ from .scraper.paper_scraper import PaperScraper
16
+ from .downloader.pdf_downloader import PDFDownloader
17
+ from .utils.zip_creator import ZipCreator
18
+ from .utils.logger import setup_logger
19
+
20
+ __all__ = [
21
+ 'UMAuthenticator',
22
+ 'PaperScraper',
23
+ 'PDFDownloader',
24
+ 'ZipCreator',
25
+ 'setup_logger'
26
+ ]
@@ -0,0 +1 @@
1
+ # Authentication package
@@ -0,0 +1,119 @@
1
+ """
2
+ Chrome Driver Fix Module
3
+
4
+ Handles Chrome driver setup issues, particularly the Win32 application error
5
+ that occurs due to architecture mismatches.
6
+ """
7
+
8
+ import os
9
+ import platform
10
+ import logging
11
+ from webdriver_manager.chrome import ChromeDriverManager
12
+
13
+
14
+ def get_chrome_driver_path():
15
+ """
16
+ Get Chrome driver path with proper architecture handling.
17
+
18
+ Returns:
19
+ str: Path to the Chrome driver executable
20
+ """
21
+ logger = logging.getLogger(__name__)
22
+
23
+ try:
24
+ # Determine system architecture
25
+ is_64bit = platform.machine().endswith('64')
26
+ system = platform.system()
27
+
28
+ logger.info(f"System: {system}, 64-bit: {is_64bit}")
29
+
30
+ # Force specific Chrome driver version/architecture if needed
31
+ if system == "Windows" and is_64bit:
32
+ # Try to get the latest driver for Windows 64-bit
33
+ driver_manager = ChromeDriverManager()
34
+ driver_path = driver_manager.install()
35
+
36
+ # Verify the driver is executable
37
+ if os.path.exists(driver_path) and os.access(driver_path, os.X_OK):
38
+ logger.info(f"Chrome driver ready: {driver_path}")
39
+ return driver_path
40
+ else:
41
+ logger.warning(f"Chrome driver not executable: {driver_path}")
42
+ # Try to fix permissions
43
+ try:
44
+ os.chmod(driver_path, 0o755)
45
+ if os.access(driver_path, os.X_OK):
46
+ logger.info("Fixed Chrome driver permissions")
47
+ return driver_path
48
+ except Exception as perm_error:
49
+ logger.error(f"Could not fix permissions: {perm_error}")
50
+ else:
51
+ # For other systems, use default behavior
52
+ driver_path = ChromeDriverManager().install()
53
+ return driver_path
54
+
55
+ except Exception as e:
56
+ logger.error(f"Chrome driver setup failed: {e}")
57
+ raise
58
+
59
+ raise Exception("Could not setup Chrome driver")
60
+
61
+
62
+ def test_chrome_driver(driver_path):
63
+ """
64
+ Test if the Chrome driver is working properly.
65
+
66
+ Args:
67
+ driver_path (str): Path to Chrome driver
68
+
69
+ Returns:
70
+ bool: True if driver works, False otherwise
71
+ """
72
+ logger = logging.getLogger(__name__)
73
+
74
+ try:
75
+ import subprocess
76
+
77
+ # Test if the driver can start
78
+ result = subprocess.run(
79
+ [driver_path, '--version'],
80
+ capture_output=True,
81
+ text=True,
82
+ timeout=10
83
+ )
84
+
85
+ if result.returncode == 0:
86
+ logger.info(f"Chrome driver test passed: {result.stdout.strip()}")
87
+ return True
88
+ else:
89
+ logger.error(f"Chrome driver test failed: {result.stderr}")
90
+ return False
91
+
92
+ except Exception as e:
93
+ logger.error(f"Chrome driver test error: {e}")
94
+ return False
95
+
96
+
97
+ def cleanup_chrome_cache():
98
+ """Clean up problematic Chrome driver cache."""
99
+ logger = logging.getLogger(__name__)
100
+
101
+ try:
102
+ import shutil
103
+ from pathlib import Path
104
+
105
+ # Get the cache directory
106
+ cache_dir = Path.home() / '.wdm' / 'drivers' / 'chromedriver'
107
+
108
+ if cache_dir.exists():
109
+ logger.info(f"Cleaning Chrome driver cache: {cache_dir}")
110
+ shutil.rmtree(cache_dir)
111
+ logger.info("Chrome driver cache cleaned")
112
+ return True
113
+ else:
114
+ logger.info("No Chrome driver cache to clean")
115
+ return True
116
+
117
+ except Exception as e:
118
+ logger.error(f"Could not clean Chrome driver cache: {e}")
119
+ return False