umpaper-fetch 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- auth/__init__.py +1 -0
- auth/chrome_fix.py +119 -0
- auth/um_authenticator.py +521 -0
- downloader/__init__.py +1 -0
- downloader/pdf_downloader.py +207 -0
- scraper/__init__.py +1 -0
- scraper/paper_scraper.py +316 -0
- umpaper_fetch/__init__.py +26 -0
- umpaper_fetch/auth/__init__.py +1 -0
- umpaper_fetch/auth/chrome_fix.py +119 -0
- umpaper_fetch/auth/um_authenticator.py +521 -0
- umpaper_fetch/cli.py +316 -0
- umpaper_fetch/downloader/__init__.py +1 -0
- umpaper_fetch/downloader/pdf_downloader.py +207 -0
- umpaper_fetch/scraper/__init__.py +1 -0
- umpaper_fetch/scraper/paper_scraper.py +316 -0
- umpaper_fetch/utils/__init__.py +1 -0
- umpaper_fetch/utils/logger.py +67 -0
- umpaper_fetch/utils/zip_creator.py +299 -0
- umpaper_fetch-1.0.0.dist-info/METADATA +462 -0
- umpaper_fetch-1.0.0.dist-info/RECORD +28 -0
- umpaper_fetch-1.0.0.dist-info/WHEEL +5 -0
- umpaper_fetch-1.0.0.dist-info/entry_points.txt +2 -0
- umpaper_fetch-1.0.0.dist-info/licenses/LICENSE +22 -0
- umpaper_fetch-1.0.0.dist-info/top_level.txt +5 -0
- utils/__init__.py +1 -0
- utils/logger.py +67 -0
- utils/zip_creator.py +299 -0
@@ -0,0 +1,207 @@
|
|
1
|
+
"""
|
2
|
+
PDF Downloader Module
|
3
|
+
|
4
|
+
Handles downloading PDF files with progress tracking, retry logic,
|
5
|
+
and concurrent download management.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import logging
|
9
|
+
import os
|
10
|
+
import time
|
11
|
+
from pathlib import Path
|
12
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
13
|
+
from tqdm import tqdm
|
14
|
+
import requests
|
15
|
+
|
16
|
+
|
17
|
+
class PDFDownloader:
|
18
|
+
"""Downloads PDF files with progress tracking and retry logic."""
|
19
|
+
|
20
|
+
def __init__(self, session, output_dir, max_retries=3, max_workers=4):
|
21
|
+
"""Initialize the PDF downloader."""
|
22
|
+
self.session = session
|
23
|
+
self.output_dir = Path(output_dir)
|
24
|
+
self.max_retries = max_retries
|
25
|
+
self.max_workers = max_workers
|
26
|
+
self.logger = logging.getLogger(__name__)
|
27
|
+
|
28
|
+
# Ensure output directory exists
|
29
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
30
|
+
|
31
|
+
def download_papers(self, papers):
|
32
|
+
"""
|
33
|
+
Download multiple papers with progress tracking.
|
34
|
+
|
35
|
+
Args:
|
36
|
+
papers (list): List of PaperInfo objects to download
|
37
|
+
|
38
|
+
Returns:
|
39
|
+
list: List of successfully downloaded file paths
|
40
|
+
"""
|
41
|
+
if not papers:
|
42
|
+
return []
|
43
|
+
|
44
|
+
self.logger.info(f"Starting download of {len(papers)} papers...")
|
45
|
+
downloaded_files = []
|
46
|
+
|
47
|
+
# Create progress bar
|
48
|
+
with tqdm(total=len(papers), desc="Downloading papers", unit="file") as pbar:
|
49
|
+
# Use ThreadPoolExecutor for concurrent downloads
|
50
|
+
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
51
|
+
# Submit all download tasks
|
52
|
+
future_to_paper = {
|
53
|
+
executor.submit(self._download_paper, paper): paper
|
54
|
+
for paper in papers
|
55
|
+
}
|
56
|
+
|
57
|
+
# Process completed downloads
|
58
|
+
for future in as_completed(future_to_paper):
|
59
|
+
paper = future_to_paper[future]
|
60
|
+
try:
|
61
|
+
file_path = future.result()
|
62
|
+
if file_path:
|
63
|
+
downloaded_files.append(file_path)
|
64
|
+
self.logger.debug(f"Downloaded: {paper.filename}")
|
65
|
+
else:
|
66
|
+
self.logger.warning(f"Failed to download: {paper.title}")
|
67
|
+
except Exception as e:
|
68
|
+
self.logger.error(f"Error downloading {paper.title}: {e}")
|
69
|
+
finally:
|
70
|
+
pbar.update(1)
|
71
|
+
|
72
|
+
self.logger.info(f"Downloaded {len(downloaded_files)}/{len(papers)} papers successfully")
|
73
|
+
return downloaded_files
|
74
|
+
|
75
|
+
def _download_paper(self, paper):
|
76
|
+
"""
|
77
|
+
Download a single paper with retry logic.
|
78
|
+
|
79
|
+
Args:
|
80
|
+
paper (PaperInfo): Paper information object
|
81
|
+
|
82
|
+
Returns:
|
83
|
+
str: Path to downloaded file, or None if failed
|
84
|
+
"""
|
85
|
+
for attempt in range(self.max_retries + 1):
|
86
|
+
try:
|
87
|
+
file_path = self._download_file(paper.download_url, paper.filename)
|
88
|
+
if file_path and self._verify_download(file_path):
|
89
|
+
return file_path
|
90
|
+
|
91
|
+
except Exception as e:
|
92
|
+
self.logger.warning(
|
93
|
+
f"Download attempt {attempt + 1} failed for {paper.title}: {e}"
|
94
|
+
)
|
95
|
+
|
96
|
+
if attempt < self.max_retries:
|
97
|
+
# Wait before retry with exponential backoff
|
98
|
+
wait_time = 2 ** attempt
|
99
|
+
time.sleep(wait_time)
|
100
|
+
|
101
|
+
self.logger.error(f"Failed to download after {self.max_retries + 1} attempts: {paper.title}")
|
102
|
+
return None
|
103
|
+
|
104
|
+
def _download_file(self, url, filename):
|
105
|
+
"""
|
106
|
+
Download a single file from URL.
|
107
|
+
|
108
|
+
Args:
|
109
|
+
url (str): Download URL
|
110
|
+
filename (str): Target filename
|
111
|
+
|
112
|
+
Returns:
|
113
|
+
str: Path to downloaded file
|
114
|
+
"""
|
115
|
+
file_path = self.output_dir / filename
|
116
|
+
|
117
|
+
# Avoid re-downloading if file already exists and is valid
|
118
|
+
if file_path.exists() and self._verify_download(file_path):
|
119
|
+
self.logger.debug(f"File already exists: {filename}")
|
120
|
+
return str(file_path)
|
121
|
+
|
122
|
+
# Start download
|
123
|
+
response = self.session.get(url, stream=True, timeout=60)
|
124
|
+
response.raise_for_status()
|
125
|
+
|
126
|
+
# Check if response is actually a PDF
|
127
|
+
content_type = response.headers.get('content-type', '').lower()
|
128
|
+
if 'pdf' not in content_type and 'application/octet-stream' not in content_type:
|
129
|
+
self.logger.warning(f"Unexpected content type for {filename}: {content_type}")
|
130
|
+
|
131
|
+
# Write file with progress tracking
|
132
|
+
total_size = int(response.headers.get('content-length', 0))
|
133
|
+
|
134
|
+
with open(file_path, 'wb') as f:
|
135
|
+
if total_size > 0:
|
136
|
+
# Track download progress for large files
|
137
|
+
downloaded = 0
|
138
|
+
for chunk in response.iter_content(chunk_size=8192):
|
139
|
+
if chunk:
|
140
|
+
f.write(chunk)
|
141
|
+
downloaded += len(chunk)
|
142
|
+
else:
|
143
|
+
# For files without content-length header
|
144
|
+
for chunk in response.iter_content(chunk_size=8192):
|
145
|
+
if chunk:
|
146
|
+
f.write(chunk)
|
147
|
+
|
148
|
+
self.logger.debug(f"Downloaded {filename} ({file_path.stat().st_size} bytes)")
|
149
|
+
return str(file_path)
|
150
|
+
|
151
|
+
def _verify_download(self, file_path):
|
152
|
+
"""
|
153
|
+
Verify that the downloaded file is valid.
|
154
|
+
|
155
|
+
Args:
|
156
|
+
file_path (str or Path): Path to the downloaded file
|
157
|
+
|
158
|
+
Returns:
|
159
|
+
bool: True if file is valid, False otherwise
|
160
|
+
"""
|
161
|
+
try:
|
162
|
+
file_path = Path(file_path)
|
163
|
+
|
164
|
+
# Check if file exists and has content
|
165
|
+
if not file_path.exists() or file_path.stat().st_size == 0:
|
166
|
+
return False
|
167
|
+
|
168
|
+
# Basic PDF validation - check PDF header
|
169
|
+
with open(file_path, 'rb') as f:
|
170
|
+
header = f.read(8)
|
171
|
+
if not header.startswith(b'%PDF'):
|
172
|
+
self.logger.warning(f"File does not appear to be a PDF: {file_path.name}")
|
173
|
+
# Don't reject non-PDF files completely, might be valid documents
|
174
|
+
# return False
|
175
|
+
|
176
|
+
return True
|
177
|
+
|
178
|
+
except Exception as e:
|
179
|
+
self.logger.warning(f"Error verifying file {file_path}: {e}")
|
180
|
+
return False
|
181
|
+
|
182
|
+
def cleanup_failed_downloads(self):
|
183
|
+
"""Remove any incomplete or corrupted downloads."""
|
184
|
+
cleaned_count = 0
|
185
|
+
|
186
|
+
for file_path in self.output_dir.glob('*.pdf'):
|
187
|
+
if not self._verify_download(file_path):
|
188
|
+
try:
|
189
|
+
file_path.unlink()
|
190
|
+
cleaned_count += 1
|
191
|
+
self.logger.debug(f"Removed invalid file: {file_path.name}")
|
192
|
+
except Exception as e:
|
193
|
+
self.logger.warning(f"Could not remove invalid file {file_path}: {e}")
|
194
|
+
|
195
|
+
if cleaned_count > 0:
|
196
|
+
self.logger.info(f"Cleaned up {cleaned_count} invalid downloaded files")
|
197
|
+
|
198
|
+
def get_download_stats(self):
|
199
|
+
"""Get statistics about downloaded files."""
|
200
|
+
pdf_files = list(self.output_dir.glob('*.pdf'))
|
201
|
+
total_size = sum(f.stat().st_size for f in pdf_files)
|
202
|
+
|
203
|
+
return {
|
204
|
+
'count': len(pdf_files),
|
205
|
+
'total_size_mb': total_size / (1024 * 1024),
|
206
|
+
'files': [f.name for f in pdf_files]
|
207
|
+
}
|
scraper/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
# Scraper package
|
scraper/paper_scraper.py
ADDED
@@ -0,0 +1,316 @@
|
|
1
|
+
"""
|
2
|
+
Paper Scraper Module
|
3
|
+
|
4
|
+
Handles searching for papers by subject code and extracting download URLs
|
5
|
+
and metadata from the UM exam paper repository.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import logging
|
9
|
+
import re
|
10
|
+
from urllib.parse import urljoin, parse_qs, urlparse
|
11
|
+
from bs4 import BeautifulSoup
|
12
|
+
import requests
|
13
|
+
|
14
|
+
|
15
|
+
class PaperInfo:
|
16
|
+
"""Data class for storing paper information."""
|
17
|
+
|
18
|
+
def __init__(self, title, download_url, year=None, semester=None, paper_type=None):
|
19
|
+
self.title = title
|
20
|
+
self.download_url = download_url
|
21
|
+
self.year = year
|
22
|
+
self.semester = semester
|
23
|
+
self.paper_type = paper_type
|
24
|
+
self.filename = self._generate_filename()
|
25
|
+
|
26
|
+
def _generate_filename(self):
|
27
|
+
"""Generate a clean filename for the paper."""
|
28
|
+
# Extract useful parts from title - remove subject code and semester/year info to avoid duplication
|
29
|
+
title_to_clean = self.title
|
30
|
+
|
31
|
+
# Remove subject code pattern from title (e.g., "WIA1005 (Semester 1, 2024)")
|
32
|
+
title_to_clean = re.sub(r'[A-Z]{2,4}\d{4}\s*\([^)]+\)\s*', '', title_to_clean)
|
33
|
+
|
34
|
+
# Clean the remaining title
|
35
|
+
clean_title = re.sub(r'[^\w\s-]', '', title_to_clean)
|
36
|
+
clean_title = re.sub(r'\s+', '_', clean_title.strip())
|
37
|
+
|
38
|
+
# Add year and semester if available
|
39
|
+
parts = []
|
40
|
+
if self.year:
|
41
|
+
parts.append(f"Y{self.year}")
|
42
|
+
if self.semester:
|
43
|
+
parts.append(f"S{self.semester}")
|
44
|
+
if self.paper_type:
|
45
|
+
parts.append(self.paper_type)
|
46
|
+
|
47
|
+
if parts:
|
48
|
+
filename = f"{'_'.join(parts)}_{clean_title}.pdf"
|
49
|
+
else:
|
50
|
+
filename = f"{clean_title}.pdf"
|
51
|
+
|
52
|
+
# Ensure filename is not too long
|
53
|
+
if len(filename) > 100:
|
54
|
+
filename = filename[:95] + ".pdf"
|
55
|
+
|
56
|
+
return filename
|
57
|
+
|
58
|
+
def __str__(self):
|
59
|
+
return f"PaperInfo(title='{self.title}', year={self.year}, semester={self.semester})"
|
60
|
+
|
61
|
+
|
62
|
+
class PaperScraper:
|
63
|
+
"""Scrapes exam papers from UM repository."""
|
64
|
+
|
65
|
+
def __init__(self, session):
|
66
|
+
"""Initialize scraper with authenticated session."""
|
67
|
+
self.session = session
|
68
|
+
self.base_url = "https://exampaper-um-edu-my.eu1.proxy.openathens.net"
|
69
|
+
self.search_url = f"{self.base_url}/cgi/search"
|
70
|
+
self.logger = logging.getLogger(__name__)
|
71
|
+
|
72
|
+
def search_papers(self, subject_code, max_results=100):
|
73
|
+
"""
|
74
|
+
Search for papers by subject code.
|
75
|
+
|
76
|
+
Args:
|
77
|
+
subject_code (str): Subject code to search for
|
78
|
+
max_results (int): Maximum number of results to return
|
79
|
+
|
80
|
+
Returns:
|
81
|
+
list[PaperInfo]: List of paper information objects
|
82
|
+
"""
|
83
|
+
self.logger.info(f"Searching for papers with subject code: {subject_code}")
|
84
|
+
|
85
|
+
papers = []
|
86
|
+
|
87
|
+
try:
|
88
|
+
# Use the correct search URL and parameters based on the actual form
|
89
|
+
search_params = {
|
90
|
+
'q': subject_code,
|
91
|
+
'_action_search': 'Search',
|
92
|
+
'_order': 'bytitle',
|
93
|
+
'basic_srchtype': 'ALL',
|
94
|
+
'_satisfyall': 'ALL'
|
95
|
+
}
|
96
|
+
|
97
|
+
self.logger.info(f"Performing search with params: {search_params}")
|
98
|
+
|
99
|
+
# Perform search request using GET (like the form does)
|
100
|
+
response = self.session.get(
|
101
|
+
"https://exampaper-um-edu-my.eu1.proxy.openathens.net/cgi/search",
|
102
|
+
params=search_params,
|
103
|
+
timeout=30
|
104
|
+
)
|
105
|
+
|
106
|
+
if response.status_code != 200:
|
107
|
+
self.logger.error(f"Search request failed: {response.status_code}")
|
108
|
+
return papers
|
109
|
+
|
110
|
+
# Parse search results
|
111
|
+
soup = BeautifulSoup(response.content, 'html.parser')
|
112
|
+
|
113
|
+
# Check if we got results
|
114
|
+
results_text = soup.find('div', class_='ep_search_controls')
|
115
|
+
if results_text:
|
116
|
+
text = results_text.get_text()
|
117
|
+
self.logger.info(f"Search results info: {text}")
|
118
|
+
|
119
|
+
# Extract number of results
|
120
|
+
import re
|
121
|
+
match = re.search(r'(\d+)\s+of\s+(\d+)', text)
|
122
|
+
if match:
|
123
|
+
total_results = int(match.group(2))
|
124
|
+
self.logger.info(f"Found {total_results} total results")
|
125
|
+
else:
|
126
|
+
self.logger.warning("Could not determine number of results")
|
127
|
+
|
128
|
+
papers = self._parse_search_results(soup, subject_code)
|
129
|
+
|
130
|
+
self.logger.info(f"Successfully extracted {len(papers)} papers for {subject_code}")
|
131
|
+
|
132
|
+
except Exception as e:
|
133
|
+
self.logger.error(f"Error searching for papers: {e}")
|
134
|
+
|
135
|
+
return papers[:max_results]
|
136
|
+
|
137
|
+
def _parse_search_results(self, soup, subject_code):
|
138
|
+
"""Parse search results from HTML."""
|
139
|
+
papers = []
|
140
|
+
|
141
|
+
# Look for the results table
|
142
|
+
results_table = soup.find('table', class_='ep_paginate_list')
|
143
|
+
if not results_table:
|
144
|
+
self.logger.warning("No results table found with class 'ep_paginate_list'")
|
145
|
+
return papers
|
146
|
+
|
147
|
+
# Find all result rows
|
148
|
+
result_rows = results_table.find_all('tr', class_='ep_search_result')
|
149
|
+
self.logger.info(f"Found {len(result_rows)} result rows")
|
150
|
+
|
151
|
+
for i, row in enumerate(result_rows, 1):
|
152
|
+
try:
|
153
|
+
self.logger.info(f"Processing result {i}...")
|
154
|
+
paper_info = self._extract_paper_info_from_row(row, subject_code)
|
155
|
+
if paper_info:
|
156
|
+
papers.append(paper_info)
|
157
|
+
self.logger.info(f"✅ Extracted: {paper_info.title}")
|
158
|
+
else:
|
159
|
+
self.logger.warning(f"❌ Could not extract info from result {i}")
|
160
|
+
except Exception as e:
|
161
|
+
self.logger.warning(f"Error parsing result {i}: {e}")
|
162
|
+
continue
|
163
|
+
|
164
|
+
return papers
|
165
|
+
|
166
|
+
def _extract_paper_info_from_row(self, row, subject_code):
|
167
|
+
"""Extract paper information from a search result row."""
|
168
|
+
try:
|
169
|
+
# Get all cells in the row
|
170
|
+
cells = row.find_all('td')
|
171
|
+
if len(cells) < 2:
|
172
|
+
self.logger.warning("Row doesn't have enough cells")
|
173
|
+
return None
|
174
|
+
|
175
|
+
# The main content is in the second cell
|
176
|
+
content_cell = cells[1]
|
177
|
+
|
178
|
+
# Extract the title and basic info
|
179
|
+
# Pattern: "WIA1005 (Semester X, YEAR) Title"
|
180
|
+
text_content = content_cell.get_text(strip=True)
|
181
|
+
self.logger.info(f"Row content: {text_content[:100]}...")
|
182
|
+
|
183
|
+
# Extract semester and year
|
184
|
+
semester_year_match = re.search(r'\(Semester (\d+), (\d{4})\)', text_content)
|
185
|
+
if semester_year_match:
|
186
|
+
semester = semester_year_match.group(1)
|
187
|
+
year = semester_year_match.group(2)
|
188
|
+
else:
|
189
|
+
semester = None
|
190
|
+
year = None
|
191
|
+
self.logger.warning("Could not extract semester/year info")
|
192
|
+
|
193
|
+
# Find the main paper link (usually the title link)
|
194
|
+
title_link = content_cell.find('a', href=True)
|
195
|
+
if title_link:
|
196
|
+
title = title_link.get_text(strip=True)
|
197
|
+
# Remove italic formatting
|
198
|
+
title = re.sub(r'[/*]', '', title)
|
199
|
+
paper_url = urljoin(self.base_url, title_link.get('href'))
|
200
|
+
self.logger.info(f"Found title link: {title}")
|
201
|
+
else:
|
202
|
+
self.logger.warning("No title link found")
|
203
|
+
return None
|
204
|
+
|
205
|
+
# Look for direct PDF download link
|
206
|
+
download_url = None
|
207
|
+
|
208
|
+
# Check the third cell (if exists) for PDF links
|
209
|
+
if len(cells) > 2:
|
210
|
+
pdf_cell = cells[2]
|
211
|
+
pdf_links = pdf_cell.find_all('a', href=True)
|
212
|
+
for link in pdf_links:
|
213
|
+
href = link.get('href')
|
214
|
+
if href and '.pdf' in href.lower():
|
215
|
+
download_url = urljoin(self.base_url, href)
|
216
|
+
self.logger.info(f"Found direct PDF link: {download_url}")
|
217
|
+
break
|
218
|
+
|
219
|
+
# If no direct PDF link found, try to get it from the paper page
|
220
|
+
if not download_url:
|
221
|
+
self.logger.info("No direct PDF link found, checking paper page...")
|
222
|
+
download_url = self._get_download_url(paper_url)
|
223
|
+
|
224
|
+
if not download_url:
|
225
|
+
self.logger.warning(f"No download URL found for: {title}")
|
226
|
+
return None
|
227
|
+
|
228
|
+
# Generate a clean title without redundant info (year/semester will be in filename prefix)
|
229
|
+
clean_title = f"{subject_code} {title}"
|
230
|
+
|
231
|
+
paper_type = self._determine_paper_type(title)
|
232
|
+
|
233
|
+
return PaperInfo(
|
234
|
+
title=clean_title,
|
235
|
+
download_url=download_url,
|
236
|
+
year=year,
|
237
|
+
semester=semester,
|
238
|
+
paper_type=paper_type
|
239
|
+
)
|
240
|
+
|
241
|
+
except Exception as e:
|
242
|
+
self.logger.warning(f"Error extracting paper info: {e}")
|
243
|
+
return None
|
244
|
+
|
245
|
+
def _determine_paper_type(self, title):
|
246
|
+
"""Determine the type of paper from the title."""
|
247
|
+
title_lower = title.lower()
|
248
|
+
|
249
|
+
if 'final' in title_lower:
|
250
|
+
return 'Final'
|
251
|
+
elif 'mid' in title_lower or 'midterm' in title_lower:
|
252
|
+
return 'Midterm'
|
253
|
+
elif 'quiz' in title_lower:
|
254
|
+
return 'Quiz'
|
255
|
+
elif 'test' in title_lower:
|
256
|
+
return 'Test'
|
257
|
+
else:
|
258
|
+
return 'Exam'
|
259
|
+
|
260
|
+
def _get_download_url(self, paper_url):
|
261
|
+
"""Get the actual PDF download URL from the paper page."""
|
262
|
+
try:
|
263
|
+
self.logger.info(f"Getting download URL from: {paper_url}")
|
264
|
+
response = self.session.get(paper_url, timeout=15)
|
265
|
+
if response.status_code != 200:
|
266
|
+
self.logger.warning(f"Failed to access paper page: {response.status_code}")
|
267
|
+
return None
|
268
|
+
|
269
|
+
soup = BeautifulSoup(response.content, 'html.parser')
|
270
|
+
|
271
|
+
# Method 1: Look for direct PDF download links
|
272
|
+
download_links = soup.find_all('a', href=True)
|
273
|
+
|
274
|
+
for link in download_links:
|
275
|
+
href = link.get('href')
|
276
|
+
link_text = link.get_text(strip=True).lower()
|
277
|
+
|
278
|
+
# Look for PDF files or download links
|
279
|
+
if href and ('.pdf' in href.lower() or
|
280
|
+
'download' in href.lower() or
|
281
|
+
'download' in link_text or
|
282
|
+
'pdf' in link_text):
|
283
|
+
download_url = urljoin(self.base_url, href)
|
284
|
+
self.logger.info(f"Found download link: {download_url}")
|
285
|
+
return download_url
|
286
|
+
|
287
|
+
# Method 2: Look for repository-specific patterns
|
288
|
+
# UM repository often uses /id/eprint/XXXXX/1/filename.pdf
|
289
|
+
eprint_links = soup.find_all('a', href=re.compile(r'/\d+/\d+/.*\.pdf$', re.I))
|
290
|
+
if eprint_links:
|
291
|
+
download_url = urljoin(self.base_url, eprint_links[0].get('href'))
|
292
|
+
self.logger.info(f"Found eprint PDF: {download_url}")
|
293
|
+
return download_url
|
294
|
+
|
295
|
+
# Method 3: Look for any PDF links
|
296
|
+
pdf_links = soup.find_all('a', href=re.compile(r'\.pdf$', re.I))
|
297
|
+
if pdf_links:
|
298
|
+
download_url = urljoin(self.base_url, pdf_links[0].get('href'))
|
299
|
+
self.logger.info(f"Found PDF link: {download_url}")
|
300
|
+
return download_url
|
301
|
+
|
302
|
+
# Method 4: Check for embedded objects or iframes
|
303
|
+
objects = soup.find_all(['object', 'embed', 'iframe'])
|
304
|
+
for obj in objects:
|
305
|
+
src = obj.get('src') or obj.get('data')
|
306
|
+
if src and '.pdf' in src.lower():
|
307
|
+
download_url = urljoin(self.base_url, src)
|
308
|
+
self.logger.info(f"Found embedded PDF: {download_url}")
|
309
|
+
return download_url
|
310
|
+
|
311
|
+
self.logger.warning(f"No download URL found on page: {paper_url}")
|
312
|
+
|
313
|
+
except Exception as e:
|
314
|
+
self.logger.warning(f"Error getting download URL for {paper_url}: {e}")
|
315
|
+
|
316
|
+
return None
|
@@ -0,0 +1,26 @@
|
|
1
|
+
"""
|
2
|
+
umpaper-fetch: Automated downloader for University Malaya past year exam papers.
|
3
|
+
|
4
|
+
This package provides tools to automatically download past year exam papers
|
5
|
+
from University Malaya's repository through an automated browser interface.
|
6
|
+
"""
|
7
|
+
|
8
|
+
__version__ = "1.0.0"
|
9
|
+
__author__ = "Marcus Mah"
|
10
|
+
__email__ = "marcusmah6969@gmail.com"
|
11
|
+
__description__ = "Automated downloader for University Malaya past year exam papers"
|
12
|
+
|
13
|
+
# Import main classes for easier access
|
14
|
+
from .auth.um_authenticator import UMAuthenticator
|
15
|
+
from .scraper.paper_scraper import PaperScraper
|
16
|
+
from .downloader.pdf_downloader import PDFDownloader
|
17
|
+
from .utils.zip_creator import ZipCreator
|
18
|
+
from .utils.logger import setup_logger
|
19
|
+
|
20
|
+
__all__ = [
|
21
|
+
'UMAuthenticator',
|
22
|
+
'PaperScraper',
|
23
|
+
'PDFDownloader',
|
24
|
+
'ZipCreator',
|
25
|
+
'setup_logger'
|
26
|
+
]
|
@@ -0,0 +1 @@
|
|
1
|
+
# Authentication package
|
@@ -0,0 +1,119 @@
|
|
1
|
+
"""
|
2
|
+
Chrome Driver Fix Module
|
3
|
+
|
4
|
+
Handles Chrome driver setup issues, particularly the Win32 application error
|
5
|
+
that occurs due to architecture mismatches.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import os
|
9
|
+
import platform
|
10
|
+
import logging
|
11
|
+
from webdriver_manager.chrome import ChromeDriverManager
|
12
|
+
|
13
|
+
|
14
|
+
def get_chrome_driver_path():
|
15
|
+
"""
|
16
|
+
Get Chrome driver path with proper architecture handling.
|
17
|
+
|
18
|
+
Returns:
|
19
|
+
str: Path to the Chrome driver executable
|
20
|
+
"""
|
21
|
+
logger = logging.getLogger(__name__)
|
22
|
+
|
23
|
+
try:
|
24
|
+
# Determine system architecture
|
25
|
+
is_64bit = platform.machine().endswith('64')
|
26
|
+
system = platform.system()
|
27
|
+
|
28
|
+
logger.info(f"System: {system}, 64-bit: {is_64bit}")
|
29
|
+
|
30
|
+
# Force specific Chrome driver version/architecture if needed
|
31
|
+
if system == "Windows" and is_64bit:
|
32
|
+
# Try to get the latest driver for Windows 64-bit
|
33
|
+
driver_manager = ChromeDriverManager()
|
34
|
+
driver_path = driver_manager.install()
|
35
|
+
|
36
|
+
# Verify the driver is executable
|
37
|
+
if os.path.exists(driver_path) and os.access(driver_path, os.X_OK):
|
38
|
+
logger.info(f"Chrome driver ready: {driver_path}")
|
39
|
+
return driver_path
|
40
|
+
else:
|
41
|
+
logger.warning(f"Chrome driver not executable: {driver_path}")
|
42
|
+
# Try to fix permissions
|
43
|
+
try:
|
44
|
+
os.chmod(driver_path, 0o755)
|
45
|
+
if os.access(driver_path, os.X_OK):
|
46
|
+
logger.info("Fixed Chrome driver permissions")
|
47
|
+
return driver_path
|
48
|
+
except Exception as perm_error:
|
49
|
+
logger.error(f"Could not fix permissions: {perm_error}")
|
50
|
+
else:
|
51
|
+
# For other systems, use default behavior
|
52
|
+
driver_path = ChromeDriverManager().install()
|
53
|
+
return driver_path
|
54
|
+
|
55
|
+
except Exception as e:
|
56
|
+
logger.error(f"Chrome driver setup failed: {e}")
|
57
|
+
raise
|
58
|
+
|
59
|
+
raise Exception("Could not setup Chrome driver")
|
60
|
+
|
61
|
+
|
62
|
+
def test_chrome_driver(driver_path):
|
63
|
+
"""
|
64
|
+
Test if the Chrome driver is working properly.
|
65
|
+
|
66
|
+
Args:
|
67
|
+
driver_path (str): Path to Chrome driver
|
68
|
+
|
69
|
+
Returns:
|
70
|
+
bool: True if driver works, False otherwise
|
71
|
+
"""
|
72
|
+
logger = logging.getLogger(__name__)
|
73
|
+
|
74
|
+
try:
|
75
|
+
import subprocess
|
76
|
+
|
77
|
+
# Test if the driver can start
|
78
|
+
result = subprocess.run(
|
79
|
+
[driver_path, '--version'],
|
80
|
+
capture_output=True,
|
81
|
+
text=True,
|
82
|
+
timeout=10
|
83
|
+
)
|
84
|
+
|
85
|
+
if result.returncode == 0:
|
86
|
+
logger.info(f"Chrome driver test passed: {result.stdout.strip()}")
|
87
|
+
return True
|
88
|
+
else:
|
89
|
+
logger.error(f"Chrome driver test failed: {result.stderr}")
|
90
|
+
return False
|
91
|
+
|
92
|
+
except Exception as e:
|
93
|
+
logger.error(f"Chrome driver test error: {e}")
|
94
|
+
return False
|
95
|
+
|
96
|
+
|
97
|
+
def cleanup_chrome_cache():
|
98
|
+
"""Clean up problematic Chrome driver cache."""
|
99
|
+
logger = logging.getLogger(__name__)
|
100
|
+
|
101
|
+
try:
|
102
|
+
import shutil
|
103
|
+
from pathlib import Path
|
104
|
+
|
105
|
+
# Get the cache directory
|
106
|
+
cache_dir = Path.home() / '.wdm' / 'drivers' / 'chromedriver'
|
107
|
+
|
108
|
+
if cache_dir.exists():
|
109
|
+
logger.info(f"Cleaning Chrome driver cache: {cache_dir}")
|
110
|
+
shutil.rmtree(cache_dir)
|
111
|
+
logger.info("Chrome driver cache cleaned")
|
112
|
+
return True
|
113
|
+
else:
|
114
|
+
logger.info("No Chrome driver cache to clean")
|
115
|
+
return True
|
116
|
+
|
117
|
+
except Exception as e:
|
118
|
+
logger.error(f"Could not clean Chrome driver cache: {e}")
|
119
|
+
return False
|