umpaper-fetch 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
umpaper_fetch/cli.py ADDED
@@ -0,0 +1,316 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Command-line interface for umpaper-fetch package.
4
+
5
+ This module provides the main entry point for the um-papers command.
6
+ """
7
+
8
+ import argparse
9
+ import getpass
10
+ import logging
11
+ import os
12
+ import sys
13
+ from pathlib import Path
14
+
15
+ from .auth.um_authenticator import UMAuthenticator
16
+ from .scraper.paper_scraper import PaperScraper
17
+ from .downloader.pdf_downloader import PDFDownloader
18
+ from .utils.zip_creator import ZipCreator
19
+ from .utils.logger import setup_logger
20
+
21
+
22
+ def parse_arguments():
23
+ """Parse command line arguments."""
24
+ parser = argparse.ArgumentParser(
25
+ description="Download all past year papers for a UM subject code",
26
+ formatter_class=argparse.RawDescriptionHelpFormatter,
27
+ epilog="""
28
+ Examples:
29
+ um-papers
30
+ um-papers --username 24012345 --subject-code WIA1005
31
+ um-papers --username 24056789 --subject-code WXES1116 --show-browser
32
+ um-papers --no-location-prompt --output-dir "C:/Downloads"
33
+ """
34
+ )
35
+
36
+ parser.add_argument(
37
+ '--username', '-u',
38
+ help='UM username (without @siswa.um.edu.my)',
39
+ type=str
40
+ )
41
+
42
+ parser.add_argument(
43
+ '--subject-code', '-s',
44
+ help='Subject code to search for (e.g., WIA1005)',
45
+ type=str
46
+ )
47
+
48
+ parser.add_argument(
49
+ '--output-dir', '-o',
50
+ help='Output directory for downloads (default: ./downloads)',
51
+ default='./downloads',
52
+ type=str
53
+ )
54
+
55
+ parser.add_argument(
56
+ '--no-location-prompt',
57
+ help='Skip location selection prompt and use default output directory',
58
+ action='store_true'
59
+ )
60
+
61
+ parser.add_argument(
62
+ '--show-browser',
63
+ help='Show browser window (default is headless mode)',
64
+ action='store_true'
65
+ )
66
+
67
+ parser.add_argument(
68
+ '--browser', '-b',
69
+ help='Browser to use (auto, chrome, edge). Default: edge',
70
+ choices=['auto', 'chrome', 'edge'],
71
+ default='edge',
72
+ type=str
73
+ )
74
+
75
+ parser.add_argument(
76
+ '--timeout',
77
+ help='Session timeout in seconds (default: 30)',
78
+ default=30,
79
+ type=int
80
+ )
81
+
82
+ parser.add_argument(
83
+ '--max-retries',
84
+ help='Maximum retry attempts (default: 3)',
85
+ default=3,
86
+ type=int
87
+ )
88
+
89
+ parser.add_argument(
90
+ '--verbose', '-v',
91
+ help='Enable verbose logging',
92
+ action='store_true'
93
+ )
94
+
95
+ parser.add_argument(
96
+ '--version',
97
+ action='version',
98
+ version='%(prog)s 1.0.0'
99
+ )
100
+
101
+ return parser.parse_args()
102
+
103
+
104
+ def get_credentials(username=None):
105
+ """Get user credentials securely."""
106
+ if not username:
107
+ username = input("Enter your UM username (without @siswa.um.edu.my): ").strip()
108
+
109
+ if not username:
110
+ print("Error: Username cannot be empty")
111
+ sys.exit(1)
112
+
113
+ password = getpass.getpass("Enter your UM password: ")
114
+
115
+ if not password:
116
+ print("Error: Password cannot be empty")
117
+ sys.exit(1)
118
+
119
+ return username, password
120
+
121
+
122
+ def get_subject_code(subject_code=None):
123
+ """Get subject code from user."""
124
+ if not subject_code:
125
+ subject_code = input("Enter subject code (e.g., WIA1005): ").strip().upper()
126
+
127
+ if not subject_code:
128
+ print("Error: Subject code cannot be empty")
129
+ sys.exit(1)
130
+
131
+ return subject_code
132
+
133
+
134
+ def get_download_location(default_output_dir):
135
+ """
136
+ Get custom download location from user.
137
+
138
+ Args:
139
+ default_output_dir (Path): Default output directory
140
+
141
+ Returns:
142
+ Path: User-chosen download location or default
143
+ """
144
+ print(f"\n📂 Download Location Settings")
145
+ print("="*50)
146
+ print(f"Default location: {default_output_dir.absolute()}")
147
+ print("\nOptions:")
148
+ print("1. Use default location (downloads folder)")
149
+ print("2. Choose custom location")
150
+
151
+ while True:
152
+ choice = input("\nSelect option (1 or 2): ").strip()
153
+
154
+ if choice == '1':
155
+ print(f"✅ Using default location: {default_output_dir.absolute()}")
156
+ return default_output_dir
157
+
158
+ elif choice == '2':
159
+ while True:
160
+ custom_path = input("\nEnter custom download path: ").strip()
161
+
162
+ if not custom_path:
163
+ print("❌ Path cannot be empty. Please try again.")
164
+ continue
165
+
166
+ try:
167
+ # Convert to Path object and expand user home directory (~)
168
+ custom_dir = Path(custom_path).expanduser()
169
+
170
+ # Try to create the directory if it doesn't exist
171
+ custom_dir.mkdir(parents=True, exist_ok=True)
172
+
173
+ # Test if we can write to this directory
174
+ test_file = custom_dir / "test_write.tmp"
175
+ try:
176
+ test_file.write_text("test")
177
+ test_file.unlink() # Delete test file
178
+ print(f"✅ Custom location set: {custom_dir.absolute()}")
179
+ return custom_dir
180
+ except Exception as write_error:
181
+ print(f"❌ Cannot write to this location: {write_error}")
182
+ print("Please choose a different path or check permissions.")
183
+
184
+ except Exception as path_error:
185
+ print(f"❌ Invalid path: {path_error}")
186
+ print("Please enter a valid directory path.")
187
+
188
+ else:
189
+ print("❌ Please enter '1' or '2'.")
190
+
191
+
192
+ def main():
193
+ """Main execution function."""
194
+ args = parse_arguments()
195
+
196
+ # Setup logging
197
+ log_level = logging.DEBUG if args.verbose else logging.INFO
198
+ logger = setup_logger(log_level)
199
+
200
+ try:
201
+ # Create default output directory
202
+ default_output_dir = Path(args.output_dir)
203
+ default_output_dir.mkdir(parents=True, exist_ok=True)
204
+
205
+ logger.info("=== UM Past Year Paper Downloader ===")
206
+
207
+ # Get credentials
208
+ username, password = get_credentials(args.username)
209
+
210
+ # Get subject code
211
+ subject_code = get_subject_code(args.subject_code)
212
+
213
+ # Get download location
214
+ if args.no_location_prompt:
215
+ output_dir = default_output_dir
216
+ logger.info(f"Using default output directory: {output_dir.absolute()}")
217
+ else:
218
+ output_dir = get_download_location(default_output_dir)
219
+
220
+ # Show configuration summary
221
+ print(f"\n📋 Configuration Summary")
222
+ print("="*50)
223
+ print(f"Username: {username}")
224
+ print(f"Subject Code: {subject_code}")
225
+ print(f"Output Directory: {output_dir.absolute()}")
226
+ print(f"Browser: {args.browser}")
227
+ print(f"Headless Mode: {not args.show_browser}")
228
+ print(f"Timeout: {args.timeout}s")
229
+ print(f"Max Retries: {args.max_retries}")
230
+
231
+ # Confirm before proceeding
232
+ print(f"\n🚀 Ready to start downloading papers for {subject_code}")
233
+ confirm = input("Continue? (y/N): ").strip().lower()
234
+
235
+ if confirm not in ['y', 'yes']:
236
+ print("❌ Operation cancelled by user")
237
+ sys.exit(0)
238
+
239
+ print("\n" + "="*60)
240
+ print("🔄 Starting download process...")
241
+ print("="*60)
242
+
243
+ # Step 1: Authentication
244
+ logger.info("Step 1: Authenticating with UM portal...")
245
+ authenticator = UMAuthenticator(
246
+ headless=not args.show_browser,
247
+ browser=args.browser,
248
+ timeout=args.timeout
249
+ )
250
+
251
+ session = authenticator.login(username, password)
252
+ if not session:
253
+ logger.error("❌ Authentication failed")
254
+ sys.exit(1)
255
+
256
+ logger.info("✅ Authentication successful")
257
+
258
+ # Step 2: Search for papers
259
+ logger.info(f"Step 2: Searching for papers with subject code: {subject_code}")
260
+ scraper = PaperScraper(session)
261
+ papers = scraper.search_papers(subject_code)
262
+
263
+ if not papers:
264
+ logger.warning(f"❌ No papers found for subject code: {subject_code}")
265
+ print(f"\n❌ No papers found for subject code: {subject_code}")
266
+ print("Please check the subject code and try again.")
267
+ sys.exit(1)
268
+
269
+ logger.info(f"✅ Found {len(papers)} papers")
270
+ print(f"✅ Found {len(papers)} papers")
271
+
272
+ # Step 3: Download papers
273
+ logger.info("Step 3: Downloading papers...")
274
+ downloader = PDFDownloader(session, output_dir, max_retries=args.max_retries)
275
+ downloaded_files = downloader.download_papers(papers)
276
+
277
+ if not downloaded_files:
278
+ logger.error("❌ No papers were downloaded successfully")
279
+ sys.exit(1)
280
+
281
+ logger.info(f"✅ Downloaded {len(downloaded_files)} papers")
282
+
283
+ # Step 4: Create ZIP archive
284
+ logger.info("Step 4: Creating ZIP archive...")
285
+ zip_creator = ZipCreator()
286
+ zip_path = zip_creator.create_zip(downloaded_files, subject_code, output_dir)
287
+
288
+ if zip_path:
289
+ logger.info(f"✅ ZIP archive created: {zip_path}")
290
+ print(f"\n🎉 Success! All papers downloaded and zipped:")
291
+ print(f"📦 ZIP file: {zip_path}")
292
+ print(f"📁 Individual files: {output_dir / subject_code}")
293
+ else:
294
+ logger.warning("⚠️ ZIP creation failed, but individual files are available")
295
+ print(f"\n⚠️ Papers downloaded but ZIP creation failed")
296
+ print(f"📁 Individual files: {output_dir / subject_code}")
297
+
298
+ # Cleanup
299
+ authenticator.cleanup()
300
+
301
+ print(f"\n✅ Download completed successfully!")
302
+ print(f"Total papers downloaded: {len(downloaded_files)}")
303
+
304
+ except KeyboardInterrupt:
305
+ logger.info("❌ Operation cancelled by user (Ctrl+C)")
306
+ print("\n❌ Operation cancelled by user")
307
+ sys.exit(1)
308
+ except Exception as e:
309
+ logger.error(f"❌ Unexpected error: {e}")
310
+ print(f"\n❌ An error occurred: {e}")
311
+ print("Check the logs for more details.")
312
+ sys.exit(1)
313
+
314
+
315
+ if __name__ == "__main__":
316
+ main()
@@ -0,0 +1 @@
1
+ # Downloader package
@@ -0,0 +1,207 @@
1
+ """
2
+ PDF Downloader Module
3
+
4
+ Handles downloading PDF files with progress tracking, retry logic,
5
+ and concurrent download management.
6
+ """
7
+
8
+ import logging
9
+ import os
10
+ import time
11
+ from pathlib import Path
12
+ from concurrent.futures import ThreadPoolExecutor, as_completed
13
+ from tqdm import tqdm
14
+ import requests
15
+
16
+
17
+ class PDFDownloader:
18
+ """Downloads PDF files with progress tracking and retry logic."""
19
+
20
+ def __init__(self, session, output_dir, max_retries=3, max_workers=4):
21
+ """Initialize the PDF downloader."""
22
+ self.session = session
23
+ self.output_dir = Path(output_dir)
24
+ self.max_retries = max_retries
25
+ self.max_workers = max_workers
26
+ self.logger = logging.getLogger(__name__)
27
+
28
+ # Ensure output directory exists
29
+ self.output_dir.mkdir(parents=True, exist_ok=True)
30
+
31
+ def download_papers(self, papers):
32
+ """
33
+ Download multiple papers with progress tracking.
34
+
35
+ Args:
36
+ papers (list): List of PaperInfo objects to download
37
+
38
+ Returns:
39
+ list: List of successfully downloaded file paths
40
+ """
41
+ if not papers:
42
+ return []
43
+
44
+ self.logger.info(f"Starting download of {len(papers)} papers...")
45
+ downloaded_files = []
46
+
47
+ # Create progress bar
48
+ with tqdm(total=len(papers), desc="Downloading papers", unit="file") as pbar:
49
+ # Use ThreadPoolExecutor for concurrent downloads
50
+ with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
51
+ # Submit all download tasks
52
+ future_to_paper = {
53
+ executor.submit(self._download_paper, paper): paper
54
+ for paper in papers
55
+ }
56
+
57
+ # Process completed downloads
58
+ for future in as_completed(future_to_paper):
59
+ paper = future_to_paper[future]
60
+ try:
61
+ file_path = future.result()
62
+ if file_path:
63
+ downloaded_files.append(file_path)
64
+ self.logger.debug(f"Downloaded: {paper.filename}")
65
+ else:
66
+ self.logger.warning(f"Failed to download: {paper.title}")
67
+ except Exception as e:
68
+ self.logger.error(f"Error downloading {paper.title}: {e}")
69
+ finally:
70
+ pbar.update(1)
71
+
72
+ self.logger.info(f"Downloaded {len(downloaded_files)}/{len(papers)} papers successfully")
73
+ return downloaded_files
74
+
75
+ def _download_paper(self, paper):
76
+ """
77
+ Download a single paper with retry logic.
78
+
79
+ Args:
80
+ paper (PaperInfo): Paper information object
81
+
82
+ Returns:
83
+ str: Path to downloaded file, or None if failed
84
+ """
85
+ for attempt in range(self.max_retries + 1):
86
+ try:
87
+ file_path = self._download_file(paper.download_url, paper.filename)
88
+ if file_path and self._verify_download(file_path):
89
+ return file_path
90
+
91
+ except Exception as e:
92
+ self.logger.warning(
93
+ f"Download attempt {attempt + 1} failed for {paper.title}: {e}"
94
+ )
95
+
96
+ if attempt < self.max_retries:
97
+ # Wait before retry with exponential backoff
98
+ wait_time = 2 ** attempt
99
+ time.sleep(wait_time)
100
+
101
+ self.logger.error(f"Failed to download after {self.max_retries + 1} attempts: {paper.title}")
102
+ return None
103
+
104
+ def _download_file(self, url, filename):
105
+ """
106
+ Download a single file from URL.
107
+
108
+ Args:
109
+ url (str): Download URL
110
+ filename (str): Target filename
111
+
112
+ Returns:
113
+ str: Path to downloaded file
114
+ """
115
+ file_path = self.output_dir / filename
116
+
117
+ # Avoid re-downloading if file already exists and is valid
118
+ if file_path.exists() and self._verify_download(file_path):
119
+ self.logger.debug(f"File already exists: {filename}")
120
+ return str(file_path)
121
+
122
+ # Start download
123
+ response = self.session.get(url, stream=True, timeout=60)
124
+ response.raise_for_status()
125
+
126
+ # Check if response is actually a PDF
127
+ content_type = response.headers.get('content-type', '').lower()
128
+ if 'pdf' not in content_type and 'application/octet-stream' not in content_type:
129
+ self.logger.warning(f"Unexpected content type for {filename}: {content_type}")
130
+
131
+ # Write file with progress tracking
132
+ total_size = int(response.headers.get('content-length', 0))
133
+
134
+ with open(file_path, 'wb') as f:
135
+ if total_size > 0:
136
+ # Track download progress for large files
137
+ downloaded = 0
138
+ for chunk in response.iter_content(chunk_size=8192):
139
+ if chunk:
140
+ f.write(chunk)
141
+ downloaded += len(chunk)
142
+ else:
143
+ # For files without content-length header
144
+ for chunk in response.iter_content(chunk_size=8192):
145
+ if chunk:
146
+ f.write(chunk)
147
+
148
+ self.logger.debug(f"Downloaded {filename} ({file_path.stat().st_size} bytes)")
149
+ return str(file_path)
150
+
151
+ def _verify_download(self, file_path):
152
+ """
153
+ Verify that the downloaded file is valid.
154
+
155
+ Args:
156
+ file_path (str or Path): Path to the downloaded file
157
+
158
+ Returns:
159
+ bool: True if file is valid, False otherwise
160
+ """
161
+ try:
162
+ file_path = Path(file_path)
163
+
164
+ # Check if file exists and has content
165
+ if not file_path.exists() or file_path.stat().st_size == 0:
166
+ return False
167
+
168
+ # Basic PDF validation - check PDF header
169
+ with open(file_path, 'rb') as f:
170
+ header = f.read(8)
171
+ if not header.startswith(b'%PDF'):
172
+ self.logger.warning(f"File does not appear to be a PDF: {file_path.name}")
173
+ # Don't reject non-PDF files completely, might be valid documents
174
+ # return False
175
+
176
+ return True
177
+
178
+ except Exception as e:
179
+ self.logger.warning(f"Error verifying file {file_path}: {e}")
180
+ return False
181
+
182
+ def cleanup_failed_downloads(self):
183
+ """Remove any incomplete or corrupted downloads."""
184
+ cleaned_count = 0
185
+
186
+ for file_path in self.output_dir.glob('*.pdf'):
187
+ if not self._verify_download(file_path):
188
+ try:
189
+ file_path.unlink()
190
+ cleaned_count += 1
191
+ self.logger.debug(f"Removed invalid file: {file_path.name}")
192
+ except Exception as e:
193
+ self.logger.warning(f"Could not remove invalid file {file_path}: {e}")
194
+
195
+ if cleaned_count > 0:
196
+ self.logger.info(f"Cleaned up {cleaned_count} invalid downloaded files")
197
+
198
+ def get_download_stats(self):
199
+ """Get statistics about downloaded files."""
200
+ pdf_files = list(self.output_dir.glob('*.pdf'))
201
+ total_size = sum(f.stat().st_size for f in pdf_files)
202
+
203
+ return {
204
+ 'count': len(pdf_files),
205
+ 'total_size_mb': total_size / (1024 * 1024),
206
+ 'files': [f.name for f in pdf_files]
207
+ }
@@ -0,0 +1 @@
1
+ # Scraper package