umpaper-fetch 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- auth/__init__.py +1 -0
- auth/chrome_fix.py +119 -0
- auth/um_authenticator.py +521 -0
- downloader/__init__.py +1 -0
- downloader/pdf_downloader.py +207 -0
- scraper/__init__.py +1 -0
- scraper/paper_scraper.py +316 -0
- umpaper_fetch/__init__.py +26 -0
- umpaper_fetch/auth/__init__.py +1 -0
- umpaper_fetch/auth/chrome_fix.py +119 -0
- umpaper_fetch/auth/um_authenticator.py +521 -0
- umpaper_fetch/cli.py +316 -0
- umpaper_fetch/downloader/__init__.py +1 -0
- umpaper_fetch/downloader/pdf_downloader.py +207 -0
- umpaper_fetch/scraper/__init__.py +1 -0
- umpaper_fetch/scraper/paper_scraper.py +316 -0
- umpaper_fetch/utils/__init__.py +1 -0
- umpaper_fetch/utils/logger.py +67 -0
- umpaper_fetch/utils/zip_creator.py +299 -0
- umpaper_fetch-1.0.0.dist-info/METADATA +462 -0
- umpaper_fetch-1.0.0.dist-info/RECORD +28 -0
- umpaper_fetch-1.0.0.dist-info/WHEEL +5 -0
- umpaper_fetch-1.0.0.dist-info/entry_points.txt +2 -0
- umpaper_fetch-1.0.0.dist-info/licenses/LICENSE +22 -0
- umpaper_fetch-1.0.0.dist-info/top_level.txt +5 -0
- utils/__init__.py +1 -0
- utils/logger.py +67 -0
- utils/zip_creator.py +299 -0
umpaper_fetch/cli.py
ADDED
@@ -0,0 +1,316 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
Command-line interface for umpaper-fetch package.
|
4
|
+
|
5
|
+
This module provides the main entry point for the um-papers command.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import argparse
|
9
|
+
import getpass
|
10
|
+
import logging
|
11
|
+
import os
|
12
|
+
import sys
|
13
|
+
from pathlib import Path
|
14
|
+
|
15
|
+
from .auth.um_authenticator import UMAuthenticator
|
16
|
+
from .scraper.paper_scraper import PaperScraper
|
17
|
+
from .downloader.pdf_downloader import PDFDownloader
|
18
|
+
from .utils.zip_creator import ZipCreator
|
19
|
+
from .utils.logger import setup_logger
|
20
|
+
|
21
|
+
|
22
|
+
def parse_arguments():
|
23
|
+
"""Parse command line arguments."""
|
24
|
+
parser = argparse.ArgumentParser(
|
25
|
+
description="Download all past year papers for a UM subject code",
|
26
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
27
|
+
epilog="""
|
28
|
+
Examples:
|
29
|
+
um-papers
|
30
|
+
um-papers --username 24012345 --subject-code WIA1005
|
31
|
+
um-papers --username 24056789 --subject-code WXES1116 --show-browser
|
32
|
+
um-papers --no-location-prompt --output-dir "C:/Downloads"
|
33
|
+
"""
|
34
|
+
)
|
35
|
+
|
36
|
+
parser.add_argument(
|
37
|
+
'--username', '-u',
|
38
|
+
help='UM username (without @siswa.um.edu.my)',
|
39
|
+
type=str
|
40
|
+
)
|
41
|
+
|
42
|
+
parser.add_argument(
|
43
|
+
'--subject-code', '-s',
|
44
|
+
help='Subject code to search for (e.g., WIA1005)',
|
45
|
+
type=str
|
46
|
+
)
|
47
|
+
|
48
|
+
parser.add_argument(
|
49
|
+
'--output-dir', '-o',
|
50
|
+
help='Output directory for downloads (default: ./downloads)',
|
51
|
+
default='./downloads',
|
52
|
+
type=str
|
53
|
+
)
|
54
|
+
|
55
|
+
parser.add_argument(
|
56
|
+
'--no-location-prompt',
|
57
|
+
help='Skip location selection prompt and use default output directory',
|
58
|
+
action='store_true'
|
59
|
+
)
|
60
|
+
|
61
|
+
parser.add_argument(
|
62
|
+
'--show-browser',
|
63
|
+
help='Show browser window (default is headless mode)',
|
64
|
+
action='store_true'
|
65
|
+
)
|
66
|
+
|
67
|
+
parser.add_argument(
|
68
|
+
'--browser', '-b',
|
69
|
+
help='Browser to use (auto, chrome, edge). Default: edge',
|
70
|
+
choices=['auto', 'chrome', 'edge'],
|
71
|
+
default='edge',
|
72
|
+
type=str
|
73
|
+
)
|
74
|
+
|
75
|
+
parser.add_argument(
|
76
|
+
'--timeout',
|
77
|
+
help='Session timeout in seconds (default: 30)',
|
78
|
+
default=30,
|
79
|
+
type=int
|
80
|
+
)
|
81
|
+
|
82
|
+
parser.add_argument(
|
83
|
+
'--max-retries',
|
84
|
+
help='Maximum retry attempts (default: 3)',
|
85
|
+
default=3,
|
86
|
+
type=int
|
87
|
+
)
|
88
|
+
|
89
|
+
parser.add_argument(
|
90
|
+
'--verbose', '-v',
|
91
|
+
help='Enable verbose logging',
|
92
|
+
action='store_true'
|
93
|
+
)
|
94
|
+
|
95
|
+
parser.add_argument(
|
96
|
+
'--version',
|
97
|
+
action='version',
|
98
|
+
version='%(prog)s 1.0.0'
|
99
|
+
)
|
100
|
+
|
101
|
+
return parser.parse_args()
|
102
|
+
|
103
|
+
|
104
|
+
def get_credentials(username=None):
|
105
|
+
"""Get user credentials securely."""
|
106
|
+
if not username:
|
107
|
+
username = input("Enter your UM username (without @siswa.um.edu.my): ").strip()
|
108
|
+
|
109
|
+
if not username:
|
110
|
+
print("Error: Username cannot be empty")
|
111
|
+
sys.exit(1)
|
112
|
+
|
113
|
+
password = getpass.getpass("Enter your UM password: ")
|
114
|
+
|
115
|
+
if not password:
|
116
|
+
print("Error: Password cannot be empty")
|
117
|
+
sys.exit(1)
|
118
|
+
|
119
|
+
return username, password
|
120
|
+
|
121
|
+
|
122
|
+
def get_subject_code(subject_code=None):
|
123
|
+
"""Get subject code from user."""
|
124
|
+
if not subject_code:
|
125
|
+
subject_code = input("Enter subject code (e.g., WIA1005): ").strip().upper()
|
126
|
+
|
127
|
+
if not subject_code:
|
128
|
+
print("Error: Subject code cannot be empty")
|
129
|
+
sys.exit(1)
|
130
|
+
|
131
|
+
return subject_code
|
132
|
+
|
133
|
+
|
134
|
+
def get_download_location(default_output_dir):
|
135
|
+
"""
|
136
|
+
Get custom download location from user.
|
137
|
+
|
138
|
+
Args:
|
139
|
+
default_output_dir (Path): Default output directory
|
140
|
+
|
141
|
+
Returns:
|
142
|
+
Path: User-chosen download location or default
|
143
|
+
"""
|
144
|
+
print(f"\n📂 Download Location Settings")
|
145
|
+
print("="*50)
|
146
|
+
print(f"Default location: {default_output_dir.absolute()}")
|
147
|
+
print("\nOptions:")
|
148
|
+
print("1. Use default location (downloads folder)")
|
149
|
+
print("2. Choose custom location")
|
150
|
+
|
151
|
+
while True:
|
152
|
+
choice = input("\nSelect option (1 or 2): ").strip()
|
153
|
+
|
154
|
+
if choice == '1':
|
155
|
+
print(f"✅ Using default location: {default_output_dir.absolute()}")
|
156
|
+
return default_output_dir
|
157
|
+
|
158
|
+
elif choice == '2':
|
159
|
+
while True:
|
160
|
+
custom_path = input("\nEnter custom download path: ").strip()
|
161
|
+
|
162
|
+
if not custom_path:
|
163
|
+
print("❌ Path cannot be empty. Please try again.")
|
164
|
+
continue
|
165
|
+
|
166
|
+
try:
|
167
|
+
# Convert to Path object and expand user home directory (~)
|
168
|
+
custom_dir = Path(custom_path).expanduser()
|
169
|
+
|
170
|
+
# Try to create the directory if it doesn't exist
|
171
|
+
custom_dir.mkdir(parents=True, exist_ok=True)
|
172
|
+
|
173
|
+
# Test if we can write to this directory
|
174
|
+
test_file = custom_dir / "test_write.tmp"
|
175
|
+
try:
|
176
|
+
test_file.write_text("test")
|
177
|
+
test_file.unlink() # Delete test file
|
178
|
+
print(f"✅ Custom location set: {custom_dir.absolute()}")
|
179
|
+
return custom_dir
|
180
|
+
except Exception as write_error:
|
181
|
+
print(f"❌ Cannot write to this location: {write_error}")
|
182
|
+
print("Please choose a different path or check permissions.")
|
183
|
+
|
184
|
+
except Exception as path_error:
|
185
|
+
print(f"❌ Invalid path: {path_error}")
|
186
|
+
print("Please enter a valid directory path.")
|
187
|
+
|
188
|
+
else:
|
189
|
+
print("❌ Please enter '1' or '2'.")
|
190
|
+
|
191
|
+
|
192
|
+
def main():
|
193
|
+
"""Main execution function."""
|
194
|
+
args = parse_arguments()
|
195
|
+
|
196
|
+
# Setup logging
|
197
|
+
log_level = logging.DEBUG if args.verbose else logging.INFO
|
198
|
+
logger = setup_logger(log_level)
|
199
|
+
|
200
|
+
try:
|
201
|
+
# Create default output directory
|
202
|
+
default_output_dir = Path(args.output_dir)
|
203
|
+
default_output_dir.mkdir(parents=True, exist_ok=True)
|
204
|
+
|
205
|
+
logger.info("=== UM Past Year Paper Downloader ===")
|
206
|
+
|
207
|
+
# Get credentials
|
208
|
+
username, password = get_credentials(args.username)
|
209
|
+
|
210
|
+
# Get subject code
|
211
|
+
subject_code = get_subject_code(args.subject_code)
|
212
|
+
|
213
|
+
# Get download location
|
214
|
+
if args.no_location_prompt:
|
215
|
+
output_dir = default_output_dir
|
216
|
+
logger.info(f"Using default output directory: {output_dir.absolute()}")
|
217
|
+
else:
|
218
|
+
output_dir = get_download_location(default_output_dir)
|
219
|
+
|
220
|
+
# Show configuration summary
|
221
|
+
print(f"\n📋 Configuration Summary")
|
222
|
+
print("="*50)
|
223
|
+
print(f"Username: {username}")
|
224
|
+
print(f"Subject Code: {subject_code}")
|
225
|
+
print(f"Output Directory: {output_dir.absolute()}")
|
226
|
+
print(f"Browser: {args.browser}")
|
227
|
+
print(f"Headless Mode: {not args.show_browser}")
|
228
|
+
print(f"Timeout: {args.timeout}s")
|
229
|
+
print(f"Max Retries: {args.max_retries}")
|
230
|
+
|
231
|
+
# Confirm before proceeding
|
232
|
+
print(f"\n🚀 Ready to start downloading papers for {subject_code}")
|
233
|
+
confirm = input("Continue? (y/N): ").strip().lower()
|
234
|
+
|
235
|
+
if confirm not in ['y', 'yes']:
|
236
|
+
print("❌ Operation cancelled by user")
|
237
|
+
sys.exit(0)
|
238
|
+
|
239
|
+
print("\n" + "="*60)
|
240
|
+
print("🔄 Starting download process...")
|
241
|
+
print("="*60)
|
242
|
+
|
243
|
+
# Step 1: Authentication
|
244
|
+
logger.info("Step 1: Authenticating with UM portal...")
|
245
|
+
authenticator = UMAuthenticator(
|
246
|
+
headless=not args.show_browser,
|
247
|
+
browser=args.browser,
|
248
|
+
timeout=args.timeout
|
249
|
+
)
|
250
|
+
|
251
|
+
session = authenticator.login(username, password)
|
252
|
+
if not session:
|
253
|
+
logger.error("❌ Authentication failed")
|
254
|
+
sys.exit(1)
|
255
|
+
|
256
|
+
logger.info("✅ Authentication successful")
|
257
|
+
|
258
|
+
# Step 2: Search for papers
|
259
|
+
logger.info(f"Step 2: Searching for papers with subject code: {subject_code}")
|
260
|
+
scraper = PaperScraper(session)
|
261
|
+
papers = scraper.search_papers(subject_code)
|
262
|
+
|
263
|
+
if not papers:
|
264
|
+
logger.warning(f"❌ No papers found for subject code: {subject_code}")
|
265
|
+
print(f"\n❌ No papers found for subject code: {subject_code}")
|
266
|
+
print("Please check the subject code and try again.")
|
267
|
+
sys.exit(1)
|
268
|
+
|
269
|
+
logger.info(f"✅ Found {len(papers)} papers")
|
270
|
+
print(f"✅ Found {len(papers)} papers")
|
271
|
+
|
272
|
+
# Step 3: Download papers
|
273
|
+
logger.info("Step 3: Downloading papers...")
|
274
|
+
downloader = PDFDownloader(session, output_dir, max_retries=args.max_retries)
|
275
|
+
downloaded_files = downloader.download_papers(papers)
|
276
|
+
|
277
|
+
if not downloaded_files:
|
278
|
+
logger.error("❌ No papers were downloaded successfully")
|
279
|
+
sys.exit(1)
|
280
|
+
|
281
|
+
logger.info(f"✅ Downloaded {len(downloaded_files)} papers")
|
282
|
+
|
283
|
+
# Step 4: Create ZIP archive
|
284
|
+
logger.info("Step 4: Creating ZIP archive...")
|
285
|
+
zip_creator = ZipCreator()
|
286
|
+
zip_path = zip_creator.create_zip(downloaded_files, subject_code, output_dir)
|
287
|
+
|
288
|
+
if zip_path:
|
289
|
+
logger.info(f"✅ ZIP archive created: {zip_path}")
|
290
|
+
print(f"\n🎉 Success! All papers downloaded and zipped:")
|
291
|
+
print(f"📦 ZIP file: {zip_path}")
|
292
|
+
print(f"📁 Individual files: {output_dir / subject_code}")
|
293
|
+
else:
|
294
|
+
logger.warning("⚠️ ZIP creation failed, but individual files are available")
|
295
|
+
print(f"\n⚠️ Papers downloaded but ZIP creation failed")
|
296
|
+
print(f"📁 Individual files: {output_dir / subject_code}")
|
297
|
+
|
298
|
+
# Cleanup
|
299
|
+
authenticator.cleanup()
|
300
|
+
|
301
|
+
print(f"\n✅ Download completed successfully!")
|
302
|
+
print(f"Total papers downloaded: {len(downloaded_files)}")
|
303
|
+
|
304
|
+
except KeyboardInterrupt:
|
305
|
+
logger.info("❌ Operation cancelled by user (Ctrl+C)")
|
306
|
+
print("\n❌ Operation cancelled by user")
|
307
|
+
sys.exit(1)
|
308
|
+
except Exception as e:
|
309
|
+
logger.error(f"❌ Unexpected error: {e}")
|
310
|
+
print(f"\n❌ An error occurred: {e}")
|
311
|
+
print("Check the logs for more details.")
|
312
|
+
sys.exit(1)
|
313
|
+
|
314
|
+
|
315
|
+
if __name__ == "__main__":
|
316
|
+
main()
|
@@ -0,0 +1 @@
|
|
1
|
+
# Downloader package
|
@@ -0,0 +1,207 @@
|
|
1
|
+
"""
|
2
|
+
PDF Downloader Module
|
3
|
+
|
4
|
+
Handles downloading PDF files with progress tracking, retry logic,
|
5
|
+
and concurrent download management.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import logging
|
9
|
+
import os
|
10
|
+
import time
|
11
|
+
from pathlib import Path
|
12
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
13
|
+
from tqdm import tqdm
|
14
|
+
import requests
|
15
|
+
|
16
|
+
|
17
|
+
class PDFDownloader:
|
18
|
+
"""Downloads PDF files with progress tracking and retry logic."""
|
19
|
+
|
20
|
+
def __init__(self, session, output_dir, max_retries=3, max_workers=4):
|
21
|
+
"""Initialize the PDF downloader."""
|
22
|
+
self.session = session
|
23
|
+
self.output_dir = Path(output_dir)
|
24
|
+
self.max_retries = max_retries
|
25
|
+
self.max_workers = max_workers
|
26
|
+
self.logger = logging.getLogger(__name__)
|
27
|
+
|
28
|
+
# Ensure output directory exists
|
29
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
30
|
+
|
31
|
+
def download_papers(self, papers):
|
32
|
+
"""
|
33
|
+
Download multiple papers with progress tracking.
|
34
|
+
|
35
|
+
Args:
|
36
|
+
papers (list): List of PaperInfo objects to download
|
37
|
+
|
38
|
+
Returns:
|
39
|
+
list: List of successfully downloaded file paths
|
40
|
+
"""
|
41
|
+
if not papers:
|
42
|
+
return []
|
43
|
+
|
44
|
+
self.logger.info(f"Starting download of {len(papers)} papers...")
|
45
|
+
downloaded_files = []
|
46
|
+
|
47
|
+
# Create progress bar
|
48
|
+
with tqdm(total=len(papers), desc="Downloading papers", unit="file") as pbar:
|
49
|
+
# Use ThreadPoolExecutor for concurrent downloads
|
50
|
+
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
51
|
+
# Submit all download tasks
|
52
|
+
future_to_paper = {
|
53
|
+
executor.submit(self._download_paper, paper): paper
|
54
|
+
for paper in papers
|
55
|
+
}
|
56
|
+
|
57
|
+
# Process completed downloads
|
58
|
+
for future in as_completed(future_to_paper):
|
59
|
+
paper = future_to_paper[future]
|
60
|
+
try:
|
61
|
+
file_path = future.result()
|
62
|
+
if file_path:
|
63
|
+
downloaded_files.append(file_path)
|
64
|
+
self.logger.debug(f"Downloaded: {paper.filename}")
|
65
|
+
else:
|
66
|
+
self.logger.warning(f"Failed to download: {paper.title}")
|
67
|
+
except Exception as e:
|
68
|
+
self.logger.error(f"Error downloading {paper.title}: {e}")
|
69
|
+
finally:
|
70
|
+
pbar.update(1)
|
71
|
+
|
72
|
+
self.logger.info(f"Downloaded {len(downloaded_files)}/{len(papers)} papers successfully")
|
73
|
+
return downloaded_files
|
74
|
+
|
75
|
+
def _download_paper(self, paper):
|
76
|
+
"""
|
77
|
+
Download a single paper with retry logic.
|
78
|
+
|
79
|
+
Args:
|
80
|
+
paper (PaperInfo): Paper information object
|
81
|
+
|
82
|
+
Returns:
|
83
|
+
str: Path to downloaded file, or None if failed
|
84
|
+
"""
|
85
|
+
for attempt in range(self.max_retries + 1):
|
86
|
+
try:
|
87
|
+
file_path = self._download_file(paper.download_url, paper.filename)
|
88
|
+
if file_path and self._verify_download(file_path):
|
89
|
+
return file_path
|
90
|
+
|
91
|
+
except Exception as e:
|
92
|
+
self.logger.warning(
|
93
|
+
f"Download attempt {attempt + 1} failed for {paper.title}: {e}"
|
94
|
+
)
|
95
|
+
|
96
|
+
if attempt < self.max_retries:
|
97
|
+
# Wait before retry with exponential backoff
|
98
|
+
wait_time = 2 ** attempt
|
99
|
+
time.sleep(wait_time)
|
100
|
+
|
101
|
+
self.logger.error(f"Failed to download after {self.max_retries + 1} attempts: {paper.title}")
|
102
|
+
return None
|
103
|
+
|
104
|
+
def _download_file(self, url, filename):
|
105
|
+
"""
|
106
|
+
Download a single file from URL.
|
107
|
+
|
108
|
+
Args:
|
109
|
+
url (str): Download URL
|
110
|
+
filename (str): Target filename
|
111
|
+
|
112
|
+
Returns:
|
113
|
+
str: Path to downloaded file
|
114
|
+
"""
|
115
|
+
file_path = self.output_dir / filename
|
116
|
+
|
117
|
+
# Avoid re-downloading if file already exists and is valid
|
118
|
+
if file_path.exists() and self._verify_download(file_path):
|
119
|
+
self.logger.debug(f"File already exists: {filename}")
|
120
|
+
return str(file_path)
|
121
|
+
|
122
|
+
# Start download
|
123
|
+
response = self.session.get(url, stream=True, timeout=60)
|
124
|
+
response.raise_for_status()
|
125
|
+
|
126
|
+
# Check if response is actually a PDF
|
127
|
+
content_type = response.headers.get('content-type', '').lower()
|
128
|
+
if 'pdf' not in content_type and 'application/octet-stream' not in content_type:
|
129
|
+
self.logger.warning(f"Unexpected content type for {filename}: {content_type}")
|
130
|
+
|
131
|
+
# Write file with progress tracking
|
132
|
+
total_size = int(response.headers.get('content-length', 0))
|
133
|
+
|
134
|
+
with open(file_path, 'wb') as f:
|
135
|
+
if total_size > 0:
|
136
|
+
# Track download progress for large files
|
137
|
+
downloaded = 0
|
138
|
+
for chunk in response.iter_content(chunk_size=8192):
|
139
|
+
if chunk:
|
140
|
+
f.write(chunk)
|
141
|
+
downloaded += len(chunk)
|
142
|
+
else:
|
143
|
+
# For files without content-length header
|
144
|
+
for chunk in response.iter_content(chunk_size=8192):
|
145
|
+
if chunk:
|
146
|
+
f.write(chunk)
|
147
|
+
|
148
|
+
self.logger.debug(f"Downloaded {filename} ({file_path.stat().st_size} bytes)")
|
149
|
+
return str(file_path)
|
150
|
+
|
151
|
+
def _verify_download(self, file_path):
|
152
|
+
"""
|
153
|
+
Verify that the downloaded file is valid.
|
154
|
+
|
155
|
+
Args:
|
156
|
+
file_path (str or Path): Path to the downloaded file
|
157
|
+
|
158
|
+
Returns:
|
159
|
+
bool: True if file is valid, False otherwise
|
160
|
+
"""
|
161
|
+
try:
|
162
|
+
file_path = Path(file_path)
|
163
|
+
|
164
|
+
# Check if file exists and has content
|
165
|
+
if not file_path.exists() or file_path.stat().st_size == 0:
|
166
|
+
return False
|
167
|
+
|
168
|
+
# Basic PDF validation - check PDF header
|
169
|
+
with open(file_path, 'rb') as f:
|
170
|
+
header = f.read(8)
|
171
|
+
if not header.startswith(b'%PDF'):
|
172
|
+
self.logger.warning(f"File does not appear to be a PDF: {file_path.name}")
|
173
|
+
# Don't reject non-PDF files completely, might be valid documents
|
174
|
+
# return False
|
175
|
+
|
176
|
+
return True
|
177
|
+
|
178
|
+
except Exception as e:
|
179
|
+
self.logger.warning(f"Error verifying file {file_path}: {e}")
|
180
|
+
return False
|
181
|
+
|
182
|
+
def cleanup_failed_downloads(self):
|
183
|
+
"""Remove any incomplete or corrupted downloads."""
|
184
|
+
cleaned_count = 0
|
185
|
+
|
186
|
+
for file_path in self.output_dir.glob('*.pdf'):
|
187
|
+
if not self._verify_download(file_path):
|
188
|
+
try:
|
189
|
+
file_path.unlink()
|
190
|
+
cleaned_count += 1
|
191
|
+
self.logger.debug(f"Removed invalid file: {file_path.name}")
|
192
|
+
except Exception as e:
|
193
|
+
self.logger.warning(f"Could not remove invalid file {file_path}: {e}")
|
194
|
+
|
195
|
+
if cleaned_count > 0:
|
196
|
+
self.logger.info(f"Cleaned up {cleaned_count} invalid downloaded files")
|
197
|
+
|
198
|
+
def get_download_stats(self):
|
199
|
+
"""Get statistics about downloaded files."""
|
200
|
+
pdf_files = list(self.output_dir.glob('*.pdf'))
|
201
|
+
total_size = sum(f.stat().st_size for f in pdf_files)
|
202
|
+
|
203
|
+
return {
|
204
|
+
'count': len(pdf_files),
|
205
|
+
'total_size_mb': total_size / (1024 * 1024),
|
206
|
+
'files': [f.name for f in pdf_files]
|
207
|
+
}
|
@@ -0,0 +1 @@
|
|
1
|
+
# Scraper package
|