PyPI - umpaper-fetch - Versions diffs - 1.0.0__py3-none-any.whl - Mend

umpaper-fetch 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

auth/__init__.py +1 -0
auth/chrome_fix.py +119 -0
auth/um_authenticator.py +521 -0
downloader/__init__.py +1 -0
downloader/pdf_downloader.py +207 -0
scraper/__init__.py +1 -0
scraper/paper_scraper.py +316 -0
umpaper_fetch/__init__.py +26 -0
umpaper_fetch/auth/__init__.py +1 -0
umpaper_fetch/auth/chrome_fix.py +119 -0
umpaper_fetch/auth/um_authenticator.py +521 -0
umpaper_fetch/cli.py +316 -0
umpaper_fetch/downloader/__init__.py +1 -0
umpaper_fetch/downloader/pdf_downloader.py +207 -0
umpaper_fetch/scraper/__init__.py +1 -0
umpaper_fetch/scraper/paper_scraper.py +316 -0
umpaper_fetch/utils/__init__.py +1 -0
umpaper_fetch/utils/logger.py +67 -0
umpaper_fetch/utils/zip_creator.py +299 -0
umpaper_fetch-1.0.0.dist-info/METADATA +462 -0
umpaper_fetch-1.0.0.dist-info/RECORD +28 -0
umpaper_fetch-1.0.0.dist-info/WHEEL +5 -0
umpaper_fetch-1.0.0.dist-info/entry_points.txt +2 -0
umpaper_fetch-1.0.0.dist-info/licenses/LICENSE +22 -0
umpaper_fetch-1.0.0.dist-info/top_level.txt +5 -0
utils/__init__.py +1 -0
utils/logger.py +67 -0
utils/zip_creator.py +299 -0

umpaper_fetch/scraper/paper_scraper.py ADDED Viewed

@@ -0,0 +1,316 @@
+"""
+Paper Scraper Module
+Handles searching for papers by subject code and extracting download URLs
+and metadata from the UM exam paper repository.
+"""
+import logging
+import re
+from urllib.parse import urljoin, parse_qs, urlparse
+from bs4 import BeautifulSoup
+import requests
+class PaperInfo:
+    """Data class for storing paper information."""
+    def __init__(self, title, download_url, year=None, semester=None, paper_type=None):
+        self.title = title
+        self.download_url = download_url
+        self.year = year
+        self.semester = semester
+        self.paper_type = paper_type
+        self.filename = self._generate_filename()
+    def _generate_filename(self):
+        """Generate a clean filename for the paper."""
+        # Extract useful parts from title - remove subject code and semester/year info to avoid duplication
+        title_to_clean = self.title
+        # Remove subject code pattern from title (e.g., "WIA1005 (Semester 1, 2024)")
+        title_to_clean = re.sub(r'[A-Z]{2,4}\d{4}\s*\([^)]+\)\s*', '', title_to_clean)
+        # Clean the remaining title
+        clean_title = re.sub(r'[^\w\s-]', '', title_to_clean)
+        clean_title = re.sub(r'\s+', '_', clean_title.strip())
+        # Add year and semester if available
+        parts = []
+        if self.year:
+            parts.append(f"Y{self.year}")
+        if self.semester:
+            parts.append(f"S{self.semester}")
+        if self.paper_type:
+            parts.append(self.paper_type)
+        if parts:
+            filename = f"{'_'.join(parts)}_{clean_title}.pdf"
+        else:
+            filename = f"{clean_title}.pdf"
+        # Ensure filename is not too long
+        if len(filename) > 100:
+            filename = filename[:95] + ".pdf"
+        return filename
+    def __str__(self):
+        return f"PaperInfo(title='{self.title}', year={self.year}, semester={self.semester})"
+class PaperScraper:
+    """Scrapes exam papers from UM repository."""
+    def __init__(self, session):
+        """Initialize scraper with authenticated session."""
+        self.session = session
+        self.base_url = "https://exampaper-um-edu-my.eu1.proxy.openathens.net"
+        self.search_url = f"{self.base_url}/cgi/search"
+        self.logger = logging.getLogger(__name__)
+    def search_papers(self, subject_code, max_results=100):
+        """
+        Search for papers by subject code.
+        Args:
+            subject_code (str): Subject code to search for
+            max_results (int): Maximum number of results to return
+        Returns:
+            list[PaperInfo]: List of paper information objects
+        """
+        self.logger.info(f"Searching for papers with subject code: {subject_code}")
+        papers = []
+        try:
+            # Use the correct search URL and parameters based on the actual form
+            search_params = {
+                'q': subject_code,
+                '_action_search': 'Search',
+                '_order': 'bytitle',
+                'basic_srchtype': 'ALL',
+                '_satisfyall': 'ALL'
+            }
+            self.logger.info(f"Performing search with params: {search_params}")
+            # Perform search request using GET (like the form does)
+            response = self.session.get(
+                "https://exampaper-um-edu-my.eu1.proxy.openathens.net/cgi/search",
+                params=search_params,
+                timeout=30
+            )
+            if response.status_code != 200:
+                self.logger.error(f"Search request failed: {response.status_code}")
+                return papers
+            # Parse search results
+            soup = BeautifulSoup(response.content, 'html.parser')
+            # Check if we got results
+            results_text = soup.find('div', class_='ep_search_controls')
+            if results_text:
+                text = results_text.get_text()
+                self.logger.info(f"Search results info: {text}")
+                # Extract number of results
+                import re
+                match = re.search(r'(\d+)\s+of\s+(\d+)', text)
+                if match:
+                    total_results = int(match.group(2))
+                    self.logger.info(f"Found {total_results} total results")
+                else:
+                    self.logger.warning("Could not determine number of results")
+            papers = self._parse_search_results(soup, subject_code)
+            self.logger.info(f"Successfully extracted {len(papers)} papers for {subject_code}")
+        except Exception as e:
+            self.logger.error(f"Error searching for papers: {e}")
+        return papers[:max_results]
+    def _parse_search_results(self, soup, subject_code):
+        """Parse search results from HTML."""
+        papers = []
+        # Look for the results table
+        results_table = soup.find('table', class_='ep_paginate_list')
+        if not results_table:
+            self.logger.warning("No results table found with class 'ep_paginate_list'")
+            return papers
+        # Find all result rows
+        result_rows = results_table.find_all('tr', class_='ep_search_result')
+        self.logger.info(f"Found {len(result_rows)} result rows")
+        for i, row in enumerate(result_rows, 1):
+            try:
+                self.logger.info(f"Processing result {i}...")
+                paper_info = self._extract_paper_info_from_row(row, subject_code)
+                if paper_info:
+                    papers.append(paper_info)
+                    self.logger.info(f"✅ Extracted: {paper_info.title}")
+                else:
+                    self.logger.warning(f"❌ Could not extract info from result {i}")
+            except Exception as e:
+                self.logger.warning(f"Error parsing result {i}: {e}")
+                continue
+        return papers
+    def _extract_paper_info_from_row(self, row, subject_code):
+        """Extract paper information from a search result row."""
+        try:
+            # Get all cells in the row
+            cells = row.find_all('td')
+            if len(cells) < 2:
+                self.logger.warning("Row doesn't have enough cells")
+                return None
+            # The main content is in the second cell
+            content_cell = cells[1]
+            # Extract the title and basic info
+            # Pattern: "WIA1005 (Semester X, YEAR) Title"
+            text_content = content_cell.get_text(strip=True)
+            self.logger.info(f"Row content: {text_content[:100]}...")
+            # Extract semester and year
+            semester_year_match = re.search(r'\(Semester (\d+), (\d{4})\)', text_content)
+            if semester_year_match:
+                semester = semester_year_match.group(1)
+                year = semester_year_match.group(2)
+            else:
+                semester = None
+                year = None
+                self.logger.warning("Could not extract semester/year info")
+            # Find the main paper link (usually the title link)
+            title_link = content_cell.find('a', href=True)
+            if title_link:
+                title = title_link.get_text(strip=True)
+                # Remove italic formatting
+                title = re.sub(r'[/*]', '', title)
+                paper_url = urljoin(self.base_url, title_link.get('href'))
+                self.logger.info(f"Found title link: {title}")
+            else:
+                self.logger.warning("No title link found")
+                return None
+            # Look for direct PDF download link
+            download_url = None
+            # Check the third cell (if exists) for PDF links
+            if len(cells) > 2:
+                pdf_cell = cells[2]
+                pdf_links = pdf_cell.find_all('a', href=True)
+                for link in pdf_links:
+                    href = link.get('href')
+                    if href and '.pdf' in href.lower():
+                        download_url = urljoin(self.base_url, href)
+                        self.logger.info(f"Found direct PDF link: {download_url}")
+                        break
+            # If no direct PDF link found, try to get it from the paper page
+            if not download_url:
+                self.logger.info("No direct PDF link found, checking paper page...")
+                download_url = self._get_download_url(paper_url)
+            if not download_url:
+                self.logger.warning(f"No download URL found for: {title}")
+                return None
+            # Generate a clean title without redundant info (year/semester will be in filename prefix)
+            clean_title = f"{subject_code} {title}"
+            paper_type = self._determine_paper_type(title)
+            return PaperInfo(
+                title=clean_title,
+                download_url=download_url,
+                year=year,
+                semester=semester,
+                paper_type=paper_type
+            )
+        except Exception as e:
+            self.logger.warning(f"Error extracting paper info: {e}")
+            return None
+    def _determine_paper_type(self, title):
+        """Determine the type of paper from the title."""
+        title_lower = title.lower()
+        if 'final' in title_lower:
+            return 'Final'
+        elif 'mid' in title_lower or 'midterm' in title_lower:
+            return 'Midterm'
+        elif 'quiz' in title_lower:
+            return 'Quiz'
+        elif 'test' in title_lower:
+            return 'Test'
+        else:
+            return 'Exam'
+    def _get_download_url(self, paper_url):
+        """Get the actual PDF download URL from the paper page."""
+        try:
+            self.logger.info(f"Getting download URL from: {paper_url}")
+            response = self.session.get(paper_url, timeout=15)
+            if response.status_code != 200:
+                self.logger.warning(f"Failed to access paper page: {response.status_code}")
+                return None
+            soup = BeautifulSoup(response.content, 'html.parser')
+            # Method 1: Look for direct PDF download links
+            download_links = soup.find_all('a', href=True)
+            for link in download_links:
+                href = link.get('href')
+                link_text = link.get_text(strip=True).lower()
+                # Look for PDF files or download links
+                if href and ('.pdf' in href.lower() or
+                           'download' in href.lower() or
+                           'download' in link_text or
+                           'pdf' in link_text):
+                    download_url = urljoin(self.base_url, href)
+                    self.logger.info(f"Found download link: {download_url}")
+                    return download_url
+            # Method 2: Look for repository-specific patterns
+            # UM repository often uses /id/eprint/XXXXX/1/filename.pdf
+            eprint_links = soup.find_all('a', href=re.compile(r'/\d+/\d+/.*\.pdf$', re.I))
+            if eprint_links:
+                download_url = urljoin(self.base_url, eprint_links[0].get('href'))
+                self.logger.info(f"Found eprint PDF: {download_url}")
+                return download_url
+            # Method 3: Look for any PDF links
+            pdf_links = soup.find_all('a', href=re.compile(r'\.pdf$', re.I))
+            if pdf_links:
+                download_url = urljoin(self.base_url, pdf_links[0].get('href'))
+                self.logger.info(f"Found PDF link: {download_url}")
+                return download_url
+            # Method 4: Check for embedded objects or iframes
+            objects = soup.find_all(['object', 'embed', 'iframe'])
+            for obj in objects:
+                src = obj.get('src') or obj.get('data')
+                if src and '.pdf' in src.lower():
+                    download_url = urljoin(self.base_url, src)
+                    self.logger.info(f"Found embedded PDF: {download_url}")
+                    return download_url
+            self.logger.warning(f"No download URL found on page: {paper_url}")
+        except Exception as e:
+            self.logger.warning(f"Error getting download URL for {paper_url}: {e}")
+        return None

umpaper_fetch/utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ # Utils package

umpaper_fetch/utils/logger.py ADDED Viewed

@@ -0,0 +1,67 @@
+"""
+Logging Configuration Module
+Sets up logging configuration for the UM Past Year Paper Downloader.
+"""
+import logging
+import sys
+from pathlib import Path
+from datetime import datetime
+def setup_logger(level=logging.INFO, log_file=None):
+    """
+    Set up logging configuration.
+    Args:
+        level (int): Logging level (default: INFO)
+        log_file (str): Optional log file path
+    Returns:
+        logging.Logger: Configured logger instance
+    """
+    # Create logs directory if it doesn't exist
+    if log_file is None:
+        logs_dir = Path("logs")
+        logs_dir.mkdir(exist_ok=True)
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        log_file = logs_dir / f"um_downloader_{timestamp}.log"
+    # Create formatter
+    formatter = logging.Formatter(
+        '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        datefmt='%Y-%m-%d %H:%M:%S'
+    )
+    # Configure root logger
+    root_logger = logging.getLogger()
+    root_logger.setLevel(level)
+    # Clear any existing handlers
+    root_logger.handlers.clear()
+    # Console handler
+    console_handler = logging.StreamHandler(sys.stdout)
+    console_handler.setLevel(level)
+    console_handler.setFormatter(formatter)
+    root_logger.addHandler(console_handler)
+    # File handler
+    if log_file:
+        file_handler = logging.FileHandler(log_file, encoding='utf-8')
+        file_handler.setLevel(logging.DEBUG)  # Always log everything to file
+        file_handler.setFormatter(formatter)
+        root_logger.addHandler(file_handler)
+    # Suppress some noisy third-party loggers
+    logging.getLogger('urllib3').setLevel(logging.WARNING)
+    logging.getLogger('selenium').setLevel(logging.WARNING)
+    logging.getLogger('webdriver_manager').setLevel(logging.WARNING)
+    logger = logging.getLogger(__name__)
+    logger.info(f"Logging initialized - Level: {logging.getLevelName(level)}")
+    if log_file:
+        logger.info(f"Log file: {log_file}")
+    return root_logger

umpaper_fetch/utils/zip_creator.py ADDED Viewed

@@ -0,0 +1,299 @@
+"""
+ZIP Creator Module
+Creates organized ZIP archives with proper file naming and structure.
+"""
+import logging
+import zipfile
+from pathlib import Path
+import os
+class ZipCreator:
+    """Creates organized ZIP archives from downloaded papers."""
+    def __init__(self):
+        """Initialize the ZIP creator."""
+        self.logger = logging.getLogger(__name__)
+    def create_zip(self, file_paths, zip_path, subject_code):
+        """
+        Create a ZIP archive from downloaded files.
+        Args:
+            file_paths (list): List of file paths to include
+            zip_path (str or Path): Output ZIP file path
+            subject_code (str): Subject code for organization
+        Returns:
+            str: Path to created ZIP file
+        """
+        zip_path = Path(zip_path)
+        if not file_paths:
+            self.logger.warning("No files to zip")
+            return None
+        self.logger.info(f"Creating ZIP archive: {zip_path.name}")
+        try:
+            with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED, compresslevel=6) as zipf:
+                # Organize files by year and semester
+                organized_files = self._organize_files(file_paths)
+                # Add files to ZIP with organized structure
+                for file_path in file_paths:
+                    file_path = Path(file_path)
+                    if not file_path.exists():
+                        self.logger.warning(f"File not found, skipping: {file_path}")
+                        continue
+                    # Determine archive path based on organization
+                    archive_path = self._get_archive_path(file_path, subject_code, organized_files)
+                    # Add file to ZIP
+                    zipf.write(file_path, archive_path)
+                    self.logger.debug(f"Added to ZIP: {archive_path}")
+                # Add a README file with information
+                readme_content = self._generate_readme(subject_code, file_paths)
+                zipf.writestr(f"{subject_code}_README.txt", readme_content)
+            # Verify ZIP was created successfully
+            if zip_path.exists() and zip_path.stat().st_size > 0:
+                self.logger.info(f"ZIP archive created successfully: {zip_path}")
+                self.logger.info(f"Archive size: {zip_path.stat().st_size / (1024*1024):.2f} MB")
+                return str(zip_path)
+            else:
+                self.logger.error("ZIP archive creation failed")
+                return None
+        except Exception as e:
+            self.logger.error(f"Error creating ZIP archive: {e}")
+            return None
+    def _organize_files(self, file_paths):
+        """
+        Organize files by extracting year and semester information.
+        Args:
+            file_paths (list): List of file paths
+        Returns:
+            dict: Dictionary mapping file paths to organization info
+        """
+        organized = {}
+        for file_path in file_paths:
+            file_path = Path(file_path)
+            filename = file_path.name
+            # Extract year and semester from filename
+            year = self._extract_year_from_filename(filename)
+            semester = self._extract_semester_from_filename(filename)
+            paper_type = self._extract_paper_type_from_filename(filename)
+            organized[str(file_path)] = {
+                'year': year,
+                'semester': semester,
+                'paper_type': paper_type,
+                'original_name': filename
+            }
+        return organized
+    def _get_archive_path(self, file_path, subject_code, organized_files):
+        """
+        Get the archive path for a file within the ZIP.
+        Args:
+            file_path (Path): Original file path
+            subject_code (str): Subject code
+            organized_files (dict): Organization information
+        Returns:
+            str: Path within the ZIP archive
+        """
+        file_info = organized_files.get(str(file_path), {})
+        # Build hierarchical path: SubjectCode/Year/filename (simplified structure)
+        path_parts = [subject_code]
+        year = file_info.get('year')
+        if year:
+            path_parts.append(f"Year_{year}")
+        else:
+            # If no year info, put in "Unsorted" folder
+            path_parts.append("Unsorted")
+        # Use original filename or clean it up
+        filename = file_info.get('original_name', file_path.name)
+        path_parts.append(filename)
+        return '/'.join(path_parts)
+    def _extract_year_from_filename(self, filename):
+        """Extract year from filename."""
+        import re
+        # Look for 4-digit year (20xx)
+        year_match = re.search(r'(20\d{2})', filename)
+        if year_match:
+            return year_match.group(1)
+        # Look for Y followed by year
+        year_match = re.search(r'Y(20\d{2}|0\d|1\d|2\d)', filename)
+        if year_match:
+            year = year_match.group(1)
+            if len(year) == 2:
+                # Convert 2-digit to 4-digit year
+                year_int = int(year)
+                if year_int <= 30:  # Assume 00-30 means 2000-2030
+                    return f"20{year}"
+                else:  # 31-99 means 1931-1999
+                    return f"19{year}"
+            return year
+        return None
+    def _extract_semester_from_filename(self, filename):
+        """Extract semester from filename."""
+        import re
+        filename_lower = filename.lower()
+        # Look for S1, S2, Sem1, Sem2, Semester 1, etc.
+        if re.search(r's1|sem1|semester\s*1', filename_lower):
+            return '1'
+        elif re.search(r's2|sem2|semester\s*2', filename_lower):
+            return '2'
+        elif re.search(r's3|sem3|semester\s*3', filename_lower):
+            return '3'
+        return None
+    def _extract_paper_type_from_filename(self, filename):
+        """Extract paper type from filename."""
+        filename_lower = filename.lower()
+        if 'final' in filename_lower:
+            return 'Final_Exam'
+        elif any(word in filename_lower for word in ['mid', 'midterm']):
+            return 'Midterm_Exam'
+        elif 'quiz' in filename_lower:
+            return 'Quiz'
+        elif 'test' in filename_lower:
+            return 'Test'
+        elif 'assignment' in filename_lower:
+            return 'Assignment'
+        return 'Exam'
+    def _generate_readme(self, subject_code, file_paths):
+        """Generate README content for the ZIP archive."""
+        content = f"""
+{subject_code} Past Year Papers
+===============================
+This archive contains past year examination papers for {subject_code}.
+Archive Contents:
+- Total files: {len(file_paths)}
+- Subject: {subject_code}
+- Downloaded: {self._get_current_timestamp()}
+File Organization:
+- Files are organized by Year only
+- Naming convention: Year_Semester_Type_Title.pdf
+Usage Instructions:
+1. Extract the archive to your desired location
+2. Papers are organized in folders by year
+3. Each year folder contains all papers for that year
+4. File names include the year, semester, and exam type for identification
+Notes:
+- All papers are in PDF format
+- Files maintain their original names with additional organization metadata
+- This archive was created using the UM Past Year Paper Downloader tool
+For questions or issues, please refer to the tool documentation.
+Generated by UM Past Year Paper Downloader
+==========================================
+"""
+        return content.strip()
+    def _get_current_timestamp(self):
+        """Get current timestamp as string."""
+        from datetime import datetime
+        return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    def verify_zip(self, zip_path):
+        """
+        Verify that the ZIP file is valid and contains expected files.
+        Args:
+            zip_path (str or Path): Path to ZIP file
+        Returns:
+            bool: True if ZIP is valid, False otherwise
+        """
+        try:
+            zip_path = Path(zip_path)
+            if not zip_path.exists():
+                return False
+            with zipfile.ZipFile(zip_path, 'r') as zipf:
+                # Test the ZIP file
+                bad_file = zipf.testzip()
+                if bad_file:
+                    self.logger.error(f"Corrupted file in ZIP: {bad_file}")
+                    return False
+                # Check if ZIP has content
+                file_list = zipf.namelist()
+                if not file_list:
+                    self.logger.error("ZIP file is empty")
+                    return False
+                self.logger.info(f"ZIP verification successful: {len(file_list)} files")
+                return True
+        except Exception as e:
+            self.logger.error(f"Error verifying ZIP file: {e}")
+            return False
+    def extract_zip_info(self, zip_path):
+        """
+        Extract information about the ZIP archive.
+        Args:
+            zip_path (str or Path): Path to ZIP file
+        Returns:
+            dict: Information about the ZIP archive
+        """
+        try:
+            zip_path = Path(zip_path)
+            with zipfile.ZipFile(zip_path, 'r') as zipf:
+                file_list = zipf.namelist()
+                total_size = sum(info.file_size for info in zipf.infolist())
+                compressed_size = zip_path.stat().st_size
+                return {
+                    'file_count': len(file_list),
+                    'total_uncompressed_size_mb': total_size / (1024 * 1024),
+                    'compressed_size_mb': compressed_size / (1024 * 1024),
+                    'compression_ratio': (1 - compressed_size / total_size) * 100 if total_size > 0 else 0,
+                    'files': file_list
+                }
+        except Exception as e:
+            self.logger.error(f"Error extracting ZIP info: {e}")
+            return None