PyPI - github-ai-scraper - Versions diffs - 0.1.2__py3-none-any.whl - Mend

github-ai-scraper 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

ai_scraper/__init__.py +3 -0
ai_scraper/api/__init__.py +6 -0
ai_scraper/api/github.py +340 -0
ai_scraper/api/gitlab.py +418 -0
ai_scraper/api/rate_limiter.py +120 -0
ai_scraper/api_server.py +196 -0
ai_scraper/auth.py +68 -0
ai_scraper/backup.py +112 -0
ai_scraper/cache.py +95 -0
ai_scraper/classifier.py +135 -0
ai_scraper/cli.py +747 -0
ai_scraper/config.py +237 -0
ai_scraper/config_watcher.py +82 -0
ai_scraper/dedup.py +148 -0
ai_scraper/filters/__init__.py +5 -0
ai_scraper/filters/ai_filter.py +93 -0
ai_scraper/health.py +155 -0
ai_scraper/i18n.py +141 -0
ai_scraper/interactive.py +96 -0
ai_scraper/keywords/__init__.py +5 -0
ai_scraper/keywords/extractor.py +274 -0
ai_scraper/logging_config.py +74 -0
ai_scraper/models/__init__.py +5 -0
ai_scraper/models/repository.py +72 -0
ai_scraper/output/__init__.py +6 -0
ai_scraper/output/excel.py +79 -0
ai_scraper/output/html.py +152 -0
ai_scraper/output/markdown.py +338 -0
ai_scraper/output/rss.py +82 -0
ai_scraper/output/translator.py +303 -0
ai_scraper/plugin_system.py +146 -0
ai_scraper/plugins/__init__.py +5 -0
ai_scraper/retry.py +134 -0
ai_scraper/scheduler.py +84 -0
ai_scraper/scrape_progress.py +99 -0
ai_scraper/secure_storage.py +127 -0
ai_scraper/storage/__init__.py +5 -0
ai_scraper/storage/async_database.py +237 -0
ai_scraper/storage/database.py +456 -0
ai_scraper/webhooks.py +95 -0
github_ai_scraper-0.1.2.dist-info/METADATA +299 -0
github_ai_scraper-0.1.2.dist-info/RECORD +44 -0
github_ai_scraper-0.1.2.dist-info/WHEEL +4 -0
github_ai_scraper-0.1.2.dist-info/entry_points.txt +2 -0

ai_scraper/scrape_progress.py ADDED Viewed

@@ -0,0 +1,99 @@
+"""Scrape progress tracking for resume support."""
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+import hashlib
+class ScrapeProgress:
+    """Track and persist scrape progress for resume support."""
+    def __init__(self, storage_dir: Path):
+        """Initialize progress tracker.
+        Args:
+            storage_dir: Directory for storing progress files.
+        """
+        self.storage_dir = Path(storage_dir)
+        self.storage_dir.mkdir(parents=True, exist_ok=True)
+    def _query_to_filename(self, query: str) -> str:
+        """Convert query to a safe filename."""
+        query_hash = hashlib.md5(query.encode()).hexdigest()[:8]
+        return f"progress_{query_hash}.json"
+    def save(
+        self,
+        query: str,
+        last_page: int,
+        total_found: int,
+        timestamp: datetime,
+    ) -> None:
+        """Save scrape progress.
+        Args:
+            query: Search query.
+            last_page: Last successfully fetched page.
+            total_found: Total repositories found so far.
+            timestamp: Timestamp of the progress.
+        """
+        filename = self._query_to_filename(query)
+        filepath = self.storage_dir / filename
+        data = {
+            "query": query,
+            "last_page": last_page,
+            "total_found": total_found,
+            "timestamp": timestamp.isoformat(),
+        }
+        filepath.write_text(json.dumps(data, indent=2), encoding="utf-8")
+    def load(self, query: str) -> Optional[dict]:
+        """Load scrape progress.
+        Args:
+            query: Search query.
+        Returns:
+            Progress data or None if not found.
+        """
+        filename = self._query_to_filename(query)
+        filepath = self.storage_dir / filename
+        if not filepath.exists():
+            return None
+        try:
+            data = json.loads(filepath.read_text(encoding="utf-8"))
+            data["timestamp"] = datetime.fromisoformat(data["timestamp"])
+            return data
+        except (json.JSONDecodeError, KeyError, ValueError):
+            return None
+    def clear(self, query: str) -> None:
+        """Clear progress for a query.
+        Args:
+            query: Search query.
+        """
+        filename = self._query_to_filename(query)
+        filepath = self.storage_dir / filename
+        if filepath.exists():
+            filepath.unlink()
+    def has_progress(self, query: str) -> bool:
+        """Check if progress exists for a query.
+        Args:
+            query: Search query.
+        Returns:
+            True if progress exists.
+        """
+        filename = self._query_to_filename(query)
+        filepath = self.storage_dir / filename
+        return filepath.exists()

ai_scraper/secure_storage.py ADDED Viewed

@@ -0,0 +1,127 @@
+"""Secure token storage using encryption."""
+import base64
+import json
+import os
+from pathlib import Path
+from typing import Optional
+class SecureStorage:
+    """Secure storage for sensitive tokens."""
+    def __init__(self, storage_dir: Path):
+        """Initialize secure storage.
+        Args:
+            storage_dir: Directory for storing encrypted tokens.
+        """
+        self.storage_dir = Path(storage_dir)
+        self.storage_dir.mkdir(parents=True, exist_ok=True)
+        self.token_file = self.storage_dir / "tokens.enc"
+        self._cipher = None
+    def _get_cipher(self):
+        """Get or create cipher for encryption."""
+        if self._cipher is None:
+            try:
+                from cryptography.fernet import Fernet
+                from cryptography.hazmat.primitives import hashes
+                from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC
+                # Use a key derived from machine-specific info
+                machine_id = f"{os.environ.get('USERNAME', 'user')}{os.environ.get('COMPUTERNAME', 'host')}"
+                # Use a fixed salt for simplicity
+                salt = b'ai_scraper_salt_v1'
+                kdf = PBKDF2HMAC(
+                    algorithm=hashes.SHA256(),
+                    length=32,
+                    salt=salt,
+                    iterations=100000,
+                )
+                key = base64.urlsafe_b64encode(kdf.derive(machine_id.encode()))
+                self._cipher = Fernet(key)
+            except ImportError:
+                # Fallback to base64 encoding if cryptography not available
+                self._cipher = None
+        return self._cipher
+    def store_token(self, name: str, token: str) -> None:
+        """Store a token securely.
+        Args:
+            name: Token name/identifier.
+            token: Token value to store.
+        """
+        cipher = self._get_cipher()
+        # Load existing tokens
+        tokens = self._load_tokens()
+        # Add/update token
+        tokens[name] = token
+        # Encrypt and save
+        data = json.dumps(tokens)
+        if cipher:
+            encrypted = cipher.encrypt(data.encode())
+            self.token_file.write_bytes(encrypted)
+        else:
+            # Fallback: base64 encode
+            encoded = base64.b64encode(data.encode())
+            self.token_file.write_bytes(encoded)
+    def get_token(self, name: str) -> Optional[str]:
+        """Retrieve a stored token.
+        Args:
+            name: Token name/identifier.
+        Returns:
+            Token value or None if not found.
+        """
+        tokens = self._load_tokens()
+        return tokens.get(name)
+    def delete_token(self, name: str) -> None:
+        """Delete a stored token.
+        Args:
+            name: Token name/identifier.
+        """
+        tokens = self._load_tokens()
+        if name in tokens:
+            del tokens[name]
+            cipher = self._get_cipher()
+            data = json.dumps(tokens)
+            if cipher:
+                encrypted = cipher.encrypt(data.encode())
+                self.token_file.write_bytes(encrypted)
+            else:
+                encoded = base64.b64encode(data.encode())
+                self.token_file.write_bytes(encoded)
+    def _load_tokens(self) -> dict:
+        """Load tokens from encrypted storage."""
+        if not self.token_file.exists():
+            return {}
+        try:
+            cipher = self._get_cipher()
+            data = self.token_file.read_bytes()
+            if cipher:
+                decrypted = cipher.decrypt(data)
+                return json.loads(decrypted.decode())
+            else:
+                # Fallback: base64 decode
+                decoded = base64.b64decode(data)
+                return json.loads(decoded.decode())
+        except Exception:
+            return {}

ai_scraper/storage/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Storage module for ai_scraper."""
+from ai_scraper.storage.database import Database
+__all__ = ["Database"]

ai_scraper/storage/async_database.py ADDED Viewed

@@ -0,0 +1,237 @@
+"""Async SQLite database storage."""
+import json
+import aiosqlite
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+from ai_scraper.models.repository import Repository
+@dataclass
+class TrendResult:
+    """Trend analysis result."""
+    repo_id: int
+    repo_name: str
+    initial_stars: int
+    current_stars: int
+    growth_rate: float
+class AsyncDatabase:
+    """Async SQLite database for storing repository data."""
+    def __init__(self, db_path: Path):
+        """Initialize database.
+        Args:
+            db_path: Path to SQLite database file.
+        """
+        self.db_path = Path(db_path)
+        self.db_path.parent.mkdir(parents=True, exist_ok=True)
+        self.conn: Optional[aiosqlite.Connection] = None
+    async def init_db(self) -> None:
+        """Initialize database tables."""
+        self.conn = await aiosqlite.connect(self.db_path)
+        self.conn.row_factory = aiosqlite.Row
+        await self.conn.executescript("""
+            CREATE TABLE IF NOT EXISTS repositories (
+                id INTEGER PRIMARY KEY,
+                name TEXT UNIQUE NOT NULL,
+                full_name TEXT,
+                description TEXT,
+                stars INTEGER,
+                language TEXT,
+                topics TEXT,
+                created_at TIMESTAMP,
+                updated_at TIMESTAMP,
+                pushed_at TIMESTAMP,
+                url TEXT,
+                open_issues INTEGER,
+                forks INTEGER,
+                contributors INTEGER,
+                relevance_score REAL,
+                first_seen_at TIMESTAMP,
+                last_updated_at TIMESTAMP
+            );
+            CREATE TABLE IF NOT EXISTS snapshots (
+                id INTEGER PRIMARY KEY AUTOINCREMENT,
+                repo_id INTEGER,
+                stars INTEGER,
+                snapshot_at TIMESTAMP,
+                FOREIGN KEY (repo_id) REFERENCES repositories(id)
+            );
+            CREATE INDEX IF NOT EXISTS idx_stars ON repositories(stars DESC);
+            CREATE INDEX IF NOT EXISTS idx_updated ON repositories(last_updated_at DESC);
+            CREATE INDEX IF NOT EXISTS idx_repo_id ON snapshots(repo_id);
+            CREATE INDEX IF NOT EXISTS idx_language ON repositories(language);
+            CREATE INDEX IF NOT EXISTS idx_created_at ON repositories(created_at DESC);
+            CREATE INDEX IF NOT EXISTS idx_relevance ON repositories(relevance_score DESC);
+            CREATE INDEX IF NOT EXISTS idx_snapshot_at ON snapshots(snapshot_at DESC);
+        """)
+        await self.conn.commit()
+    async def save_repository(self, repo: Repository, relevance_score: float = 0.0) -> None:
+        """Save or update a repository."""
+        now = datetime.now().isoformat()
+        await self.conn.execute("""
+            INSERT INTO repositories (
+                id, name, full_name, description, stars, language, topics,
+                created_at, updated_at, pushed_at, url, open_issues, forks,
+                contributors, relevance_score, first_seen_at, last_updated_at
+            ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+            ON CONFLICT(id) DO UPDATE SET
+                full_name = excluded.full_name,
+                description = excluded.description,
+                stars = excluded.stars,
+                language = excluded.language,
+                topics = excluded.topics,
+                updated_at = excluded.updated_at,
+                pushed_at = excluded.pushed_at,
+                open_issues = excluded.open_issues,
+                forks = excluded.forks,
+                contributors = excluded.contributors,
+                relevance_score = excluded.relevance_score,
+                last_updated_at = excluded.last_updated_at
+        """, (
+            repo.id, repo.name, repo.full_name, repo.description, repo.stars,
+            repo.language, json.dumps(repo.topics), repo.created_at.isoformat(),
+            repo.updated_at.isoformat(), repo.pushed_at.isoformat(), repo.url,
+            repo.open_issues, repo.forks, repo.contributors, relevance_score,
+            now, now
+        ))
+        await self.conn.commit()
+    async def get_repository_by_id(self, repo_id: int) -> Optional[Repository]:
+        """Get a specific repository by ID."""
+        cursor = await self.conn.execute(
+            "SELECT * FROM repositories WHERE id = ?", (repo_id,)
+        )
+        row = await cursor.fetchone()
+        if row is None:
+            return None
+        return self._row_to_repo(row)
+    async def get_all_repositories(self, limit: int = 100, sort_by: str = "stars") -> list[Repository]:
+        """Get all repositories."""
+        valid_sort_fields = ["stars", "updated_at", "relevance_score"]
+        sort_field = sort_by if sort_by in valid_sort_fields else "stars"
+        cursor = await self.conn.execute(f"""
+            SELECT * FROM repositories
+            ORDER BY {sort_field} DESC
+            LIMIT ?
+        """, (limit,))
+        rows = await cursor.fetchall()
+        return [self._row_to_repo(row) for row in rows]
+    async def get_stats(self) -> dict:
+        """Get database statistics."""
+        cursor = await self.conn.execute("SELECT COUNT(*) as count FROM repositories")
+        row = await cursor.fetchone()
+        repo_count = row["count"]
+        cursor = await self.conn.execute("SELECT COUNT(*) as count FROM snapshots")
+        row = await cursor.fetchone()
+        snapshot_count = row["count"]
+        cursor = await self.conn.execute("SELECT SUM(stars) as total FROM repositories")
+        row = await cursor.fetchone()
+        total_stars = row["total"] or 0
+        return {
+            "repository_count": repo_count,
+            "snapshot_count": snapshot_count,
+            "total_stars": total_stars,
+        }
+    async def get_last_scrape_time(self) -> Optional[datetime]:
+        """Get the timestamp of the most recent repository update."""
+        cursor = await self.conn.execute(
+            "SELECT MAX(last_updated_at) as max_time FROM repositories"
+        )
+        row = await cursor.fetchone()
+        if row["max_time"] is None:
+            return None
+        return datetime.fromisoformat(row["max_time"])
+    async def search_local(self, query: str, limit: int = 20) -> list[Repository]:
+        """Search repositories locally."""
+        cursor = await self.conn.execute("""
+            SELECT * FROM repositories
+            WHERE name LIKE ? OR description LIKE ?
+            ORDER BY stars DESC
+            LIMIT ?
+        """, (f"%{query}%", f"%{query}%", limit))
+        rows = await cursor.fetchall()
+        return [self._row_to_repo(row) for row in rows]
+    async def get_trending(self, days: int = 7, limit: int = 10) -> list[TrendResult]:
+        """Get trending repositories by star growth."""
+        cursor = await self.conn.execute("""
+            SELECT
+                r.id as repo_id,
+                r.name as repo_name,
+                s1.stars as initial_stars,
+                r.stars as current_stars
+            FROM repositories r
+            JOIN snapshots s1 ON r.id = s1.repo_id
+            WHERE s1.snapshot_at >= datetime('now', ?)
+            GROUP BY r.id
+            HAVING current_stars > initial_stars
+            ORDER BY (CAST(current_stars AS FLOAT) / initial_stars - 1) DESC
+            LIMIT ?
+        """, (f'-{days} days', limit))
+        results = []
+        async for row in cursor:
+            initial = row["initial_stars"]
+            current = row["current_stars"]
+            growth = (current - initial) / initial if initial > 0 else 0.0
+            results.append(TrendResult(
+                repo_id=row["repo_id"],
+                repo_name=row["repo_name"],
+                initial_stars=initial,
+                current_stars=current,
+                growth_rate=growth,
+            ))
+        return results
+    async def close(self) -> None:
+        """Close database connection."""
+        if self.conn:
+            await self.conn.close()
+            self.conn = None
+    def _row_to_repo(self, row: aiosqlite.Row) -> Repository:
+        """Convert database row to Repository object."""
+        return Repository(
+            id=row["id"],
+            name=row["name"],
+            full_name=row["full_name"],
+            description=row["description"],
+            stars=row["stars"],
+            language=row["language"],
+            topics=json.loads(row["topics"]) if row["topics"] else [],
+            created_at=datetime.fromisoformat(row["created_at"]) if row["created_at"] else None,
+            updated_at=datetime.fromisoformat(row["updated_at"]) if row["updated_at"] else None,
+            pushed_at=datetime.fromisoformat(row["pushed_at"]) if row["pushed_at"] else None,
+            url=row["url"],
+            open_issues=row["open_issues"],
+            forks=row["forks"],
+            contributors=row["contributors"],
+        )