PyPI - github-ai-scraper - Versions diffs - 0.1.2__py3-none-any.whl - Mend

github-ai-scraper 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

ai_scraper/__init__.py +3 -0
ai_scraper/api/__init__.py +6 -0
ai_scraper/api/github.py +340 -0
ai_scraper/api/gitlab.py +418 -0
ai_scraper/api/rate_limiter.py +120 -0
ai_scraper/api_server.py +196 -0
ai_scraper/auth.py +68 -0
ai_scraper/backup.py +112 -0
ai_scraper/cache.py +95 -0
ai_scraper/classifier.py +135 -0
ai_scraper/cli.py +747 -0
ai_scraper/config.py +237 -0
ai_scraper/config_watcher.py +82 -0
ai_scraper/dedup.py +148 -0
ai_scraper/filters/__init__.py +5 -0
ai_scraper/filters/ai_filter.py +93 -0
ai_scraper/health.py +155 -0
ai_scraper/i18n.py +141 -0
ai_scraper/interactive.py +96 -0
ai_scraper/keywords/__init__.py +5 -0
ai_scraper/keywords/extractor.py +274 -0
ai_scraper/logging_config.py +74 -0
ai_scraper/models/__init__.py +5 -0
ai_scraper/models/repository.py +72 -0
ai_scraper/output/__init__.py +6 -0
ai_scraper/output/excel.py +79 -0
ai_scraper/output/html.py +152 -0
ai_scraper/output/markdown.py +338 -0
ai_scraper/output/rss.py +82 -0
ai_scraper/output/translator.py +303 -0
ai_scraper/plugin_system.py +146 -0
ai_scraper/plugins/__init__.py +5 -0
ai_scraper/retry.py +134 -0
ai_scraper/scheduler.py +84 -0
ai_scraper/scrape_progress.py +99 -0
ai_scraper/secure_storage.py +127 -0
ai_scraper/storage/__init__.py +5 -0
ai_scraper/storage/async_database.py +237 -0
ai_scraper/storage/database.py +456 -0
ai_scraper/webhooks.py +95 -0
github_ai_scraper-0.1.2.dist-info/METADATA +299 -0
github_ai_scraper-0.1.2.dist-info/RECORD +44 -0
github_ai_scraper-0.1.2.dist-info/WHEEL +4 -0
github_ai_scraper-0.1.2.dist-info/entry_points.txt +2 -0

ai_scraper/config.py ADDED Viewed

@@ -0,0 +1,237 @@
+"""Configuration management."""
+import os
+import re
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional
+import yaml
+@dataclass
+class GitHubConfig:
+    """GitHub API configuration."""
+    token: Optional[str] = None
+    cache_ttl: int = 3600
+@dataclass
+class GitLabConfig:
+    """GitLab API configuration."""
+    token: Optional[str] = None
+    base_url: str = "https://gitlab.com/api/v4"
+    cache_ttl: int = 3600
+@dataclass
+class FilterConfigYaml:
+    """Filter configuration from YAML."""
+    min_stars: int = 100
+    keywords: list[str] = field(default_factory=lambda: [
+        "ai", "artificial intelligence", "machine learning", "deep learning",
+        "neural network", "llm", "gpt", "transformer", "nlp", "computer vision",
+        "reinforcement learning", "pytorch", "tensorflow", "huggingface"
+    ])
+    topics: list[str] = field(default_factory=lambda: [
+        "ai", "machine-learning", "deep-learning", "neural-network",
+        "natural-language-processing", "computer-vision", "llm", "gpt",
+        "pytorch", "tensorflow", "huggingface", "openai", "langchain"
+    ])
+    languages: list[str] = field(default_factory=list)
+    exclude_keywords: list[str] = field(default_factory=list)
+@dataclass
+class ScrapeConfigYaml:
+    """Scrape configuration from YAML."""
+    data_fields: list[str] = field(default_factory=lambda: [
+        "stars", "language", "topics", "contributors"
+    ])
+    max_results: int = 500
+    concurrency: int = 5
+    cache_ttl: int = 3600
+@dataclass
+class DatabaseConfig:
+    """Database configuration."""
+    path: str = "./data/ai_scraper.db"
+@dataclass
+class SchedulerConfig:
+    """Go scheduler configuration."""
+    enabled: bool = True
+    workers: int = 4
+@dataclass
+class KeywordsConfig:
+    """Keywords configuration."""
+    file: str = "./keywords.txt"
+    max_keywords: int = 100
+@dataclass
+class OutputConfig:
+    """Output configuration."""
+    dir: str = "./output"
+    filename: str = "repositories.md"
+@dataclass
+class WebhookEndpointConfig:
+    """Webhook endpoint configuration."""
+    url: str = ""
+    events: list[str] = field(default_factory=list)
+@dataclass
+class WebhooksConfig:
+    """Webhooks configuration."""
+    enabled: bool = False
+    endpoints: list[WebhookEndpointConfig] = field(default_factory=list)
+@dataclass
+class Config:
+    """Main configuration."""
+    github: GitHubConfig = field(default_factory=GitHubConfig)
+    gitlab: GitLabConfig = field(default_factory=GitLabConfig)
+    filter: FilterConfigYaml = field(default_factory=FilterConfigYaml)
+    scrape: ScrapeConfigYaml = field(default_factory=ScrapeConfigYaml)
+    database: DatabaseConfig = field(default_factory=DatabaseConfig)
+    scheduler: SchedulerConfig = field(default_factory=SchedulerConfig)
+    keywords: KeywordsConfig = field(default_factory=KeywordsConfig)
+    output: OutputConfig = field(default_factory=OutputConfig)
+    webhooks: WebhooksConfig = field(default_factory=WebhooksConfig)
+def _substitute_env_vars(value: str) -> str:
+    """Substitute environment variables in string value."""
+    pattern = r'\$\{([^}]+)\}'
+    def replace(match):
+        var_name = match.group(1)
+        return os.environ.get(var_name, "")
+    return re.sub(pattern, replace, value)
+def _process_config_values(config_dict: dict) -> dict:
+    """Recursively process config values for env var substitution."""
+    result = {}
+    for key, value in config_dict.items():
+        if isinstance(value, dict):
+            result[key] = _process_config_values(value)
+        elif isinstance(value, str):
+            result[key] = _substitute_env_vars(value)
+        elif isinstance(value, list):
+            result[key] = [
+                _substitute_env_vars(item) if isinstance(item, str) else item
+                for item in value
+            ]
+        else:
+            result[key] = value
+    return result
+def load_config(config_path: Optional[Path] = None) -> Config:
+    """Load configuration from YAML file."""
+    if config_path is None or not config_path.exists():
+        return Config()
+    with open(config_path, "r", encoding="utf-8") as f:
+        raw_config = yaml.safe_load(f) or {}
+    processed_config = _process_config_values(raw_config)
+    github = GitHubConfig(
+        token=processed_config.get("github", {}).get("token"),
+        cache_ttl=processed_config.get("github", {}).get("cache_ttl", 3600),
+    )
+    gitlab_dict = processed_config.get("gitlab", {})
+    gitlab_config = GitLabConfig(
+        token=gitlab_dict.get("token"),
+        base_url=gitlab_dict.get("base_url", "https://gitlab.com/api/v4"),
+        cache_ttl=gitlab_dict.get("cache_ttl", 3600),
+    )
+    filter_dict = processed_config.get("filter", {})
+    filter_config = FilterConfigYaml(
+        min_stars=filter_dict.get("min_stars", 100),
+        keywords=filter_dict.get("keywords", FilterConfigYaml().keywords),
+        topics=filter_dict.get("topics", FilterConfigYaml().topics),
+        languages=filter_dict.get("languages", []),
+        exclude_keywords=filter_dict.get("exclude_keywords", []),
+    )
+    scrape_dict = processed_config.get("scrape", {})
+    scrape_config = ScrapeConfigYaml(
+        data_fields=scrape_dict.get("data_fields", ScrapeConfigYaml().data_fields),
+        max_results=scrape_dict.get("max_results", 500),
+        concurrency=scrape_dict.get("concurrency", 5),
+        cache_ttl=scrape_dict.get("cache_ttl", 3600),
+    )
+    database_dict = processed_config.get("database", {})
+    database_config = DatabaseConfig(
+        path=database_dict.get("path", "./data/ai_scraper.db"),
+    )
+    scheduler_dict = processed_config.get("scheduler", {})
+    scheduler_config = SchedulerConfig(
+        enabled=scheduler_dict.get("enabled", True),
+        workers=scheduler_dict.get("workers", 4),
+    )
+    keywords_dict = processed_config.get("keywords", {})
+    keywords_config = KeywordsConfig(
+        file=keywords_dict.get("file", "./keywords.txt"),
+        max_keywords=keywords_dict.get("max_keywords", 100),
+    )
+    output_dict = processed_config.get("output", {})
+    output_config = OutputConfig(
+        dir=output_dict.get("dir", "./output"),
+        filename=output_dict.get("filename", "repositories.md"),
+    )
+    webhooks_dict = processed_config.get("webhooks", {})
+    endpoints_list = webhooks_dict.get("endpoints", [])
+    endpoints = [
+        WebhookEndpointConfig(
+            url=endpoint.get("url", ""),
+            events=endpoint.get("events", []),
+        )
+        for endpoint in endpoints_list
+    ]
+    webhooks_config = WebhooksConfig(
+        enabled=webhooks_dict.get("enabled", False),
+        endpoints=endpoints,
+    )
+    return Config(
+        github=github,
+        gitlab=gitlab_config,
+        filter=filter_config,
+        scrape=scrape_config,
+        database=database_config,
+        scheduler=scheduler_config,
+        keywords=keywords_config,
+        output=output_config,
+        webhooks=webhooks_config,
+    )

ai_scraper/config_watcher.py ADDED Viewed

@@ -0,0 +1,82 @@
+"""Configuration file watcher for hot reload."""
+import logging
+import threading
+import time
+from pathlib import Path
+from typing import Callable, Optional
+logger = logging.getLogger(__name__)
+class ConfigWatcher:
+    """Watch configuration file for changes."""
+    def __init__(
+        self,
+        config_path: Path,
+        on_change: Callable[[Path], None],
+        poll_interval: float = 1.0,
+    ):
+        """Initialize config watcher.
+        Args:
+            config_path: Path to configuration file.
+            on_change: Callback when file changes.
+            poll_interval: Polling interval in seconds.
+        """
+        self.config_path = Path(config_path)
+        self.on_change = on_change
+        self.poll_interval = poll_interval
+        self._running = False
+        self._thread: Optional[threading.Thread] = None
+        self._last_mtime: Optional[float] = None
+    def start(self) -> None:
+        """Start watching for changes."""
+        if self._running:
+            return
+        self._running = True
+        self._last_mtime = self._get_mtime()
+        self._thread = threading.Thread(target=self._watch_loop, daemon=True)
+        self._thread.start()
+        logger.info(f"Started watching {self.config_path}")
+    def stop(self) -> None:
+        """Stop watching for changes."""
+        self._running = False
+        if self._thread:
+            self._thread.join(timeout=2)
+            self._thread = None
+        logger.info("Stopped config watcher")
+    def _get_mtime(self) -> Optional[float]:
+        """Get file modification time."""
+        try:
+            return self.config_path.stat().st_mtime
+        except FileNotFoundError:
+            return None
+    def _watch_loop(self) -> None:
+        """Main watch loop."""
+        while self._running:
+            try:
+                current_mtime = self._get_mtime()
+                if current_mtime is not None and current_mtime != self._last_mtime:
+                    logger.info(f"Config file changed: {self.config_path}")
+                    self._last_mtime = current_mtime
+                    try:
+                        self.on_change(self.config_path)
+                    except Exception as e:
+                        logger.error(f"Error in on_change callback: {e}")
+                time.sleep(self.poll_interval)
+            except Exception as e:
+                logger.error(f"Error in watch loop: {e}")
+                time.sleep(self.poll_interval)

ai_scraper/dedup.py ADDED Viewed

@@ -0,0 +1,148 @@
+"""Repository deduplication utilities."""
+from dataclasses import dataclass
+from typing import Optional
+from ai_scraper.models.repository import Repository
+@dataclass
+class DuplicationInfo:
+    """Information about repository duplication."""
+    is_fork: bool
+    is_mirror: bool
+    is_similar: bool
+    original_repo: Optional[str]
+    duplicate_type: str  # "fork", "mirror", "similar", "none"
+    similarity_score: float = 0.0
+class DeduplicationChecker:
+    """Check for repository duplicates."""
+    # Common mirror patterns
+    MIRROR_PATTERNS = [
+        "-mirror",
+        "-mirror.git",
+        ".mirror",
+        "mirror-",
+    ]
+    def check(self, repo: Repository, is_fork: bool = False) -> DuplicationInfo:
+        """Check if repository is a duplicate.
+        Args:
+            repo: Repository to check.
+            is_fork: Whether the repo is a fork (from API data).
+        Returns:
+            Duplication information.
+        """
+        # Check mirror patterns in name
+        name_lower = repo.name.lower()
+        is_mirror = any(pattern in name_lower for pattern in self.MIRROR_PATTERNS)
+        # Extract original repo name if mirror
+        original = None
+        duplicate_type = "none"
+        if is_fork:
+            duplicate_type = "fork"
+            original = self._extract_original_from_fork(repo.name)
+        elif is_mirror:
+            duplicate_type = "mirror"
+            original = self._extract_original_name(repo.name)
+        return DuplicationInfo(
+            is_fork=is_fork,
+            is_mirror=is_mirror,
+            is_similar=False,
+            original_repo=original,
+            duplicate_type=duplicate_type,
+        )
+    def _extract_original_name(self, mirror_name: str) -> str:
+        """Extract original repository name from mirror name."""
+        name = mirror_name
+        for pattern in self.MIRROR_PATTERNS:
+            name = name.replace(pattern, "")
+        return name.strip("-_")
+    def _extract_original_from_fork(self, fork_name: str) -> str:
+        """Extract original repo name from fork."""
+        # Fork name is usually user/original-repo
+        # We'd need API data to know the actual original
+        return fork_name
+    def find_similar_content(
+        self,
+        repos: list[Repository],
+        threshold: float = 0.8,
+    ) -> list[tuple[Repository, Repository, float]]:
+        """Find repositories with similar content.
+        Args:
+            repos: List of repositories.
+            threshold: Similarity threshold (0-1).
+        Returns:
+            List of (repo1, repo2, similarity) tuples.
+        """
+        similar_pairs = []
+        for i, repo1 in enumerate(repos):
+            for repo2 in repos[i + 1:]:
+                similarity = self._calculate_similarity(repo1, repo2)
+                if similarity >= threshold:
+                    similar_pairs.append((repo1, repo2, similarity))
+        return similar_pairs
+    def _calculate_similarity(self, repo1: Repository, repo2: Repository) -> float:
+        """Calculate similarity between two repositories."""
+        # Compare descriptions
+        desc1 = (repo1.description or "").lower()
+        desc2 = (repo2.description or "").lower()
+        # Simple Jaccard similarity on words
+        words1 = set(desc1.split())
+        words2 = set(desc2.split())
+        if not words1 or not words2:
+            return 0.0
+        intersection = words1 & words2
+        union = words1 | words2
+        return len(intersection) / len(union)
+    def find_duplicates(self, repos: list[Repository]) -> dict[str, list[Repository]]:
+        """Find groups of duplicate repositories.
+        Args:
+            repos: List of repositories.
+        Returns:
+            Dictionary mapping normalized names to duplicate groups.
+        """
+        groups: dict[str, list[Repository]] = {}
+        for repo in repos:
+            normalized = self._normalize_name(repo.name)
+            if normalized not in groups:
+                groups[normalized] = []
+            groups[normalized].append(repo)
+        # Return only groups with duplicates
+        return {k: v for k, v in groups.items() if len(v) > 1}
+    def _normalize_name(self, name: str) -> str:
+        """Normalize repository name for comparison."""
+        name = name.lower()
+        # Remove common suffixes
+        for suffix in ["-mirror", "-mirror.git", ".mirror", "-fork"]:
+            name = name.replace(suffix, "")
+        # Remove organization prefix
+        if "/" in name:
+            name = name.split("/")[-1]
+        return name.strip("-_")

ai_scraper/filters/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Filters for ai_scraper."""
+from ai_scraper.filters.ai_filter import AIFilter
+__all__ = ["AIFilter"]

ai_scraper/filters/ai_filter.py ADDED Viewed

@@ -0,0 +1,93 @@
+"""AI-related content filter."""
+from ai_scraper.models.repository import Repository, FilterConfig
+from ai_scraper.classifier import RepositoryClassifier, Classification
+class AIFilter:
+    """Filter for detecting AI-related repositories."""
+    def __init__(self):
+        """Initialize the AI filter with a classifier."""
+        self._classifier = RepositoryClassifier()
+    def is_ai_related(self, repo: Repository, config: FilterConfig) -> bool:
+        """Check if repository is AI-related.
+        Args:
+            repo: Repository to check.
+            config: Filter configuration.
+        Returns:
+            True if repository is AI-related.
+        """
+        # Check exclude keywords first
+        text_to_check = f"{repo.name} {repo.description or ''}".lower()
+        for exclude in config.exclude_keywords:
+            # Normalize: replace hyphens with spaces for matching
+            exclude_normalized = exclude.lower().replace("-", " ")
+            if exclude_normalized in text_to_check or exclude.lower() in text_to_check:
+                return False
+        # Check topics
+        repo_topics_lower = [t.lower() for t in repo.topics]
+        for topic in config.topics:
+            if topic.lower() in repo_topics_lower:
+                return True
+        # Check keywords in name and description
+        for keyword in config.keywords:
+            # Normalize: replace hyphens with spaces for matching
+            keyword_normalized = keyword.lower().replace("-", " ")
+            if keyword_normalized in text_to_check or keyword.lower() in text_to_check:
+                return True
+        return False
+    def score_relevance(self, repo: Repository) -> float:
+        """Calculate AI relevance score for a repository.
+        Args:
+            repo: Repository to score.
+        Returns:
+            Relevance score between 0.0 and 1.0.
+        """
+        score = 0.0
+        text_to_check = f"{repo.name} {repo.description or ''}".lower()
+        # Default AI indicators
+        ai_keywords = [
+            "ai", "artificial intelligence", "machine learning", "deep learning",
+            "neural network", "llm", "gpt", "transformer", "nlp", "computer vision",
+            "pytorch", "tensorflow", "huggingface", "openai", "langchain"
+        ]
+        ai_topics = [
+            "ai", "machine-learning", "deep-learning", "neural-network",
+            "natural-language-processing", "computer-vision", "llm", "gpt",
+            "pytorch", "tensorflow", "huggingface", "openai", "langchain"
+        ]
+        # Count keyword matches
+        keyword_matches = sum(1 for kw in ai_keywords if kw in text_to_check)
+        score += min(keyword_matches * 0.2, 0.6)
+        # Count topic matches
+        repo_topics_lower = [t.lower() for t in repo.topics]
+        topic_matches = sum(1 for topic in ai_topics if topic in repo_topics_lower)
+        score += min(topic_matches * 0.15, 0.4)
+        return min(score, 1.0)
+    def classify(self, repo: Repository) -> Classification:
+        """Classify a repository into an AI category.
+        Args:
+            repo: Repository to classify.
+        Returns:
+            Classification result with primary category, secondary categories,
+            confidence, tech stack, and maturity assessment.
+        """
+        return self._classifier.classify(repo)