github-ai-scraper 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. ai_scraper/__init__.py +3 -0
  2. ai_scraper/api/__init__.py +6 -0
  3. ai_scraper/api/github.py +340 -0
  4. ai_scraper/api/gitlab.py +418 -0
  5. ai_scraper/api/rate_limiter.py +120 -0
  6. ai_scraper/api_server.py +196 -0
  7. ai_scraper/auth.py +68 -0
  8. ai_scraper/backup.py +112 -0
  9. ai_scraper/cache.py +95 -0
  10. ai_scraper/classifier.py +135 -0
  11. ai_scraper/cli.py +747 -0
  12. ai_scraper/config.py +237 -0
  13. ai_scraper/config_watcher.py +82 -0
  14. ai_scraper/dedup.py +148 -0
  15. ai_scraper/filters/__init__.py +5 -0
  16. ai_scraper/filters/ai_filter.py +93 -0
  17. ai_scraper/health.py +155 -0
  18. ai_scraper/i18n.py +141 -0
  19. ai_scraper/interactive.py +96 -0
  20. ai_scraper/keywords/__init__.py +5 -0
  21. ai_scraper/keywords/extractor.py +274 -0
  22. ai_scraper/logging_config.py +74 -0
  23. ai_scraper/models/__init__.py +5 -0
  24. ai_scraper/models/repository.py +72 -0
  25. ai_scraper/output/__init__.py +6 -0
  26. ai_scraper/output/excel.py +79 -0
  27. ai_scraper/output/html.py +152 -0
  28. ai_scraper/output/markdown.py +338 -0
  29. ai_scraper/output/rss.py +82 -0
  30. ai_scraper/output/translator.py +303 -0
  31. ai_scraper/plugin_system.py +146 -0
  32. ai_scraper/plugins/__init__.py +5 -0
  33. ai_scraper/retry.py +134 -0
  34. ai_scraper/scheduler.py +84 -0
  35. ai_scraper/scrape_progress.py +99 -0
  36. ai_scraper/secure_storage.py +127 -0
  37. ai_scraper/storage/__init__.py +5 -0
  38. ai_scraper/storage/async_database.py +237 -0
  39. ai_scraper/storage/database.py +456 -0
  40. ai_scraper/webhooks.py +95 -0
  41. github_ai_scraper-0.1.2.dist-info/METADATA +299 -0
  42. github_ai_scraper-0.1.2.dist-info/RECORD +44 -0
  43. github_ai_scraper-0.1.2.dist-info/WHEEL +4 -0
  44. github_ai_scraper-0.1.2.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,74 @@
1
+ """Logging configuration for ai-scraper."""
2
+
3
+ import json
4
+ import logging
5
+ import sys
6
+ from pathlib import Path
7
+ from typing import Optional
8
+
9
+ from rich.console import Console
10
+ from rich.logging import RichHandler
11
+
12
+ # Shared console for consistent output
13
+ console = Console()
14
+
15
+
16
+ def setup_logging(
17
+ level: str = "INFO",
18
+ log_file: Optional[Path] = None,
19
+ json_format: bool = False,
20
+ ) -> None:
21
+ """Set up logging.
22
+
23
+ Args:
24
+ level: Log level (DEBUG, INFO, WARNING, ERROR).
25
+ log_file: Optional file path for logs.
26
+ json_format: Use JSON format for logs.
27
+ """
28
+ handlers: list[logging.Handler] = [
29
+ RichHandler(rich_tracebacks=True, console=console)
30
+ ]
31
+
32
+ if log_file:
33
+ log_file.parent.mkdir(parents=True, exist_ok=True)
34
+ file_handler = logging.FileHandler(log_file)
35
+
36
+ if json_format:
37
+
38
+ class JSONFormatter(logging.Formatter):
39
+ def format(self, record: logging.LogRecord) -> str:
40
+ log_entry = {
41
+ "timestamp": self.formatTime(record),
42
+ "level": record.levelname,
43
+ "message": record.getMessage(),
44
+ "module": record.module,
45
+ "function": record.funcName,
46
+ }
47
+ return json.dumps(log_entry)
48
+
49
+ file_handler.setFormatter(JSONFormatter())
50
+ else:
51
+ file_handler.setFormatter(
52
+ logging.Formatter(
53
+ "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
54
+ )
55
+ )
56
+ handlers.append(file_handler)
57
+
58
+ logging.basicConfig(
59
+ level=logging.getLevelName(level),
60
+ format="%(message)s",
61
+ handlers=handlers,
62
+ )
63
+
64
+
65
+ def get_logger(name: str) -> logging.Logger:
66
+ """Get a configured logger.
67
+
68
+ Args:
69
+ name: Logger name.
70
+
71
+ Returns:
72
+ Configured logger.
73
+ """
74
+ return logging.getLogger(name)
@@ -0,0 +1,5 @@
1
+ """Data models for ai_scraper."""
2
+
3
+ from ai_scraper.models.repository import Repository, RepoSnapshot, FilterConfig, ScrapeConfig
4
+
5
+ __all__ = ["Repository", "RepoSnapshot", "FilterConfig", "ScrapeConfig"]
@@ -0,0 +1,72 @@
1
+ """Repository data models."""
2
+
3
+ from datetime import datetime
4
+ from typing import Optional
5
+
6
+ from pydantic import BaseModel, Field, field_validator
7
+
8
+
9
+ class Repository(BaseModel):
10
+ """Repository information (GitHub or GitLab)."""
11
+
12
+ id: int = Field(gt=0, description="Repository ID")
13
+ name: str = Field(min_length=1, max_length=200)
14
+ full_name: str = Field(min_length=1, max_length=200)
15
+ description: Optional[str] = Field(None, max_length=1000)
16
+ stars: int = Field(ge=0, description="Star count")
17
+ language: Optional[str] = Field(None, max_length=50)
18
+ topics: list[str] = Field(default_factory=list)
19
+ created_at: datetime
20
+ updated_at: datetime
21
+ pushed_at: datetime
22
+ url: str = Field(pattern=r"^https?://[\w\.-]+/[\w\-\.]+/[\w\-\.]+")
23
+ open_issues: Optional[int] = Field(None, ge=0)
24
+ forks: Optional[int] = Field(None, ge=0)
25
+ contributors: Optional[int] = Field(None, ge=0)
26
+
27
+ @field_validator("name", "full_name")
28
+ @classmethod
29
+ def validate_name(cls, v: str) -> str:
30
+ """Validate and strip name fields."""
31
+ if not v.strip():
32
+ raise ValueError("Name cannot be empty")
33
+ return v.strip()
34
+
35
+ @field_validator("topics")
36
+ @classmethod
37
+ def validate_topics(cls, v: list[str]) -> list[str]:
38
+ """Clean and lowercase topics."""
39
+ return [t.strip().lower() for t in v if t.strip()]
40
+
41
+
42
+ class RepoSnapshot(BaseModel):
43
+ """Repository snapshot for trend analysis."""
44
+
45
+ repo_id: int = Field(gt=0)
46
+ stars: int = Field(ge=0)
47
+ snapshot_at: datetime
48
+
49
+
50
+ class FilterConfig(BaseModel):
51
+ """Filter configuration for scraping."""
52
+
53
+ keywords: list[str] = Field(default_factory=list)
54
+ topics: list[str] = Field(default_factory=list)
55
+ languages: list[str] = Field(default_factory=list)
56
+ exclude_keywords: list[str] = Field(default_factory=list)
57
+ min_stars: int = Field(default=100, ge=0)
58
+
59
+ @field_validator("keywords", "topics", "languages", "exclude_keywords")
60
+ @classmethod
61
+ def clean_string_list(cls, v: list[str]) -> list[str]:
62
+ """Clean string lists by stripping whitespace and removing empty strings."""
63
+ return [s.strip() for s in v if s.strip()]
64
+
65
+
66
+ class ScrapeConfig(BaseModel):
67
+ """Scrape configuration."""
68
+
69
+ data_fields: list[str] = Field(default_factory=list)
70
+ max_results: int = Field(default=100, gt=0)
71
+ concurrency: int = Field(default=5, gt=0, le=100)
72
+ cache_ttl: int = Field(default=3600, ge=0)
@@ -0,0 +1,6 @@
1
+ """Output module for generating reports."""
2
+
3
+ from ai_scraper.output.html import HTMLExporter
4
+ from ai_scraper.output.markdown import MarkdownExporter
5
+
6
+ __all__ = ["HTMLExporter", "MarkdownExporter"]
@@ -0,0 +1,79 @@
1
+ """Excel exporter for generating spreadsheet reports."""
2
+
3
+ from datetime import datetime
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+ from ai_scraper.models import Repository
8
+
9
+
10
+ class ExcelExporter:
11
+ """Export repositories to Excel format."""
12
+
13
+ def __init__(self, output_dir: Path, filename: str = "repositories.xlsx"):
14
+ """Initialize the exporter.
15
+
16
+ Args:
17
+ output_dir: Directory for output files.
18
+ filename: Name of the output file.
19
+ """
20
+ self.output_dir = Path(output_dir)
21
+ self.filename = filename
22
+
23
+ def export_repositories(self, repos: list[Repository], title: str = "AI Repositories") -> Path:
24
+ """Export repositories to an Excel file.
25
+
26
+ Args:
27
+ repos: List of repositories to export.
28
+ title: Sheet title.
29
+
30
+ Returns:
31
+ Path to the created file.
32
+ """
33
+ try:
34
+ import openpyxl
35
+ from openpyxl.styles import Font, PatternFill, Alignment
36
+ except ImportError:
37
+ raise ImportError("openpyxl is required for Excel export. Install with: pip install openpyxl")
38
+
39
+ self.output_dir.mkdir(parents=True, exist_ok=True)
40
+
41
+ wb = openpyxl.Workbook()
42
+ ws = wb.active
43
+ ws.title = title[:31] # Excel sheet name limit
44
+
45
+ # Header style
46
+ header_font = Font(bold=True, color="FFFFFF")
47
+ header_fill = PatternFill(start_color="4472C4", end_color="4472C4", fill_type="solid")
48
+
49
+ # Headers
50
+ headers = ["Name", "Stars", "Language", "Topics", "Description", "URL", "Updated"]
51
+ for col, header in enumerate(headers, 1):
52
+ cell = ws.cell(row=1, column=col, value=header)
53
+ cell.font = header_font
54
+ cell.fill = header_fill
55
+ cell.alignment = Alignment(horizontal="center")
56
+
57
+ # Data rows
58
+ for row, repo in enumerate(repos, 2):
59
+ ws.cell(row=row, column=1, value=repo.full_name)
60
+ ws.cell(row=row, column=2, value=repo.stars)
61
+ ws.cell(row=row, column=3, value=repo.language or "-")
62
+ ws.cell(row=row, column=4, value=", ".join(repo.topics[:5]))
63
+ ws.cell(row=row, column=5, value=repo.description or "")
64
+ ws.cell(row=row, column=6, value=repo.url)
65
+ ws.cell(row=row, column=7, value=repo.updated_at.strftime("%Y-%m-%d") if repo.updated_at else "")
66
+
67
+ # Adjust column widths
68
+ ws.column_dimensions['A'].width = 30
69
+ ws.column_dimensions['B'].width = 10
70
+ ws.column_dimensions['C'].width = 12
71
+ ws.column_dimensions['D'].width = 25
72
+ ws.column_dimensions['E'].width = 50
73
+ ws.column_dimensions['F'].width = 40
74
+ ws.column_dimensions['G'].width = 12
75
+
76
+ output_path = self.output_dir / self.filename
77
+ wb.save(output_path)
78
+
79
+ return output_path
@@ -0,0 +1,152 @@
1
+ """HTML exporter for generating web reports."""
2
+
3
+ from datetime import datetime
4
+ from html import escape as html_escape
5
+ from pathlib import Path
6
+ from typing import Optional
7
+
8
+ from ai_scraper.models import Repository
9
+
10
+
11
+ class HTMLExporter:
12
+ """Export repositories to HTML format."""
13
+
14
+ def __init__(self, output_dir: Path, filename: str = "index.html"):
15
+ """Initialize the exporter.
16
+
17
+ Args:
18
+ output_dir: Directory for output files.
19
+ filename: Name of the output file.
20
+ """
21
+ self.output_dir = Path(output_dir)
22
+ self.filename = filename
23
+
24
+ def export_repositories(self, repos: list[Repository], title: str = "AI Repositories") -> Path:
25
+ """Export repositories to an HTML file.
26
+
27
+ Args:
28
+ repos: List of repositories to export.
29
+ title: Page title.
30
+
31
+ Returns:
32
+ Path to the created file.
33
+ """
34
+ self.output_dir.mkdir(parents=True, exist_ok=True)
35
+
36
+ content = self._generate_html(repos, title)
37
+
38
+ output_path = self.output_dir / self.filename
39
+ output_path.write_text(content, encoding="utf-8")
40
+
41
+ return output_path
42
+
43
+ def _generate_html(self, repos: list[Repository], title: str) -> str:
44
+ """Generate full HTML content."""
45
+ # Escape title to prevent XSS
46
+ safe_title = html_escape(title)
47
+
48
+ return f"""<!DOCTYPE html>
49
+ <html lang="en">
50
+ <head>
51
+ <meta charset="UTF-8">
52
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
53
+ <title>{safe_title}</title>
54
+ <style>
55
+ * {{ margin: 0; padding: 0; box-sizing: border-box; }}
56
+ body {{ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; background: #f6f8fa; }}
57
+ .container {{ max-width: 1200px; margin: 0 auto; padding: 20px; }}
58
+ h1 {{ color: #24292f; margin-bottom: 10px; }}
59
+ .meta {{ color: #57606a; margin-bottom: 20px; }}
60
+ .stats {{ display: flex; gap: 20px; margin-bottom: 30px; flex-wrap: wrap; }}
61
+ .stat-box {{ background: white; padding: 15px 20px; border-radius: 8px; box-shadow: 0 1px 3px rgba(0,0,0,0.1); }}
62
+ .stat-number {{ font-size: 24px; font-weight: bold; color: #0969da; }}
63
+ .stat-label {{ color: #57606a; font-size: 14px; }}
64
+ table {{ width: 100%; background: white; border-radius: 8px; overflow: hidden; box-shadow: 0 1px 3px rgba(0,0,0,0.1); }}
65
+ th {{ background: #f6f8fa; padding: 12px 16px; text-align: left; font-weight: 600; color: #24292f; }}
66
+ td {{ padding: 12px 16px; border-top: 1px solid #d0d7de; }}
67
+ tr:hover {{ background: #f6f8fa; }}
68
+ a {{ color: #0969da; text-decoration: none; }}
69
+ a:hover {{ text-decoration: underline; }}
70
+ .stars {{ color: #e3b341; }}
71
+ .language {{ background: #ddf4ff; color: #0969da; padding: 2px 8px; border-radius: 12px; font-size: 12px; }}
72
+ .description {{ color: #57606a; }}
73
+ @media (max-width: 768px) {{
74
+ .container {{ padding: 10px; }}
75
+ table {{ font-size: 14px; }}
76
+ th, td {{ padding: 8px; }}
77
+ }}
78
+ </style>
79
+ </head>
80
+ <body>
81
+ <div class="container">
82
+ <h1>{safe_title}</h1>
83
+ <p class="meta">Updated: {datetime.now().strftime("%Y-%m-%d %H:%M")} | Total: {len(repos)} repositories</p>
84
+
85
+ <div class="stats">
86
+ <div class="stat-box">
87
+ <div class="stat-number">{len(repos)}</div>
88
+ <div class="stat-label">Repositories</div>
89
+ </div>
90
+ <div class="stat-box">
91
+ <div class="stat-number">{sum(r.stars for r in repos):,}</div>
92
+ <div class="stat-label">Total Stars</div>
93
+ </div>
94
+ </div>
95
+
96
+ <table>
97
+ <thead>
98
+ <tr>
99
+ <th>Name</th>
100
+ <th>Stars</th>
101
+ <th>Language</th>
102
+ <th>Description</th>
103
+ </tr>
104
+ </thead>
105
+ <tbody>
106
+ {self._generate_rows(repos)}
107
+ </tbody>
108
+ </table>
109
+ </div>
110
+ </body>
111
+ </html>"""
112
+
113
+ def _generate_rows(self, repos: list[Repository]) -> str:
114
+ """Generate table rows."""
115
+ rows = []
116
+ for repo in repos:
117
+ # Escape all user-controlled data to prevent XSS
118
+ safe_url = self._safe_url(repo.url)
119
+ safe_name = html_escape(repo.full_name, quote=True)
120
+ safe_language = html_escape(repo.language or "-", quote=True)
121
+ safe_description = self._clean_description(repo.description)
122
+
123
+ rows.append(f""" <tr>
124
+ <td><a href="{safe_url}" target="_blank" rel="noopener noreferrer">{safe_name}</a></td>
125
+ <td><span class="stars">★ {repo.stars:,}</span></td>
126
+ <td><span class="language">{safe_language}</span></td>
127
+ <td class="description">{safe_description}</td>
128
+ </tr>""")
129
+ return "\n".join(rows)
130
+
131
+ def _safe_url(self, url: str) -> str:
132
+ """Validate and escape URL to prevent XSS.
133
+
134
+ Only allows http:// and https:// URLs.
135
+ Returns '#' for invalid URLs.
136
+ """
137
+ safe_url = html_escape(url, quote=True)
138
+ if safe_url.startswith(('http://', 'https://')):
139
+ return safe_url
140
+ return '#'
141
+
142
+ def _clean_description(self, description: Optional[str]) -> str:
143
+ """Clean description for HTML.
144
+
145
+ Escapes HTML characters and truncates to 100 chars with ellipsis.
146
+ """
147
+ if not description:
148
+ return ""
149
+ escaped = html_escape(description, quote=True)
150
+ if len(escaped) > 100:
151
+ return escaped[:97] + "..."
152
+ return escaped