github-ai-scraper 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_scraper/__init__.py +3 -0
- ai_scraper/api/__init__.py +6 -0
- ai_scraper/api/github.py +340 -0
- ai_scraper/api/gitlab.py +418 -0
- ai_scraper/api/rate_limiter.py +120 -0
- ai_scraper/api_server.py +196 -0
- ai_scraper/auth.py +68 -0
- ai_scraper/backup.py +112 -0
- ai_scraper/cache.py +95 -0
- ai_scraper/classifier.py +135 -0
- ai_scraper/cli.py +747 -0
- ai_scraper/config.py +237 -0
- ai_scraper/config_watcher.py +82 -0
- ai_scraper/dedup.py +148 -0
- ai_scraper/filters/__init__.py +5 -0
- ai_scraper/filters/ai_filter.py +93 -0
- ai_scraper/health.py +155 -0
- ai_scraper/i18n.py +141 -0
- ai_scraper/interactive.py +96 -0
- ai_scraper/keywords/__init__.py +5 -0
- ai_scraper/keywords/extractor.py +274 -0
- ai_scraper/logging_config.py +74 -0
- ai_scraper/models/__init__.py +5 -0
- ai_scraper/models/repository.py +72 -0
- ai_scraper/output/__init__.py +6 -0
- ai_scraper/output/excel.py +79 -0
- ai_scraper/output/html.py +152 -0
- ai_scraper/output/markdown.py +338 -0
- ai_scraper/output/rss.py +82 -0
- ai_scraper/output/translator.py +303 -0
- ai_scraper/plugin_system.py +146 -0
- ai_scraper/plugins/__init__.py +5 -0
- ai_scraper/retry.py +134 -0
- ai_scraper/scheduler.py +84 -0
- ai_scraper/scrape_progress.py +99 -0
- ai_scraper/secure_storage.py +127 -0
- ai_scraper/storage/__init__.py +5 -0
- ai_scraper/storage/async_database.py +237 -0
- ai_scraper/storage/database.py +456 -0
- ai_scraper/webhooks.py +95 -0
- github_ai_scraper-0.1.2.dist-info/METADATA +299 -0
- github_ai_scraper-0.1.2.dist-info/RECORD +44 -0
- github_ai_scraper-0.1.2.dist-info/WHEEL +4 -0
- github_ai_scraper-0.1.2.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""Logging configuration for ai-scraper."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
from rich.console import Console
|
|
10
|
+
from rich.logging import RichHandler
|
|
11
|
+
|
|
12
|
+
# Shared console for consistent output
|
|
13
|
+
console = Console()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def setup_logging(
|
|
17
|
+
level: str = "INFO",
|
|
18
|
+
log_file: Optional[Path] = None,
|
|
19
|
+
json_format: bool = False,
|
|
20
|
+
) -> None:
|
|
21
|
+
"""Set up logging.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
level: Log level (DEBUG, INFO, WARNING, ERROR).
|
|
25
|
+
log_file: Optional file path for logs.
|
|
26
|
+
json_format: Use JSON format for logs.
|
|
27
|
+
"""
|
|
28
|
+
handlers: list[logging.Handler] = [
|
|
29
|
+
RichHandler(rich_tracebacks=True, console=console)
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
if log_file:
|
|
33
|
+
log_file.parent.mkdir(parents=True, exist_ok=True)
|
|
34
|
+
file_handler = logging.FileHandler(log_file)
|
|
35
|
+
|
|
36
|
+
if json_format:
|
|
37
|
+
|
|
38
|
+
class JSONFormatter(logging.Formatter):
|
|
39
|
+
def format(self, record: logging.LogRecord) -> str:
|
|
40
|
+
log_entry = {
|
|
41
|
+
"timestamp": self.formatTime(record),
|
|
42
|
+
"level": record.levelname,
|
|
43
|
+
"message": record.getMessage(),
|
|
44
|
+
"module": record.module,
|
|
45
|
+
"function": record.funcName,
|
|
46
|
+
}
|
|
47
|
+
return json.dumps(log_entry)
|
|
48
|
+
|
|
49
|
+
file_handler.setFormatter(JSONFormatter())
|
|
50
|
+
else:
|
|
51
|
+
file_handler.setFormatter(
|
|
52
|
+
logging.Formatter(
|
|
53
|
+
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
54
|
+
)
|
|
55
|
+
)
|
|
56
|
+
handlers.append(file_handler)
|
|
57
|
+
|
|
58
|
+
logging.basicConfig(
|
|
59
|
+
level=logging.getLevelName(level),
|
|
60
|
+
format="%(message)s",
|
|
61
|
+
handlers=handlers,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def get_logger(name: str) -> logging.Logger:
|
|
66
|
+
"""Get a configured logger.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
name: Logger name.
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
Configured logger.
|
|
73
|
+
"""
|
|
74
|
+
return logging.getLogger(name)
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""Repository data models."""
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, Field, field_validator
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Repository(BaseModel):
|
|
10
|
+
"""Repository information (GitHub or GitLab)."""
|
|
11
|
+
|
|
12
|
+
id: int = Field(gt=0, description="Repository ID")
|
|
13
|
+
name: str = Field(min_length=1, max_length=200)
|
|
14
|
+
full_name: str = Field(min_length=1, max_length=200)
|
|
15
|
+
description: Optional[str] = Field(None, max_length=1000)
|
|
16
|
+
stars: int = Field(ge=0, description="Star count")
|
|
17
|
+
language: Optional[str] = Field(None, max_length=50)
|
|
18
|
+
topics: list[str] = Field(default_factory=list)
|
|
19
|
+
created_at: datetime
|
|
20
|
+
updated_at: datetime
|
|
21
|
+
pushed_at: datetime
|
|
22
|
+
url: str = Field(pattern=r"^https?://[\w\.-]+/[\w\-\.]+/[\w\-\.]+")
|
|
23
|
+
open_issues: Optional[int] = Field(None, ge=0)
|
|
24
|
+
forks: Optional[int] = Field(None, ge=0)
|
|
25
|
+
contributors: Optional[int] = Field(None, ge=0)
|
|
26
|
+
|
|
27
|
+
@field_validator("name", "full_name")
|
|
28
|
+
@classmethod
|
|
29
|
+
def validate_name(cls, v: str) -> str:
|
|
30
|
+
"""Validate and strip name fields."""
|
|
31
|
+
if not v.strip():
|
|
32
|
+
raise ValueError("Name cannot be empty")
|
|
33
|
+
return v.strip()
|
|
34
|
+
|
|
35
|
+
@field_validator("topics")
|
|
36
|
+
@classmethod
|
|
37
|
+
def validate_topics(cls, v: list[str]) -> list[str]:
|
|
38
|
+
"""Clean and lowercase topics."""
|
|
39
|
+
return [t.strip().lower() for t in v if t.strip()]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class RepoSnapshot(BaseModel):
|
|
43
|
+
"""Repository snapshot for trend analysis."""
|
|
44
|
+
|
|
45
|
+
repo_id: int = Field(gt=0)
|
|
46
|
+
stars: int = Field(ge=0)
|
|
47
|
+
snapshot_at: datetime
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class FilterConfig(BaseModel):
|
|
51
|
+
"""Filter configuration for scraping."""
|
|
52
|
+
|
|
53
|
+
keywords: list[str] = Field(default_factory=list)
|
|
54
|
+
topics: list[str] = Field(default_factory=list)
|
|
55
|
+
languages: list[str] = Field(default_factory=list)
|
|
56
|
+
exclude_keywords: list[str] = Field(default_factory=list)
|
|
57
|
+
min_stars: int = Field(default=100, ge=0)
|
|
58
|
+
|
|
59
|
+
@field_validator("keywords", "topics", "languages", "exclude_keywords")
|
|
60
|
+
@classmethod
|
|
61
|
+
def clean_string_list(cls, v: list[str]) -> list[str]:
|
|
62
|
+
"""Clean string lists by stripping whitespace and removing empty strings."""
|
|
63
|
+
return [s.strip() for s in v if s.strip()]
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class ScrapeConfig(BaseModel):
|
|
67
|
+
"""Scrape configuration."""
|
|
68
|
+
|
|
69
|
+
data_fields: list[str] = Field(default_factory=list)
|
|
70
|
+
max_results: int = Field(default=100, gt=0)
|
|
71
|
+
concurrency: int = Field(default=5, gt=0, le=100)
|
|
72
|
+
cache_ttl: int = Field(default=3600, ge=0)
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""Excel exporter for generating spreadsheet reports."""
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from ai_scraper.models import Repository
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ExcelExporter:
|
|
11
|
+
"""Export repositories to Excel format."""
|
|
12
|
+
|
|
13
|
+
def __init__(self, output_dir: Path, filename: str = "repositories.xlsx"):
|
|
14
|
+
"""Initialize the exporter.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
output_dir: Directory for output files.
|
|
18
|
+
filename: Name of the output file.
|
|
19
|
+
"""
|
|
20
|
+
self.output_dir = Path(output_dir)
|
|
21
|
+
self.filename = filename
|
|
22
|
+
|
|
23
|
+
def export_repositories(self, repos: list[Repository], title: str = "AI Repositories") -> Path:
|
|
24
|
+
"""Export repositories to an Excel file.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
repos: List of repositories to export.
|
|
28
|
+
title: Sheet title.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
Path to the created file.
|
|
32
|
+
"""
|
|
33
|
+
try:
|
|
34
|
+
import openpyxl
|
|
35
|
+
from openpyxl.styles import Font, PatternFill, Alignment
|
|
36
|
+
except ImportError:
|
|
37
|
+
raise ImportError("openpyxl is required for Excel export. Install with: pip install openpyxl")
|
|
38
|
+
|
|
39
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
40
|
+
|
|
41
|
+
wb = openpyxl.Workbook()
|
|
42
|
+
ws = wb.active
|
|
43
|
+
ws.title = title[:31] # Excel sheet name limit
|
|
44
|
+
|
|
45
|
+
# Header style
|
|
46
|
+
header_font = Font(bold=True, color="FFFFFF")
|
|
47
|
+
header_fill = PatternFill(start_color="4472C4", end_color="4472C4", fill_type="solid")
|
|
48
|
+
|
|
49
|
+
# Headers
|
|
50
|
+
headers = ["Name", "Stars", "Language", "Topics", "Description", "URL", "Updated"]
|
|
51
|
+
for col, header in enumerate(headers, 1):
|
|
52
|
+
cell = ws.cell(row=1, column=col, value=header)
|
|
53
|
+
cell.font = header_font
|
|
54
|
+
cell.fill = header_fill
|
|
55
|
+
cell.alignment = Alignment(horizontal="center")
|
|
56
|
+
|
|
57
|
+
# Data rows
|
|
58
|
+
for row, repo in enumerate(repos, 2):
|
|
59
|
+
ws.cell(row=row, column=1, value=repo.full_name)
|
|
60
|
+
ws.cell(row=row, column=2, value=repo.stars)
|
|
61
|
+
ws.cell(row=row, column=3, value=repo.language or "-")
|
|
62
|
+
ws.cell(row=row, column=4, value=", ".join(repo.topics[:5]))
|
|
63
|
+
ws.cell(row=row, column=5, value=repo.description or "")
|
|
64
|
+
ws.cell(row=row, column=6, value=repo.url)
|
|
65
|
+
ws.cell(row=row, column=7, value=repo.updated_at.strftime("%Y-%m-%d") if repo.updated_at else "")
|
|
66
|
+
|
|
67
|
+
# Adjust column widths
|
|
68
|
+
ws.column_dimensions['A'].width = 30
|
|
69
|
+
ws.column_dimensions['B'].width = 10
|
|
70
|
+
ws.column_dimensions['C'].width = 12
|
|
71
|
+
ws.column_dimensions['D'].width = 25
|
|
72
|
+
ws.column_dimensions['E'].width = 50
|
|
73
|
+
ws.column_dimensions['F'].width = 40
|
|
74
|
+
ws.column_dimensions['G'].width = 12
|
|
75
|
+
|
|
76
|
+
output_path = self.output_dir / self.filename
|
|
77
|
+
wb.save(output_path)
|
|
78
|
+
|
|
79
|
+
return output_path
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""HTML exporter for generating web reports."""
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from html import escape as html_escape
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
from ai_scraper.models import Repository
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class HTMLExporter:
|
|
12
|
+
"""Export repositories to HTML format."""
|
|
13
|
+
|
|
14
|
+
def __init__(self, output_dir: Path, filename: str = "index.html"):
|
|
15
|
+
"""Initialize the exporter.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
output_dir: Directory for output files.
|
|
19
|
+
filename: Name of the output file.
|
|
20
|
+
"""
|
|
21
|
+
self.output_dir = Path(output_dir)
|
|
22
|
+
self.filename = filename
|
|
23
|
+
|
|
24
|
+
def export_repositories(self, repos: list[Repository], title: str = "AI Repositories") -> Path:
|
|
25
|
+
"""Export repositories to an HTML file.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
repos: List of repositories to export.
|
|
29
|
+
title: Page title.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
Path to the created file.
|
|
33
|
+
"""
|
|
34
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
35
|
+
|
|
36
|
+
content = self._generate_html(repos, title)
|
|
37
|
+
|
|
38
|
+
output_path = self.output_dir / self.filename
|
|
39
|
+
output_path.write_text(content, encoding="utf-8")
|
|
40
|
+
|
|
41
|
+
return output_path
|
|
42
|
+
|
|
43
|
+
def _generate_html(self, repos: list[Repository], title: str) -> str:
|
|
44
|
+
"""Generate full HTML content."""
|
|
45
|
+
# Escape title to prevent XSS
|
|
46
|
+
safe_title = html_escape(title)
|
|
47
|
+
|
|
48
|
+
return f"""<!DOCTYPE html>
|
|
49
|
+
<html lang="en">
|
|
50
|
+
<head>
|
|
51
|
+
<meta charset="UTF-8">
|
|
52
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
53
|
+
<title>{safe_title}</title>
|
|
54
|
+
<style>
|
|
55
|
+
* {{ margin: 0; padding: 0; box-sizing: border-box; }}
|
|
56
|
+
body {{ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; background: #f6f8fa; }}
|
|
57
|
+
.container {{ max-width: 1200px; margin: 0 auto; padding: 20px; }}
|
|
58
|
+
h1 {{ color: #24292f; margin-bottom: 10px; }}
|
|
59
|
+
.meta {{ color: #57606a; margin-bottom: 20px; }}
|
|
60
|
+
.stats {{ display: flex; gap: 20px; margin-bottom: 30px; flex-wrap: wrap; }}
|
|
61
|
+
.stat-box {{ background: white; padding: 15px 20px; border-radius: 8px; box-shadow: 0 1px 3px rgba(0,0,0,0.1); }}
|
|
62
|
+
.stat-number {{ font-size: 24px; font-weight: bold; color: #0969da; }}
|
|
63
|
+
.stat-label {{ color: #57606a; font-size: 14px; }}
|
|
64
|
+
table {{ width: 100%; background: white; border-radius: 8px; overflow: hidden; box-shadow: 0 1px 3px rgba(0,0,0,0.1); }}
|
|
65
|
+
th {{ background: #f6f8fa; padding: 12px 16px; text-align: left; font-weight: 600; color: #24292f; }}
|
|
66
|
+
td {{ padding: 12px 16px; border-top: 1px solid #d0d7de; }}
|
|
67
|
+
tr:hover {{ background: #f6f8fa; }}
|
|
68
|
+
a {{ color: #0969da; text-decoration: none; }}
|
|
69
|
+
a:hover {{ text-decoration: underline; }}
|
|
70
|
+
.stars {{ color: #e3b341; }}
|
|
71
|
+
.language {{ background: #ddf4ff; color: #0969da; padding: 2px 8px; border-radius: 12px; font-size: 12px; }}
|
|
72
|
+
.description {{ color: #57606a; }}
|
|
73
|
+
@media (max-width: 768px) {{
|
|
74
|
+
.container {{ padding: 10px; }}
|
|
75
|
+
table {{ font-size: 14px; }}
|
|
76
|
+
th, td {{ padding: 8px; }}
|
|
77
|
+
}}
|
|
78
|
+
</style>
|
|
79
|
+
</head>
|
|
80
|
+
<body>
|
|
81
|
+
<div class="container">
|
|
82
|
+
<h1>{safe_title}</h1>
|
|
83
|
+
<p class="meta">Updated: {datetime.now().strftime("%Y-%m-%d %H:%M")} | Total: {len(repos)} repositories</p>
|
|
84
|
+
|
|
85
|
+
<div class="stats">
|
|
86
|
+
<div class="stat-box">
|
|
87
|
+
<div class="stat-number">{len(repos)}</div>
|
|
88
|
+
<div class="stat-label">Repositories</div>
|
|
89
|
+
</div>
|
|
90
|
+
<div class="stat-box">
|
|
91
|
+
<div class="stat-number">{sum(r.stars for r in repos):,}</div>
|
|
92
|
+
<div class="stat-label">Total Stars</div>
|
|
93
|
+
</div>
|
|
94
|
+
</div>
|
|
95
|
+
|
|
96
|
+
<table>
|
|
97
|
+
<thead>
|
|
98
|
+
<tr>
|
|
99
|
+
<th>Name</th>
|
|
100
|
+
<th>Stars</th>
|
|
101
|
+
<th>Language</th>
|
|
102
|
+
<th>Description</th>
|
|
103
|
+
</tr>
|
|
104
|
+
</thead>
|
|
105
|
+
<tbody>
|
|
106
|
+
{self._generate_rows(repos)}
|
|
107
|
+
</tbody>
|
|
108
|
+
</table>
|
|
109
|
+
</div>
|
|
110
|
+
</body>
|
|
111
|
+
</html>"""
|
|
112
|
+
|
|
113
|
+
def _generate_rows(self, repos: list[Repository]) -> str:
|
|
114
|
+
"""Generate table rows."""
|
|
115
|
+
rows = []
|
|
116
|
+
for repo in repos:
|
|
117
|
+
# Escape all user-controlled data to prevent XSS
|
|
118
|
+
safe_url = self._safe_url(repo.url)
|
|
119
|
+
safe_name = html_escape(repo.full_name, quote=True)
|
|
120
|
+
safe_language = html_escape(repo.language or "-", quote=True)
|
|
121
|
+
safe_description = self._clean_description(repo.description)
|
|
122
|
+
|
|
123
|
+
rows.append(f""" <tr>
|
|
124
|
+
<td><a href="{safe_url}" target="_blank" rel="noopener noreferrer">{safe_name}</a></td>
|
|
125
|
+
<td><span class="stars">★ {repo.stars:,}</span></td>
|
|
126
|
+
<td><span class="language">{safe_language}</span></td>
|
|
127
|
+
<td class="description">{safe_description}</td>
|
|
128
|
+
</tr>""")
|
|
129
|
+
return "\n".join(rows)
|
|
130
|
+
|
|
131
|
+
def _safe_url(self, url: str) -> str:
|
|
132
|
+
"""Validate and escape URL to prevent XSS.
|
|
133
|
+
|
|
134
|
+
Only allows http:// and https:// URLs.
|
|
135
|
+
Returns '#' for invalid URLs.
|
|
136
|
+
"""
|
|
137
|
+
safe_url = html_escape(url, quote=True)
|
|
138
|
+
if safe_url.startswith(('http://', 'https://')):
|
|
139
|
+
return safe_url
|
|
140
|
+
return '#'
|
|
141
|
+
|
|
142
|
+
def _clean_description(self, description: Optional[str]) -> str:
|
|
143
|
+
"""Clean description for HTML.
|
|
144
|
+
|
|
145
|
+
Escapes HTML characters and truncates to 100 chars with ellipsis.
|
|
146
|
+
"""
|
|
147
|
+
if not description:
|
|
148
|
+
return ""
|
|
149
|
+
escaped = html_escape(description, quote=True)
|
|
150
|
+
if len(escaped) > 100:
|
|
151
|
+
return escaped[:97] + "..."
|
|
152
|
+
return escaped
|