github-ai-scraper 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_scraper/__init__.py +3 -0
- ai_scraper/api/__init__.py +6 -0
- ai_scraper/api/github.py +340 -0
- ai_scraper/api/gitlab.py +418 -0
- ai_scraper/api/rate_limiter.py +120 -0
- ai_scraper/api_server.py +196 -0
- ai_scraper/auth.py +68 -0
- ai_scraper/backup.py +112 -0
- ai_scraper/cache.py +95 -0
- ai_scraper/classifier.py +135 -0
- ai_scraper/cli.py +747 -0
- ai_scraper/config.py +237 -0
- ai_scraper/config_watcher.py +82 -0
- ai_scraper/dedup.py +148 -0
- ai_scraper/filters/__init__.py +5 -0
- ai_scraper/filters/ai_filter.py +93 -0
- ai_scraper/health.py +155 -0
- ai_scraper/i18n.py +141 -0
- ai_scraper/interactive.py +96 -0
- ai_scraper/keywords/__init__.py +5 -0
- ai_scraper/keywords/extractor.py +274 -0
- ai_scraper/logging_config.py +74 -0
- ai_scraper/models/__init__.py +5 -0
- ai_scraper/models/repository.py +72 -0
- ai_scraper/output/__init__.py +6 -0
- ai_scraper/output/excel.py +79 -0
- ai_scraper/output/html.py +152 -0
- ai_scraper/output/markdown.py +338 -0
- ai_scraper/output/rss.py +82 -0
- ai_scraper/output/translator.py +303 -0
- ai_scraper/plugin_system.py +146 -0
- ai_scraper/plugins/__init__.py +5 -0
- ai_scraper/retry.py +134 -0
- ai_scraper/scheduler.py +84 -0
- ai_scraper/scrape_progress.py +99 -0
- ai_scraper/secure_storage.py +127 -0
- ai_scraper/storage/__init__.py +5 -0
- ai_scraper/storage/async_database.py +237 -0
- ai_scraper/storage/database.py +456 -0
- ai_scraper/webhooks.py +95 -0
- github_ai_scraper-0.1.2.dist-info/METADATA +299 -0
- github_ai_scraper-0.1.2.dist-info/RECORD +44 -0
- github_ai_scraper-0.1.2.dist-info/WHEEL +4 -0
- github_ai_scraper-0.1.2.dist-info/entry_points.txt +2 -0
ai_scraper/cli.py
ADDED
|
@@ -0,0 +1,747 @@
|
|
|
1
|
+
"""CLI entry point for ai-scraper."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import io
|
|
5
|
+
import re
|
|
6
|
+
import sys
|
|
7
|
+
from datetime import datetime, timedelta
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
import click
|
|
12
|
+
import rich.table as table
|
|
13
|
+
from rich import print as rprint
|
|
14
|
+
from rich.console import Console
|
|
15
|
+
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn
|
|
16
|
+
|
|
17
|
+
from ai_scraper import __version__
|
|
18
|
+
from ai_scraper.api.github import GitHubClient
|
|
19
|
+
from ai_scraper.api.gitlab import GitLabClient
|
|
20
|
+
from ai_scraper.config import Config, load_config
|
|
21
|
+
from ai_scraper.filters.ai_filter import AIFilter
|
|
22
|
+
from ai_scraper.keywords.extractor import KeywordExtractor
|
|
23
|
+
from ai_scraper.models.repository import FilterConfig as FilterConfigModel
|
|
24
|
+
from ai_scraper.output.markdown import MarkdownExporter
|
|
25
|
+
from ai_scraper.storage.database import Database
|
|
26
|
+
|
|
27
|
+
# Create console with UTF-8 encoding for Windows
|
|
28
|
+
# Use a wrapper to ensure UTF-8 encoding for output
|
|
29
|
+
if sys.platform == "win32":
|
|
30
|
+
# Reconfigure stdout for UTF-8 if needed
|
|
31
|
+
if hasattr(sys.stdout, 'reconfigure'):
|
|
32
|
+
try:
|
|
33
|
+
sys.stdout.reconfigure(encoding='utf-8')
|
|
34
|
+
except (OSError, ValueError):
|
|
35
|
+
pass
|
|
36
|
+
console = Console(force_terminal=True, legacy_windows=False)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def clean_text(text: str) -> str:
|
|
40
|
+
"""Remove emoji and special characters that can't be displayed in Windows terminal."""
|
|
41
|
+
if not text:
|
|
42
|
+
return ""
|
|
43
|
+
# Remove emoji, zero-width joiners, and other non-printable characters
|
|
44
|
+
# Keep only ASCII and common Unicode letters/numbers/punctuation
|
|
45
|
+
result = []
|
|
46
|
+
for char in text:
|
|
47
|
+
# Keep ASCII printable characters and common Unicode ranges
|
|
48
|
+
if (32 <= ord(char) <= 126 or # ASCII printable
|
|
49
|
+
'\u4e00' <= char <= '\u9fff' or # Chinese characters
|
|
50
|
+
'\u0400' <= char <= '\u04ff' or # Cyrillic
|
|
51
|
+
char in ' \t'): # Basic whitespace
|
|
52
|
+
result.append(char)
|
|
53
|
+
elif char in '\n\r':
|
|
54
|
+
result.append(' ')
|
|
55
|
+
return ''.join(result)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def parse_since_param(since: Optional[str]) -> Optional[datetime]:
|
|
59
|
+
"""Parse the --since parameter into a datetime.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
since: Either YYYY-MM-DD format or relative like '1d', '1w', '1m'.
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
datetime representing the cutoff time, or None if since is None.
|
|
66
|
+
|
|
67
|
+
Raises:
|
|
68
|
+
ValueError: If the format is invalid.
|
|
69
|
+
"""
|
|
70
|
+
if since is None:
|
|
71
|
+
return None
|
|
72
|
+
|
|
73
|
+
# Try YYYY-MM-DD format first
|
|
74
|
+
if re.match(r'^\d{4}-\d{2}-\d{2}$', since):
|
|
75
|
+
return datetime.strptime(since, '%Y-%m-%d')
|
|
76
|
+
|
|
77
|
+
# Try relative format: number + unit (d, w, m)
|
|
78
|
+
match = re.match(r'^(\d+)([dwmy])$', since.lower())
|
|
79
|
+
if match:
|
|
80
|
+
amount = int(match.group(1))
|
|
81
|
+
unit = match.group(2)
|
|
82
|
+
|
|
83
|
+
if unit == 'd':
|
|
84
|
+
return datetime.now() - timedelta(days=amount)
|
|
85
|
+
elif unit == 'w':
|
|
86
|
+
return datetime.now() - timedelta(weeks=amount)
|
|
87
|
+
elif unit == 'm':
|
|
88
|
+
return datetime.now() - timedelta(days=amount * 30)
|
|
89
|
+
elif unit == 'y':
|
|
90
|
+
return datetime.now() - timedelta(days=amount * 365)
|
|
91
|
+
|
|
92
|
+
raise ValueError(
|
|
93
|
+
f"Invalid --since format: '{since}'. "
|
|
94
|
+
"Use YYYY-MM-DD or relative format like '1d', '1w', '1m', '1y'."
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
@click.group()
|
|
99
|
+
@click.version_option(version=__version__)
|
|
100
|
+
@click.option("--config", "-c", type=click.Path(exists=True), help="Config file path")
|
|
101
|
+
@click.pass_context
|
|
102
|
+
def cli(ctx: click.Context, config: Optional[str]):
|
|
103
|
+
"""GitHub AI high-star repositories scraper."""
|
|
104
|
+
ctx.ensure_object(dict)
|
|
105
|
+
|
|
106
|
+
config_path = Path(config) if config else Path("ai-scraper.yaml")
|
|
107
|
+
ctx.obj["config"] = load_config(config_path)
|
|
108
|
+
ctx.obj["config_path"] = config_path
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@cli.command()
|
|
112
|
+
@click.option("--min-stars", type=int, help="Minimum stars filter")
|
|
113
|
+
@click.option("--max-results", type=int, help="Maximum results to fetch")
|
|
114
|
+
@click.option("--incremental", is_flag=True, help="Only fetch repos updated since last scrape")
|
|
115
|
+
@click.option("--since", type=str, help="Fetch repos updated since date (YYYY-MM-DD or 1d/1w/1m)")
|
|
116
|
+
@click.option("--progress/--no-progress", default=True, help="Show progress bar (default: on)")
|
|
117
|
+
@click.option("--platform", type=click.Choice(["github", "gitlab"]), default="github", help="Platform to scrape (github or gitlab)")
|
|
118
|
+
@click.option("--gitlab-url", type=str, help="GitLab instance URL (for self-hosted GitLab)")
|
|
119
|
+
@click.pass_context
|
|
120
|
+
def scrape(ctx: click.Context, min_stars: Optional[int], max_results: Optional[int],
|
|
121
|
+
incremental: bool, since: Optional[str], progress: bool, platform: str, gitlab_url: Optional[str]):
|
|
122
|
+
"""Scrape AI repositories from GitHub or GitLab."""
|
|
123
|
+
config: Config = ctx.obj["config"]
|
|
124
|
+
|
|
125
|
+
# Override config with CLI options
|
|
126
|
+
if min_stars is not None:
|
|
127
|
+
config.filter.min_stars = min_stars
|
|
128
|
+
if max_results is not None:
|
|
129
|
+
config.scrape.max_results = max_results
|
|
130
|
+
|
|
131
|
+
# Parse --since parameter
|
|
132
|
+
since_date: Optional[datetime] = None
|
|
133
|
+
if since:
|
|
134
|
+
try:
|
|
135
|
+
since_date = parse_since_param(since)
|
|
136
|
+
console.print(f"[dim]Fetching repos updated since: {since_date}[/dim]")
|
|
137
|
+
except ValueError as e:
|
|
138
|
+
console.print(f"[red]Error: {e}[/red]")
|
|
139
|
+
sys.exit(1)
|
|
140
|
+
|
|
141
|
+
console.print(f"[bold blue]Starting scrape from {platform}...[/bold blue]")
|
|
142
|
+
|
|
143
|
+
async def run_scrape(since_date_inner: Optional[datetime]):
|
|
144
|
+
# Create appropriate client based on platform
|
|
145
|
+
if platform == "gitlab":
|
|
146
|
+
client = GitLabClient(
|
|
147
|
+
token=config.gitlab.token or config.github.token,
|
|
148
|
+
base_url=gitlab_url or config.gitlab.base_url
|
|
149
|
+
)
|
|
150
|
+
else:
|
|
151
|
+
client = GitHubClient(token=config.github.token)
|
|
152
|
+
|
|
153
|
+
db = Database(Path(config.database.path))
|
|
154
|
+
db.init_db()
|
|
155
|
+
filter_instance = AIFilter()
|
|
156
|
+
keyword_extractor = KeywordExtractor(
|
|
157
|
+
Path(config.keywords.file),
|
|
158
|
+
max_keywords=config.keywords.max_keywords
|
|
159
|
+
)
|
|
160
|
+
markdown_exporter = MarkdownExporter(
|
|
161
|
+
Path(config.output.dir),
|
|
162
|
+
filename=config.output.filename
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
try:
|
|
166
|
+
# Handle incremental mode
|
|
167
|
+
if incremental and since_date_inner is None:
|
|
168
|
+
last_scrape = db.get_last_scrape_time()
|
|
169
|
+
if last_scrape:
|
|
170
|
+
since_date_inner = last_scrape
|
|
171
|
+
if not progress:
|
|
172
|
+
console.print(f"[dim]Incremental mode: fetching repos since last scrape ({last_scrape})[/dim]")
|
|
173
|
+
else:
|
|
174
|
+
if not progress:
|
|
175
|
+
console.print("[dim]Incremental mode: no previous scrape found, fetching all repos[/dim]")
|
|
176
|
+
|
|
177
|
+
# Build search query based on platform
|
|
178
|
+
primary_topic = config.filter.topics[0] if config.filter.topics else "ai"
|
|
179
|
+
|
|
180
|
+
if platform == "gitlab":
|
|
181
|
+
# GitLab uses different search syntax
|
|
182
|
+
query = "ai" # GitLab search is simpler
|
|
183
|
+
if not progress:
|
|
184
|
+
console.print(f"[dim]Searching GitLab for: {query}[/dim]")
|
|
185
|
+
else:
|
|
186
|
+
# GitHub search
|
|
187
|
+
query = f"stars:>{config.filter.min_stars} topic:{primary_topic}"
|
|
188
|
+
if since_date_inner:
|
|
189
|
+
date_str = since_date_inner.strftime('%Y-%m-%d')
|
|
190
|
+
query += f" pushed:>{date_str}"
|
|
191
|
+
if not progress:
|
|
192
|
+
console.print(f"[dim]Query: {query}[/dim]")
|
|
193
|
+
|
|
194
|
+
# Search repositories
|
|
195
|
+
all_repos = []
|
|
196
|
+
page = 1
|
|
197
|
+
per_page = min(100, config.scrape.max_results)
|
|
198
|
+
max_results = config.scrape.max_results
|
|
199
|
+
|
|
200
|
+
if progress:
|
|
201
|
+
# Use progress bar
|
|
202
|
+
with Progress(
|
|
203
|
+
SpinnerColumn(),
|
|
204
|
+
TextColumn("[progress.description]{task.description}"),
|
|
205
|
+
BarColumn(),
|
|
206
|
+
TaskProgressColumn(),
|
|
207
|
+
console=console,
|
|
208
|
+
) as progress_bar:
|
|
209
|
+
task = progress_bar.add_task(
|
|
210
|
+
f"[cyan]Scraping {platform} repositories...",
|
|
211
|
+
total=max_results
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
while len(all_repos) < max_results:
|
|
215
|
+
# Use appropriate search method based on platform
|
|
216
|
+
if platform == "gitlab":
|
|
217
|
+
repos = await client.search_projects(
|
|
218
|
+
query=query,
|
|
219
|
+
sort="star_count",
|
|
220
|
+
order="desc",
|
|
221
|
+
page=page,
|
|
222
|
+
per_page=per_page,
|
|
223
|
+
min_stars=config.filter.min_stars,
|
|
224
|
+
)
|
|
225
|
+
else:
|
|
226
|
+
repos = await client.search_repositories(
|
|
227
|
+
query=query,
|
|
228
|
+
sort="stars",
|
|
229
|
+
order="desc",
|
|
230
|
+
page=page,
|
|
231
|
+
per_page=per_page,
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
if not repos:
|
|
235
|
+
break
|
|
236
|
+
|
|
237
|
+
# Filter AI-related repos
|
|
238
|
+
filter_config = FilterConfigModel(
|
|
239
|
+
keywords=config.filter.keywords,
|
|
240
|
+
topics=config.filter.topics,
|
|
241
|
+
languages=config.filter.languages,
|
|
242
|
+
exclude_keywords=config.filter.exclude_keywords,
|
|
243
|
+
min_stars=config.filter.min_stars,
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
for repo in repos:
|
|
247
|
+
if len(all_repos) >= max_results:
|
|
248
|
+
break # 达到限制,停止添加
|
|
249
|
+
if filter_instance.is_ai_related(repo, filter_config):
|
|
250
|
+
score = filter_instance.score_relevance(repo)
|
|
251
|
+
db.save_repository(repo, relevance_score=score)
|
|
252
|
+
# Save snapshot for trend analysis
|
|
253
|
+
db.save_snapshot(repo.id, repo.stars, datetime.now())
|
|
254
|
+
all_repos.append(repo)
|
|
255
|
+
progress_bar.update(task, completed=len(all_repos))
|
|
256
|
+
|
|
257
|
+
# 检查是否达到限制
|
|
258
|
+
if len(all_repos) >= max_results:
|
|
259
|
+
break
|
|
260
|
+
|
|
261
|
+
page += 1
|
|
262
|
+
|
|
263
|
+
if len(repos) < per_page:
|
|
264
|
+
break
|
|
265
|
+
|
|
266
|
+
# Ensure progress shows final count
|
|
267
|
+
progress_bar.update(task, completed=len(all_repos))
|
|
268
|
+
else:
|
|
269
|
+
# No progress bar - use original console output
|
|
270
|
+
while len(all_repos) < max_results:
|
|
271
|
+
if platform == "gitlab":
|
|
272
|
+
repos = await client.search_projects(
|
|
273
|
+
query=query,
|
|
274
|
+
sort="star_count",
|
|
275
|
+
order="desc",
|
|
276
|
+
page=page,
|
|
277
|
+
per_page=per_page,
|
|
278
|
+
min_stars=config.filter.min_stars,
|
|
279
|
+
)
|
|
280
|
+
else:
|
|
281
|
+
repos = await client.search_repositories(
|
|
282
|
+
query=query,
|
|
283
|
+
sort="stars",
|
|
284
|
+
order="desc",
|
|
285
|
+
page=page,
|
|
286
|
+
per_page=per_page,
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
if not repos:
|
|
290
|
+
break
|
|
291
|
+
|
|
292
|
+
# Filter AI-related repos
|
|
293
|
+
filter_config = FilterConfigModel(
|
|
294
|
+
keywords=config.filter.keywords,
|
|
295
|
+
topics=config.filter.topics,
|
|
296
|
+
languages=config.filter.languages,
|
|
297
|
+
exclude_keywords=config.filter.exclude_keywords,
|
|
298
|
+
min_stars=config.filter.min_stars,
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
for repo in repos:
|
|
302
|
+
if len(all_repos) >= max_results:
|
|
303
|
+
break # 达到限制,停止添加
|
|
304
|
+
if filter_instance.is_ai_related(repo, filter_config):
|
|
305
|
+
score = filter_instance.score_relevance(repo)
|
|
306
|
+
db.save_repository(repo, relevance_score=score)
|
|
307
|
+
# Save snapshot for trend analysis
|
|
308
|
+
db.save_snapshot(repo.id, repo.stars, datetime.now())
|
|
309
|
+
all_repos.append(repo)
|
|
310
|
+
|
|
311
|
+
# 检查是否达到限制
|
|
312
|
+
if len(all_repos) >= max_results:
|
|
313
|
+
break
|
|
314
|
+
|
|
315
|
+
console.print(f"[dim]Page {page}: found {len(repos)} repos, {len(all_repos)} total AI-related[/dim]")
|
|
316
|
+
page += 1
|
|
317
|
+
|
|
318
|
+
if len(repos) < per_page:
|
|
319
|
+
break
|
|
320
|
+
|
|
321
|
+
if all_repos:
|
|
322
|
+
console.print("[dim]Extracting keywords...[/dim]")
|
|
323
|
+
existing_keywords = keyword_extractor.load_keywords()
|
|
324
|
+
new_keywords = keyword_extractor.extract_from_repos(all_repos)
|
|
325
|
+
merged = keyword_extractor.merge_keywords(existing_keywords, new_keywords)
|
|
326
|
+
keyword_extractor.save_keywords(merged)
|
|
327
|
+
console.print(f"[dim]Keywords updated: {len(merged)} total[/dim]")
|
|
328
|
+
|
|
329
|
+
console.print("[dim]Generating Markdown report...[/dim]")
|
|
330
|
+
output_path = markdown_exporter.export_repositories(all_repos)
|
|
331
|
+
console.print(f"[dim]Report saved to: {output_path}[/dim]")
|
|
332
|
+
|
|
333
|
+
console.print(f"[bold green]Scraped {len(all_repos)} AI repositories[/bold green]")
|
|
334
|
+
|
|
335
|
+
finally:
|
|
336
|
+
await client.close()
|
|
337
|
+
db.close()
|
|
338
|
+
|
|
339
|
+
asyncio.run(run_scrape(since_date))
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
@cli.command("list")
|
|
343
|
+
@click.option("--sort", type=click.Choice(["stars", "updated", "relevance"]), default="stars")
|
|
344
|
+
@click.option("--lang", type=str, help="Filter by language")
|
|
345
|
+
@click.option("--limit", type=int, default=20, help="Number of results")
|
|
346
|
+
@click.pass_context
|
|
347
|
+
def list_repos(ctx: click.Context, sort: str, lang: Optional[str], limit: int):
|
|
348
|
+
"""List scraped repositories."""
|
|
349
|
+
config: Config = ctx.obj["config"]
|
|
350
|
+
db = Database(Path(config.database.path))
|
|
351
|
+
|
|
352
|
+
if not Path(config.database.path).exists():
|
|
353
|
+
console.print("[yellow]No database found. Run 'ai-scraper scrape' first.[/yellow]")
|
|
354
|
+
return
|
|
355
|
+
|
|
356
|
+
db.init_db()
|
|
357
|
+
repos = db.get_all_repositories(limit=limit, sort_by=sort)
|
|
358
|
+
|
|
359
|
+
# Filter by language if specified
|
|
360
|
+
if lang:
|
|
361
|
+
repos = [r for r in repos if r.language and r.language.lower() == lang.lower()]
|
|
362
|
+
|
|
363
|
+
# Create table
|
|
364
|
+
tbl = table.Table(title=f"AI Repositories (sorted by {sort})")
|
|
365
|
+
tbl.add_column("Name", style="cyan")
|
|
366
|
+
tbl.add_column("Stars", justify="right", style="yellow")
|
|
367
|
+
tbl.add_column("Language", style="green")
|
|
368
|
+
tbl.add_column("Description", max_width=40)
|
|
369
|
+
|
|
370
|
+
for repo in repos:
|
|
371
|
+
stars_str = f"{repo.stars:,}"
|
|
372
|
+
desc = clean_text(repo.description)
|
|
373
|
+
desc = desc[:37] + "..." if desc and len(desc) > 40 else desc or ""
|
|
374
|
+
tbl.add_row(repo.name, stars_str, repo.language or "-", desc)
|
|
375
|
+
|
|
376
|
+
console.print(tbl)
|
|
377
|
+
db.close()
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
@cli.command()
|
|
381
|
+
@click.option("--days", type=int, default=7, help="Days to analyze")
|
|
382
|
+
@click.option("--top", type=int, default=10, help="Number of top results")
|
|
383
|
+
@click.pass_context
|
|
384
|
+
def trending(ctx: click.Context, days: int, top: int):
|
|
385
|
+
"""Show trending repositories by star growth."""
|
|
386
|
+
config: Config = ctx.obj["config"]
|
|
387
|
+
db = Database(Path(config.database.path))
|
|
388
|
+
|
|
389
|
+
if not Path(config.database.path).exists():
|
|
390
|
+
console.print("[yellow]No database found. Run 'ai-scraper scrape' first.[/yellow]")
|
|
391
|
+
return
|
|
392
|
+
|
|
393
|
+
db.init_db()
|
|
394
|
+
trends = db.get_trending(days=days, limit=top)
|
|
395
|
+
|
|
396
|
+
if not trends:
|
|
397
|
+
console.print(f"[yellow]No trending data found for the last {days} days.[/yellow]")
|
|
398
|
+
console.print("[dim]Run 'ai-scraper scrape' multiple times to build trend data.[/dim]")
|
|
399
|
+
db.close()
|
|
400
|
+
return
|
|
401
|
+
|
|
402
|
+
tbl = table.Table(title=f"Trending Repositories (last {days} days)")
|
|
403
|
+
tbl.add_column("Repository", style="cyan")
|
|
404
|
+
tbl.add_column("Growth", justify="right", style="green")
|
|
405
|
+
tbl.add_column("Stars", justify="right", style="yellow")
|
|
406
|
+
|
|
407
|
+
for trend in trends:
|
|
408
|
+
growth_str = f"+{trend.growth_rate * 100:.1f}%"
|
|
409
|
+
stars_str = f"{trend.current_stars:,}"
|
|
410
|
+
tbl.add_row(trend.repo_name, growth_str, stars_str)
|
|
411
|
+
|
|
412
|
+
console.print(tbl)
|
|
413
|
+
db.close()
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
@cli.group()
|
|
417
|
+
def config_cmd():
|
|
418
|
+
"""Configuration management."""
|
|
419
|
+
pass
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
@config_cmd.command("init")
|
|
423
|
+
@click.pass_context
|
|
424
|
+
def config_init(ctx: click.Context):
|
|
425
|
+
"""Initialize configuration file."""
|
|
426
|
+
config_path: Path = ctx.obj["config_path"]
|
|
427
|
+
|
|
428
|
+
if config_path.exists():
|
|
429
|
+
console.print(f"[yellow]Config file already exists at {config_path}[/yellow]")
|
|
430
|
+
return
|
|
431
|
+
|
|
432
|
+
# Copy default config
|
|
433
|
+
import shutil
|
|
434
|
+
default_config = Path(__file__).parent.parent.parent / "ai-scraper.yaml"
|
|
435
|
+
|
|
436
|
+
if default_config.exists():
|
|
437
|
+
shutil.copy(default_config, config_path)
|
|
438
|
+
console.print(f"[green]Created config file at {config_path}[/green]")
|
|
439
|
+
else:
|
|
440
|
+
console.print("[red]Default config not found[/red]")
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
@config_cmd.command("show")
|
|
444
|
+
@click.pass_context
|
|
445
|
+
def config_show(ctx: click.Context):
|
|
446
|
+
"""Show current configuration."""
|
|
447
|
+
config: Config = ctx.obj["config"]
|
|
448
|
+
|
|
449
|
+
console.print("[bold]Current Configuration:[/bold]")
|
|
450
|
+
console.print(f" GitHub Token: {'***' if config.github.token else 'Not set'}")
|
|
451
|
+
console.print(f" GitLab Token: {'***' if config.gitlab.token else 'Not set'}")
|
|
452
|
+
console.print(f" GitLab URL: {config.gitlab.base_url}")
|
|
453
|
+
console.print(f" Cache TTL: {config.github.cache_ttl}s")
|
|
454
|
+
console.print(f" Min Stars: {config.filter.min_stars}")
|
|
455
|
+
console.print(f" Keywords: {', '.join(config.filter.keywords[:5])}...")
|
|
456
|
+
console.print(f" Topics: {', '.join(config.filter.topics[:5])}...")
|
|
457
|
+
console.print(f" Max Results: {config.scrape.max_results}")
|
|
458
|
+
console.print(f" Database: {config.database.path}")
|
|
459
|
+
console.print(f" Scheduler: {'enabled' if config.scheduler.enabled else 'disabled'}")
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
cli.add_command(config_cmd, name="config")
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
@cli.group()
|
|
466
|
+
def db_cmd():
|
|
467
|
+
"""Database management."""
|
|
468
|
+
pass
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
@db_cmd.command("stats")
|
|
472
|
+
@click.pass_context
|
|
473
|
+
def db_stats(ctx: click.Context):
|
|
474
|
+
"""Show database statistics."""
|
|
475
|
+
config: Config = ctx.obj["config"]
|
|
476
|
+
db = Database(Path(config.database.path))
|
|
477
|
+
|
|
478
|
+
if not Path(config.database.path).exists():
|
|
479
|
+
console.print("[yellow]No database found. Run 'ai-scraper scrape' first.[/yellow]")
|
|
480
|
+
return
|
|
481
|
+
|
|
482
|
+
db.init_db()
|
|
483
|
+
stats = db.get_stats()
|
|
484
|
+
|
|
485
|
+
console.print("[bold]Database Statistics:[/bold]")
|
|
486
|
+
console.print(f" Repository count: {stats['repository_count']}")
|
|
487
|
+
console.print(f" Snapshot count: {stats['snapshot_count']}")
|
|
488
|
+
console.print(f" Total stars: {stats['total_stars']:,}")
|
|
489
|
+
|
|
490
|
+
db.close()
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
@db_cmd.command("clean")
|
|
494
|
+
@click.option("--days", type=int, default=30, help="Keep snapshots from last N days")
|
|
495
|
+
@click.option("--invalid", is_flag=True, help="Remove repos with invalid data")
|
|
496
|
+
@click.option("--vacuum", is_flag=True, help="Optimize database size")
|
|
497
|
+
@click.pass_context
|
|
498
|
+
def db_clean(ctx: click.Context, days: int, invalid: bool, vacuum: bool):
|
|
499
|
+
"""Clean and optimize database.
|
|
500
|
+
|
|
501
|
+
Examples:
|
|
502
|
+
ai-scraper db clean --days 30 # Clean old snapshots
|
|
503
|
+
ai-scraper db clean --invalid # Remove invalid repos
|
|
504
|
+
ai-scraper db clean --vacuum # Optimize database
|
|
505
|
+
ai-scraper db clean --invalid --vacuum # Both
|
|
506
|
+
"""
|
|
507
|
+
config: Config = ctx.obj["config"]
|
|
508
|
+
db = Database(Path(config.database.path))
|
|
509
|
+
|
|
510
|
+
if not Path(config.database.path).exists():
|
|
511
|
+
console.print("[yellow]No database found.[/yellow]")
|
|
512
|
+
return
|
|
513
|
+
|
|
514
|
+
db.init_db()
|
|
515
|
+
|
|
516
|
+
# Clean old snapshots (default behavior)
|
|
517
|
+
if not invalid and not vacuum:
|
|
518
|
+
deleted = db.clean_old_snapshots(days=days)
|
|
519
|
+
console.print(f"[green]Deleted {deleted} old snapshots[/green]")
|
|
520
|
+
|
|
521
|
+
# Remove invalid repos
|
|
522
|
+
if invalid:
|
|
523
|
+
removed = db.clean_invalid_repos()
|
|
524
|
+
console.print(f"[green]Removed {removed} invalid repositories[/green]")
|
|
525
|
+
|
|
526
|
+
# Optimize database
|
|
527
|
+
if vacuum:
|
|
528
|
+
db.vacuum()
|
|
529
|
+
console.print("[green]Database optimized[/green]")
|
|
530
|
+
|
|
531
|
+
db.close()
|
|
532
|
+
|
|
533
|
+
|
|
534
|
+
@db_cmd.command("export")
|
|
535
|
+
@click.option("--format", "-f", type=click.Choice(["csv", "json", "html", "markdown"]), default="csv")
|
|
536
|
+
@click.option("--output", "-o", type=click.Path(), default="export.csv")
|
|
537
|
+
@click.pass_context
|
|
538
|
+
def db_export(ctx: click.Context, format: str, output: str):
|
|
539
|
+
"""Export database to file."""
|
|
540
|
+
config: Config = ctx.obj["config"]
|
|
541
|
+
db = Database(Path(config.database.path))
|
|
542
|
+
|
|
543
|
+
if not Path(config.database.path).exists():
|
|
544
|
+
console.print("[yellow]No database found.[/yellow]")
|
|
545
|
+
return
|
|
546
|
+
|
|
547
|
+
db.init_db()
|
|
548
|
+
repos = db.get_all_repositories(limit=10000)
|
|
549
|
+
|
|
550
|
+
if format == "csv":
|
|
551
|
+
import csv
|
|
552
|
+
|
|
553
|
+
with open(output, "w", newline="", encoding="utf-8") as f:
|
|
554
|
+
writer = csv.writer(f)
|
|
555
|
+
writer.writerow(["name", "stars", "language", "description", "url"])
|
|
556
|
+
|
|
557
|
+
for repo in repos:
|
|
558
|
+
writer.writerow([
|
|
559
|
+
repo.name,
|
|
560
|
+
repo.stars,
|
|
561
|
+
repo.language or "",
|
|
562
|
+
repo.description or "",
|
|
563
|
+
repo.url,
|
|
564
|
+
])
|
|
565
|
+
|
|
566
|
+
console.print(f"[green]Exported {len(repos)} repositories to {output}[/green]")
|
|
567
|
+
|
|
568
|
+
elif format == "json":
|
|
569
|
+
import json
|
|
570
|
+
|
|
571
|
+
data = {
|
|
572
|
+
"repositories": [
|
|
573
|
+
{
|
|
574
|
+
"name": r.name,
|
|
575
|
+
"stars": r.stars,
|
|
576
|
+
"language": r.language,
|
|
577
|
+
"description": r.description,
|
|
578
|
+
"url": r.url,
|
|
579
|
+
}
|
|
580
|
+
for r in repos
|
|
581
|
+
],
|
|
582
|
+
"total": len(repos),
|
|
583
|
+
}
|
|
584
|
+
|
|
585
|
+
with open(output, "w", encoding="utf-8") as f:
|
|
586
|
+
json.dump(data, f, indent=2)
|
|
587
|
+
|
|
588
|
+
console.print(f"[green]Exported {len(repos)} repositories to {output}[/green]")
|
|
589
|
+
|
|
590
|
+
elif format == "html":
|
|
591
|
+
from ai_scraper.output.html import HTMLExporter
|
|
592
|
+
exporter = HTMLExporter(Path(config.output.dir), filename=output)
|
|
593
|
+
path = exporter.export_repositories(repos)
|
|
594
|
+
console.print(f"[green]Exported {len(repos)} repositories to {path}[/green]")
|
|
595
|
+
|
|
596
|
+
elif format == "markdown":
|
|
597
|
+
from ai_scraper.output.markdown import MarkdownExporter
|
|
598
|
+
exporter = MarkdownExporter(Path(config.output.dir), filename=output)
|
|
599
|
+
path = exporter.export_repositories(repos)
|
|
600
|
+
console.print(f"[green]Exported {len(repos)} repositories to {path}[/green]")
|
|
601
|
+
|
|
602
|
+
db.close()
|
|
603
|
+
|
|
604
|
+
|
|
605
|
+
cli.add_command(db_cmd, name="db")
|
|
606
|
+
|
|
607
|
+
|
|
608
|
+
@cli.group()
|
|
609
|
+
def keywords_cmd():
|
|
610
|
+
"""Keywords management."""
|
|
611
|
+
pass
|
|
612
|
+
|
|
613
|
+
|
|
614
|
+
@keywords_cmd.command("list")
|
|
615
|
+
@click.pass_context
|
|
616
|
+
def keywords_list(ctx: click.Context):
|
|
617
|
+
"""List all keywords."""
|
|
618
|
+
config: Config = ctx.obj["config"]
|
|
619
|
+
extractor = KeywordExtractor(Path(config.keywords.file))
|
|
620
|
+
keywords = extractor.get_keywords_for_search()
|
|
621
|
+
if not keywords:
|
|
622
|
+
console.print("[yellow]No keywords found.[/yellow]")
|
|
623
|
+
return
|
|
624
|
+
console.print(f"[bold]Keywords ({len(keywords)}):[/bold]")
|
|
625
|
+
for kw in sorted(keywords):
|
|
626
|
+
console.print(f" {kw}")
|
|
627
|
+
|
|
628
|
+
|
|
629
|
+
@keywords_cmd.command("extract")
|
|
630
|
+
@click.pass_context
|
|
631
|
+
def keywords_extract(ctx: click.Context):
|
|
632
|
+
"""Manually extract keywords from existing database."""
|
|
633
|
+
config: Config = ctx.obj["config"]
|
|
634
|
+
if not Path(config.database.path).exists():
|
|
635
|
+
console.print("[yellow]No database found. Run 'ai-scraper scrape' first.[/yellow]")
|
|
636
|
+
return
|
|
637
|
+
db = Database(Path(config.database.path))
|
|
638
|
+
db.init_db()
|
|
639
|
+
repos = db.get_all_repositories(limit=10000)
|
|
640
|
+
db.close()
|
|
641
|
+
if not repos:
|
|
642
|
+
console.print("[yellow]No repositories in database.[/yellow]")
|
|
643
|
+
return
|
|
644
|
+
extractor = KeywordExtractor(Path(config.keywords.file), max_keywords=config.keywords.max_keywords)
|
|
645
|
+
existing = extractor.load_keywords()
|
|
646
|
+
new = extractor.extract_from_repos(repos)
|
|
647
|
+
merged = extractor.merge_keywords(existing, new)
|
|
648
|
+
extractor.save_keywords(merged)
|
|
649
|
+
console.print(f"[green]Extracted {len(new)} new keywords[/green]")
|
|
650
|
+
console.print(f"[green]Total: {len(merged)} keywords[/green]")
|
|
651
|
+
|
|
652
|
+
|
|
653
|
+
@keywords_cmd.command("clear")
|
|
654
|
+
@click.pass_context
|
|
655
|
+
def keywords_clear(ctx: click.Context):
|
|
656
|
+
"""Clear all keywords."""
|
|
657
|
+
config: Config = ctx.obj["config"]
|
|
658
|
+
extractor = KeywordExtractor(Path(config.keywords.file))
|
|
659
|
+
extractor.save_keywords(set())
|
|
660
|
+
console.print("[green]Keywords cleared.[/green]")
|
|
661
|
+
|
|
662
|
+
|
|
663
|
+
cli.add_command(keywords_cmd, name="keywords")
|
|
664
|
+
|
|
665
|
+
|
|
666
|
+
@cli.command()
|
|
667
|
+
@click.option("--host", default="0.0.0.0", help="Server host")
|
|
668
|
+
@click.option("--port", default=8080, help="Server port")
|
|
669
|
+
@click.pass_context
|
|
670
|
+
def serve(ctx: click.Context, host: str, port: int):
|
|
671
|
+
"""Start REST API server."""
|
|
672
|
+
from ai_scraper.api_server import run_server
|
|
673
|
+
console.print(f"[bold green]Starting API server at http://{host}:{port}[/bold green]")
|
|
674
|
+
run_server(host=host, port=port)
|
|
675
|
+
|
|
676
|
+
|
|
677
|
+
@cli.command()
|
|
678
|
+
@click.option("--cron", default="0 9 * * *", help="Cron expression for schedule")
|
|
679
|
+
@click.option("--max-results", default=100, help="Max results per scrape")
|
|
680
|
+
@click.pass_context
|
|
681
|
+
def schedule(ctx: click.Context, cron: str, max_results: int):
|
|
682
|
+
"""Schedule periodic scraping.
|
|
683
|
+
|
|
684
|
+
Example cron expressions:
|
|
685
|
+
"0 9 * * *" - Daily at 9:00 AM
|
|
686
|
+
"0 */6 * * *" - Every 6 hours
|
|
687
|
+
"0 9 * * 1-5" - Weekdays at 9:00 AM
|
|
688
|
+
"""
|
|
689
|
+
from ai_scraper.scheduler import scheduler
|
|
690
|
+
|
|
691
|
+
async def run_scrape():
|
|
692
|
+
console.print(f"[dim]{datetime.now()}: Starting scheduled scrape[/dim]")
|
|
693
|
+
ctx.invoke(scrape, max_results=max_results)
|
|
694
|
+
|
|
695
|
+
scheduler.add_task("scrape", cron, lambda: asyncio.run(run_scrape()))
|
|
696
|
+
|
|
697
|
+
console.print(f"[bold green]Scheduler started[/bold green]")
|
|
698
|
+
console.print(f"[dim]Next run: {scheduler.tasks['scrape']['next_run']}[/dim]")
|
|
699
|
+
|
|
700
|
+
asyncio.run(scheduler.run())
|
|
701
|
+
|
|
702
|
+
|
|
703
|
+
@cli.command()
|
|
704
|
+
@click.pass_context
|
|
705
|
+
def interactive(ctx: click.Context):
|
|
706
|
+
"""Start interactive mode with menu-driven interface."""
|
|
707
|
+
from ai_scraper.interactive import show_main_menu, get_scrape_params
|
|
708
|
+
from rich.prompt import Prompt
|
|
709
|
+
|
|
710
|
+
while True:
|
|
711
|
+
choice = show_main_menu()
|
|
712
|
+
|
|
713
|
+
if choice == "q":
|
|
714
|
+
console.print("\n[cyan]Goodbye![/cyan]")
|
|
715
|
+
break
|
|
716
|
+
elif choice == "1":
|
|
717
|
+
# Quick scrape
|
|
718
|
+
ctx.invoke(scrape, max_results=50)
|
|
719
|
+
elif choice == "2":
|
|
720
|
+
# Deep scrape
|
|
721
|
+
ctx.invoke(scrape, max_results=500)
|
|
722
|
+
elif choice == "3":
|
|
723
|
+
# Custom scrape
|
|
724
|
+
params = get_scrape_params()
|
|
725
|
+
ctx.invoke(scrape, **params)
|
|
726
|
+
elif choice == "4":
|
|
727
|
+
# View results
|
|
728
|
+
ctx.invoke(list_repos)
|
|
729
|
+
elif choice == "5":
|
|
730
|
+
# Trending
|
|
731
|
+
ctx.invoke(trending)
|
|
732
|
+
elif choice == "6":
|
|
733
|
+
# Export
|
|
734
|
+
format_choice = Prompt.ask("Export format", choices=["csv", "json", "html"], default="csv")
|
|
735
|
+
ctx.invoke(db_export, format=format_choice, output=f"export.{format_choice}")
|
|
736
|
+
elif choice == "7":
|
|
737
|
+
# Settings
|
|
738
|
+
ctx.invoke(config_show)
|
|
739
|
+
|
|
740
|
+
|
|
741
|
+
def main():
|
|
742
|
+
"""Main entry point."""
|
|
743
|
+
cli()
|
|
744
|
+
|
|
745
|
+
|
|
746
|
+
if __name__ == "__main__":
|
|
747
|
+
main()
|