github-ai-scraper 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. ai_scraper/__init__.py +3 -0
  2. ai_scraper/api/__init__.py +6 -0
  3. ai_scraper/api/github.py +340 -0
  4. ai_scraper/api/gitlab.py +418 -0
  5. ai_scraper/api/rate_limiter.py +120 -0
  6. ai_scraper/api_server.py +196 -0
  7. ai_scraper/auth.py +68 -0
  8. ai_scraper/backup.py +112 -0
  9. ai_scraper/cache.py +95 -0
  10. ai_scraper/classifier.py +135 -0
  11. ai_scraper/cli.py +747 -0
  12. ai_scraper/config.py +237 -0
  13. ai_scraper/config_watcher.py +82 -0
  14. ai_scraper/dedup.py +148 -0
  15. ai_scraper/filters/__init__.py +5 -0
  16. ai_scraper/filters/ai_filter.py +93 -0
  17. ai_scraper/health.py +155 -0
  18. ai_scraper/i18n.py +141 -0
  19. ai_scraper/interactive.py +96 -0
  20. ai_scraper/keywords/__init__.py +5 -0
  21. ai_scraper/keywords/extractor.py +274 -0
  22. ai_scraper/logging_config.py +74 -0
  23. ai_scraper/models/__init__.py +5 -0
  24. ai_scraper/models/repository.py +72 -0
  25. ai_scraper/output/__init__.py +6 -0
  26. ai_scraper/output/excel.py +79 -0
  27. ai_scraper/output/html.py +152 -0
  28. ai_scraper/output/markdown.py +338 -0
  29. ai_scraper/output/rss.py +82 -0
  30. ai_scraper/output/translator.py +303 -0
  31. ai_scraper/plugin_system.py +146 -0
  32. ai_scraper/plugins/__init__.py +5 -0
  33. ai_scraper/retry.py +134 -0
  34. ai_scraper/scheduler.py +84 -0
  35. ai_scraper/scrape_progress.py +99 -0
  36. ai_scraper/secure_storage.py +127 -0
  37. ai_scraper/storage/__init__.py +5 -0
  38. ai_scraper/storage/async_database.py +237 -0
  39. ai_scraper/storage/database.py +456 -0
  40. ai_scraper/webhooks.py +95 -0
  41. github_ai_scraper-0.1.2.dist-info/METADATA +299 -0
  42. github_ai_scraper-0.1.2.dist-info/RECORD +44 -0
  43. github_ai_scraper-0.1.2.dist-info/WHEEL +4 -0
  44. github_ai_scraper-0.1.2.dist-info/entry_points.txt +2 -0
ai_scraper/cli.py ADDED
@@ -0,0 +1,747 @@
1
+ """CLI entry point for ai-scraper."""
2
+
3
+ import asyncio
4
+ import io
5
+ import re
6
+ import sys
7
+ from datetime import datetime, timedelta
8
+ from pathlib import Path
9
+ from typing import Optional
10
+
11
+ import click
12
+ import rich.table as table
13
+ from rich import print as rprint
14
+ from rich.console import Console
15
+ from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn
16
+
17
+ from ai_scraper import __version__
18
+ from ai_scraper.api.github import GitHubClient
19
+ from ai_scraper.api.gitlab import GitLabClient
20
+ from ai_scraper.config import Config, load_config
21
+ from ai_scraper.filters.ai_filter import AIFilter
22
+ from ai_scraper.keywords.extractor import KeywordExtractor
23
+ from ai_scraper.models.repository import FilterConfig as FilterConfigModel
24
+ from ai_scraper.output.markdown import MarkdownExporter
25
+ from ai_scraper.storage.database import Database
26
+
27
+ # Create console with UTF-8 encoding for Windows
28
+ # Use a wrapper to ensure UTF-8 encoding for output
29
+ if sys.platform == "win32":
30
+ # Reconfigure stdout for UTF-8 if needed
31
+ if hasattr(sys.stdout, 'reconfigure'):
32
+ try:
33
+ sys.stdout.reconfigure(encoding='utf-8')
34
+ except (OSError, ValueError):
35
+ pass
36
+ console = Console(force_terminal=True, legacy_windows=False)
37
+
38
+
39
+ def clean_text(text: str) -> str:
40
+ """Remove emoji and special characters that can't be displayed in Windows terminal."""
41
+ if not text:
42
+ return ""
43
+ # Remove emoji, zero-width joiners, and other non-printable characters
44
+ # Keep only ASCII and common Unicode letters/numbers/punctuation
45
+ result = []
46
+ for char in text:
47
+ # Keep ASCII printable characters and common Unicode ranges
48
+ if (32 <= ord(char) <= 126 or # ASCII printable
49
+ '\u4e00' <= char <= '\u9fff' or # Chinese characters
50
+ '\u0400' <= char <= '\u04ff' or # Cyrillic
51
+ char in ' \t'): # Basic whitespace
52
+ result.append(char)
53
+ elif char in '\n\r':
54
+ result.append(' ')
55
+ return ''.join(result)
56
+
57
+
58
+ def parse_since_param(since: Optional[str]) -> Optional[datetime]:
59
+ """Parse the --since parameter into a datetime.
60
+
61
+ Args:
62
+ since: Either YYYY-MM-DD format or relative like '1d', '1w', '1m'.
63
+
64
+ Returns:
65
+ datetime representing the cutoff time, or None if since is None.
66
+
67
+ Raises:
68
+ ValueError: If the format is invalid.
69
+ """
70
+ if since is None:
71
+ return None
72
+
73
+ # Try YYYY-MM-DD format first
74
+ if re.match(r'^\d{4}-\d{2}-\d{2}$', since):
75
+ return datetime.strptime(since, '%Y-%m-%d')
76
+
77
+ # Try relative format: number + unit (d, w, m)
78
+ match = re.match(r'^(\d+)([dwmy])$', since.lower())
79
+ if match:
80
+ amount = int(match.group(1))
81
+ unit = match.group(2)
82
+
83
+ if unit == 'd':
84
+ return datetime.now() - timedelta(days=amount)
85
+ elif unit == 'w':
86
+ return datetime.now() - timedelta(weeks=amount)
87
+ elif unit == 'm':
88
+ return datetime.now() - timedelta(days=amount * 30)
89
+ elif unit == 'y':
90
+ return datetime.now() - timedelta(days=amount * 365)
91
+
92
+ raise ValueError(
93
+ f"Invalid --since format: '{since}'. "
94
+ "Use YYYY-MM-DD or relative format like '1d', '1w', '1m', '1y'."
95
+ )
96
+
97
+
98
+ @click.group()
99
+ @click.version_option(version=__version__)
100
+ @click.option("--config", "-c", type=click.Path(exists=True), help="Config file path")
101
+ @click.pass_context
102
+ def cli(ctx: click.Context, config: Optional[str]):
103
+ """GitHub AI high-star repositories scraper."""
104
+ ctx.ensure_object(dict)
105
+
106
+ config_path = Path(config) if config else Path("ai-scraper.yaml")
107
+ ctx.obj["config"] = load_config(config_path)
108
+ ctx.obj["config_path"] = config_path
109
+
110
+
111
+ @cli.command()
112
+ @click.option("--min-stars", type=int, help="Minimum stars filter")
113
+ @click.option("--max-results", type=int, help="Maximum results to fetch")
114
+ @click.option("--incremental", is_flag=True, help="Only fetch repos updated since last scrape")
115
+ @click.option("--since", type=str, help="Fetch repos updated since date (YYYY-MM-DD or 1d/1w/1m)")
116
+ @click.option("--progress/--no-progress", default=True, help="Show progress bar (default: on)")
117
+ @click.option("--platform", type=click.Choice(["github", "gitlab"]), default="github", help="Platform to scrape (github or gitlab)")
118
+ @click.option("--gitlab-url", type=str, help="GitLab instance URL (for self-hosted GitLab)")
119
+ @click.pass_context
120
+ def scrape(ctx: click.Context, min_stars: Optional[int], max_results: Optional[int],
121
+ incremental: bool, since: Optional[str], progress: bool, platform: str, gitlab_url: Optional[str]):
122
+ """Scrape AI repositories from GitHub or GitLab."""
123
+ config: Config = ctx.obj["config"]
124
+
125
+ # Override config with CLI options
126
+ if min_stars is not None:
127
+ config.filter.min_stars = min_stars
128
+ if max_results is not None:
129
+ config.scrape.max_results = max_results
130
+
131
+ # Parse --since parameter
132
+ since_date: Optional[datetime] = None
133
+ if since:
134
+ try:
135
+ since_date = parse_since_param(since)
136
+ console.print(f"[dim]Fetching repos updated since: {since_date}[/dim]")
137
+ except ValueError as e:
138
+ console.print(f"[red]Error: {e}[/red]")
139
+ sys.exit(1)
140
+
141
+ console.print(f"[bold blue]Starting scrape from {platform}...[/bold blue]")
142
+
143
+ async def run_scrape(since_date_inner: Optional[datetime]):
144
+ # Create appropriate client based on platform
145
+ if platform == "gitlab":
146
+ client = GitLabClient(
147
+ token=config.gitlab.token or config.github.token,
148
+ base_url=gitlab_url or config.gitlab.base_url
149
+ )
150
+ else:
151
+ client = GitHubClient(token=config.github.token)
152
+
153
+ db = Database(Path(config.database.path))
154
+ db.init_db()
155
+ filter_instance = AIFilter()
156
+ keyword_extractor = KeywordExtractor(
157
+ Path(config.keywords.file),
158
+ max_keywords=config.keywords.max_keywords
159
+ )
160
+ markdown_exporter = MarkdownExporter(
161
+ Path(config.output.dir),
162
+ filename=config.output.filename
163
+ )
164
+
165
+ try:
166
+ # Handle incremental mode
167
+ if incremental and since_date_inner is None:
168
+ last_scrape = db.get_last_scrape_time()
169
+ if last_scrape:
170
+ since_date_inner = last_scrape
171
+ if not progress:
172
+ console.print(f"[dim]Incremental mode: fetching repos since last scrape ({last_scrape})[/dim]")
173
+ else:
174
+ if not progress:
175
+ console.print("[dim]Incremental mode: no previous scrape found, fetching all repos[/dim]")
176
+
177
+ # Build search query based on platform
178
+ primary_topic = config.filter.topics[0] if config.filter.topics else "ai"
179
+
180
+ if platform == "gitlab":
181
+ # GitLab uses different search syntax
182
+ query = "ai" # GitLab search is simpler
183
+ if not progress:
184
+ console.print(f"[dim]Searching GitLab for: {query}[/dim]")
185
+ else:
186
+ # GitHub search
187
+ query = f"stars:>{config.filter.min_stars} topic:{primary_topic}"
188
+ if since_date_inner:
189
+ date_str = since_date_inner.strftime('%Y-%m-%d')
190
+ query += f" pushed:>{date_str}"
191
+ if not progress:
192
+ console.print(f"[dim]Query: {query}[/dim]")
193
+
194
+ # Search repositories
195
+ all_repos = []
196
+ page = 1
197
+ per_page = min(100, config.scrape.max_results)
198
+ max_results = config.scrape.max_results
199
+
200
+ if progress:
201
+ # Use progress bar
202
+ with Progress(
203
+ SpinnerColumn(),
204
+ TextColumn("[progress.description]{task.description}"),
205
+ BarColumn(),
206
+ TaskProgressColumn(),
207
+ console=console,
208
+ ) as progress_bar:
209
+ task = progress_bar.add_task(
210
+ f"[cyan]Scraping {platform} repositories...",
211
+ total=max_results
212
+ )
213
+
214
+ while len(all_repos) < max_results:
215
+ # Use appropriate search method based on platform
216
+ if platform == "gitlab":
217
+ repos = await client.search_projects(
218
+ query=query,
219
+ sort="star_count",
220
+ order="desc",
221
+ page=page,
222
+ per_page=per_page,
223
+ min_stars=config.filter.min_stars,
224
+ )
225
+ else:
226
+ repos = await client.search_repositories(
227
+ query=query,
228
+ sort="stars",
229
+ order="desc",
230
+ page=page,
231
+ per_page=per_page,
232
+ )
233
+
234
+ if not repos:
235
+ break
236
+
237
+ # Filter AI-related repos
238
+ filter_config = FilterConfigModel(
239
+ keywords=config.filter.keywords,
240
+ topics=config.filter.topics,
241
+ languages=config.filter.languages,
242
+ exclude_keywords=config.filter.exclude_keywords,
243
+ min_stars=config.filter.min_stars,
244
+ )
245
+
246
+ for repo in repos:
247
+ if len(all_repos) >= max_results:
248
+ break # 达到限制,停止添加
249
+ if filter_instance.is_ai_related(repo, filter_config):
250
+ score = filter_instance.score_relevance(repo)
251
+ db.save_repository(repo, relevance_score=score)
252
+ # Save snapshot for trend analysis
253
+ db.save_snapshot(repo.id, repo.stars, datetime.now())
254
+ all_repos.append(repo)
255
+ progress_bar.update(task, completed=len(all_repos))
256
+
257
+ # 检查是否达到限制
258
+ if len(all_repos) >= max_results:
259
+ break
260
+
261
+ page += 1
262
+
263
+ if len(repos) < per_page:
264
+ break
265
+
266
+ # Ensure progress shows final count
267
+ progress_bar.update(task, completed=len(all_repos))
268
+ else:
269
+ # No progress bar - use original console output
270
+ while len(all_repos) < max_results:
271
+ if platform == "gitlab":
272
+ repos = await client.search_projects(
273
+ query=query,
274
+ sort="star_count",
275
+ order="desc",
276
+ page=page,
277
+ per_page=per_page,
278
+ min_stars=config.filter.min_stars,
279
+ )
280
+ else:
281
+ repos = await client.search_repositories(
282
+ query=query,
283
+ sort="stars",
284
+ order="desc",
285
+ page=page,
286
+ per_page=per_page,
287
+ )
288
+
289
+ if not repos:
290
+ break
291
+
292
+ # Filter AI-related repos
293
+ filter_config = FilterConfigModel(
294
+ keywords=config.filter.keywords,
295
+ topics=config.filter.topics,
296
+ languages=config.filter.languages,
297
+ exclude_keywords=config.filter.exclude_keywords,
298
+ min_stars=config.filter.min_stars,
299
+ )
300
+
301
+ for repo in repos:
302
+ if len(all_repos) >= max_results:
303
+ break # 达到限制,停止添加
304
+ if filter_instance.is_ai_related(repo, filter_config):
305
+ score = filter_instance.score_relevance(repo)
306
+ db.save_repository(repo, relevance_score=score)
307
+ # Save snapshot for trend analysis
308
+ db.save_snapshot(repo.id, repo.stars, datetime.now())
309
+ all_repos.append(repo)
310
+
311
+ # 检查是否达到限制
312
+ if len(all_repos) >= max_results:
313
+ break
314
+
315
+ console.print(f"[dim]Page {page}: found {len(repos)} repos, {len(all_repos)} total AI-related[/dim]")
316
+ page += 1
317
+
318
+ if len(repos) < per_page:
319
+ break
320
+
321
+ if all_repos:
322
+ console.print("[dim]Extracting keywords...[/dim]")
323
+ existing_keywords = keyword_extractor.load_keywords()
324
+ new_keywords = keyword_extractor.extract_from_repos(all_repos)
325
+ merged = keyword_extractor.merge_keywords(existing_keywords, new_keywords)
326
+ keyword_extractor.save_keywords(merged)
327
+ console.print(f"[dim]Keywords updated: {len(merged)} total[/dim]")
328
+
329
+ console.print("[dim]Generating Markdown report...[/dim]")
330
+ output_path = markdown_exporter.export_repositories(all_repos)
331
+ console.print(f"[dim]Report saved to: {output_path}[/dim]")
332
+
333
+ console.print(f"[bold green]Scraped {len(all_repos)} AI repositories[/bold green]")
334
+
335
+ finally:
336
+ await client.close()
337
+ db.close()
338
+
339
+ asyncio.run(run_scrape(since_date))
340
+
341
+
342
+ @cli.command("list")
343
+ @click.option("--sort", type=click.Choice(["stars", "updated", "relevance"]), default="stars")
344
+ @click.option("--lang", type=str, help="Filter by language")
345
+ @click.option("--limit", type=int, default=20, help="Number of results")
346
+ @click.pass_context
347
+ def list_repos(ctx: click.Context, sort: str, lang: Optional[str], limit: int):
348
+ """List scraped repositories."""
349
+ config: Config = ctx.obj["config"]
350
+ db = Database(Path(config.database.path))
351
+
352
+ if not Path(config.database.path).exists():
353
+ console.print("[yellow]No database found. Run 'ai-scraper scrape' first.[/yellow]")
354
+ return
355
+
356
+ db.init_db()
357
+ repos = db.get_all_repositories(limit=limit, sort_by=sort)
358
+
359
+ # Filter by language if specified
360
+ if lang:
361
+ repos = [r for r in repos if r.language and r.language.lower() == lang.lower()]
362
+
363
+ # Create table
364
+ tbl = table.Table(title=f"AI Repositories (sorted by {sort})")
365
+ tbl.add_column("Name", style="cyan")
366
+ tbl.add_column("Stars", justify="right", style="yellow")
367
+ tbl.add_column("Language", style="green")
368
+ tbl.add_column("Description", max_width=40)
369
+
370
+ for repo in repos:
371
+ stars_str = f"{repo.stars:,}"
372
+ desc = clean_text(repo.description)
373
+ desc = desc[:37] + "..." if desc and len(desc) > 40 else desc or ""
374
+ tbl.add_row(repo.name, stars_str, repo.language or "-", desc)
375
+
376
+ console.print(tbl)
377
+ db.close()
378
+
379
+
380
+ @cli.command()
381
+ @click.option("--days", type=int, default=7, help="Days to analyze")
382
+ @click.option("--top", type=int, default=10, help="Number of top results")
383
+ @click.pass_context
384
+ def trending(ctx: click.Context, days: int, top: int):
385
+ """Show trending repositories by star growth."""
386
+ config: Config = ctx.obj["config"]
387
+ db = Database(Path(config.database.path))
388
+
389
+ if not Path(config.database.path).exists():
390
+ console.print("[yellow]No database found. Run 'ai-scraper scrape' first.[/yellow]")
391
+ return
392
+
393
+ db.init_db()
394
+ trends = db.get_trending(days=days, limit=top)
395
+
396
+ if not trends:
397
+ console.print(f"[yellow]No trending data found for the last {days} days.[/yellow]")
398
+ console.print("[dim]Run 'ai-scraper scrape' multiple times to build trend data.[/dim]")
399
+ db.close()
400
+ return
401
+
402
+ tbl = table.Table(title=f"Trending Repositories (last {days} days)")
403
+ tbl.add_column("Repository", style="cyan")
404
+ tbl.add_column("Growth", justify="right", style="green")
405
+ tbl.add_column("Stars", justify="right", style="yellow")
406
+
407
+ for trend in trends:
408
+ growth_str = f"+{trend.growth_rate * 100:.1f}%"
409
+ stars_str = f"{trend.current_stars:,}"
410
+ tbl.add_row(trend.repo_name, growth_str, stars_str)
411
+
412
+ console.print(tbl)
413
+ db.close()
414
+
415
+
416
+ @cli.group()
417
+ def config_cmd():
418
+ """Configuration management."""
419
+ pass
420
+
421
+
422
+ @config_cmd.command("init")
423
+ @click.pass_context
424
+ def config_init(ctx: click.Context):
425
+ """Initialize configuration file."""
426
+ config_path: Path = ctx.obj["config_path"]
427
+
428
+ if config_path.exists():
429
+ console.print(f"[yellow]Config file already exists at {config_path}[/yellow]")
430
+ return
431
+
432
+ # Copy default config
433
+ import shutil
434
+ default_config = Path(__file__).parent.parent.parent / "ai-scraper.yaml"
435
+
436
+ if default_config.exists():
437
+ shutil.copy(default_config, config_path)
438
+ console.print(f"[green]Created config file at {config_path}[/green]")
439
+ else:
440
+ console.print("[red]Default config not found[/red]")
441
+
442
+
443
+ @config_cmd.command("show")
444
+ @click.pass_context
445
+ def config_show(ctx: click.Context):
446
+ """Show current configuration."""
447
+ config: Config = ctx.obj["config"]
448
+
449
+ console.print("[bold]Current Configuration:[/bold]")
450
+ console.print(f" GitHub Token: {'***' if config.github.token else 'Not set'}")
451
+ console.print(f" GitLab Token: {'***' if config.gitlab.token else 'Not set'}")
452
+ console.print(f" GitLab URL: {config.gitlab.base_url}")
453
+ console.print(f" Cache TTL: {config.github.cache_ttl}s")
454
+ console.print(f" Min Stars: {config.filter.min_stars}")
455
+ console.print(f" Keywords: {', '.join(config.filter.keywords[:5])}...")
456
+ console.print(f" Topics: {', '.join(config.filter.topics[:5])}...")
457
+ console.print(f" Max Results: {config.scrape.max_results}")
458
+ console.print(f" Database: {config.database.path}")
459
+ console.print(f" Scheduler: {'enabled' if config.scheduler.enabled else 'disabled'}")
460
+
461
+
462
+ cli.add_command(config_cmd, name="config")
463
+
464
+
465
+ @cli.group()
466
+ def db_cmd():
467
+ """Database management."""
468
+ pass
469
+
470
+
471
+ @db_cmd.command("stats")
472
+ @click.pass_context
473
+ def db_stats(ctx: click.Context):
474
+ """Show database statistics."""
475
+ config: Config = ctx.obj["config"]
476
+ db = Database(Path(config.database.path))
477
+
478
+ if not Path(config.database.path).exists():
479
+ console.print("[yellow]No database found. Run 'ai-scraper scrape' first.[/yellow]")
480
+ return
481
+
482
+ db.init_db()
483
+ stats = db.get_stats()
484
+
485
+ console.print("[bold]Database Statistics:[/bold]")
486
+ console.print(f" Repository count: {stats['repository_count']}")
487
+ console.print(f" Snapshot count: {stats['snapshot_count']}")
488
+ console.print(f" Total stars: {stats['total_stars']:,}")
489
+
490
+ db.close()
491
+
492
+
493
+ @db_cmd.command("clean")
494
+ @click.option("--days", type=int, default=30, help="Keep snapshots from last N days")
495
+ @click.option("--invalid", is_flag=True, help="Remove repos with invalid data")
496
+ @click.option("--vacuum", is_flag=True, help="Optimize database size")
497
+ @click.pass_context
498
+ def db_clean(ctx: click.Context, days: int, invalid: bool, vacuum: bool):
499
+ """Clean and optimize database.
500
+
501
+ Examples:
502
+ ai-scraper db clean --days 30 # Clean old snapshots
503
+ ai-scraper db clean --invalid # Remove invalid repos
504
+ ai-scraper db clean --vacuum # Optimize database
505
+ ai-scraper db clean --invalid --vacuum # Both
506
+ """
507
+ config: Config = ctx.obj["config"]
508
+ db = Database(Path(config.database.path))
509
+
510
+ if not Path(config.database.path).exists():
511
+ console.print("[yellow]No database found.[/yellow]")
512
+ return
513
+
514
+ db.init_db()
515
+
516
+ # Clean old snapshots (default behavior)
517
+ if not invalid and not vacuum:
518
+ deleted = db.clean_old_snapshots(days=days)
519
+ console.print(f"[green]Deleted {deleted} old snapshots[/green]")
520
+
521
+ # Remove invalid repos
522
+ if invalid:
523
+ removed = db.clean_invalid_repos()
524
+ console.print(f"[green]Removed {removed} invalid repositories[/green]")
525
+
526
+ # Optimize database
527
+ if vacuum:
528
+ db.vacuum()
529
+ console.print("[green]Database optimized[/green]")
530
+
531
+ db.close()
532
+
533
+
534
+ @db_cmd.command("export")
535
+ @click.option("--format", "-f", type=click.Choice(["csv", "json", "html", "markdown"]), default="csv")
536
+ @click.option("--output", "-o", type=click.Path(), default="export.csv")
537
+ @click.pass_context
538
+ def db_export(ctx: click.Context, format: str, output: str):
539
+ """Export database to file."""
540
+ config: Config = ctx.obj["config"]
541
+ db = Database(Path(config.database.path))
542
+
543
+ if not Path(config.database.path).exists():
544
+ console.print("[yellow]No database found.[/yellow]")
545
+ return
546
+
547
+ db.init_db()
548
+ repos = db.get_all_repositories(limit=10000)
549
+
550
+ if format == "csv":
551
+ import csv
552
+
553
+ with open(output, "w", newline="", encoding="utf-8") as f:
554
+ writer = csv.writer(f)
555
+ writer.writerow(["name", "stars", "language", "description", "url"])
556
+
557
+ for repo in repos:
558
+ writer.writerow([
559
+ repo.name,
560
+ repo.stars,
561
+ repo.language or "",
562
+ repo.description or "",
563
+ repo.url,
564
+ ])
565
+
566
+ console.print(f"[green]Exported {len(repos)} repositories to {output}[/green]")
567
+
568
+ elif format == "json":
569
+ import json
570
+
571
+ data = {
572
+ "repositories": [
573
+ {
574
+ "name": r.name,
575
+ "stars": r.stars,
576
+ "language": r.language,
577
+ "description": r.description,
578
+ "url": r.url,
579
+ }
580
+ for r in repos
581
+ ],
582
+ "total": len(repos),
583
+ }
584
+
585
+ with open(output, "w", encoding="utf-8") as f:
586
+ json.dump(data, f, indent=2)
587
+
588
+ console.print(f"[green]Exported {len(repos)} repositories to {output}[/green]")
589
+
590
+ elif format == "html":
591
+ from ai_scraper.output.html import HTMLExporter
592
+ exporter = HTMLExporter(Path(config.output.dir), filename=output)
593
+ path = exporter.export_repositories(repos)
594
+ console.print(f"[green]Exported {len(repos)} repositories to {path}[/green]")
595
+
596
+ elif format == "markdown":
597
+ from ai_scraper.output.markdown import MarkdownExporter
598
+ exporter = MarkdownExporter(Path(config.output.dir), filename=output)
599
+ path = exporter.export_repositories(repos)
600
+ console.print(f"[green]Exported {len(repos)} repositories to {path}[/green]")
601
+
602
+ db.close()
603
+
604
+
605
+ cli.add_command(db_cmd, name="db")
606
+
607
+
608
+ @cli.group()
609
+ def keywords_cmd():
610
+ """Keywords management."""
611
+ pass
612
+
613
+
614
+ @keywords_cmd.command("list")
615
+ @click.pass_context
616
+ def keywords_list(ctx: click.Context):
617
+ """List all keywords."""
618
+ config: Config = ctx.obj["config"]
619
+ extractor = KeywordExtractor(Path(config.keywords.file))
620
+ keywords = extractor.get_keywords_for_search()
621
+ if not keywords:
622
+ console.print("[yellow]No keywords found.[/yellow]")
623
+ return
624
+ console.print(f"[bold]Keywords ({len(keywords)}):[/bold]")
625
+ for kw in sorted(keywords):
626
+ console.print(f" {kw}")
627
+
628
+
629
+ @keywords_cmd.command("extract")
630
+ @click.pass_context
631
+ def keywords_extract(ctx: click.Context):
632
+ """Manually extract keywords from existing database."""
633
+ config: Config = ctx.obj["config"]
634
+ if not Path(config.database.path).exists():
635
+ console.print("[yellow]No database found. Run 'ai-scraper scrape' first.[/yellow]")
636
+ return
637
+ db = Database(Path(config.database.path))
638
+ db.init_db()
639
+ repos = db.get_all_repositories(limit=10000)
640
+ db.close()
641
+ if not repos:
642
+ console.print("[yellow]No repositories in database.[/yellow]")
643
+ return
644
+ extractor = KeywordExtractor(Path(config.keywords.file), max_keywords=config.keywords.max_keywords)
645
+ existing = extractor.load_keywords()
646
+ new = extractor.extract_from_repos(repos)
647
+ merged = extractor.merge_keywords(existing, new)
648
+ extractor.save_keywords(merged)
649
+ console.print(f"[green]Extracted {len(new)} new keywords[/green]")
650
+ console.print(f"[green]Total: {len(merged)} keywords[/green]")
651
+
652
+
653
+ @keywords_cmd.command("clear")
654
+ @click.pass_context
655
+ def keywords_clear(ctx: click.Context):
656
+ """Clear all keywords."""
657
+ config: Config = ctx.obj["config"]
658
+ extractor = KeywordExtractor(Path(config.keywords.file))
659
+ extractor.save_keywords(set())
660
+ console.print("[green]Keywords cleared.[/green]")
661
+
662
+
663
+ cli.add_command(keywords_cmd, name="keywords")
664
+
665
+
666
+ @cli.command()
667
+ @click.option("--host", default="0.0.0.0", help="Server host")
668
+ @click.option("--port", default=8080, help="Server port")
669
+ @click.pass_context
670
+ def serve(ctx: click.Context, host: str, port: int):
671
+ """Start REST API server."""
672
+ from ai_scraper.api_server import run_server
673
+ console.print(f"[bold green]Starting API server at http://{host}:{port}[/bold green]")
674
+ run_server(host=host, port=port)
675
+
676
+
677
+ @cli.command()
678
+ @click.option("--cron", default="0 9 * * *", help="Cron expression for schedule")
679
+ @click.option("--max-results", default=100, help="Max results per scrape")
680
+ @click.pass_context
681
+ def schedule(ctx: click.Context, cron: str, max_results: int):
682
+ """Schedule periodic scraping.
683
+
684
+ Example cron expressions:
685
+ "0 9 * * *" - Daily at 9:00 AM
686
+ "0 */6 * * *" - Every 6 hours
687
+ "0 9 * * 1-5" - Weekdays at 9:00 AM
688
+ """
689
+ from ai_scraper.scheduler import scheduler
690
+
691
+ async def run_scrape():
692
+ console.print(f"[dim]{datetime.now()}: Starting scheduled scrape[/dim]")
693
+ ctx.invoke(scrape, max_results=max_results)
694
+
695
+ scheduler.add_task("scrape", cron, lambda: asyncio.run(run_scrape()))
696
+
697
+ console.print(f"[bold green]Scheduler started[/bold green]")
698
+ console.print(f"[dim]Next run: {scheduler.tasks['scrape']['next_run']}[/dim]")
699
+
700
+ asyncio.run(scheduler.run())
701
+
702
+
703
+ @cli.command()
704
+ @click.pass_context
705
+ def interactive(ctx: click.Context):
706
+ """Start interactive mode with menu-driven interface."""
707
+ from ai_scraper.interactive import show_main_menu, get_scrape_params
708
+ from rich.prompt import Prompt
709
+
710
+ while True:
711
+ choice = show_main_menu()
712
+
713
+ if choice == "q":
714
+ console.print("\n[cyan]Goodbye![/cyan]")
715
+ break
716
+ elif choice == "1":
717
+ # Quick scrape
718
+ ctx.invoke(scrape, max_results=50)
719
+ elif choice == "2":
720
+ # Deep scrape
721
+ ctx.invoke(scrape, max_results=500)
722
+ elif choice == "3":
723
+ # Custom scrape
724
+ params = get_scrape_params()
725
+ ctx.invoke(scrape, **params)
726
+ elif choice == "4":
727
+ # View results
728
+ ctx.invoke(list_repos)
729
+ elif choice == "5":
730
+ # Trending
731
+ ctx.invoke(trending)
732
+ elif choice == "6":
733
+ # Export
734
+ format_choice = Prompt.ask("Export format", choices=["csv", "json", "html"], default="csv")
735
+ ctx.invoke(db_export, format=format_choice, output=f"export.{format_choice}")
736
+ elif choice == "7":
737
+ # Settings
738
+ ctx.invoke(config_show)
739
+
740
+
741
+ def main():
742
+ """Main entry point."""
743
+ cli()
744
+
745
+
746
+ if __name__ == "__main__":
747
+ main()