github-ai-scraper 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. github_ai_scraper-0.1.2/.dockerignore +17 -0
  2. github_ai_scraper-0.1.2/.github/workflows/ci.yml +83 -0
  3. github_ai_scraper-0.1.2/.github/workflows/release.yml +34 -0
  4. github_ai_scraper-0.1.2/.gitignore +59 -0
  5. github_ai_scraper-0.1.2/CLAUDE.md +108 -0
  6. github_ai_scraper-0.1.2/Dockerfile +42 -0
  7. github_ai_scraper-0.1.2/PKG-INFO +299 -0
  8. github_ai_scraper-0.1.2/README.md +257 -0
  9. github_ai_scraper-0.1.2/README_CN.md +260 -0
  10. github_ai_scraper-0.1.2/RELEASE_NOTES.md +52 -0
  11. github_ai_scraper-0.1.2/ai-scraper.yaml +83 -0
  12. github_ai_scraper-0.1.2/ai-security-config.yaml +88 -0
  13. github_ai_scraper-0.1.2/cmds/scheduler/limiter.go +98 -0
  14. github_ai_scraper-0.1.2/cmds/scheduler/main.go +60 -0
  15. github_ai_scraper-0.1.2/cmds/scheduler/processor.go +184 -0
  16. github_ai_scraper-0.1.2/cmds/scheduler/scheduler.go +240 -0
  17. github_ai_scraper-0.1.2/data/.gitkeep +1 -0
  18. github_ai_scraper-0.1.2/docker-compose.yml +27 -0
  19. github_ai_scraper-0.1.2/docs/PROGRESS.md +101 -0
  20. github_ai_scraper-0.1.2/docs/promotion/chinese-article-outline.md +82 -0
  21. github_ai_scraper-0.1.2/docs/promotion/english-article-outline.md +82 -0
  22. github_ai_scraper-0.1.2/docs/superpowers/plans/2026-05-09-github-ai-scraper.md +3251 -0
  23. github_ai_scraper-0.1.2/docs/superpowers/plans/2026-05-09-keywords-and-output.md +1029 -0
  24. github_ai_scraper-0.1.2/docs/superpowers/plans/2026-05-11-iteration-optimization-v2.md +197 -0
  25. github_ai_scraper-0.1.2/docs/superpowers/plans/2026-05-11-iteration-optimization.md +133 -0
  26. github_ai_scraper-0.1.2/docs/superpowers/plans/2026-05-14-promotion-plan.md +660 -0
  27. github_ai_scraper-0.1.2/docs/superpowers/specs/2026-05-09-github-ai-scraper-design.md +491 -0
  28. github_ai_scraper-0.1.2/docs/superpowers/specs/2026-05-09-keywords-and-output-design.md +246 -0
  29. github_ai_scraper-0.1.2/docs/superpowers/specs/2026-05-14-promotion-plan-design.md +216 -0
  30. github_ai_scraper-0.1.2/go.mod +5 -0
  31. github_ai_scraper-0.1.2/output/repositories.md +4981 -0
  32. github_ai_scraper-0.1.2/plugins/README.md +51 -0
  33. github_ai_scraper-0.1.2/plugins/example_plugin.py +35 -0
  34. github_ai_scraper-0.1.2/pyproject.toml +75 -0
  35. github_ai_scraper-0.1.2/scraped_repos.json +54 -0
  36. github_ai_scraper-0.1.2/src/ai_scraper/__init__.py +3 -0
  37. github_ai_scraper-0.1.2/src/ai_scraper/api/__init__.py +6 -0
  38. github_ai_scraper-0.1.2/src/ai_scraper/api/github.py +340 -0
  39. github_ai_scraper-0.1.2/src/ai_scraper/api/gitlab.py +418 -0
  40. github_ai_scraper-0.1.2/src/ai_scraper/api/rate_limiter.py +120 -0
  41. github_ai_scraper-0.1.2/src/ai_scraper/api_server.py +196 -0
  42. github_ai_scraper-0.1.2/src/ai_scraper/auth.py +68 -0
  43. github_ai_scraper-0.1.2/src/ai_scraper/backup.py +112 -0
  44. github_ai_scraper-0.1.2/src/ai_scraper/cache.py +95 -0
  45. github_ai_scraper-0.1.2/src/ai_scraper/classifier.py +135 -0
  46. github_ai_scraper-0.1.2/src/ai_scraper/cli.py +747 -0
  47. github_ai_scraper-0.1.2/src/ai_scraper/config.py +237 -0
  48. github_ai_scraper-0.1.2/src/ai_scraper/config_watcher.py +82 -0
  49. github_ai_scraper-0.1.2/src/ai_scraper/dedup.py +148 -0
  50. github_ai_scraper-0.1.2/src/ai_scraper/filters/__init__.py +5 -0
  51. github_ai_scraper-0.1.2/src/ai_scraper/filters/ai_filter.py +93 -0
  52. github_ai_scraper-0.1.2/src/ai_scraper/health.py +155 -0
  53. github_ai_scraper-0.1.2/src/ai_scraper/i18n.py +141 -0
  54. github_ai_scraper-0.1.2/src/ai_scraper/interactive.py +96 -0
  55. github_ai_scraper-0.1.2/src/ai_scraper/keywords/__init__.py +5 -0
  56. github_ai_scraper-0.1.2/src/ai_scraper/keywords/extractor.py +274 -0
  57. github_ai_scraper-0.1.2/src/ai_scraper/logging_config.py +74 -0
  58. github_ai_scraper-0.1.2/src/ai_scraper/models/__init__.py +5 -0
  59. github_ai_scraper-0.1.2/src/ai_scraper/models/repository.py +72 -0
  60. github_ai_scraper-0.1.2/src/ai_scraper/output/__init__.py +6 -0
  61. github_ai_scraper-0.1.2/src/ai_scraper/output/excel.py +79 -0
  62. github_ai_scraper-0.1.2/src/ai_scraper/output/html.py +152 -0
  63. github_ai_scraper-0.1.2/src/ai_scraper/output/markdown.py +338 -0
  64. github_ai_scraper-0.1.2/src/ai_scraper/output/rss.py +82 -0
  65. github_ai_scraper-0.1.2/src/ai_scraper/output/translator.py +303 -0
  66. github_ai_scraper-0.1.2/src/ai_scraper/plugin_system.py +146 -0
  67. github_ai_scraper-0.1.2/src/ai_scraper/plugins/__init__.py +5 -0
  68. github_ai_scraper-0.1.2/src/ai_scraper/retry.py +134 -0
  69. github_ai_scraper-0.1.2/src/ai_scraper/scheduler.py +84 -0
  70. github_ai_scraper-0.1.2/src/ai_scraper/scrape_progress.py +99 -0
  71. github_ai_scraper-0.1.2/src/ai_scraper/secure_storage.py +127 -0
  72. github_ai_scraper-0.1.2/src/ai_scraper/storage/__init__.py +5 -0
  73. github_ai_scraper-0.1.2/src/ai_scraper/storage/async_database.py +237 -0
  74. github_ai_scraper-0.1.2/src/ai_scraper/storage/database.py +456 -0
  75. github_ai_scraper-0.1.2/src/ai_scraper/webhooks.py +95 -0
  76. github_ai_scraper-0.1.2/test_export.json +54 -0
  77. github_ai_scraper-0.1.2/test_output.txt +0 -0
  78. github_ai_scraper-0.1.2/tests/__init__.py +1 -0
  79. github_ai_scraper-0.1.2/tests/test_api_server_performance.py +70 -0
  80. github_ai_scraper-0.1.2/tests/test_async_database.py +188 -0
  81. github_ai_scraper-0.1.2/tests/test_auth.py +82 -0
  82. github_ai_scraper-0.1.2/tests/test_backup.py +164 -0
  83. github_ai_scraper-0.1.2/tests/test_cache.py +226 -0
  84. github_ai_scraper-0.1.2/tests/test_classifier.py +273 -0
  85. github_ai_scraper-0.1.2/tests/test_classifier_enhanced.py +140 -0
  86. github_ai_scraper-0.1.2/tests/test_cli.py +186 -0
  87. github_ai_scraper-0.1.2/tests/test_concurrent_scrape.py +142 -0
  88. github_ai_scraper-0.1.2/tests/test_config.py +75 -0
  89. github_ai_scraper-0.1.2/tests/test_config_watcher.py +99 -0
  90. github_ai_scraper-0.1.2/tests/test_connection_pool.py +72 -0
  91. github_ai_scraper-0.1.2/tests/test_database.py +240 -0
  92. github_ai_scraper-0.1.2/tests/test_db_performance.py +217 -0
  93. github_ai_scraper-0.1.2/tests/test_dedup.py +183 -0
  94. github_ai_scraper-0.1.2/tests/test_dedup_enhanced.py +162 -0
  95. github_ai_scraper-0.1.2/tests/test_example_plugin.py +98 -0
  96. github_ai_scraper-0.1.2/tests/test_export_formats.py +133 -0
  97. github_ai_scraper-0.1.2/tests/test_filter.py +104 -0
  98. github_ai_scraper-0.1.2/tests/test_github.py +313 -0
  99. github_ai_scraper-0.1.2/tests/test_health.py +197 -0
  100. github_ai_scraper-0.1.2/tests/test_i18n.py +87 -0
  101. github_ai_scraper-0.1.2/tests/test_incremental.py +278 -0
  102. github_ai_scraper-0.1.2/tests/test_integration.py +173 -0
  103. github_ai_scraper-0.1.2/tests/test_interactive.py +218 -0
  104. github_ai_scraper-0.1.2/tests/test_keywords.py +545 -0
  105. github_ai_scraper-0.1.2/tests/test_models.py +85 -0
  106. github_ai_scraper-0.1.2/tests/test_output.py +199 -0
  107. github_ai_scraper-0.1.2/tests/test_output_html.py +429 -0
  108. github_ai_scraper-0.1.2/tests/test_rate_limiter_advanced.py +325 -0
  109. github_ai_scraper-0.1.2/tests/test_retry.py +132 -0
  110. github_ai_scraper-0.1.2/tests/test_scrape_progress.py +131 -0
  111. github_ai_scraper-0.1.2/tests/test_secure_storage.py +76 -0
  112. github_ai_scraper-0.1.2/tests/test_validation.py +521 -0
@@ -0,0 +1,17 @@
1
+ .git
2
+ .gitignore
3
+ __pycache__
4
+ *.pyc
5
+ *.pyo
6
+ *.egg-info
7
+ .pytest_cache
8
+ .coverage
9
+ htmlcov
10
+ .venv
11
+ venv
12
+ .env
13
+ *.db
14
+ *.json
15
+ data/
16
+ output/
17
+ .cache/
@@ -0,0 +1,83 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [master, main]
6
+ pull_request:
7
+ branches: [master, main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ['3.10', '3.11', '3.12']
15
+
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+
19
+ - name: Set up Python ${{ matrix.python-version }}
20
+ uses: actions/setup-python@v5
21
+ with:
22
+ python-version: ${{ matrix.python-version }}
23
+
24
+ - name: Install dependencies
25
+ run: |
26
+ python -m pip install --upgrade pip
27
+ pip install -e ".[dev]"
28
+
29
+ - name: Run tests
30
+ run: pytest tests/ -v --cov=src/ai_scraper --cov-report=xml
31
+
32
+ - name: Upload coverage
33
+ uses: codecov/codecov-action@v4
34
+ with:
35
+ files: ./coverage.xml
36
+ env:
37
+ CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
38
+
39
+ lint:
40
+ runs-on: ubuntu-latest
41
+
42
+ steps:
43
+ - uses: actions/checkout@v4
44
+
45
+ - name: Set up Python
46
+ uses: actions/setup-python@v5
47
+ with:
48
+ python-version: '3.11'
49
+
50
+ - name: Install linters
51
+ run: pip install ruff mypy
52
+
53
+ - name: Run Ruff
54
+ run: ruff check src/ tests/
55
+
56
+ - name: Run MyPy
57
+ run: mypy src/ --ignore-missing-imports
58
+
59
+ build:
60
+ runs-on: ubuntu-latest
61
+ needs: [test, lint]
62
+
63
+ steps:
64
+ - uses: actions/checkout@v4
65
+
66
+ - name: Set up Python
67
+ uses: actions/setup-python@v5
68
+ with:
69
+ python-version: '3.11'
70
+
71
+ - name: Install build tools
72
+ run: |
73
+ python -m pip install --upgrade pip
74
+ pip install build
75
+
76
+ - name: Build package
77
+ run: python -m build
78
+
79
+ - name: Upload artifacts
80
+ uses: actions/upload-artifact@v4
81
+ with:
82
+ name: dist
83
+ path: dist/
@@ -0,0 +1,34 @@
1
+ name: Release
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ jobs:
8
+ publish:
9
+ runs-on: ubuntu-latest
10
+
11
+ steps:
12
+ - uses: actions/checkout@v4
13
+
14
+ - name: Set up Python
15
+ uses: actions/setup-python@v5
16
+ with:
17
+ python-version: '3.11'
18
+
19
+ - name: Install build tools
20
+ run: |
21
+ python -m pip install --upgrade pip
22
+ pip install build twine
23
+
24
+ - name: Build package
25
+ run: python -m build
26
+
27
+ - name: Check package metadata
28
+ run: twine check dist/*
29
+
30
+ - name: Publish to PyPI
31
+ env:
32
+ TWINE_USERNAME: __token__
33
+ TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
34
+ run: twine upload --verbose dist/*
@@ -0,0 +1,59 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # Virtual environments
24
+ venv/
25
+ ENV/
26
+ env/
27
+ .venv/
28
+
29
+ # IDE
30
+ .idea/
31
+ .vscode/
32
+ *.swp
33
+ *.swo
34
+ *~
35
+
36
+ # Testing
37
+ .pytest_cache/
38
+ .coverage
39
+ htmlcov/
40
+ .tox/
41
+ .nox/
42
+
43
+ # Project specific
44
+ data/*.db
45
+ data/*.sqlite
46
+ output.txt
47
+ .claude/
48
+
49
+ # Go
50
+ cmd/scheduler/scheduler
51
+ cmd/scheduler/scheduler.exe
52
+
53
+ # Generated files
54
+ keywords.txt
55
+ output/*.json
56
+ output/*.csv
57
+ output/*.html
58
+ output/*.db
59
+ output/*.sqlite
@@ -0,0 +1,108 @@
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ## Language Requirement
6
+
7
+ **任何情况下,都使用中文进行交流和回复。** 无论用户使用何种语言提问,都必须使用中文回答。
8
+
9
+ ## Project Overview
10
+
11
+ GitHub/GitLab AI Scraper is a CLI tool for scraping AI-related high-star repositories from GitHub and GitLab. It supports multi-platform scraping, dynamic keyword extraction, multiple export formats, and a REST API server.
12
+
13
+ ## Common Commands
14
+
15
+ ### Development Setup
16
+ ```bash
17
+ pip install -e ".[dev]" # Install with dev dependencies
18
+ ```
19
+
20
+ ### Testing
21
+ ```bash
22
+ pytest tests/ -v # Run all tests
23
+ pytest tests/test_models.py -v # Run single test file
24
+ pytest -k "repository" -v # Run tests matching pattern
25
+ ```
26
+
27
+ ### CLI Usage
28
+ ```bash
29
+ ai-scraper scrape # Scrape from GitHub (default)
30
+ ai-scraper scrape --platform gitlab # Scrape from GitLab
31
+ ai-scraper scrape --gitlab-url https://your-gitlab.com/api/v4 # Self-hosted GitLab
32
+ ai-scraper scrape --incremental # Only updated repos
33
+ ai-scraper scrape --since 7d # Repos updated in last 7 days
34
+ ai-scraper scrape --progress # Show progress bar
35
+ ai-scraper list # List scraped repos
36
+ ai-scraper trending # Show trending repos
37
+ ai-scraper serve --port 8080 # Start REST API server
38
+ ai-scraper keywords list # List keywords
39
+ ai-scraper db export --format markdown --output repositories.md
40
+ ai-scraper db clean --invalid # Remove invalid repos
41
+ ai-scraper db clean --vacuum # Optimize database
42
+ ai-scraper config show # Show current config
43
+ ```
44
+
45
+ ### Environment Variables
46
+ ```bash
47
+ export GITHUB_TOKEN=your_token # Increases rate limit to 5000/hour
48
+ export GITLAB_TOKEN=your_token # For GitLab scraping
49
+ ```
50
+
51
+ ## Architecture
52
+
53
+ ### Core Data Flow
54
+ 1. **API Clients** (`src/ai_scraper/api/`) - Async HTTP clients for GitHub/GitLab with rate limiting and caching
55
+ 2. **Filter** (`src/ai_scraper/filters/ai_filter.py`) - Determines if repos are AI-related and scores relevance
56
+ 3. **Classifier** (`src/ai_scraper/classifier.py`) - Categorizes repos into AI subdomains (LLM, CV, NLP, MLOps, etc.)
57
+ 4. **Database** (`src/ai_scraper/storage/database.py`) - SQLite storage with snapshot-based trend tracking
58
+ 5. **Output** (`src/ai_scraper/output/`) - Exporters for Markdown, HTML, Excel, RSS formats
59
+
60
+ ### Key Components
61
+
62
+ **Repository Model** (`src/ai_scraper/models/repository.py`)
63
+ - Pydantic models: `Repository`, `RepoSnapshot`, `FilterConfig`, `ScrapeConfig`
64
+ - URL pattern supports both GitHub and GitLab: `^https?://[\w\.-]+/[\w\-\.]+/[\w\-\.]+`
65
+
66
+ **API Clients** (`src/ai_scraper/api/`)
67
+ - `GitHubClient` and `GitLabClient` share similar async patterns
68
+ - Both use `RateLimiter` (token bucket) and optional `RequestCache`
69
+ - Connection pooling via `aiohttp.TCPConnector`
70
+
71
+ **Keyword Extraction** (`src/ai_scraper/keywords/extractor.py`)
72
+ - Extracts keywords from repo topics, descriptions, and names
73
+ - Filters stopwords, numeric/path/file-like noise, and low-quality keywords while preserving short AI terms like `ai` and `ml`
74
+ - Merges with existing keywords respecting `max_keywords` limit
75
+
76
+ **Configuration** (`src/ai_scraper/config.py`)
77
+ - YAML-based config loaded from `ai-scraper.yaml`
78
+ - Environment variable substitution: `${GITHUB_TOKEN}` syntax
79
+ - Separate configs for GitHub, GitLab, filter, scrape, database, keywords, output, webhooks
80
+
81
+ ### CLI Structure (`src/ai_scraper/cli.py`)
82
+ - Uses Click with grouped commands: `scrape`, `list`, `trending`, `serve`, `schedule`, `interactive`
83
+ - Sub-groups: `config`, `db`, `keywords`
84
+ - Progress bar via Rich library
85
+ - Windows UTF-8 handling with `clean_text()` function
86
+
87
+ ## Platform Support
88
+
89
+ Both GitHub and GitLab are supported with platform-specific clients:
90
+ - GitHub: `GitHubClient` with search query syntax `stars:>100 topic:ai pushed:>YYYY-MM-DD`
91
+ - GitLab: `GitLabClient` with simpler search, configurable `base_url` for self-hosted instances
92
+
93
+ ## Export Formats
94
+
95
+ Located in `src/ai_scraper/output/`:
96
+ - `markdown.py` - Markdown with Chinese translation, category grouping, language icons
97
+ - `html.py` - HTML with responsive styling
98
+ - `excel.py` - Excel workbook format
99
+ - `rss.py` - RSS feed format
100
+ - `translator.py` - Description translation support
101
+
102
+ ## Database Schema
103
+
104
+ SQLite tables in `data/ai_scraper.db`:
105
+ - `repositories` - Main repo data with relevance scores, timestamps
106
+ - `snapshots` - Star count snapshots for trend analysis
107
+
108
+ Indexes on: stars, last_updated_at, language, created_at, relevance_score, snapshot_at
@@ -0,0 +1,42 @@
1
+ # Build stage for Go scheduler (optional)
2
+ FROM golang:1.21-alpine AS go-builder
3
+
4
+ WORKDIR /build
5
+ COPY go.mod ./ 2>/dev/null || true
6
+ RUN if [ -f go.mod ]; then go mod download; fi
7
+
8
+ COPY cmds/scheduler/ ./ 2>/dev/null || true
9
+ RUN if [ -f main.go ]; then CGO_ENABLED=0 go build -o scheduler .; fi
10
+
11
+ # Main stage
12
+ FROM python:3.11-slim
13
+
14
+ # Install SQLite
15
+ RUN apt-get update && apt-get install -y \
16
+ sqlite3 \
17
+ && rm -rf /var/lib/apt/lists/*
18
+
19
+ WORKDIR /app
20
+
21
+ # Install Python dependencies
22
+ COPY pyproject.toml ./
23
+ RUN pip install --no-cache-dir -e .
24
+
25
+ # Copy application code
26
+ COPY src/ ./src/
27
+ COPY ai-scraper.yaml ./
28
+
29
+ # Copy Go scheduler binary if built
30
+ COPY --from=go-builder /build/scheduler /usr/local/bin/scheduler 2>/dev/null || true
31
+
32
+ # Create data and output directories
33
+ RUN mkdir -p /app/data /app/output /app/.cache
34
+
35
+ # Set environment variables
36
+ ENV PYTHONUNBUFFERED=1
37
+ ENV AI_SCRAPER_DATA_DIR=/app/data
38
+ ENV AI_SCRAPER_OUTPUT_DIR=/app/output
39
+
40
+ # Entry point
41
+ ENTRYPOINT ["python", "-m", "ai_scraper.cli"]
42
+ CMD ["--help"]
@@ -0,0 +1,299 @@
1
+ Metadata-Version: 2.4
2
+ Name: github-ai-scraper
3
+ Version: 0.1.2
4
+ Summary: A CLI tool for discovering and scraping AI-related high-star repositories from GitHub and GitLab
5
+ Project-URL: Homepage, https://github.com/lwx66615/github-ai-scraper
6
+ Project-URL: Repository, https://github.com/lwx66615/github-ai-scraper
7
+ Project-URL: Documentation, https://github.com/lwx66615/github-ai-scraper#readme
8
+ Project-URL: Issues, https://github.com/lwx66615/github-ai-scraper/issues
9
+ Project-URL: Changelog, https://github.com/lwx66615/github-ai-scraper/releases
10
+ Author: lwx66615
11
+ License-Expression: MIT
12
+ Keywords: ai,cli,github,gitlab,machine-learning,repositories,scraper,trending
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Environment :: Console
15
+ Classifier: Intended Audience :: Developers
16
+ Classifier: Intended Audience :: Science/Research
17
+ Classifier: License :: OSI Approved :: MIT License
18
+ Classifier: Operating System :: OS Independent
19
+ Classifier: Programming Language :: Python :: 3
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
24
+ Classifier: Topic :: Software Development
25
+ Classifier: Topic :: Utilities
26
+ Classifier: Typing :: Typed
27
+ Requires-Python: >=3.10
28
+ Requires-Dist: aiohttp>=3.9.0
29
+ Requires-Dist: aiosqlite>=0.19.0
30
+ Requires-Dist: click>=8.1.0
31
+ Requires-Dist: croniter>=2.0.0
32
+ Requires-Dist: fastapi>=0.109.0
33
+ Requires-Dist: pydantic>=2.0.0
34
+ Requires-Dist: pyyaml>=6.0
35
+ Requires-Dist: rich>=13.0.0
36
+ Requires-Dist: uvicorn>=0.27.0
37
+ Provides-Extra: dev
38
+ Requires-Dist: pytest-asyncio>=0.23.0; extra == 'dev'
39
+ Requires-Dist: pytest-cov>=4.1.0; extra == 'dev'
40
+ Requires-Dist: pytest>=8.0.0; extra == 'dev'
41
+ Description-Content-Type: text/markdown
42
+
43
+ # GitHub/GitLab AI Scraper
44
+
45
+ English | [简体中文](README_CN.md)
46
+
47
+ A CLI tool for scraping AI-related high-star repositories from GitHub and GitLab.
48
+
49
+ ## Features
50
+
51
+ - **Multi-platform support** - Scrape from GitHub or GitLab (including self-hosted instances)
52
+ - Search and filter AI-related repositories by keywords and topics
53
+ - **Dynamic keyword extraction** - Automatically learns new keywords from scraped repos
54
+ - **Markdown/HTML/Excel/RSS report generation** - Multiple export formats with Chinese translation
55
+ - **Incremental scraping** - Fetch only updated repos with `--since` flag
56
+ - **Resume support** - Continue interrupted scrapes with progress tracking
57
+ - **Progress bar display** - Visual progress during scraping
58
+ - **Interactive CLI mode** - Menu-driven interface for easy use
59
+ - **Concurrent scraping** - Parallel requests for faster results
60
+ - **Multi-language search** - Support for Chinese and English keywords
61
+ - Local SQLite storage with trend analysis
62
+ - Configurable filtering and scraping options
63
+ - Rate limiting with GitHub/GitLab API token support
64
+ - Export to CSV/JSON/HTML/Excel/RSS/Markdown formats
65
+ - **REST API server** - Access data via HTTP endpoints with optional authentication
66
+ - **Scheduled scraping** - Cron-based periodic scraping
67
+ - **Webhook notifications** - Notify external services on events
68
+ - **Plugin system** - Extend functionality with custom plugins
69
+ - **Repository health assessment** - Activity, popularity, maintenance scores
70
+ - **Intelligent classification** - LLM, CV, NLP, MLOps, AI Infrastructure categories
71
+ - **Deduplication** - Fork and mirror detection, content similarity
72
+ - **Secure token storage** - Encrypted storage for sensitive tokens
73
+ - **Database backup** - Automatic backup and restore functionality
74
+ - **Error recovery** - Retry logic with exponential backoff
75
+
76
+ ## Installation
77
+
78
+ ```bash
79
+ # Install from PyPI
80
+ pip install github-ai-scraper
81
+
82
+ # Or install from source for development
83
+ pip install -e ".[dev]"
84
+ ```
85
+
86
+ ## Quick Start
87
+
88
+ ```bash
89
+ # Set your GitHub token (optional, increases rate limit)
90
+ export GITHUB_TOKEN=your_token_here
91
+
92
+ # Scrape AI repositories from GitHub (default)
93
+ ai-scraper scrape
94
+
95
+ # Scrape from GitLab
96
+ ai-scraper scrape --platform gitlab
97
+
98
+ # Scrape from self-hosted GitLab
99
+ ai-scraper scrape --platform gitlab --gitlab-url https://your-gitlab.com/api/v4
100
+
101
+ # Scrape with progress bar
102
+ ai-scraper scrape --progress
103
+
104
+ # Concurrent scraping (faster)
105
+ ai-scraper scrape --concurrent
106
+
107
+ # Incremental scraping (repos updated in last 7 days)
108
+ ai-scraper scrape --incremental
109
+ ai-scraper scrape --since 7d
110
+
111
+ # Resume interrupted scrape
112
+ ai-scraper scrape --resume
113
+
114
+ # Interactive mode
115
+ ai-scraper interactive
116
+
117
+ # List scraped repositories
118
+ ai-scraper list
119
+
120
+ # Show trending repositories
121
+ ai-scraper trending
122
+
123
+ # Export data
124
+ ai-scraper db export --format html --output index.html
125
+ ai-scraper db export --format xlsx --output repos.xlsx
126
+ ai-scraper db export --format rss --output feed.xml
127
+ ai-scraper db export --format markdown --output repositories.md
128
+
129
+ # Start REST API server (with authentication)
130
+ ai-scraper serve --port 8080 --auth
131
+
132
+ # Schedule periodic scraping (daily at 9am)
133
+ ai-scraper schedule --cron "0 9 * * *"
134
+
135
+ # Backup database
136
+ ai-scraper db backup
137
+ ai-scraper db restore backup_file.db.gz
138
+ ```
139
+
140
+ ## Configuration
141
+
142
+ Create `ai-scraper.yaml` to customize:
143
+
144
+ ```yaml
145
+ github:
146
+ token: ${GITHUB_TOKEN}
147
+ cache_ttl: 3600
148
+
149
+ gitlab:
150
+ token: ${GITLAB_TOKEN} # Optional, for GitLab scraping
151
+ base_url: https://gitlab.com/api/v4 # Or your self-hosted GitLab URL
152
+ cache_ttl: 3600
153
+
154
+ filter:
155
+ min_stars: 100
156
+ keywords:
157
+ - ai
158
+ - machine-learning
159
+ - 人工智能 # Chinese keyword support
160
+ topics:
161
+ - ai
162
+ - deep-learning
163
+
164
+ scrape:
165
+ max_results: 500
166
+ concurrency: 5
167
+ concurrent_requests: 5
168
+
169
+ database:
170
+ path: ./data/ai_scraper.db
171
+ backup_dir: ./backups
172
+ max_backups: 10
173
+
174
+ api:
175
+ auth_enabled: true
176
+ api_keys:
177
+ - as_your_api_key_here
178
+
179
+ webhooks:
180
+ enabled: false
181
+ endpoints:
182
+ - url: https://hooks.slack.com/services/YOUR/WEBHOOK/URL
183
+ events: [scrape_complete, trending_found]
184
+ ```
185
+
186
+ ## Commands
187
+
188
+ | Command | Description |
189
+ |---------|-------------|
190
+ | `ai-scraper scrape` | Scrape AI repositories from GitHub |
191
+ | `ai-scraper scrape --platform gitlab` | Scrape from GitLab |
192
+ | `ai-scraper scrape --platform gitlab --gitlab-url URL` | Scrape from self-hosted GitLab |
193
+ | `ai-scraper scrape --concurrent` | Concurrent scraping for faster results |
194
+ | `ai-scraper scrape --incremental` | Incremental scraping (only updated repos) |
195
+ | `ai-scraper scrape --since 7d` | Fetch repos updated in last 7 days |
196
+ | `ai-scraper scrape --resume` | Resume interrupted scrape |
197
+ | `ai-scraper scrape --progress` | Show progress bar during scraping |
198
+ | `ai-scraper interactive` | Start interactive menu mode |
199
+ | `ai-scraper list` | List scraped repositories |
200
+ | `ai-scraper trending` | Show trending repositories by star growth |
201
+ | `ai-scraper serve` | Start REST API server |
202
+ | `ai-scraper serve --auth` | Start API server with authentication |
203
+ | `ai-scraper schedule` | Schedule periodic scraping |
204
+ | `ai-scraper keywords list` | List all keywords |
205
+ | `ai-scraper keywords extract` | Extract keywords from database |
206
+ | `ai-scraper keywords clear` | Clear keywords |
207
+ | `ai-scraper config init` | Initialize config file |
208
+ | `ai-scraper config show` | Show current config |
209
+ | `ai-scraper db stats` | Show database statistics |
210
+ | `ai-scraper db export` | Export data to CSV/JSON/HTML/Excel/RSS |
211
+ | `ai-scraper db clean --invalid` | Remove repositories with invalid data |
212
+ | `ai-scraper db clean --vacuum` | Optimize database size |
213
+ | `ai-scraper db backup` | Create database backup |
214
+ | `ai-scraper db restore` | Restore from backup |
215
+ | `ai-scraper db backups` | List available backups |
216
+
217
+ ## REST API Endpoints
218
+
219
+ When running `ai-scraper serve`:
220
+
221
+ | Endpoint | Description |
222
+ |----------|-------------|
223
+ | `GET /api/repos` | List repositories with filters |
224
+ | `GET /api/repos/{id}` | Get specific repository |
225
+ | `GET /api/stats` | Get database statistics |
226
+ | `GET /api/trending` | Get trending repositories |
227
+ | `GET /api/search?q=...` | Search repositories |
228
+
229
+ Authentication: Pass `X-API-Key` header when `--auth` is enabled.
230
+
231
+ ## Project Structure
232
+
233
+ ```
234
+ github-ai-scraper/
235
+ ├── src/ai_scraper/
236
+ │ ├── cli.py # CLI entry point
237
+ │ ├── config.py # Configuration management
238
+ │ ├── interactive.py # Interactive menu mode
239
+ │ ├── classifier.py # Repository classification
240
+ │ ├── dedup.py # Deduplication utilities
241
+ │ ├── health.py # Health assessment
242
+ │ ├── scheduler.py # Task scheduling
243
+ │ ├── webhooks.py # Webhook notifications
244
+ │ ├── plugins.py # Plugin system
245
+ │ ├── logging_config.py # Logging configuration
246
+ │ ├── api_server.py # REST API server
247
+ │ ├── auth.py # API authentication
248
+ │ ├── retry.py # Error recovery
249
+ │ ├── i18n.py # Multi-language support
250
+ │ ├── scrape_progress.py # Resume support
251
+ │ ├── backup.py # Database backup
252
+ │ ├── config_watcher.py # Config hot reload
253
+ │ ├── secure_storage.py # Token encryption
254
+ │ ├── api/
255
+ │ │ ├── github.py # GitHub API client
256
+ │ │ └── rate_limiter.py # Token bucket rate limiter
257
+ │ ├── models/
258
+ │ │ └── repository.py # Data models (Pydantic)
259
+ │ ├── filters/
260
+ │ │ └── ai_filter.py # AI relevance filter
261
+ │ ├── output/
262
+ │ │ ├── markdown.py # Markdown exporter
263
+ │ │ ├── html.py # HTML exporter
264
+ │ │ ├── excel.py # Excel exporter
265
+ │ │ └── rss.py # RSS exporter
266
+ │ └── storage/
267
+ │ ├── database.py # SQLite storage (sync)
268
+ │ └── async_database.py # SQLite storage (async)
269
+ ├── plugins/ # Example plugins
270
+ ├── tests/ # Test suite
271
+ ├── Dockerfile # Docker support
272
+ ├── docker-compose.yml # Docker compose
273
+ ├── .github/workflows/ # CI/CD workflows
274
+ └── ai-scraper.yaml # Default configuration
275
+ ```
276
+
277
+ ## Development
278
+
279
+ ```bash
280
+ # Install dev dependencies
281
+ pip install -e ".[dev]"
282
+
283
+ # Run tests
284
+ pytest tests/ -v
285
+
286
+ # Build Docker image
287
+ docker build -t ai-scraper .
288
+ ```
289
+
290
+ ## API Rate Limits
291
+
292
+ - Without token: 60 requests/hour
293
+ - With token: 5000 requests/hour
294
+
295
+ Set `GITHUB_TOKEN` environment variable for higher limits.
296
+
297
+ ## License
298
+
299
+ MIT