github-ai-scraper 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- github_ai_scraper-0.1.2/.dockerignore +17 -0
- github_ai_scraper-0.1.2/.github/workflows/ci.yml +83 -0
- github_ai_scraper-0.1.2/.github/workflows/release.yml +34 -0
- github_ai_scraper-0.1.2/.gitignore +59 -0
- github_ai_scraper-0.1.2/CLAUDE.md +108 -0
- github_ai_scraper-0.1.2/Dockerfile +42 -0
- github_ai_scraper-0.1.2/PKG-INFO +299 -0
- github_ai_scraper-0.1.2/README.md +257 -0
- github_ai_scraper-0.1.2/README_CN.md +260 -0
- github_ai_scraper-0.1.2/RELEASE_NOTES.md +52 -0
- github_ai_scraper-0.1.2/ai-scraper.yaml +83 -0
- github_ai_scraper-0.1.2/ai-security-config.yaml +88 -0
- github_ai_scraper-0.1.2/cmds/scheduler/limiter.go +98 -0
- github_ai_scraper-0.1.2/cmds/scheduler/main.go +60 -0
- github_ai_scraper-0.1.2/cmds/scheduler/processor.go +184 -0
- github_ai_scraper-0.1.2/cmds/scheduler/scheduler.go +240 -0
- github_ai_scraper-0.1.2/data/.gitkeep +1 -0
- github_ai_scraper-0.1.2/docker-compose.yml +27 -0
- github_ai_scraper-0.1.2/docs/PROGRESS.md +101 -0
- github_ai_scraper-0.1.2/docs/promotion/chinese-article-outline.md +82 -0
- github_ai_scraper-0.1.2/docs/promotion/english-article-outline.md +82 -0
- github_ai_scraper-0.1.2/docs/superpowers/plans/2026-05-09-github-ai-scraper.md +3251 -0
- github_ai_scraper-0.1.2/docs/superpowers/plans/2026-05-09-keywords-and-output.md +1029 -0
- github_ai_scraper-0.1.2/docs/superpowers/plans/2026-05-11-iteration-optimization-v2.md +197 -0
- github_ai_scraper-0.1.2/docs/superpowers/plans/2026-05-11-iteration-optimization.md +133 -0
- github_ai_scraper-0.1.2/docs/superpowers/plans/2026-05-14-promotion-plan.md +660 -0
- github_ai_scraper-0.1.2/docs/superpowers/specs/2026-05-09-github-ai-scraper-design.md +491 -0
- github_ai_scraper-0.1.2/docs/superpowers/specs/2026-05-09-keywords-and-output-design.md +246 -0
- github_ai_scraper-0.1.2/docs/superpowers/specs/2026-05-14-promotion-plan-design.md +216 -0
- github_ai_scraper-0.1.2/go.mod +5 -0
- github_ai_scraper-0.1.2/output/repositories.md +4981 -0
- github_ai_scraper-0.1.2/plugins/README.md +51 -0
- github_ai_scraper-0.1.2/plugins/example_plugin.py +35 -0
- github_ai_scraper-0.1.2/pyproject.toml +75 -0
- github_ai_scraper-0.1.2/scraped_repos.json +54 -0
- github_ai_scraper-0.1.2/src/ai_scraper/__init__.py +3 -0
- github_ai_scraper-0.1.2/src/ai_scraper/api/__init__.py +6 -0
- github_ai_scraper-0.1.2/src/ai_scraper/api/github.py +340 -0
- github_ai_scraper-0.1.2/src/ai_scraper/api/gitlab.py +418 -0
- github_ai_scraper-0.1.2/src/ai_scraper/api/rate_limiter.py +120 -0
- github_ai_scraper-0.1.2/src/ai_scraper/api_server.py +196 -0
- github_ai_scraper-0.1.2/src/ai_scraper/auth.py +68 -0
- github_ai_scraper-0.1.2/src/ai_scraper/backup.py +112 -0
- github_ai_scraper-0.1.2/src/ai_scraper/cache.py +95 -0
- github_ai_scraper-0.1.2/src/ai_scraper/classifier.py +135 -0
- github_ai_scraper-0.1.2/src/ai_scraper/cli.py +747 -0
- github_ai_scraper-0.1.2/src/ai_scraper/config.py +237 -0
- github_ai_scraper-0.1.2/src/ai_scraper/config_watcher.py +82 -0
- github_ai_scraper-0.1.2/src/ai_scraper/dedup.py +148 -0
- github_ai_scraper-0.1.2/src/ai_scraper/filters/__init__.py +5 -0
- github_ai_scraper-0.1.2/src/ai_scraper/filters/ai_filter.py +93 -0
- github_ai_scraper-0.1.2/src/ai_scraper/health.py +155 -0
- github_ai_scraper-0.1.2/src/ai_scraper/i18n.py +141 -0
- github_ai_scraper-0.1.2/src/ai_scraper/interactive.py +96 -0
- github_ai_scraper-0.1.2/src/ai_scraper/keywords/__init__.py +5 -0
- github_ai_scraper-0.1.2/src/ai_scraper/keywords/extractor.py +274 -0
- github_ai_scraper-0.1.2/src/ai_scraper/logging_config.py +74 -0
- github_ai_scraper-0.1.2/src/ai_scraper/models/__init__.py +5 -0
- github_ai_scraper-0.1.2/src/ai_scraper/models/repository.py +72 -0
- github_ai_scraper-0.1.2/src/ai_scraper/output/__init__.py +6 -0
- github_ai_scraper-0.1.2/src/ai_scraper/output/excel.py +79 -0
- github_ai_scraper-0.1.2/src/ai_scraper/output/html.py +152 -0
- github_ai_scraper-0.1.2/src/ai_scraper/output/markdown.py +338 -0
- github_ai_scraper-0.1.2/src/ai_scraper/output/rss.py +82 -0
- github_ai_scraper-0.1.2/src/ai_scraper/output/translator.py +303 -0
- github_ai_scraper-0.1.2/src/ai_scraper/plugin_system.py +146 -0
- github_ai_scraper-0.1.2/src/ai_scraper/plugins/__init__.py +5 -0
- github_ai_scraper-0.1.2/src/ai_scraper/retry.py +134 -0
- github_ai_scraper-0.1.2/src/ai_scraper/scheduler.py +84 -0
- github_ai_scraper-0.1.2/src/ai_scraper/scrape_progress.py +99 -0
- github_ai_scraper-0.1.2/src/ai_scraper/secure_storage.py +127 -0
- github_ai_scraper-0.1.2/src/ai_scraper/storage/__init__.py +5 -0
- github_ai_scraper-0.1.2/src/ai_scraper/storage/async_database.py +237 -0
- github_ai_scraper-0.1.2/src/ai_scraper/storage/database.py +456 -0
- github_ai_scraper-0.1.2/src/ai_scraper/webhooks.py +95 -0
- github_ai_scraper-0.1.2/test_export.json +54 -0
- github_ai_scraper-0.1.2/test_output.txt +0 -0
- github_ai_scraper-0.1.2/tests/__init__.py +1 -0
- github_ai_scraper-0.1.2/tests/test_api_server_performance.py +70 -0
- github_ai_scraper-0.1.2/tests/test_async_database.py +188 -0
- github_ai_scraper-0.1.2/tests/test_auth.py +82 -0
- github_ai_scraper-0.1.2/tests/test_backup.py +164 -0
- github_ai_scraper-0.1.2/tests/test_cache.py +226 -0
- github_ai_scraper-0.1.2/tests/test_classifier.py +273 -0
- github_ai_scraper-0.1.2/tests/test_classifier_enhanced.py +140 -0
- github_ai_scraper-0.1.2/tests/test_cli.py +186 -0
- github_ai_scraper-0.1.2/tests/test_concurrent_scrape.py +142 -0
- github_ai_scraper-0.1.2/tests/test_config.py +75 -0
- github_ai_scraper-0.1.2/tests/test_config_watcher.py +99 -0
- github_ai_scraper-0.1.2/tests/test_connection_pool.py +72 -0
- github_ai_scraper-0.1.2/tests/test_database.py +240 -0
- github_ai_scraper-0.1.2/tests/test_db_performance.py +217 -0
- github_ai_scraper-0.1.2/tests/test_dedup.py +183 -0
- github_ai_scraper-0.1.2/tests/test_dedup_enhanced.py +162 -0
- github_ai_scraper-0.1.2/tests/test_example_plugin.py +98 -0
- github_ai_scraper-0.1.2/tests/test_export_formats.py +133 -0
- github_ai_scraper-0.1.2/tests/test_filter.py +104 -0
- github_ai_scraper-0.1.2/tests/test_github.py +313 -0
- github_ai_scraper-0.1.2/tests/test_health.py +197 -0
- github_ai_scraper-0.1.2/tests/test_i18n.py +87 -0
- github_ai_scraper-0.1.2/tests/test_incremental.py +278 -0
- github_ai_scraper-0.1.2/tests/test_integration.py +173 -0
- github_ai_scraper-0.1.2/tests/test_interactive.py +218 -0
- github_ai_scraper-0.1.2/tests/test_keywords.py +545 -0
- github_ai_scraper-0.1.2/tests/test_models.py +85 -0
- github_ai_scraper-0.1.2/tests/test_output.py +199 -0
- github_ai_scraper-0.1.2/tests/test_output_html.py +429 -0
- github_ai_scraper-0.1.2/tests/test_rate_limiter_advanced.py +325 -0
- github_ai_scraper-0.1.2/tests/test_retry.py +132 -0
- github_ai_scraper-0.1.2/tests/test_scrape_progress.py +131 -0
- github_ai_scraper-0.1.2/tests/test_secure_storage.py +76 -0
- github_ai_scraper-0.1.2/tests/test_validation.py +521 -0
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [master, main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [master, main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
strategy:
|
|
13
|
+
matrix:
|
|
14
|
+
python-version: ['3.10', '3.11', '3.12']
|
|
15
|
+
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
|
|
19
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
20
|
+
uses: actions/setup-python@v5
|
|
21
|
+
with:
|
|
22
|
+
python-version: ${{ matrix.python-version }}
|
|
23
|
+
|
|
24
|
+
- name: Install dependencies
|
|
25
|
+
run: |
|
|
26
|
+
python -m pip install --upgrade pip
|
|
27
|
+
pip install -e ".[dev]"
|
|
28
|
+
|
|
29
|
+
- name: Run tests
|
|
30
|
+
run: pytest tests/ -v --cov=src/ai_scraper --cov-report=xml
|
|
31
|
+
|
|
32
|
+
- name: Upload coverage
|
|
33
|
+
uses: codecov/codecov-action@v4
|
|
34
|
+
with:
|
|
35
|
+
files: ./coverage.xml
|
|
36
|
+
env:
|
|
37
|
+
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
|
|
38
|
+
|
|
39
|
+
lint:
|
|
40
|
+
runs-on: ubuntu-latest
|
|
41
|
+
|
|
42
|
+
steps:
|
|
43
|
+
- uses: actions/checkout@v4
|
|
44
|
+
|
|
45
|
+
- name: Set up Python
|
|
46
|
+
uses: actions/setup-python@v5
|
|
47
|
+
with:
|
|
48
|
+
python-version: '3.11'
|
|
49
|
+
|
|
50
|
+
- name: Install linters
|
|
51
|
+
run: pip install ruff mypy
|
|
52
|
+
|
|
53
|
+
- name: Run Ruff
|
|
54
|
+
run: ruff check src/ tests/
|
|
55
|
+
|
|
56
|
+
- name: Run MyPy
|
|
57
|
+
run: mypy src/ --ignore-missing-imports
|
|
58
|
+
|
|
59
|
+
build:
|
|
60
|
+
runs-on: ubuntu-latest
|
|
61
|
+
needs: [test, lint]
|
|
62
|
+
|
|
63
|
+
steps:
|
|
64
|
+
- uses: actions/checkout@v4
|
|
65
|
+
|
|
66
|
+
- name: Set up Python
|
|
67
|
+
uses: actions/setup-python@v5
|
|
68
|
+
with:
|
|
69
|
+
python-version: '3.11'
|
|
70
|
+
|
|
71
|
+
- name: Install build tools
|
|
72
|
+
run: |
|
|
73
|
+
python -m pip install --upgrade pip
|
|
74
|
+
pip install build
|
|
75
|
+
|
|
76
|
+
- name: Build package
|
|
77
|
+
run: python -m build
|
|
78
|
+
|
|
79
|
+
- name: Upload artifacts
|
|
80
|
+
uses: actions/upload-artifact@v4
|
|
81
|
+
with:
|
|
82
|
+
name: dist
|
|
83
|
+
path: dist/
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
name: Release
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
publish:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
|
|
11
|
+
steps:
|
|
12
|
+
- uses: actions/checkout@v4
|
|
13
|
+
|
|
14
|
+
- name: Set up Python
|
|
15
|
+
uses: actions/setup-python@v5
|
|
16
|
+
with:
|
|
17
|
+
python-version: '3.11'
|
|
18
|
+
|
|
19
|
+
- name: Install build tools
|
|
20
|
+
run: |
|
|
21
|
+
python -m pip install --upgrade pip
|
|
22
|
+
pip install build twine
|
|
23
|
+
|
|
24
|
+
- name: Build package
|
|
25
|
+
run: python -m build
|
|
26
|
+
|
|
27
|
+
- name: Check package metadata
|
|
28
|
+
run: twine check dist/*
|
|
29
|
+
|
|
30
|
+
- name: Publish to PyPI
|
|
31
|
+
env:
|
|
32
|
+
TWINE_USERNAME: __token__
|
|
33
|
+
TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
|
|
34
|
+
run: twine upload --verbose dist/*
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
.Python
|
|
7
|
+
build/
|
|
8
|
+
develop-eggs/
|
|
9
|
+
dist/
|
|
10
|
+
downloads/
|
|
11
|
+
eggs/
|
|
12
|
+
.eggs/
|
|
13
|
+
lib/
|
|
14
|
+
lib64/
|
|
15
|
+
parts/
|
|
16
|
+
sdist/
|
|
17
|
+
var/
|
|
18
|
+
wheels/
|
|
19
|
+
*.egg-info/
|
|
20
|
+
.installed.cfg
|
|
21
|
+
*.egg
|
|
22
|
+
|
|
23
|
+
# Virtual environments
|
|
24
|
+
venv/
|
|
25
|
+
ENV/
|
|
26
|
+
env/
|
|
27
|
+
.venv/
|
|
28
|
+
|
|
29
|
+
# IDE
|
|
30
|
+
.idea/
|
|
31
|
+
.vscode/
|
|
32
|
+
*.swp
|
|
33
|
+
*.swo
|
|
34
|
+
*~
|
|
35
|
+
|
|
36
|
+
# Testing
|
|
37
|
+
.pytest_cache/
|
|
38
|
+
.coverage
|
|
39
|
+
htmlcov/
|
|
40
|
+
.tox/
|
|
41
|
+
.nox/
|
|
42
|
+
|
|
43
|
+
# Project specific
|
|
44
|
+
data/*.db
|
|
45
|
+
data/*.sqlite
|
|
46
|
+
output.txt
|
|
47
|
+
.claude/
|
|
48
|
+
|
|
49
|
+
# Go
|
|
50
|
+
cmd/scheduler/scheduler
|
|
51
|
+
cmd/scheduler/scheduler.exe
|
|
52
|
+
|
|
53
|
+
# Generated files
|
|
54
|
+
keywords.txt
|
|
55
|
+
output/*.json
|
|
56
|
+
output/*.csv
|
|
57
|
+
output/*.html
|
|
58
|
+
output/*.db
|
|
59
|
+
output/*.sqlite
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
# CLAUDE.md
|
|
2
|
+
|
|
3
|
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
|
4
|
+
|
|
5
|
+
## Language Requirement
|
|
6
|
+
|
|
7
|
+
**任何情况下,都使用中文进行交流和回复。** 无论用户使用何种语言提问,都必须使用中文回答。
|
|
8
|
+
|
|
9
|
+
## Project Overview
|
|
10
|
+
|
|
11
|
+
GitHub/GitLab AI Scraper is a CLI tool for scraping AI-related high-star repositories from GitHub and GitLab. It supports multi-platform scraping, dynamic keyword extraction, multiple export formats, and a REST API server.
|
|
12
|
+
|
|
13
|
+
## Common Commands
|
|
14
|
+
|
|
15
|
+
### Development Setup
|
|
16
|
+
```bash
|
|
17
|
+
pip install -e ".[dev]" # Install with dev dependencies
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
### Testing
|
|
21
|
+
```bash
|
|
22
|
+
pytest tests/ -v # Run all tests
|
|
23
|
+
pytest tests/test_models.py -v # Run single test file
|
|
24
|
+
pytest -k "repository" -v # Run tests matching pattern
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
### CLI Usage
|
|
28
|
+
```bash
|
|
29
|
+
ai-scraper scrape # Scrape from GitHub (default)
|
|
30
|
+
ai-scraper scrape --platform gitlab # Scrape from GitLab
|
|
31
|
+
ai-scraper scrape --gitlab-url https://your-gitlab.com/api/v4 # Self-hosted GitLab
|
|
32
|
+
ai-scraper scrape --incremental # Only updated repos
|
|
33
|
+
ai-scraper scrape --since 7d # Repos updated in last 7 days
|
|
34
|
+
ai-scraper scrape --progress # Show progress bar
|
|
35
|
+
ai-scraper list # List scraped repos
|
|
36
|
+
ai-scraper trending # Show trending repos
|
|
37
|
+
ai-scraper serve --port 8080 # Start REST API server
|
|
38
|
+
ai-scraper keywords list # List keywords
|
|
39
|
+
ai-scraper db export --format markdown --output repositories.md
|
|
40
|
+
ai-scraper db clean --invalid # Remove invalid repos
|
|
41
|
+
ai-scraper db clean --vacuum # Optimize database
|
|
42
|
+
ai-scraper config show # Show current config
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
### Environment Variables
|
|
46
|
+
```bash
|
|
47
|
+
export GITHUB_TOKEN=your_token # Increases rate limit to 5000/hour
|
|
48
|
+
export GITLAB_TOKEN=your_token # For GitLab scraping
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Architecture
|
|
52
|
+
|
|
53
|
+
### Core Data Flow
|
|
54
|
+
1. **API Clients** (`src/ai_scraper/api/`) - Async HTTP clients for GitHub/GitLab with rate limiting and caching
|
|
55
|
+
2. **Filter** (`src/ai_scraper/filters/ai_filter.py`) - Determines if repos are AI-related and scores relevance
|
|
56
|
+
3. **Classifier** (`src/ai_scraper/classifier.py`) - Categorizes repos into AI subdomains (LLM, CV, NLP, MLOps, etc.)
|
|
57
|
+
4. **Database** (`src/ai_scraper/storage/database.py`) - SQLite storage with snapshot-based trend tracking
|
|
58
|
+
5. **Output** (`src/ai_scraper/output/`) - Exporters for Markdown, HTML, Excel, RSS formats
|
|
59
|
+
|
|
60
|
+
### Key Components
|
|
61
|
+
|
|
62
|
+
**Repository Model** (`src/ai_scraper/models/repository.py`)
|
|
63
|
+
- Pydantic models: `Repository`, `RepoSnapshot`, `FilterConfig`, `ScrapeConfig`
|
|
64
|
+
- URL pattern supports both GitHub and GitLab: `^https?://[\w\.-]+/[\w\-\.]+/[\w\-\.]+`
|
|
65
|
+
|
|
66
|
+
**API Clients** (`src/ai_scraper/api/`)
|
|
67
|
+
- `GitHubClient` and `GitLabClient` share similar async patterns
|
|
68
|
+
- Both use `RateLimiter` (token bucket) and optional `RequestCache`
|
|
69
|
+
- Connection pooling via `aiohttp.TCPConnector`
|
|
70
|
+
|
|
71
|
+
**Keyword Extraction** (`src/ai_scraper/keywords/extractor.py`)
|
|
72
|
+
- Extracts keywords from repo topics, descriptions, and names
|
|
73
|
+
- Filters stopwords, numeric/path/file-like noise, and low-quality keywords while preserving short AI terms like `ai` and `ml`
|
|
74
|
+
- Merges with existing keywords respecting `max_keywords` limit
|
|
75
|
+
|
|
76
|
+
**Configuration** (`src/ai_scraper/config.py`)
|
|
77
|
+
- YAML-based config loaded from `ai-scraper.yaml`
|
|
78
|
+
- Environment variable substitution: `${GITHUB_TOKEN}` syntax
|
|
79
|
+
- Separate configs for GitHub, GitLab, filter, scrape, database, keywords, output, webhooks
|
|
80
|
+
|
|
81
|
+
### CLI Structure (`src/ai_scraper/cli.py`)
|
|
82
|
+
- Uses Click with grouped commands: `scrape`, `list`, `trending`, `serve`, `schedule`, `interactive`
|
|
83
|
+
- Sub-groups: `config`, `db`, `keywords`
|
|
84
|
+
- Progress bar via Rich library
|
|
85
|
+
- Windows UTF-8 handling with `clean_text()` function
|
|
86
|
+
|
|
87
|
+
## Platform Support
|
|
88
|
+
|
|
89
|
+
Both GitHub and GitLab are supported with platform-specific clients:
|
|
90
|
+
- GitHub: `GitHubClient` with search query syntax `stars:>100 topic:ai pushed:>YYYY-MM-DD`
|
|
91
|
+
- GitLab: `GitLabClient` with simpler search, configurable `base_url` for self-hosted instances
|
|
92
|
+
|
|
93
|
+
## Export Formats
|
|
94
|
+
|
|
95
|
+
Located in `src/ai_scraper/output/`:
|
|
96
|
+
- `markdown.py` - Markdown with Chinese translation, category grouping, language icons
|
|
97
|
+
- `html.py` - HTML with responsive styling
|
|
98
|
+
- `excel.py` - Excel workbook format
|
|
99
|
+
- `rss.py` - RSS feed format
|
|
100
|
+
- `translator.py` - Description translation support
|
|
101
|
+
|
|
102
|
+
## Database Schema
|
|
103
|
+
|
|
104
|
+
SQLite tables in `data/ai_scraper.db`:
|
|
105
|
+
- `repositories` - Main repo data with relevance scores, timestamps
|
|
106
|
+
- `snapshots` - Star count snapshots for trend analysis
|
|
107
|
+
|
|
108
|
+
Indexes on: stars, last_updated_at, language, created_at, relevance_score, snapshot_at
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# Build stage for Go scheduler (optional)
|
|
2
|
+
FROM golang:1.21-alpine AS go-builder
|
|
3
|
+
|
|
4
|
+
WORKDIR /build
|
|
5
|
+
COPY go.mod ./ 2>/dev/null || true
|
|
6
|
+
RUN if [ -f go.mod ]; then go mod download; fi
|
|
7
|
+
|
|
8
|
+
COPY cmds/scheduler/ ./ 2>/dev/null || true
|
|
9
|
+
RUN if [ -f main.go ]; then CGO_ENABLED=0 go build -o scheduler .; fi
|
|
10
|
+
|
|
11
|
+
# Main stage
|
|
12
|
+
FROM python:3.11-slim
|
|
13
|
+
|
|
14
|
+
# Install SQLite
|
|
15
|
+
RUN apt-get update && apt-get install -y \
|
|
16
|
+
sqlite3 \
|
|
17
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
18
|
+
|
|
19
|
+
WORKDIR /app
|
|
20
|
+
|
|
21
|
+
# Install Python dependencies
|
|
22
|
+
COPY pyproject.toml ./
|
|
23
|
+
RUN pip install --no-cache-dir -e .
|
|
24
|
+
|
|
25
|
+
# Copy application code
|
|
26
|
+
COPY src/ ./src/
|
|
27
|
+
COPY ai-scraper.yaml ./
|
|
28
|
+
|
|
29
|
+
# Copy Go scheduler binary if built
|
|
30
|
+
COPY --from=go-builder /build/scheduler /usr/local/bin/scheduler 2>/dev/null || true
|
|
31
|
+
|
|
32
|
+
# Create data and output directories
|
|
33
|
+
RUN mkdir -p /app/data /app/output /app/.cache
|
|
34
|
+
|
|
35
|
+
# Set environment variables
|
|
36
|
+
ENV PYTHONUNBUFFERED=1
|
|
37
|
+
ENV AI_SCRAPER_DATA_DIR=/app/data
|
|
38
|
+
ENV AI_SCRAPER_OUTPUT_DIR=/app/output
|
|
39
|
+
|
|
40
|
+
# Entry point
|
|
41
|
+
ENTRYPOINT ["python", "-m", "ai_scraper.cli"]
|
|
42
|
+
CMD ["--help"]
|
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: github-ai-scraper
|
|
3
|
+
Version: 0.1.2
|
|
4
|
+
Summary: A CLI tool for discovering and scraping AI-related high-star repositories from GitHub and GitLab
|
|
5
|
+
Project-URL: Homepage, https://github.com/lwx66615/github-ai-scraper
|
|
6
|
+
Project-URL: Repository, https://github.com/lwx66615/github-ai-scraper
|
|
7
|
+
Project-URL: Documentation, https://github.com/lwx66615/github-ai-scraper#readme
|
|
8
|
+
Project-URL: Issues, https://github.com/lwx66615/github-ai-scraper/issues
|
|
9
|
+
Project-URL: Changelog, https://github.com/lwx66615/github-ai-scraper/releases
|
|
10
|
+
Author: lwx66615
|
|
11
|
+
License-Expression: MIT
|
|
12
|
+
Keywords: ai,cli,github,gitlab,machine-learning,repositories,scraper,trending
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Environment :: Console
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: Intended Audience :: Science/Research
|
|
17
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
18
|
+
Classifier: Operating System :: OS Independent
|
|
19
|
+
Classifier: Programming Language :: Python :: 3
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
23
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
24
|
+
Classifier: Topic :: Software Development
|
|
25
|
+
Classifier: Topic :: Utilities
|
|
26
|
+
Classifier: Typing :: Typed
|
|
27
|
+
Requires-Python: >=3.10
|
|
28
|
+
Requires-Dist: aiohttp>=3.9.0
|
|
29
|
+
Requires-Dist: aiosqlite>=0.19.0
|
|
30
|
+
Requires-Dist: click>=8.1.0
|
|
31
|
+
Requires-Dist: croniter>=2.0.0
|
|
32
|
+
Requires-Dist: fastapi>=0.109.0
|
|
33
|
+
Requires-Dist: pydantic>=2.0.0
|
|
34
|
+
Requires-Dist: pyyaml>=6.0
|
|
35
|
+
Requires-Dist: rich>=13.0.0
|
|
36
|
+
Requires-Dist: uvicorn>=0.27.0
|
|
37
|
+
Provides-Extra: dev
|
|
38
|
+
Requires-Dist: pytest-asyncio>=0.23.0; extra == 'dev'
|
|
39
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == 'dev'
|
|
40
|
+
Requires-Dist: pytest>=8.0.0; extra == 'dev'
|
|
41
|
+
Description-Content-Type: text/markdown
|
|
42
|
+
|
|
43
|
+
# GitHub/GitLab AI Scraper
|
|
44
|
+
|
|
45
|
+
English | [简体中文](README_CN.md)
|
|
46
|
+
|
|
47
|
+
A CLI tool for scraping AI-related high-star repositories from GitHub and GitLab.
|
|
48
|
+
|
|
49
|
+
## Features
|
|
50
|
+
|
|
51
|
+
- **Multi-platform support** - Scrape from GitHub or GitLab (including self-hosted instances)
|
|
52
|
+
- Search and filter AI-related repositories by keywords and topics
|
|
53
|
+
- **Dynamic keyword extraction** - Automatically learns new keywords from scraped repos
|
|
54
|
+
- **Markdown/HTML/Excel/RSS report generation** - Multiple export formats with Chinese translation
|
|
55
|
+
- **Incremental scraping** - Fetch only updated repos with `--since` flag
|
|
56
|
+
- **Resume support** - Continue interrupted scrapes with progress tracking
|
|
57
|
+
- **Progress bar display** - Visual progress during scraping
|
|
58
|
+
- **Interactive CLI mode** - Menu-driven interface for easy use
|
|
59
|
+
- **Concurrent scraping** - Parallel requests for faster results
|
|
60
|
+
- **Multi-language search** - Support for Chinese and English keywords
|
|
61
|
+
- Local SQLite storage with trend analysis
|
|
62
|
+
- Configurable filtering and scraping options
|
|
63
|
+
- Rate limiting with GitHub/GitLab API token support
|
|
64
|
+
- Export to CSV/JSON/HTML/Excel/RSS/Markdown formats
|
|
65
|
+
- **REST API server** - Access data via HTTP endpoints with optional authentication
|
|
66
|
+
- **Scheduled scraping** - Cron-based periodic scraping
|
|
67
|
+
- **Webhook notifications** - Notify external services on events
|
|
68
|
+
- **Plugin system** - Extend functionality with custom plugins
|
|
69
|
+
- **Repository health assessment** - Activity, popularity, maintenance scores
|
|
70
|
+
- **Intelligent classification** - LLM, CV, NLP, MLOps, AI Infrastructure categories
|
|
71
|
+
- **Deduplication** - Fork and mirror detection, content similarity
|
|
72
|
+
- **Secure token storage** - Encrypted storage for sensitive tokens
|
|
73
|
+
- **Database backup** - Automatic backup and restore functionality
|
|
74
|
+
- **Error recovery** - Retry logic with exponential backoff
|
|
75
|
+
|
|
76
|
+
## Installation
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
# Install from PyPI
|
|
80
|
+
pip install github-ai-scraper
|
|
81
|
+
|
|
82
|
+
# Or install from source for development
|
|
83
|
+
pip install -e ".[dev]"
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## Quick Start
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
# Set your GitHub token (optional, increases rate limit)
|
|
90
|
+
export GITHUB_TOKEN=your_token_here
|
|
91
|
+
|
|
92
|
+
# Scrape AI repositories from GitHub (default)
|
|
93
|
+
ai-scraper scrape
|
|
94
|
+
|
|
95
|
+
# Scrape from GitLab
|
|
96
|
+
ai-scraper scrape --platform gitlab
|
|
97
|
+
|
|
98
|
+
# Scrape from self-hosted GitLab
|
|
99
|
+
ai-scraper scrape --platform gitlab --gitlab-url https://your-gitlab.com/api/v4
|
|
100
|
+
|
|
101
|
+
# Scrape with progress bar
|
|
102
|
+
ai-scraper scrape --progress
|
|
103
|
+
|
|
104
|
+
# Concurrent scraping (faster)
|
|
105
|
+
ai-scraper scrape --concurrent
|
|
106
|
+
|
|
107
|
+
# Incremental scraping (repos updated in last 7 days)
|
|
108
|
+
ai-scraper scrape --incremental
|
|
109
|
+
ai-scraper scrape --since 7d
|
|
110
|
+
|
|
111
|
+
# Resume interrupted scrape
|
|
112
|
+
ai-scraper scrape --resume
|
|
113
|
+
|
|
114
|
+
# Interactive mode
|
|
115
|
+
ai-scraper interactive
|
|
116
|
+
|
|
117
|
+
# List scraped repositories
|
|
118
|
+
ai-scraper list
|
|
119
|
+
|
|
120
|
+
# Show trending repositories
|
|
121
|
+
ai-scraper trending
|
|
122
|
+
|
|
123
|
+
# Export data
|
|
124
|
+
ai-scraper db export --format html --output index.html
|
|
125
|
+
ai-scraper db export --format xlsx --output repos.xlsx
|
|
126
|
+
ai-scraper db export --format rss --output feed.xml
|
|
127
|
+
ai-scraper db export --format markdown --output repositories.md
|
|
128
|
+
|
|
129
|
+
# Start REST API server (with authentication)
|
|
130
|
+
ai-scraper serve --port 8080 --auth
|
|
131
|
+
|
|
132
|
+
# Schedule periodic scraping (daily at 9am)
|
|
133
|
+
ai-scraper schedule --cron "0 9 * * *"
|
|
134
|
+
|
|
135
|
+
# Backup database
|
|
136
|
+
ai-scraper db backup
|
|
137
|
+
ai-scraper db restore backup_file.db.gz
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
## Configuration
|
|
141
|
+
|
|
142
|
+
Create `ai-scraper.yaml` to customize:
|
|
143
|
+
|
|
144
|
+
```yaml
|
|
145
|
+
github:
|
|
146
|
+
token: ${GITHUB_TOKEN}
|
|
147
|
+
cache_ttl: 3600
|
|
148
|
+
|
|
149
|
+
gitlab:
|
|
150
|
+
token: ${GITLAB_TOKEN} # Optional, for GitLab scraping
|
|
151
|
+
base_url: https://gitlab.com/api/v4 # Or your self-hosted GitLab URL
|
|
152
|
+
cache_ttl: 3600
|
|
153
|
+
|
|
154
|
+
filter:
|
|
155
|
+
min_stars: 100
|
|
156
|
+
keywords:
|
|
157
|
+
- ai
|
|
158
|
+
- machine-learning
|
|
159
|
+
- 人工智能 # Chinese keyword support
|
|
160
|
+
topics:
|
|
161
|
+
- ai
|
|
162
|
+
- deep-learning
|
|
163
|
+
|
|
164
|
+
scrape:
|
|
165
|
+
max_results: 500
|
|
166
|
+
concurrency: 5
|
|
167
|
+
concurrent_requests: 5
|
|
168
|
+
|
|
169
|
+
database:
|
|
170
|
+
path: ./data/ai_scraper.db
|
|
171
|
+
backup_dir: ./backups
|
|
172
|
+
max_backups: 10
|
|
173
|
+
|
|
174
|
+
api:
|
|
175
|
+
auth_enabled: true
|
|
176
|
+
api_keys:
|
|
177
|
+
- as_your_api_key_here
|
|
178
|
+
|
|
179
|
+
webhooks:
|
|
180
|
+
enabled: false
|
|
181
|
+
endpoints:
|
|
182
|
+
- url: https://hooks.slack.com/services/YOUR/WEBHOOK/URL
|
|
183
|
+
events: [scrape_complete, trending_found]
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
## Commands
|
|
187
|
+
|
|
188
|
+
| Command | Description |
|
|
189
|
+
|---------|-------------|
|
|
190
|
+
| `ai-scraper scrape` | Scrape AI repositories from GitHub |
|
|
191
|
+
| `ai-scraper scrape --platform gitlab` | Scrape from GitLab |
|
|
192
|
+
| `ai-scraper scrape --platform gitlab --gitlab-url URL` | Scrape from self-hosted GitLab |
|
|
193
|
+
| `ai-scraper scrape --concurrent` | Concurrent scraping for faster results |
|
|
194
|
+
| `ai-scraper scrape --incremental` | Incremental scraping (only updated repos) |
|
|
195
|
+
| `ai-scraper scrape --since 7d` | Fetch repos updated in last 7 days |
|
|
196
|
+
| `ai-scraper scrape --resume` | Resume interrupted scrape |
|
|
197
|
+
| `ai-scraper scrape --progress` | Show progress bar during scraping |
|
|
198
|
+
| `ai-scraper interactive` | Start interactive menu mode |
|
|
199
|
+
| `ai-scraper list` | List scraped repositories |
|
|
200
|
+
| `ai-scraper trending` | Show trending repositories by star growth |
|
|
201
|
+
| `ai-scraper serve` | Start REST API server |
|
|
202
|
+
| `ai-scraper serve --auth` | Start API server with authentication |
|
|
203
|
+
| `ai-scraper schedule` | Schedule periodic scraping |
|
|
204
|
+
| `ai-scraper keywords list` | List all keywords |
|
|
205
|
+
| `ai-scraper keywords extract` | Extract keywords from database |
|
|
206
|
+
| `ai-scraper keywords clear` | Clear keywords |
|
|
207
|
+
| `ai-scraper config init` | Initialize config file |
|
|
208
|
+
| `ai-scraper config show` | Show current config |
|
|
209
|
+
| `ai-scraper db stats` | Show database statistics |
|
|
210
|
+
| `ai-scraper db export` | Export data to CSV/JSON/HTML/Excel/RSS |
|
|
211
|
+
| `ai-scraper db clean --invalid` | Remove repositories with invalid data |
|
|
212
|
+
| `ai-scraper db clean --vacuum` | Optimize database size |
|
|
213
|
+
| `ai-scraper db backup` | Create database backup |
|
|
214
|
+
| `ai-scraper db restore` | Restore from backup |
|
|
215
|
+
| `ai-scraper db backups` | List available backups |
|
|
216
|
+
|
|
217
|
+
## REST API Endpoints
|
|
218
|
+
|
|
219
|
+
When running `ai-scraper serve`:
|
|
220
|
+
|
|
221
|
+
| Endpoint | Description |
|
|
222
|
+
|----------|-------------|
|
|
223
|
+
| `GET /api/repos` | List repositories with filters |
|
|
224
|
+
| `GET /api/repos/{id}` | Get specific repository |
|
|
225
|
+
| `GET /api/stats` | Get database statistics |
|
|
226
|
+
| `GET /api/trending` | Get trending repositories |
|
|
227
|
+
| `GET /api/search?q=...` | Search repositories |
|
|
228
|
+
|
|
229
|
+
Authentication: Pass `X-API-Key` header when `--auth` is enabled.
|
|
230
|
+
|
|
231
|
+
## Project Structure
|
|
232
|
+
|
|
233
|
+
```
|
|
234
|
+
github-ai-scraper/
|
|
235
|
+
├── src/ai_scraper/
|
|
236
|
+
│ ├── cli.py # CLI entry point
|
|
237
|
+
│ ├── config.py # Configuration management
|
|
238
|
+
│ ├── interactive.py # Interactive menu mode
|
|
239
|
+
│ ├── classifier.py # Repository classification
|
|
240
|
+
│ ├── dedup.py # Deduplication utilities
|
|
241
|
+
│ ├── health.py # Health assessment
|
|
242
|
+
│ ├── scheduler.py # Task scheduling
|
|
243
|
+
│ ├── webhooks.py # Webhook notifications
|
|
244
|
+
│ ├── plugins.py # Plugin system
|
|
245
|
+
│ ├── logging_config.py # Logging configuration
|
|
246
|
+
│ ├── api_server.py # REST API server
|
|
247
|
+
│ ├── auth.py # API authentication
|
|
248
|
+
│ ├── retry.py # Error recovery
|
|
249
|
+
│ ├── i18n.py # Multi-language support
|
|
250
|
+
│ ├── scrape_progress.py # Resume support
|
|
251
|
+
│ ├── backup.py # Database backup
|
|
252
|
+
│ ├── config_watcher.py # Config hot reload
|
|
253
|
+
│ ├── secure_storage.py # Token encryption
|
|
254
|
+
│ ├── api/
|
|
255
|
+
│ │ ├── github.py # GitHub API client
|
|
256
|
+
│ │ └── rate_limiter.py # Token bucket rate limiter
|
|
257
|
+
│ ├── models/
|
|
258
|
+
│ │ └── repository.py # Data models (Pydantic)
|
|
259
|
+
│ ├── filters/
|
|
260
|
+
│ │ └── ai_filter.py # AI relevance filter
|
|
261
|
+
│ ├── output/
|
|
262
|
+
│ │ ├── markdown.py # Markdown exporter
|
|
263
|
+
│ │ ├── html.py # HTML exporter
|
|
264
|
+
│ │ ├── excel.py # Excel exporter
|
|
265
|
+
│ │ └── rss.py # RSS exporter
|
|
266
|
+
│ └── storage/
|
|
267
|
+
│ ├── database.py # SQLite storage (sync)
|
|
268
|
+
│ └── async_database.py # SQLite storage (async)
|
|
269
|
+
├── plugins/ # Example plugins
|
|
270
|
+
├── tests/ # Test suite
|
|
271
|
+
├── Dockerfile # Docker support
|
|
272
|
+
├── docker-compose.yml # Docker compose
|
|
273
|
+
├── .github/workflows/ # CI/CD workflows
|
|
274
|
+
└── ai-scraper.yaml # Default configuration
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
## Development
|
|
278
|
+
|
|
279
|
+
```bash
|
|
280
|
+
# Install dev dependencies
|
|
281
|
+
pip install -e ".[dev]"
|
|
282
|
+
|
|
283
|
+
# Run tests
|
|
284
|
+
pytest tests/ -v
|
|
285
|
+
|
|
286
|
+
# Build Docker image
|
|
287
|
+
docker build -t ai-scraper .
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
## API Rate Limits
|
|
291
|
+
|
|
292
|
+
- Without token: 60 requests/hour
|
|
293
|
+
- With token: 5000 requests/hour
|
|
294
|
+
|
|
295
|
+
Set `GITHUB_TOKEN` environment variable for higher limits.
|
|
296
|
+
|
|
297
|
+
## License
|
|
298
|
+
|
|
299
|
+
MIT
|