github-ai-scraper 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. ai_scraper/__init__.py +3 -0
  2. ai_scraper/api/__init__.py +6 -0
  3. ai_scraper/api/github.py +340 -0
  4. ai_scraper/api/gitlab.py +418 -0
  5. ai_scraper/api/rate_limiter.py +120 -0
  6. ai_scraper/api_server.py +196 -0
  7. ai_scraper/auth.py +68 -0
  8. ai_scraper/backup.py +112 -0
  9. ai_scraper/cache.py +95 -0
  10. ai_scraper/classifier.py +135 -0
  11. ai_scraper/cli.py +747 -0
  12. ai_scraper/config.py +237 -0
  13. ai_scraper/config_watcher.py +82 -0
  14. ai_scraper/dedup.py +148 -0
  15. ai_scraper/filters/__init__.py +5 -0
  16. ai_scraper/filters/ai_filter.py +93 -0
  17. ai_scraper/health.py +155 -0
  18. ai_scraper/i18n.py +141 -0
  19. ai_scraper/interactive.py +96 -0
  20. ai_scraper/keywords/__init__.py +5 -0
  21. ai_scraper/keywords/extractor.py +274 -0
  22. ai_scraper/logging_config.py +74 -0
  23. ai_scraper/models/__init__.py +5 -0
  24. ai_scraper/models/repository.py +72 -0
  25. ai_scraper/output/__init__.py +6 -0
  26. ai_scraper/output/excel.py +79 -0
  27. ai_scraper/output/html.py +152 -0
  28. ai_scraper/output/markdown.py +338 -0
  29. ai_scraper/output/rss.py +82 -0
  30. ai_scraper/output/translator.py +303 -0
  31. ai_scraper/plugin_system.py +146 -0
  32. ai_scraper/plugins/__init__.py +5 -0
  33. ai_scraper/retry.py +134 -0
  34. ai_scraper/scheduler.py +84 -0
  35. ai_scraper/scrape_progress.py +99 -0
  36. ai_scraper/secure_storage.py +127 -0
  37. ai_scraper/storage/__init__.py +5 -0
  38. ai_scraper/storage/async_database.py +237 -0
  39. ai_scraper/storage/database.py +456 -0
  40. ai_scraper/webhooks.py +95 -0
  41. github_ai_scraper-0.1.2.dist-info/METADATA +299 -0
  42. github_ai_scraper-0.1.2.dist-info/RECORD +44 -0
  43. github_ai_scraper-0.1.2.dist-info/WHEEL +4 -0
  44. github_ai_scraper-0.1.2.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,299 @@
1
+ Metadata-Version: 2.4
2
+ Name: github-ai-scraper
3
+ Version: 0.1.2
4
+ Summary: A CLI tool for discovering and scraping AI-related high-star repositories from GitHub and GitLab
5
+ Project-URL: Homepage, https://github.com/lwx66615/github-ai-scraper
6
+ Project-URL: Repository, https://github.com/lwx66615/github-ai-scraper
7
+ Project-URL: Documentation, https://github.com/lwx66615/github-ai-scraper#readme
8
+ Project-URL: Issues, https://github.com/lwx66615/github-ai-scraper/issues
9
+ Project-URL: Changelog, https://github.com/lwx66615/github-ai-scraper/releases
10
+ Author: lwx66615
11
+ License-Expression: MIT
12
+ Keywords: ai,cli,github,gitlab,machine-learning,repositories,scraper,trending
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Environment :: Console
15
+ Classifier: Intended Audience :: Developers
16
+ Classifier: Intended Audience :: Science/Research
17
+ Classifier: License :: OSI Approved :: MIT License
18
+ Classifier: Operating System :: OS Independent
19
+ Classifier: Programming Language :: Python :: 3
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
24
+ Classifier: Topic :: Software Development
25
+ Classifier: Topic :: Utilities
26
+ Classifier: Typing :: Typed
27
+ Requires-Python: >=3.10
28
+ Requires-Dist: aiohttp>=3.9.0
29
+ Requires-Dist: aiosqlite>=0.19.0
30
+ Requires-Dist: click>=8.1.0
31
+ Requires-Dist: croniter>=2.0.0
32
+ Requires-Dist: fastapi>=0.109.0
33
+ Requires-Dist: pydantic>=2.0.0
34
+ Requires-Dist: pyyaml>=6.0
35
+ Requires-Dist: rich>=13.0.0
36
+ Requires-Dist: uvicorn>=0.27.0
37
+ Provides-Extra: dev
38
+ Requires-Dist: pytest-asyncio>=0.23.0; extra == 'dev'
39
+ Requires-Dist: pytest-cov>=4.1.0; extra == 'dev'
40
+ Requires-Dist: pytest>=8.0.0; extra == 'dev'
41
+ Description-Content-Type: text/markdown
42
+
43
+ # GitHub/GitLab AI Scraper
44
+
45
+ English | [简体中文](README_CN.md)
46
+
47
+ A CLI tool for scraping AI-related high-star repositories from GitHub and GitLab.
48
+
49
+ ## Features
50
+
51
+ - **Multi-platform support** - Scrape from GitHub or GitLab (including self-hosted instances)
52
+ - Search and filter AI-related repositories by keywords and topics
53
+ - **Dynamic keyword extraction** - Automatically learns new keywords from scraped repos
54
+ - **Markdown/HTML/Excel/RSS report generation** - Multiple export formats with Chinese translation
55
+ - **Incremental scraping** - Fetch only updated repos with `--since` flag
56
+ - **Resume support** - Continue interrupted scrapes with progress tracking
57
+ - **Progress bar display** - Visual progress during scraping
58
+ - **Interactive CLI mode** - Menu-driven interface for easy use
59
+ - **Concurrent scraping** - Parallel requests for faster results
60
+ - **Multi-language search** - Support for Chinese and English keywords
61
+ - Local SQLite storage with trend analysis
62
+ - Configurable filtering and scraping options
63
+ - Rate limiting with GitHub/GitLab API token support
64
+ - Export to CSV/JSON/HTML/Excel/RSS/Markdown formats
65
+ - **REST API server** - Access data via HTTP endpoints with optional authentication
66
+ - **Scheduled scraping** - Cron-based periodic scraping
67
+ - **Webhook notifications** - Notify external services on events
68
+ - **Plugin system** - Extend functionality with custom plugins
69
+ - **Repository health assessment** - Activity, popularity, maintenance scores
70
+ - **Intelligent classification** - LLM, CV, NLP, MLOps, AI Infrastructure categories
71
+ - **Deduplication** - Fork and mirror detection, content similarity
72
+ - **Secure token storage** - Encrypted storage for sensitive tokens
73
+ - **Database backup** - Automatic backup and restore functionality
74
+ - **Error recovery** - Retry logic with exponential backoff
75
+
76
+ ## Installation
77
+
78
+ ```bash
79
+ # Install from PyPI
80
+ pip install github-ai-scraper
81
+
82
+ # Or install from source for development
83
+ pip install -e ".[dev]"
84
+ ```
85
+
86
+ ## Quick Start
87
+
88
+ ```bash
89
+ # Set your GitHub token (optional, increases rate limit)
90
+ export GITHUB_TOKEN=your_token_here
91
+
92
+ # Scrape AI repositories from GitHub (default)
93
+ ai-scraper scrape
94
+
95
+ # Scrape from GitLab
96
+ ai-scraper scrape --platform gitlab
97
+
98
+ # Scrape from self-hosted GitLab
99
+ ai-scraper scrape --platform gitlab --gitlab-url https://your-gitlab.com/api/v4
100
+
101
+ # Scrape with progress bar
102
+ ai-scraper scrape --progress
103
+
104
+ # Concurrent scraping (faster)
105
+ ai-scraper scrape --concurrent
106
+
107
+ # Incremental scraping (repos updated in last 7 days)
108
+ ai-scraper scrape --incremental
109
+ ai-scraper scrape --since 7d
110
+
111
+ # Resume interrupted scrape
112
+ ai-scraper scrape --resume
113
+
114
+ # Interactive mode
115
+ ai-scraper interactive
116
+
117
+ # List scraped repositories
118
+ ai-scraper list
119
+
120
+ # Show trending repositories
121
+ ai-scraper trending
122
+
123
+ # Export data
124
+ ai-scraper db export --format html --output index.html
125
+ ai-scraper db export --format xlsx --output repos.xlsx
126
+ ai-scraper db export --format rss --output feed.xml
127
+ ai-scraper db export --format markdown --output repositories.md
128
+
129
+ # Start REST API server (with authentication)
130
+ ai-scraper serve --port 8080 --auth
131
+
132
+ # Schedule periodic scraping (daily at 9am)
133
+ ai-scraper schedule --cron "0 9 * * *"
134
+
135
+ # Backup database
136
+ ai-scraper db backup
137
+ ai-scraper db restore backup_file.db.gz
138
+ ```
139
+
140
+ ## Configuration
141
+
142
+ Create `ai-scraper.yaml` to customize:
143
+
144
+ ```yaml
145
+ github:
146
+ token: ${GITHUB_TOKEN}
147
+ cache_ttl: 3600
148
+
149
+ gitlab:
150
+ token: ${GITLAB_TOKEN} # Optional, for GitLab scraping
151
+ base_url: https://gitlab.com/api/v4 # Or your self-hosted GitLab URL
152
+ cache_ttl: 3600
153
+
154
+ filter:
155
+ min_stars: 100
156
+ keywords:
157
+ - ai
158
+ - machine-learning
159
+ - 人工智能 # Chinese keyword support
160
+ topics:
161
+ - ai
162
+ - deep-learning
163
+
164
+ scrape:
165
+ max_results: 500
166
+ concurrency: 5
167
+ concurrent_requests: 5
168
+
169
+ database:
170
+ path: ./data/ai_scraper.db
171
+ backup_dir: ./backups
172
+ max_backups: 10
173
+
174
+ api:
175
+ auth_enabled: true
176
+ api_keys:
177
+ - as_your_api_key_here
178
+
179
+ webhooks:
180
+ enabled: false
181
+ endpoints:
182
+ - url: https://hooks.slack.com/services/YOUR/WEBHOOK/URL
183
+ events: [scrape_complete, trending_found]
184
+ ```
185
+
186
+ ## Commands
187
+
188
+ | Command | Description |
189
+ |---------|-------------|
190
+ | `ai-scraper scrape` | Scrape AI repositories from GitHub |
191
+ | `ai-scraper scrape --platform gitlab` | Scrape from GitLab |
192
+ | `ai-scraper scrape --platform gitlab --gitlab-url URL` | Scrape from self-hosted GitLab |
193
+ | `ai-scraper scrape --concurrent` | Concurrent scraping for faster results |
194
+ | `ai-scraper scrape --incremental` | Incremental scraping (only updated repos) |
195
+ | `ai-scraper scrape --since 7d` | Fetch repos updated in last 7 days |
196
+ | `ai-scraper scrape --resume` | Resume interrupted scrape |
197
+ | `ai-scraper scrape --progress` | Show progress bar during scraping |
198
+ | `ai-scraper interactive` | Start interactive menu mode |
199
+ | `ai-scraper list` | List scraped repositories |
200
+ | `ai-scraper trending` | Show trending repositories by star growth |
201
+ | `ai-scraper serve` | Start REST API server |
202
+ | `ai-scraper serve --auth` | Start API server with authentication |
203
+ | `ai-scraper schedule` | Schedule periodic scraping |
204
+ | `ai-scraper keywords list` | List all keywords |
205
+ | `ai-scraper keywords extract` | Extract keywords from database |
206
+ | `ai-scraper keywords clear` | Clear keywords |
207
+ | `ai-scraper config init` | Initialize config file |
208
+ | `ai-scraper config show` | Show current config |
209
+ | `ai-scraper db stats` | Show database statistics |
210
+ | `ai-scraper db export` | Export data to CSV/JSON/HTML/Excel/RSS |
211
+ | `ai-scraper db clean --invalid` | Remove repositories with invalid data |
212
+ | `ai-scraper db clean --vacuum` | Optimize database size |
213
+ | `ai-scraper db backup` | Create database backup |
214
+ | `ai-scraper db restore` | Restore from backup |
215
+ | `ai-scraper db backups` | List available backups |
216
+
217
+ ## REST API Endpoints
218
+
219
+ When running `ai-scraper serve`:
220
+
221
+ | Endpoint | Description |
222
+ |----------|-------------|
223
+ | `GET /api/repos` | List repositories with filters |
224
+ | `GET /api/repos/{id}` | Get specific repository |
225
+ | `GET /api/stats` | Get database statistics |
226
+ | `GET /api/trending` | Get trending repositories |
227
+ | `GET /api/search?q=...` | Search repositories |
228
+
229
+ Authentication: Pass `X-API-Key` header when `--auth` is enabled.
230
+
231
+ ## Project Structure
232
+
233
+ ```
234
+ github-ai-scraper/
235
+ ├── src/ai_scraper/
236
+ │ ├── cli.py # CLI entry point
237
+ │ ├── config.py # Configuration management
238
+ │ ├── interactive.py # Interactive menu mode
239
+ │ ├── classifier.py # Repository classification
240
+ │ ├── dedup.py # Deduplication utilities
241
+ │ ├── health.py # Health assessment
242
+ │ ├── scheduler.py # Task scheduling
243
+ │ ├── webhooks.py # Webhook notifications
244
+ │ ├── plugins.py # Plugin system
245
+ │ ├── logging_config.py # Logging configuration
246
+ │ ├── api_server.py # REST API server
247
+ │ ├── auth.py # API authentication
248
+ │ ├── retry.py # Error recovery
249
+ │ ├── i18n.py # Multi-language support
250
+ │ ├── scrape_progress.py # Resume support
251
+ │ ├── backup.py # Database backup
252
+ │ ├── config_watcher.py # Config hot reload
253
+ │ ├── secure_storage.py # Token encryption
254
+ │ ├── api/
255
+ │ │ ├── github.py # GitHub API client
256
+ │ │ └── rate_limiter.py # Token bucket rate limiter
257
+ │ ├── models/
258
+ │ │ └── repository.py # Data models (Pydantic)
259
+ │ ├── filters/
260
+ │ │ └── ai_filter.py # AI relevance filter
261
+ │ ├── output/
262
+ │ │ ├── markdown.py # Markdown exporter
263
+ │ │ ├── html.py # HTML exporter
264
+ │ │ ├── excel.py # Excel exporter
265
+ │ │ └── rss.py # RSS exporter
266
+ │ └── storage/
267
+ │ ├── database.py # SQLite storage (sync)
268
+ │ └── async_database.py # SQLite storage (async)
269
+ ├── plugins/ # Example plugins
270
+ ├── tests/ # Test suite
271
+ ├── Dockerfile # Docker support
272
+ ├── docker-compose.yml # Docker compose
273
+ ├── .github/workflows/ # CI/CD workflows
274
+ └── ai-scraper.yaml # Default configuration
275
+ ```
276
+
277
+ ## Development
278
+
279
+ ```bash
280
+ # Install dev dependencies
281
+ pip install -e ".[dev]"
282
+
283
+ # Run tests
284
+ pytest tests/ -v
285
+
286
+ # Build Docker image
287
+ docker build -t ai-scraper .
288
+ ```
289
+
290
+ ## API Rate Limits
291
+
292
+ - Without token: 60 requests/hour
293
+ - With token: 5000 requests/hour
294
+
295
+ Set `GITHUB_TOKEN` environment variable for higher limits.
296
+
297
+ ## License
298
+
299
+ MIT
@@ -0,0 +1,44 @@
1
+ ai_scraper/__init__.py,sha256=XTkRTj8cn2kR9ZIfXmul1C4dFLFnLwKJLY3Rx1wTQtU,71
2
+ ai_scraper/api_server.py,sha256=_b22MexjtJqms4V0yV6N9iXfcPoxNErST496C4I09dw,5446
3
+ ai_scraper/auth.py,sha256=2OZh83A1bBhntLKImBzZn7M8rwMS6oR7piJ-gbqn_q0,1413
4
+ ai_scraper/backup.py,sha256=n3pL4yX5qNCHKGUk7we26sdzC85E-2M3iw7A67lKpwY,3410
5
+ ai_scraper/cache.py,sha256=NBUxaxpql4muB9IzQOz-f0Inpqn03dT9Bv7V5Hg3G7s,2736
6
+ ai_scraper/classifier.py,sha256=2Ny3h0ipIVoDE6JqhKemwKldTaEVcO95BRwPm0PNJeo,4910
7
+ ai_scraper/cli.py,sha256=2btSVlaftoHxLxVqr3c2Mis_zhhdacTmPGcQs5c3p_I,27649
8
+ ai_scraper/config.py,sha256=5DgBpzF1aUoNEgfCa7AQ9tLYYx_N4sACK4VLweqYU1M,7020
9
+ ai_scraper/config_watcher.py,sha256=te6Ot5-atJ6v49O9M73_XijblO6ze0S5sIgYyZY9i_A,2477
10
+ ai_scraper/dedup.py,sha256=QLFvJ8XlViAOkh_OxjpCYLlhEtIHrRbbEBRrLJl0ONw,4597
11
+ ai_scraper/health.py,sha256=nb8PdDoVti0RXbHOO3OJbxmOwNESJgherL2g2mYZF80,3887
12
+ ai_scraper/i18n.py,sha256=hKsN4L2wtwDmOMpu1BceyrYbIuDW5ksSd-_F_mqa8V8,4256
13
+ ai_scraper/interactive.py,sha256=cV8XCNzPR0luIHIOYV4QUgvG9DfCJPGtEWIROpPbkP8,2895
14
+ ai_scraper/logging_config.py,sha256=7gS-02VqtkDvtTfGdK7QzN_nXNEVaLqLBOxWwoOBGoQ,1963
15
+ ai_scraper/plugin_system.py,sha256=3Fa-mfRM4ekjB9YiXbtPDyDV4HvCjpMSWC3fI2Hw-xY,4025
16
+ ai_scraper/retry.py,sha256=dMvlft84RNm06ls04tiwUKaNzMJH3wSqUdNDW4RYUeo,3857
17
+ ai_scraper/scheduler.py,sha256=Ua2JWKh0Z6-FP-Mz5YqIzA0fjzf40x4fGir2LDkUgRI,2452
18
+ ai_scraper/scrape_progress.py,sha256=ADDEKhd1PwovDzLqf2YmM-FX7nzIYRjYORVrnwvrhqw,2764
19
+ ai_scraper/secure_storage.py,sha256=MS7FBcF2Q7W1U48S2fejGZpD0dVHz7zv37kBdDSZ5ko,3841
20
+ ai_scraper/webhooks.py,sha256=4xUmz-gpYbn40awv8IjzhI7kM4dSOD_sw7VIzgrwY00,2831
21
+ ai_scraper/api/__init__.py,sha256=V1hpP3MhnqaalXWaOpkvKVXJEylv9G_ymhmwwBuw7-k,176
22
+ ai_scraper/api/github.py,sha256=yxfXOmKfPVdkHXrw0Bbb-4RP7Zzvqd6ers5HH71smfA,11013
23
+ ai_scraper/api/gitlab.py,sha256=Hu0wN1un9eYAjkjTz_ZEyFZKDUCsxZ9XWGLPIN0mDlY,13548
24
+ ai_scraper/api/rate_limiter.py,sha256=2CFJiGOgb-wrkYiSg5d_9hNe8Ogry-sR0DDUBSxptpU,3768
25
+ ai_scraper/filters/__init__.py,sha256=mfhX3UlOkJnAZM3sey1VeGaNrVpHGzlbXi8BOyrIYaY,105
26
+ ai_scraper/filters/ai_filter.py,sha256=X83jyQhwblhYF7Q39SC1i_6SZSwqBxCQqUr84m40INs,3355
27
+ ai_scraper/keywords/__init__.py,sha256=JLjO0I1-4PmYD-p77CqvZoSTdKAlD2xpqUa322SLRiE,126
28
+ ai_scraper/keywords/extractor.py,sha256=LKOOwNLyMtKUy9_-kczOHbqX7jlAGba1dyKbO8O85ws,8791
29
+ ai_scraper/models/__init__.py,sha256=HrspqqVcdNtfSueLoZRVJFgi9CLApNUobXIGrbppYO0,203
30
+ ai_scraper/models/repository.py,sha256=cgsoxcNORIXQDe4H7UI5P8_JOgcrinibBMOfdvOmsi0,2452
31
+ ai_scraper/output/__init__.py,sha256=pUl1AdDtApfhLFh3TCyIPBwB6qjULgfrlBF1j2VG9-I,197
32
+ ai_scraper/output/excel.py,sha256=rLPyJHMHC4AjoNun90fYQsJipn6NaSTyidEUOkH_hhw,2848
33
+ ai_scraper/output/html.py,sha256=ibN8N5QY3JjJp2g0GqfXutEgOLwkDVZ3XwSm09Pk5gA,5710
34
+ ai_scraper/output/markdown.py,sha256=r26_NIjE5H1-6MPO2Fn-X8ycYIt0ZxwAv2WfYYPn1o0,10575
35
+ ai_scraper/output/rss.py,sha256=VCeo_eaa67YaX9YTx6yntdCZFuhEuUllsUO5TqTuYzo,2661
36
+ ai_scraper/output/translator.py,sha256=KbhTpjEvp2OfGafeKFTCBqd5Cv3ys5T8GpP7FnevOwk,9058
37
+ ai_scraper/plugins/__init__.py,sha256=F0Rs9cJSkm7zSF8hXCPDCjVqVzX2kxbgctxc_8ddlS8,171
38
+ ai_scraper/storage/__init__.py,sha256=W0EUJI3qlo17YVUH4TDde19OCfjLSjaDo5cerYcz3OM,111
39
+ ai_scraper/storage/async_database.py,sha256=Vvhwhl2a7vwq155QFHoy7UpSuTcSJ18MAF5wKAXP1EI,8976
40
+ ai_scraper/storage/database.py,sha256=tgPYgpYYG142BRrV2Ei0OIkbabhpb4CCJoi-a2Pk9Kg,14343
41
+ github_ai_scraper-0.1.2.dist-info/METADATA,sha256=SsjF9Ep_0QoA5nF-lY98L2XyBHd_3YW12lElb853FZ8,10407
42
+ github_ai_scraper-0.1.2.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
43
+ github_ai_scraper-0.1.2.dist-info/entry_points.txt,sha256=d_ztZfaEXPUmB9RrtsQlciR3IkfC0WXHxiJEhdT6dRU,51
44
+ github_ai_scraper-0.1.2.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ ai-scraper = ai_scraper.cli:main