hltv-crawler 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hltv_crawler-0.1.0/PKG-INFO +95 -0
- hltv_crawler-0.1.0/README.md +72 -0
- hltv_crawler-0.1.0/hltv_crawler.egg-info/PKG-INFO +95 -0
- hltv_crawler-0.1.0/hltv_crawler.egg-info/SOURCES.txt +20 -0
- hltv_crawler-0.1.0/hltv_crawler.egg-info/dependency_links.txt +1 -0
- hltv_crawler-0.1.0/hltv_crawler.egg-info/entry_points.txt +2 -0
- hltv_crawler-0.1.0/hltv_crawler.egg-info/requires.txt +2 -0
- hltv_crawler-0.1.0/hltv_crawler.egg-info/top_level.txt +3 -0
- hltv_crawler-0.1.0/parser/__init__.py +17 -0
- hltv_crawler-0.1.0/parser/parsers.py +454 -0
- hltv_crawler-0.1.0/pyproject.toml +43 -0
- hltv_crawler-0.1.0/scraper/__init__.py +17 -0
- hltv_crawler-0.1.0/scraper/config.py +30 -0
- hltv_crawler-0.1.0/scraper/event_scraper.py +12 -0
- hltv_crawler-0.1.0/scraper/http_client.py +89 -0
- hltv_crawler-0.1.0/scraper/match_scraper.py +27 -0
- hltv_crawler-0.1.0/scraper/orchestrator.py +426 -0
- hltv_crawler-0.1.0/scraper/team_scraper.py +37 -0
- hltv_crawler-0.1.0/setup.cfg +4 -0
- hltv_crawler-0.1.0/storage/__init__.py +4 -0
- hltv_crawler-0.1.0/storage/database.py +319 -0
- hltv_crawler-0.1.0/storage/schema.py +136 -0
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hltv-crawler
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: HLTV CS2 match data crawler — scrape matches, maps, player stats to SQLite
|
|
5
|
+
Author: kongerly
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/kongerly/hltv-crawler
|
|
8
|
+
Project-URL: Repository, https://github.com/kongerly/hltv-crawler
|
|
9
|
+
Project-URL: Issues, https://github.com/kongerly/hltv-crawler/issues
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Topic :: Games/Entertainment
|
|
13
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
Requires-Dist: curl_cffi>=0.15
|
|
22
|
+
Requires-Dist: beautifulsoup4>=4.12
|
|
23
|
+
|
|
24
|
+
# HLTV CS2 Crawler
|
|
25
|
+
|
|
26
|
+
> CS2 match data crawler — scrape HLTV matches, maps, player stats to SQLite.
|
|
27
|
+
|
|
28
|
+
## Features
|
|
29
|
+
|
|
30
|
+
- **Cloudflare bypass** via `curl_cffi` (Chrome 124 impersonation)
|
|
31
|
+
- **Disk cache** — avoid re-downloading (24h TTL)
|
|
32
|
+
- **Rate limiting** — 2s between requests
|
|
33
|
+
- **Pagination** — auto-scroll through results pages
|
|
34
|
+
- **Date/event filtering** — scrape only what you need
|
|
35
|
+
- **Resume support** — stop and continue later (via `progress.json`)
|
|
36
|
+
- **Zero external deps for storage** — SQLite is Python stdlib
|
|
37
|
+
|
|
38
|
+
## Quick Start
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
# 1. Install
|
|
42
|
+
git clone git@github.com:kongerly/hltv-crawler.git
|
|
43
|
+
cd hltv-crawler
|
|
44
|
+
python -m venv .venv
|
|
45
|
+
.venv\\Scripts\\activate # Windows
|
|
46
|
+
pip install -r requirements.txt
|
|
47
|
+
|
|
48
|
+
# 2. Crawl (use --max-pages first to test)
|
|
49
|
+
python crawl.py --max-pages=3
|
|
50
|
+
|
|
51
|
+
# 3. More examples
|
|
52
|
+
python crawl.py --max-pages=10 --start-date=2025-01-01
|
|
53
|
+
python crawl.py --event="IEM" --max-pages=5
|
|
54
|
+
python crawl.py --resume # Continue from last time
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Output
|
|
58
|
+
|
|
59
|
+
- `data/hltv.db` — SQLite database with 6 tables
|
|
60
|
+
- `data/raw/*.html` — cached HTML pages (can be deleted safely)
|
|
61
|
+
|
|
62
|
+
### Database tables
|
|
63
|
+
|
|
64
|
+
| Table | Description |
|
|
65
|
+
|-------|-------------|
|
|
66
|
+
| `events` | Tournaments & events |
|
|
67
|
+
| `teams` | Team rankings |
|
|
68
|
+
| `players` | Player roster info |
|
|
69
|
+
| `matches` | Match results (bo, scores, winner) |
|
|
70
|
+
| `maps` | Per-map scores + CT/T side rounds |
|
|
71
|
+
| `player_match_stats` | Per-player stats (rating, ADR, KAST, K/D) |
|
|
72
|
+
|
|
73
|
+
## Requirements
|
|
74
|
+
|
|
75
|
+
- Python 3.10+
|
|
76
|
+
- `curl_cffi` — HTTP client with TLS fingerprint spoofing
|
|
77
|
+
- `beautifulsoup4` — HTML parsing
|
|
78
|
+
|
|
79
|
+
## Use as a Library
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
from pathlib import Path
|
|
83
|
+
from storage import Database
|
|
84
|
+
from scraper import HltvOrchestrator
|
|
85
|
+
|
|
86
|
+
with Database(Path("data/hltv.db")) as db:
|
|
87
|
+
db.create_tables()
|
|
88
|
+
with HltvOrchestrator(db) as orch:
|
|
89
|
+
counts = orch.run_full_pipeline(max_pages=5)
|
|
90
|
+
print(counts)
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## License
|
|
94
|
+
|
|
95
|
+
MIT
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# HLTV CS2 Crawler
|
|
2
|
+
|
|
3
|
+
> CS2 match data crawler — scrape HLTV matches, maps, player stats to SQLite.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Cloudflare bypass** via `curl_cffi` (Chrome 124 impersonation)
|
|
8
|
+
- **Disk cache** — avoid re-downloading (24h TTL)
|
|
9
|
+
- **Rate limiting** — 2s between requests
|
|
10
|
+
- **Pagination** — auto-scroll through results pages
|
|
11
|
+
- **Date/event filtering** — scrape only what you need
|
|
12
|
+
- **Resume support** — stop and continue later (via `progress.json`)
|
|
13
|
+
- **Zero external deps for storage** — SQLite is Python stdlib
|
|
14
|
+
|
|
15
|
+
## Quick Start
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
# 1. Install
|
|
19
|
+
git clone git@github.com:kongerly/hltv-crawler.git
|
|
20
|
+
cd hltv-crawler
|
|
21
|
+
python -m venv .venv
|
|
22
|
+
.venv\\Scripts\\activate # Windows
|
|
23
|
+
pip install -r requirements.txt
|
|
24
|
+
|
|
25
|
+
# 2. Crawl (use --max-pages first to test)
|
|
26
|
+
python crawl.py --max-pages=3
|
|
27
|
+
|
|
28
|
+
# 3. More examples
|
|
29
|
+
python crawl.py --max-pages=10 --start-date=2025-01-01
|
|
30
|
+
python crawl.py --event="IEM" --max-pages=5
|
|
31
|
+
python crawl.py --resume # Continue from last time
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Output
|
|
35
|
+
|
|
36
|
+
- `data/hltv.db` — SQLite database with 6 tables
|
|
37
|
+
- `data/raw/*.html` — cached HTML pages (can be deleted safely)
|
|
38
|
+
|
|
39
|
+
### Database tables
|
|
40
|
+
|
|
41
|
+
| Table | Description |
|
|
42
|
+
|-------|-------------|
|
|
43
|
+
| `events` | Tournaments & events |
|
|
44
|
+
| `teams` | Team rankings |
|
|
45
|
+
| `players` | Player roster info |
|
|
46
|
+
| `matches` | Match results (bo, scores, winner) |
|
|
47
|
+
| `maps` | Per-map scores + CT/T side rounds |
|
|
48
|
+
| `player_match_stats` | Per-player stats (rating, ADR, KAST, K/D) |
|
|
49
|
+
|
|
50
|
+
## Requirements
|
|
51
|
+
|
|
52
|
+
- Python 3.10+
|
|
53
|
+
- `curl_cffi` — HTTP client with TLS fingerprint spoofing
|
|
54
|
+
- `beautifulsoup4` — HTML parsing
|
|
55
|
+
|
|
56
|
+
## Use as a Library
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
from pathlib import Path
|
|
60
|
+
from storage import Database
|
|
61
|
+
from scraper import HltvOrchestrator
|
|
62
|
+
|
|
63
|
+
with Database(Path("data/hltv.db")) as db:
|
|
64
|
+
db.create_tables()
|
|
65
|
+
with HltvOrchestrator(db) as orch:
|
|
66
|
+
counts = orch.run_full_pipeline(max_pages=5)
|
|
67
|
+
print(counts)
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## License
|
|
71
|
+
|
|
72
|
+
MIT
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hltv-crawler
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: HLTV CS2 match data crawler — scrape matches, maps, player stats to SQLite
|
|
5
|
+
Author: kongerly
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/kongerly/hltv-crawler
|
|
8
|
+
Project-URL: Repository, https://github.com/kongerly/hltv-crawler
|
|
9
|
+
Project-URL: Issues, https://github.com/kongerly/hltv-crawler/issues
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Topic :: Games/Entertainment
|
|
13
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
Requires-Dist: curl_cffi>=0.15
|
|
22
|
+
Requires-Dist: beautifulsoup4>=4.12
|
|
23
|
+
|
|
24
|
+
# HLTV CS2 Crawler
|
|
25
|
+
|
|
26
|
+
> CS2 match data crawler — scrape HLTV matches, maps, player stats to SQLite.
|
|
27
|
+
|
|
28
|
+
## Features
|
|
29
|
+
|
|
30
|
+
- **Cloudflare bypass** via `curl_cffi` (Chrome 124 impersonation)
|
|
31
|
+
- **Disk cache** — avoid re-downloading (24h TTL)
|
|
32
|
+
- **Rate limiting** — 2s between requests
|
|
33
|
+
- **Pagination** — auto-scroll through results pages
|
|
34
|
+
- **Date/event filtering** — scrape only what you need
|
|
35
|
+
- **Resume support** — stop and continue later (via `progress.json`)
|
|
36
|
+
- **Zero external deps for storage** — SQLite is Python stdlib
|
|
37
|
+
|
|
38
|
+
## Quick Start
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
# 1. Install
|
|
42
|
+
git clone git@github.com:kongerly/hltv-crawler.git
|
|
43
|
+
cd hltv-crawler
|
|
44
|
+
python -m venv .venv
|
|
45
|
+
.venv\\Scripts\\activate # Windows
|
|
46
|
+
pip install -r requirements.txt
|
|
47
|
+
|
|
48
|
+
# 2. Crawl (use --max-pages first to test)
|
|
49
|
+
python crawl.py --max-pages=3
|
|
50
|
+
|
|
51
|
+
# 3. More examples
|
|
52
|
+
python crawl.py --max-pages=10 --start-date=2025-01-01
|
|
53
|
+
python crawl.py --event="IEM" --max-pages=5
|
|
54
|
+
python crawl.py --resume # Continue from last time
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Output
|
|
58
|
+
|
|
59
|
+
- `data/hltv.db` — SQLite database with 6 tables
|
|
60
|
+
- `data/raw/*.html` — cached HTML pages (can be deleted safely)
|
|
61
|
+
|
|
62
|
+
### Database tables
|
|
63
|
+
|
|
64
|
+
| Table | Description |
|
|
65
|
+
|-------|-------------|
|
|
66
|
+
| `events` | Tournaments & events |
|
|
67
|
+
| `teams` | Team rankings |
|
|
68
|
+
| `players` | Player roster info |
|
|
69
|
+
| `matches` | Match results (bo, scores, winner) |
|
|
70
|
+
| `maps` | Per-map scores + CT/T side rounds |
|
|
71
|
+
| `player_match_stats` | Per-player stats (rating, ADR, KAST, K/D) |
|
|
72
|
+
|
|
73
|
+
## Requirements
|
|
74
|
+
|
|
75
|
+
- Python 3.10+
|
|
76
|
+
- `curl_cffi` — HTTP client with TLS fingerprint spoofing
|
|
77
|
+
- `beautifulsoup4` — HTML parsing
|
|
78
|
+
|
|
79
|
+
## Use as a Library
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
from pathlib import Path
|
|
83
|
+
from storage import Database
|
|
84
|
+
from scraper import HltvOrchestrator
|
|
85
|
+
|
|
86
|
+
with Database(Path("data/hltv.db")) as db:
|
|
87
|
+
db.create_tables()
|
|
88
|
+
with HltvOrchestrator(db) as orch:
|
|
89
|
+
counts = orch.run_full_pipeline(max_pages=5)
|
|
90
|
+
print(counts)
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## License
|
|
94
|
+
|
|
95
|
+
MIT
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
hltv_crawler.egg-info/PKG-INFO
|
|
4
|
+
hltv_crawler.egg-info/SOURCES.txt
|
|
5
|
+
hltv_crawler.egg-info/dependency_links.txt
|
|
6
|
+
hltv_crawler.egg-info/entry_points.txt
|
|
7
|
+
hltv_crawler.egg-info/requires.txt
|
|
8
|
+
hltv_crawler.egg-info/top_level.txt
|
|
9
|
+
parser/__init__.py
|
|
10
|
+
parser/parsers.py
|
|
11
|
+
scraper/__init__.py
|
|
12
|
+
scraper/config.py
|
|
13
|
+
scraper/event_scraper.py
|
|
14
|
+
scraper/http_client.py
|
|
15
|
+
scraper/match_scraper.py
|
|
16
|
+
scraper/orchestrator.py
|
|
17
|
+
scraper/team_scraper.py
|
|
18
|
+
storage/__init__.py
|
|
19
|
+
storage/database.py
|
|
20
|
+
storage/schema.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""HLTV Data Parsers — extract structured data from HTML pages."""
|
|
2
|
+
|
|
3
|
+
from parser.parsers import (
|
|
4
|
+
parse_events_page,
|
|
5
|
+
parse_results_page,
|
|
6
|
+
parse_match_detail,
|
|
7
|
+
parse_team_ranking_page,
|
|
8
|
+
parse_ranking_players,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"parse_events_page",
|
|
13
|
+
"parse_results_page",
|
|
14
|
+
"parse_match_detail",
|
|
15
|
+
"parse_team_ranking_page",
|
|
16
|
+
"parse_ranking_players",
|
|
17
|
+
]
|