hltv-crawler 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,95 @@
1
+ Metadata-Version: 2.4
2
+ Name: hltv-crawler
3
+ Version: 0.1.0
4
+ Summary: HLTV CS2 match data crawler — scrape matches, maps, player stats to SQLite
5
+ Author: kongerly
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/kongerly/hltv-crawler
8
+ Project-URL: Repository, https://github.com/kongerly/hltv-crawler
9
+ Project-URL: Issues, https://github.com/kongerly/hltv-crawler/issues
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Topic :: Games/Entertainment
13
+ Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Requires-Python: >=3.10
20
+ Description-Content-Type: text/markdown
21
+ Requires-Dist: curl_cffi>=0.15
22
+ Requires-Dist: beautifulsoup4>=4.12
23
+
24
+ # HLTV CS2 Crawler
25
+
26
+ > CS2 match data crawler — scrape HLTV matches, maps, player stats to SQLite.
27
+
28
+ ## Features
29
+
30
+ - **Cloudflare bypass** via `curl_cffi` (Chrome 124 impersonation)
31
+ - **Disk cache** — avoid re-downloading (24h TTL)
32
+ - **Rate limiting** — 2s between requests
33
+ - **Pagination** — auto-scroll through results pages
34
+ - **Date/event filtering** — scrape only what you need
35
+ - **Resume support** — stop and continue later (via `progress.json`)
36
+ - **Zero external deps for storage** — SQLite is Python stdlib
37
+
38
+ ## Quick Start
39
+
40
+ ```bash
41
+ # 1. Install
42
+ git clone git@github.com:kongerly/hltv-crawler.git
43
+ cd hltv-crawler
44
+ python -m venv .venv
45
+ .venv\\Scripts\\activate # Windows
46
+ pip install -r requirements.txt
47
+
48
+ # 2. Crawl (use --max-pages first to test)
49
+ python crawl.py --max-pages=3
50
+
51
+ # 3. More examples
52
+ python crawl.py --max-pages=10 --start-date=2025-01-01
53
+ python crawl.py --event="IEM" --max-pages=5
54
+ python crawl.py --resume # Continue from last time
55
+ ```
56
+
57
+ ## Output
58
+
59
+ - `data/hltv.db` — SQLite database with 6 tables
60
+ - `data/raw/*.html` — cached HTML pages (can be deleted safely)
61
+
62
+ ### Database tables
63
+
64
+ | Table | Description |
65
+ |-------|-------------|
66
+ | `events` | Tournaments & events |
67
+ | `teams` | Team rankings |
68
+ | `players` | Player roster info |
69
+ | `matches` | Match results (bo, scores, winner) |
70
+ | `maps` | Per-map scores + CT/T side rounds |
71
+ | `player_match_stats` | Per-player stats (rating, ADR, KAST, K/D) |
72
+
73
+ ## Requirements
74
+
75
+ - Python 3.10+
76
+ - `curl_cffi` — HTTP client with TLS fingerprint spoofing
77
+ - `beautifulsoup4` — HTML parsing
78
+
79
+ ## Use as a Library
80
+
81
+ ```python
82
+ from pathlib import Path
83
+ from storage import Database
84
+ from scraper import HltvOrchestrator
85
+
86
+ with Database(Path("data/hltv.db")) as db:
87
+ db.create_tables()
88
+ with HltvOrchestrator(db) as orch:
89
+ counts = orch.run_full_pipeline(max_pages=5)
90
+ print(counts)
91
+ ```
92
+
93
+ ## License
94
+
95
+ MIT
@@ -0,0 +1,72 @@
1
+ # HLTV CS2 Crawler
2
+
3
+ > CS2 match data crawler — scrape HLTV matches, maps, player stats to SQLite.
4
+
5
+ ## Features
6
+
7
+ - **Cloudflare bypass** via `curl_cffi` (Chrome 124 impersonation)
8
+ - **Disk cache** — avoid re-downloading (24h TTL)
9
+ - **Rate limiting** — 2s between requests
10
+ - **Pagination** — auto-scroll through results pages
11
+ - **Date/event filtering** — scrape only what you need
12
+ - **Resume support** — stop and continue later (via `progress.json`)
13
+ - **Zero external deps for storage** — SQLite is Python stdlib
14
+
15
+ ## Quick Start
16
+
17
+ ```bash
18
+ # 1. Install
19
+ git clone git@github.com:kongerly/hltv-crawler.git
20
+ cd hltv-crawler
21
+ python -m venv .venv
22
+ .venv\\Scripts\\activate # Windows
23
+ pip install -r requirements.txt
24
+
25
+ # 2. Crawl (use --max-pages first to test)
26
+ python crawl.py --max-pages=3
27
+
28
+ # 3. More examples
29
+ python crawl.py --max-pages=10 --start-date=2025-01-01
30
+ python crawl.py --event="IEM" --max-pages=5
31
+ python crawl.py --resume # Continue from last time
32
+ ```
33
+
34
+ ## Output
35
+
36
+ - `data/hltv.db` — SQLite database with 6 tables
37
+ - `data/raw/*.html` — cached HTML pages (can be deleted safely)
38
+
39
+ ### Database tables
40
+
41
+ | Table | Description |
42
+ |-------|-------------|
43
+ | `events` | Tournaments & events |
44
+ | `teams` | Team rankings |
45
+ | `players` | Player roster info |
46
+ | `matches` | Match results (bo, scores, winner) |
47
+ | `maps` | Per-map scores + CT/T side rounds |
48
+ | `player_match_stats` | Per-player stats (rating, ADR, KAST, K/D) |
49
+
50
+ ## Requirements
51
+
52
+ - Python 3.10+
53
+ - `curl_cffi` — HTTP client with TLS fingerprint spoofing
54
+ - `beautifulsoup4` — HTML parsing
55
+
56
+ ## Use as a Library
57
+
58
+ ```python
59
+ from pathlib import Path
60
+ from storage import Database
61
+ from scraper import HltvOrchestrator
62
+
63
+ with Database(Path("data/hltv.db")) as db:
64
+ db.create_tables()
65
+ with HltvOrchestrator(db) as orch:
66
+ counts = orch.run_full_pipeline(max_pages=5)
67
+ print(counts)
68
+ ```
69
+
70
+ ## License
71
+
72
+ MIT
@@ -0,0 +1,95 @@
1
+ Metadata-Version: 2.4
2
+ Name: hltv-crawler
3
+ Version: 0.1.0
4
+ Summary: HLTV CS2 match data crawler — scrape matches, maps, player stats to SQLite
5
+ Author: kongerly
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/kongerly/hltv-crawler
8
+ Project-URL: Repository, https://github.com/kongerly/hltv-crawler
9
+ Project-URL: Issues, https://github.com/kongerly/hltv-crawler/issues
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Topic :: Games/Entertainment
13
+ Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Requires-Python: >=3.10
20
+ Description-Content-Type: text/markdown
21
+ Requires-Dist: curl_cffi>=0.15
22
+ Requires-Dist: beautifulsoup4>=4.12
23
+
24
+ # HLTV CS2 Crawler
25
+
26
+ > CS2 match data crawler — scrape HLTV matches, maps, player stats to SQLite.
27
+
28
+ ## Features
29
+
30
+ - **Cloudflare bypass** via `curl_cffi` (Chrome 124 impersonation)
31
+ - **Disk cache** — avoid re-downloading (24h TTL)
32
+ - **Rate limiting** — 2s between requests
33
+ - **Pagination** — auto-scroll through results pages
34
+ - **Date/event filtering** — scrape only what you need
35
+ - **Resume support** — stop and continue later (via `progress.json`)
36
+ - **Zero external deps for storage** — SQLite is Python stdlib
37
+
38
+ ## Quick Start
39
+
40
+ ```bash
41
+ # 1. Install
42
+ git clone git@github.com:kongerly/hltv-crawler.git
43
+ cd hltv-crawler
44
+ python -m venv .venv
45
+ .venv\\Scripts\\activate # Windows
46
+ pip install -r requirements.txt
47
+
48
+ # 2. Crawl (use --max-pages first to test)
49
+ python crawl.py --max-pages=3
50
+
51
+ # 3. More examples
52
+ python crawl.py --max-pages=10 --start-date=2025-01-01
53
+ python crawl.py --event="IEM" --max-pages=5
54
+ python crawl.py --resume # Continue from last time
55
+ ```
56
+
57
+ ## Output
58
+
59
+ - `data/hltv.db` — SQLite database with 6 tables
60
+ - `data/raw/*.html` — cached HTML pages (can be deleted safely)
61
+
62
+ ### Database tables
63
+
64
+ | Table | Description |
65
+ |-------|-------------|
66
+ | `events` | Tournaments & events |
67
+ | `teams` | Team rankings |
68
+ | `players` | Player roster info |
69
+ | `matches` | Match results (bo, scores, winner) |
70
+ | `maps` | Per-map scores + CT/T side rounds |
71
+ | `player_match_stats` | Per-player stats (rating, ADR, KAST, K/D) |
72
+
73
+ ## Requirements
74
+
75
+ - Python 3.10+
76
+ - `curl_cffi` — HTTP client with TLS fingerprint spoofing
77
+ - `beautifulsoup4` — HTML parsing
78
+
79
+ ## Use as a Library
80
+
81
+ ```python
82
+ from pathlib import Path
83
+ from storage import Database
84
+ from scraper import HltvOrchestrator
85
+
86
+ with Database(Path("data/hltv.db")) as db:
87
+ db.create_tables()
88
+ with HltvOrchestrator(db) as orch:
89
+ counts = orch.run_full_pipeline(max_pages=5)
90
+ print(counts)
91
+ ```
92
+
93
+ ## License
94
+
95
+ MIT
@@ -0,0 +1,20 @@
1
+ README.md
2
+ pyproject.toml
3
+ hltv_crawler.egg-info/PKG-INFO
4
+ hltv_crawler.egg-info/SOURCES.txt
5
+ hltv_crawler.egg-info/dependency_links.txt
6
+ hltv_crawler.egg-info/entry_points.txt
7
+ hltv_crawler.egg-info/requires.txt
8
+ hltv_crawler.egg-info/top_level.txt
9
+ parser/__init__.py
10
+ parser/parsers.py
11
+ scraper/__init__.py
12
+ scraper/config.py
13
+ scraper/event_scraper.py
14
+ scraper/http_client.py
15
+ scraper/match_scraper.py
16
+ scraper/orchestrator.py
17
+ scraper/team_scraper.py
18
+ storage/__init__.py
19
+ storage/database.py
20
+ storage/schema.py
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ hltv-crawler = crawl:main
@@ -0,0 +1,2 @@
1
+ curl_cffi>=0.15
2
+ beautifulsoup4>=4.12
@@ -0,0 +1,3 @@
1
+ parser
2
+ scraper
3
+ storage
@@ -0,0 +1,17 @@
1
+ """HLTV Data Parsers — extract structured data from HTML pages."""
2
+
3
+ from parser.parsers import (
4
+ parse_events_page,
5
+ parse_results_page,
6
+ parse_match_detail,
7
+ parse_team_ranking_page,
8
+ parse_ranking_players,
9
+ )
10
+
11
+ __all__ = [
12
+ "parse_events_page",
13
+ "parse_results_page",
14
+ "parse_match_detail",
15
+ "parse_team_ranking_page",
16
+ "parse_ranking_players",
17
+ ]