scrapeer 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapeer-1.0.0/LICENSE.txt +21 -0
- scrapeer-1.0.0/PKG-INFO +148 -0
- scrapeer-1.0.0/README.md +119 -0
- scrapeer-1.0.0/scrapeer/__init__.py +11 -0
- scrapeer-1.0.0/scrapeer/http.py +227 -0
- scrapeer-1.0.0/scrapeer/scraper.py +134 -0
- scrapeer-1.0.0/scrapeer/udp.py +327 -0
- scrapeer-1.0.0/scrapeer/utils.py +76 -0
- scrapeer-1.0.0/scrapeer.egg-info/PKG-INFO +148 -0
- scrapeer-1.0.0/scrapeer.egg-info/SOURCES.txt +12 -0
- scrapeer-1.0.0/scrapeer.egg-info/dependency_links.txt +1 -0
- scrapeer-1.0.0/scrapeer.egg-info/top_level.txt +1 -0
- scrapeer-1.0.0/setup.cfg +4 -0
- scrapeer-1.0.0/setup.py +29 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 tboy1337
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
scrapeer-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: scrapeer
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Essential Python library that scrapes HTTP(S) and UDP trackers for torrent information.
|
|
5
|
+
Home-page: https://github.com/tboy1337/scrapeer-py
|
|
6
|
+
Download-URL: https://github.com/tboy1337/scrapeer-py/releases/latest
|
|
7
|
+
Author: tboy1337
|
|
8
|
+
Author-email: obywhuie@anonaddy.com
|
|
9
|
+
License: MIT
|
|
10
|
+
Keywords: torrent,torrents,scraper,scrapeer,torrent-scraper,torrent-scraping
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Requires-Python: >=3.6
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
License-File: LICENSE.txt
|
|
17
|
+
Dynamic: author
|
|
18
|
+
Dynamic: author-email
|
|
19
|
+
Dynamic: classifier
|
|
20
|
+
Dynamic: description
|
|
21
|
+
Dynamic: description-content-type
|
|
22
|
+
Dynamic: download-url
|
|
23
|
+
Dynamic: home-page
|
|
24
|
+
Dynamic: keywords
|
|
25
|
+
Dynamic: license
|
|
26
|
+
Dynamic: license-file
|
|
27
|
+
Dynamic: requires-python
|
|
28
|
+
Dynamic: summary
|
|
29
|
+
|
|
30
|
+
# Scrapeer-py
|
|
31
|
+
|
|
32
|
+
A tiny Python library that lets you scrape HTTP(S) and UDP trackers for torrent information.
|
|
33
|
+
|
|
34
|
+
Scrapeer-py is a Python port of the original PHP [Scrapeer](https://github.com/torrentpier/scrapeer) library by [TorrentPier](https://github.com/torrentpier).
|
|
35
|
+
|
|
36
|
+
## Overview
|
|
37
|
+
|
|
38
|
+
Scrapeer-py allows you to retrieve peer information from BitTorrent trackers using both HTTP(S) and UDP protocols. It can fetch seeders, leechers, and completed download counts for multiple torrents from multiple trackers simultaneously.
|
|
39
|
+
|
|
40
|
+
## Features
|
|
41
|
+
|
|
42
|
+
- Support for both HTTP(S) and UDP tracker protocols
|
|
43
|
+
- Batch scraping of multiple infohashes at once (up to 64)
|
|
44
|
+
- Support for trackers with passkeys
|
|
45
|
+
- Optional announce mode for trackers that don't support scrape
|
|
46
|
+
- Configurable timeout settings
|
|
47
|
+
- Detailed error reporting
|
|
48
|
+
- Well-organized modular codebase
|
|
49
|
+
|
|
50
|
+
## Installation
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
pip install scrapeer
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Usage
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
from scrapeer import Scraper
|
|
60
|
+
|
|
61
|
+
# Initialize the scraper
|
|
62
|
+
scraper = Scraper()
|
|
63
|
+
|
|
64
|
+
# Define your infohashes and trackers
|
|
65
|
+
infohashes = [
|
|
66
|
+
"0123456789abcdef0123456789abcdef01234567",
|
|
67
|
+
"fedcba9876543210fedcba9876543210fedcba98"
|
|
68
|
+
]
|
|
69
|
+
|
|
70
|
+
trackers = [
|
|
71
|
+
"udp://tracker.example.com:80",
|
|
72
|
+
"http://tracker.example.org:6969/announce",
|
|
73
|
+
"https://private-tracker.example.net:443/YOUR_PASSKEY/announce"
|
|
74
|
+
]
|
|
75
|
+
|
|
76
|
+
# Get the results (timeout of 3 seconds per tracker)
|
|
77
|
+
results = scraper.scrape(
|
|
78
|
+
hashes=infohashes,
|
|
79
|
+
trackers=trackers,
|
|
80
|
+
timeout=3
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# Print the results
|
|
84
|
+
for infohash, data in results.items():
|
|
85
|
+
print(f"Results for {infohash}:")
|
|
86
|
+
print(f" Seeders: {data['seeders']}")
|
|
87
|
+
print(f" Leechers: {data['leechers']}")
|
|
88
|
+
print(f" Completed: {data['completed']}")
|
|
89
|
+
|
|
90
|
+
# Check if there were any errors
|
|
91
|
+
if scraper.has_errors():
|
|
92
|
+
print("\nErrors:")
|
|
93
|
+
for error in scraper.get_errors():
|
|
94
|
+
print(f" {error}")
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## Package Structure
|
|
98
|
+
|
|
99
|
+
Scrapeer-py is organized into the following modules:
|
|
100
|
+
|
|
101
|
+
- `scrapeer/` - Main package directory
|
|
102
|
+
- `__init__.py` - Package initialization that exports the Scraper class
|
|
103
|
+
- `scraper.py` - Main Scraper class implementation
|
|
104
|
+
- `http.py` - HTTP(S) protocol scraping functionality
|
|
105
|
+
- `udp.py` - UDP protocol scraping functionality
|
|
106
|
+
- `utils.py` - Utility functions used across the package
|
|
107
|
+
|
|
108
|
+
## API Reference
|
|
109
|
+
|
|
110
|
+
### `Scraper` class
|
|
111
|
+
|
|
112
|
+
#### `scrape(hashes, trackers, max_trackers=None, timeout=2, announce=False)`
|
|
113
|
+
|
|
114
|
+
Scrape trackers for torrent information.
|
|
115
|
+
|
|
116
|
+
- **Parameters**:
|
|
117
|
+
- `hashes`: List (>1) or string of infohash(es)
|
|
118
|
+
- `trackers`: List (>1) or string of tracker(s)
|
|
119
|
+
- `max_trackers`: (Optional) Maximum number of trackers to be scraped, Default all
|
|
120
|
+
- `timeout`: (Optional) Maximum time for each tracker scrape in seconds, Default 2
|
|
121
|
+
- `announce`: (Optional) Use announce instead of scrape, Default False
|
|
122
|
+
|
|
123
|
+
- **Returns**:
|
|
124
|
+
- Dictionary of results with infohashes as keys and stats as values
|
|
125
|
+
|
|
126
|
+
#### `has_errors()`
|
|
127
|
+
|
|
128
|
+
Checks if there are any errors.
|
|
129
|
+
|
|
130
|
+
- **Returns**:
|
|
131
|
+
- `bool`: True if errors are present, False otherwise
|
|
132
|
+
|
|
133
|
+
#### `get_errors()`
|
|
134
|
+
|
|
135
|
+
Returns all the errors that were logged.
|
|
136
|
+
|
|
137
|
+
- **Returns**:
|
|
138
|
+
- `list`: All the logged errors
|
|
139
|
+
|
|
140
|
+
## Limitations
|
|
141
|
+
|
|
142
|
+
- Maximum of 64 infohashes per request
|
|
143
|
+
- Minimum of 1 infohash per request
|
|
144
|
+
- Only supports BitTorrent trackers (HTTP(S) and UDP)
|
|
145
|
+
|
|
146
|
+
## License
|
|
147
|
+
|
|
148
|
+
This project is licensed under the MIT License - see the [LICENSE.txt](LICENSE.txt) file for details.
|
scrapeer-1.0.0/README.md
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
# Scrapeer-py
|
|
2
|
+
|
|
3
|
+
A tiny Python library that lets you scrape HTTP(S) and UDP trackers for torrent information.
|
|
4
|
+
|
|
5
|
+
Scrapeer-py is a Python port of the original PHP [Scrapeer](https://github.com/torrentpier/scrapeer) library by [TorrentPier](https://github.com/torrentpier).
|
|
6
|
+
|
|
7
|
+
## Overview
|
|
8
|
+
|
|
9
|
+
Scrapeer-py allows you to retrieve peer information from BitTorrent trackers using both HTTP(S) and UDP protocols. It can fetch seeders, leechers, and completed download counts for multiple torrents from multiple trackers simultaneously.
|
|
10
|
+
|
|
11
|
+
## Features
|
|
12
|
+
|
|
13
|
+
- Support for both HTTP(S) and UDP tracker protocols
|
|
14
|
+
- Batch scraping of multiple infohashes at once (up to 64)
|
|
15
|
+
- Support for trackers with passkeys
|
|
16
|
+
- Optional announce mode for trackers that don't support scrape
|
|
17
|
+
- Configurable timeout settings
|
|
18
|
+
- Detailed error reporting
|
|
19
|
+
- Well-organized modular codebase
|
|
20
|
+
|
|
21
|
+
## Installation
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install scrapeer
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Usage
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
from scrapeer import Scraper
|
|
31
|
+
|
|
32
|
+
# Initialize the scraper
|
|
33
|
+
scraper = Scraper()
|
|
34
|
+
|
|
35
|
+
# Define your infohashes and trackers
|
|
36
|
+
infohashes = [
|
|
37
|
+
"0123456789abcdef0123456789abcdef01234567",
|
|
38
|
+
"fedcba9876543210fedcba9876543210fedcba98"
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
trackers = [
|
|
42
|
+
"udp://tracker.example.com:80",
|
|
43
|
+
"http://tracker.example.org:6969/announce",
|
|
44
|
+
"https://private-tracker.example.net:443/YOUR_PASSKEY/announce"
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
# Get the results (timeout of 3 seconds per tracker)
|
|
48
|
+
results = scraper.scrape(
|
|
49
|
+
hashes=infohashes,
|
|
50
|
+
trackers=trackers,
|
|
51
|
+
timeout=3
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
# Print the results
|
|
55
|
+
for infohash, data in results.items():
|
|
56
|
+
print(f"Results for {infohash}:")
|
|
57
|
+
print(f" Seeders: {data['seeders']}")
|
|
58
|
+
print(f" Leechers: {data['leechers']}")
|
|
59
|
+
print(f" Completed: {data['completed']}")
|
|
60
|
+
|
|
61
|
+
# Check if there were any errors
|
|
62
|
+
if scraper.has_errors():
|
|
63
|
+
print("\nErrors:")
|
|
64
|
+
for error in scraper.get_errors():
|
|
65
|
+
print(f" {error}")
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Package Structure
|
|
69
|
+
|
|
70
|
+
Scrapeer-py is organized into the following modules:
|
|
71
|
+
|
|
72
|
+
- `scrapeer/` - Main package directory
|
|
73
|
+
- `__init__.py` - Package initialization that exports the Scraper class
|
|
74
|
+
- `scraper.py` - Main Scraper class implementation
|
|
75
|
+
- `http.py` - HTTP(S) protocol scraping functionality
|
|
76
|
+
- `udp.py` - UDP protocol scraping functionality
|
|
77
|
+
- `utils.py` - Utility functions used across the package
|
|
78
|
+
|
|
79
|
+
## API Reference
|
|
80
|
+
|
|
81
|
+
### `Scraper` class
|
|
82
|
+
|
|
83
|
+
#### `scrape(hashes, trackers, max_trackers=None, timeout=2, announce=False)`
|
|
84
|
+
|
|
85
|
+
Scrape trackers for torrent information.
|
|
86
|
+
|
|
87
|
+
- **Parameters**:
|
|
88
|
+
- `hashes`: List (>1) or string of infohash(es)
|
|
89
|
+
- `trackers`: List (>1) or string of tracker(s)
|
|
90
|
+
- `max_trackers`: (Optional) Maximum number of trackers to be scraped, Default all
|
|
91
|
+
- `timeout`: (Optional) Maximum time for each tracker scrape in seconds, Default 2
|
|
92
|
+
- `announce`: (Optional) Use announce instead of scrape, Default False
|
|
93
|
+
|
|
94
|
+
- **Returns**:
|
|
95
|
+
- Dictionary of results with infohashes as keys and stats as values
|
|
96
|
+
|
|
97
|
+
#### `has_errors()`
|
|
98
|
+
|
|
99
|
+
Checks if there are any errors.
|
|
100
|
+
|
|
101
|
+
- **Returns**:
|
|
102
|
+
- `bool`: True if errors are present, False otherwise
|
|
103
|
+
|
|
104
|
+
#### `get_errors()`
|
|
105
|
+
|
|
106
|
+
Returns all the errors that were logged.
|
|
107
|
+
|
|
108
|
+
- **Returns**:
|
|
109
|
+
- `list`: All the logged errors
|
|
110
|
+
|
|
111
|
+
## Limitations
|
|
112
|
+
|
|
113
|
+
- Maximum of 64 infohashes per request
|
|
114
|
+
- Minimum of 1 infohash per request
|
|
115
|
+
- Only supports BitTorrent trackers (HTTP(S) and UDP)
|
|
116
|
+
|
|
117
|
+
## License
|
|
118
|
+
|
|
119
|
+
This project is licensed under the MIT License - see the [LICENSE.txt](LICENSE.txt) file for details.
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Scrapeer-py, a tiny Python library that lets you scrape
|
|
3
|
+
HTTP(S) and UDP trackers for torrent information.
|
|
4
|
+
|
|
5
|
+
Port of the original PHP Scrapeer library by TorrentPier.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .scraper import Scraper
|
|
9
|
+
|
|
10
|
+
__version__ = '1.0.0'
|
|
11
|
+
__all__ = ['Scraper']
|
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HTTP scraping functionality for Scrapeer.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import urllib.request
|
|
6
|
+
import urllib.parse
|
|
7
|
+
import re
|
|
8
|
+
import socket
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def scrape_http(infohashes, protocol, host, port, passkey, announce, timeout):
|
|
12
|
+
"""
|
|
13
|
+
Initiates the HTTP(S) scraping
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
infohashes: List (>1) or string of infohash(es).
|
|
17
|
+
protocol: Protocol to use for the scraping.
|
|
18
|
+
host: Domain or IP address of the tracker.
|
|
19
|
+
port: Optional. Port number of the tracker.
|
|
20
|
+
passkey: Optional. Passkey provided in the scrape request.
|
|
21
|
+
announce: Optional. Use announce instead of scrape.
|
|
22
|
+
timeout: Maximum time for each tracker scrape in seconds.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
dict: Dictionary of results.
|
|
26
|
+
"""
|
|
27
|
+
if announce:
|
|
28
|
+
response = http_announce(infohashes, protocol, host, port, passkey, timeout)
|
|
29
|
+
else:
|
|
30
|
+
query = http_query(infohashes, protocol, host, port, passkey)
|
|
31
|
+
response = http_request(query, host, port, timeout)
|
|
32
|
+
|
|
33
|
+
results = http_data(response, infohashes, host)
|
|
34
|
+
return results
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def http_query(infohashes, protocol, host, port, passkey):
|
|
38
|
+
"""
|
|
39
|
+
Builds the HTTP(S) query
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
infohashes: List (>1) or string of infohash(es).
|
|
43
|
+
protocol: Protocol to use for the scraping.
|
|
44
|
+
host: Domain or IP address of the tracker.
|
|
45
|
+
port: Port number of the tracker.
|
|
46
|
+
passkey: Optional. Passkey provided in the scrape request.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
str: Fully qualified URL.
|
|
50
|
+
"""
|
|
51
|
+
info = urllib.parse.urlparse(f"{protocol}://{host}:{port}/scrape{passkey}")
|
|
52
|
+
query = f"{info.scheme}://{info.netloc}{info.path}"
|
|
53
|
+
|
|
54
|
+
if len(infohashes) > 1:
|
|
55
|
+
query += '?'
|
|
56
|
+
|
|
57
|
+
for index, infohash in enumerate(infohashes):
|
|
58
|
+
query += f"info_hash={urllib.parse.quote(bytes.fromhex(infohash))}"
|
|
59
|
+
|
|
60
|
+
if index < len(infohashes) - 1:
|
|
61
|
+
query += '&'
|
|
62
|
+
elif len(infohashes) == 1:
|
|
63
|
+
query += f"?info_hash={urllib.parse.quote(bytes.fromhex(infohashes[0]))}"
|
|
64
|
+
|
|
65
|
+
return query
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def http_request(query, host, port, timeout):
|
|
69
|
+
"""
|
|
70
|
+
Sends HTTP(S) request to the tracker
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
query: URL to the tracker.
|
|
74
|
+
host: Domain or IP address of the tracker.
|
|
75
|
+
port: Port number of the tracker.
|
|
76
|
+
timeout: Maximum time for each tracker scrape in seconds.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
str: Response from the tracker.
|
|
80
|
+
"""
|
|
81
|
+
socket.setdefaulttimeout(timeout)
|
|
82
|
+
|
|
83
|
+
try:
|
|
84
|
+
request = urllib.request.Request(
|
|
85
|
+
query,
|
|
86
|
+
headers={'User-Agent': 'Scrapeer-py/1.0.0'}
|
|
87
|
+
)
|
|
88
|
+
response = urllib.request.urlopen(request).read()
|
|
89
|
+
return response
|
|
90
|
+
except Exception as e:
|
|
91
|
+
raise Exception(f"Connection error: {host}:{port} - {str(e)}")
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def http_announce(infohashes, protocol, host, port, passkey, timeout):
|
|
95
|
+
"""
|
|
96
|
+
Announces to the tracker instead of scraping
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
infohashes: List (>1) or string of infohash(es).
|
|
100
|
+
protocol: Protocol to use for the scraping.
|
|
101
|
+
host: Domain or IP address of the tracker.
|
|
102
|
+
port: Port number of the tracker.
|
|
103
|
+
passkey: Optional. Passkey provided in the scrape request.
|
|
104
|
+
timeout: Maximum time for each tracker scrape in seconds.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
str: Response from the tracker.
|
|
108
|
+
"""
|
|
109
|
+
info = urllib.parse.urlparse(f"{protocol}://{host}:{port}/announce{passkey}")
|
|
110
|
+
query = f"{info.scheme}://{info.netloc}{info.path}"
|
|
111
|
+
|
|
112
|
+
if len(infohashes) > 1:
|
|
113
|
+
raise Exception(f"Too many hashes for HTTP announce ({len(infohashes)}).")
|
|
114
|
+
|
|
115
|
+
query += f"?info_hash={urllib.parse.quote(bytes.fromhex(infohashes[0]))}"
|
|
116
|
+
query += "&peer_id=test1234567891234567"
|
|
117
|
+
query += "&port=6889"
|
|
118
|
+
query += "&uploaded=0"
|
|
119
|
+
query += "&downloaded=0"
|
|
120
|
+
query += "&left=0"
|
|
121
|
+
query += "&compact=1"
|
|
122
|
+
|
|
123
|
+
socket.setdefaulttimeout(timeout)
|
|
124
|
+
|
|
125
|
+
try:
|
|
126
|
+
request = urllib.request.Request(
|
|
127
|
+
query,
|
|
128
|
+
headers={'User-Agent': 'Scrapeer-py/1.0.0'}
|
|
129
|
+
)
|
|
130
|
+
response = urllib.request.urlopen(request).read()
|
|
131
|
+
return response
|
|
132
|
+
except Exception as e:
|
|
133
|
+
raise Exception(f"Connection error: {host}:{port} - {str(e)}")
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def http_data(response, infohashes, host):
|
|
137
|
+
"""
|
|
138
|
+
Gets the data from HTTP(S) response
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
response: Response from the tracker.
|
|
142
|
+
infohashes: List (>1) or string of infohash(es).
|
|
143
|
+
host: Domain or IP address of the tracker.
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
dict: Dictionary of results.
|
|
147
|
+
"""
|
|
148
|
+
data = str(response)
|
|
149
|
+
results = {}
|
|
150
|
+
pattern_all = r"d8:completei(\d+)e10:downloadedi(\d+)e10:incompletei(\d+)e"
|
|
151
|
+
pattern_single = r"d8:completei(\d+)e10:incompletei(\d+)e"
|
|
152
|
+
|
|
153
|
+
for infohash in infohashes:
|
|
154
|
+
pattern = f"{infohash}:{pattern_all}"
|
|
155
|
+
matches = re.search(pattern, data, re.IGNORECASE)
|
|
156
|
+
|
|
157
|
+
if matches:
|
|
158
|
+
results[infohash] = {
|
|
159
|
+
'seeders': int(matches.group(1)),
|
|
160
|
+
'completed': int(matches.group(2)),
|
|
161
|
+
'leechers': int(matches.group(3)),
|
|
162
|
+
}
|
|
163
|
+
else:
|
|
164
|
+
pattern = f"{infohash}:{pattern_single}"
|
|
165
|
+
matches = re.search(pattern, data, re.IGNORECASE)
|
|
166
|
+
|
|
167
|
+
if matches:
|
|
168
|
+
results[infohash] = {
|
|
169
|
+
'seeders': int(matches.group(1)),
|
|
170
|
+
'completed': 0,
|
|
171
|
+
'leechers': int(matches.group(2)),
|
|
172
|
+
}
|
|
173
|
+
else:
|
|
174
|
+
info = get_information(data, 'd5:filesd', 'ee')
|
|
175
|
+
|
|
176
|
+
if info:
|
|
177
|
+
pattern = f"20:{bytes.fromhex(infohash).decode('latin-1', errors='ignore')}d"
|
|
178
|
+
start = info.find(pattern)
|
|
179
|
+
|
|
180
|
+
if start != -1:
|
|
181
|
+
info = info[start:]
|
|
182
|
+
end = info.find('e')
|
|
183
|
+
info = info[:end + 1]
|
|
184
|
+
|
|
185
|
+
seeders = re.search(r"completei(\d+)e", info, re.IGNORECASE)
|
|
186
|
+
leechers = re.search(r"incompletei(\d+)e", info, re.IGNORECASE)
|
|
187
|
+
completed = re.search(r"downloadedi(\d+)e", info, re.IGNORECASE)
|
|
188
|
+
|
|
189
|
+
seeders = int(seeders.group(1)) if seeders else 0
|
|
190
|
+
leechers = int(leechers.group(1)) if leechers else 0
|
|
191
|
+
completed = int(completed.group(1)) if completed else 0
|
|
192
|
+
|
|
193
|
+
results[infohash] = {
|
|
194
|
+
'seeders': seeders,
|
|
195
|
+
'completed': completed,
|
|
196
|
+
'leechers': leechers,
|
|
197
|
+
}
|
|
198
|
+
else:
|
|
199
|
+
raise Exception(f"Failed to parse torrent data from '{host}'.")
|
|
200
|
+
else:
|
|
201
|
+
raise Exception(f"Invalid scrape response from '{host}'.")
|
|
202
|
+
|
|
203
|
+
return results
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def get_information(data, start, end):
|
|
207
|
+
"""
|
|
208
|
+
Gets information from HTTP(S) response
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
data: Response from the tracker.
|
|
212
|
+
start: Starting string.
|
|
213
|
+
end: Ending string.
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
str: Information or None.
|
|
217
|
+
"""
|
|
218
|
+
start_pos = data.find(start)
|
|
219
|
+
|
|
220
|
+
if start_pos != -1:
|
|
221
|
+
start_pos += len(start)
|
|
222
|
+
end_pos = data.find(end, start_pos)
|
|
223
|
+
|
|
224
|
+
if end_pos != -1:
|
|
225
|
+
return data[start_pos:end_pos]
|
|
226
|
+
|
|
227
|
+
return None
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Main Scraper class for Scrapeer.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import urllib.parse
|
|
6
|
+
from .http import scrape_http
|
|
7
|
+
from .udp import scrape_udp
|
|
8
|
+
from .utils import normalize_infohashes, get_passkey
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Scraper:
|
|
12
|
+
"""
|
|
13
|
+
The one and only class you'll ever need.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
VERSION = '1.0.0' # Python port version
|
|
17
|
+
|
|
18
|
+
def __init__(self):
|
|
19
|
+
"""
|
|
20
|
+
Initialize the scraper.
|
|
21
|
+
"""
|
|
22
|
+
self.errors = []
|
|
23
|
+
self.infohashes = []
|
|
24
|
+
self.timeout = 2
|
|
25
|
+
|
|
26
|
+
def scrape(self, hashes, trackers, max_trackers=None, timeout=2, announce=False):
|
|
27
|
+
"""
|
|
28
|
+
Initiates the scraper
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
hashes: List (>1) or string of infohash(es).
|
|
32
|
+
trackers: List (>1) or string of tracker(s).
|
|
33
|
+
max_trackers: Optional. Maximum number of trackers to be scraped, Default all.
|
|
34
|
+
timeout: Optional. Maximum time for each tracker scrape in seconds, Default 2.
|
|
35
|
+
announce: Optional. Use announce instead of scrape, Default false.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
dict: Dictionary of results.
|
|
39
|
+
"""
|
|
40
|
+
final_result = {}
|
|
41
|
+
|
|
42
|
+
if not trackers:
|
|
43
|
+
self.errors.append('No tracker specified, aborting.')
|
|
44
|
+
return final_result
|
|
45
|
+
elif not isinstance(trackers, list):
|
|
46
|
+
trackers = [trackers]
|
|
47
|
+
|
|
48
|
+
if isinstance(timeout, int):
|
|
49
|
+
self.timeout = timeout
|
|
50
|
+
else:
|
|
51
|
+
self.timeout = 2
|
|
52
|
+
self.errors.append('Timeout must be an integer. Using default value.')
|
|
53
|
+
|
|
54
|
+
try:
|
|
55
|
+
self.infohashes = normalize_infohashes(hashes, self.errors)
|
|
56
|
+
except ValueError as e:
|
|
57
|
+
self.errors.append(str(e))
|
|
58
|
+
return final_result
|
|
59
|
+
|
|
60
|
+
max_iterations = max_trackers if isinstance(max_trackers, int) else len(trackers)
|
|
61
|
+
for index, tracker in enumerate(trackers):
|
|
62
|
+
if self.infohashes and index < max_iterations:
|
|
63
|
+
info = urllib.parse.urlparse(tracker)
|
|
64
|
+
protocol = info.scheme
|
|
65
|
+
host = info.netloc.split(':')[0] if ':' in info.netloc else info.netloc
|
|
66
|
+
|
|
67
|
+
if not protocol or not host:
|
|
68
|
+
self.errors.append(f'Skipping invalid tracker ({tracker}).')
|
|
69
|
+
continue
|
|
70
|
+
|
|
71
|
+
port = info.port if info.port else None
|
|
72
|
+
path = info.path if info.path else None
|
|
73
|
+
passkey = get_passkey(path)
|
|
74
|
+
|
|
75
|
+
result = self.try_scrape(protocol, host, port, passkey, announce)
|
|
76
|
+
final_result.update(result)
|
|
77
|
+
continue
|
|
78
|
+
break
|
|
79
|
+
|
|
80
|
+
return final_result
|
|
81
|
+
|
|
82
|
+
def try_scrape(self, protocol, host, port, passkey, announce):
|
|
83
|
+
"""
|
|
84
|
+
Tries to scrape with a single tracker.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
protocol: Protocol of the tracker.
|
|
88
|
+
host: Domain or address of the tracker.
|
|
89
|
+
port: Optional. Port number of the tracker.
|
|
90
|
+
passkey: Optional. Passkey provided in the scrape request.
|
|
91
|
+
announce: Optional. Use announce instead of scrape, Default false.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
dict: Dictionary of results.
|
|
95
|
+
"""
|
|
96
|
+
infohashes = self.infohashes.copy()
|
|
97
|
+
self.infohashes = []
|
|
98
|
+
results = {}
|
|
99
|
+
|
|
100
|
+
try:
|
|
101
|
+
if protocol == 'udp':
|
|
102
|
+
port = port if port else 80
|
|
103
|
+
results = scrape_udp(infohashes, host, port, announce, self.timeout)
|
|
104
|
+
elif protocol == 'http':
|
|
105
|
+
port = port if port else 80
|
|
106
|
+
results = scrape_http(infohashes, protocol, host, port, passkey, announce, self.timeout)
|
|
107
|
+
elif protocol == 'https':
|
|
108
|
+
port = port if port else 443
|
|
109
|
+
results = scrape_http(infohashes, protocol, host, port, passkey, announce, self.timeout)
|
|
110
|
+
else:
|
|
111
|
+
raise Exception(f'Unsupported protocol ({protocol}://{host}).')
|
|
112
|
+
except Exception as e:
|
|
113
|
+
self.infohashes = infohashes
|
|
114
|
+
self.errors.append(str(e))
|
|
115
|
+
|
|
116
|
+
return results
|
|
117
|
+
|
|
118
|
+
def has_errors(self):
|
|
119
|
+
"""
|
|
120
|
+
Checks if there are any errors.
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
bool: True if errors are present, False otherwise.
|
|
124
|
+
"""
|
|
125
|
+
return len(self.errors) > 0
|
|
126
|
+
|
|
127
|
+
def get_errors(self):
|
|
128
|
+
"""
|
|
129
|
+
Returns all the errors that were logged.
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
list: All the logged errors.
|
|
133
|
+
"""
|
|
134
|
+
return self.errors
|
|
@@ -0,0 +1,327 @@
|
|
|
1
|
+
"""
|
|
2
|
+
UDP scraping functionality for Scrapeer.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import socket
|
|
6
|
+
import struct
|
|
7
|
+
import random
|
|
8
|
+
from .utils import random_peer_id, collect_info_hash
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def scrape_udp(infohashes, host, port, announce, timeout):
|
|
12
|
+
"""
|
|
13
|
+
Initiates the UDP scraping
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
infohashes: List (>1) or string of infohash(es).
|
|
17
|
+
host: Domain or IP address of the tracker.
|
|
18
|
+
port: Port number of the tracker.
|
|
19
|
+
announce: Optional. Use announce instead of scrape.
|
|
20
|
+
timeout: Maximum time for each tracker scrape in seconds.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
dict: Dictionary of results.
|
|
24
|
+
"""
|
|
25
|
+
socket_obj, ip = prepare_udp(host, port)
|
|
26
|
+
socket_obj.settimeout(timeout)
|
|
27
|
+
|
|
28
|
+
try:
|
|
29
|
+
transaction_id, connection_id = udp_connection_request(socket_obj)
|
|
30
|
+
connection_id = udp_connection_response(socket_obj, transaction_id, host, port)
|
|
31
|
+
|
|
32
|
+
if announce:
|
|
33
|
+
return udp_announce(socket_obj, infohashes, connection_id)
|
|
34
|
+
else:
|
|
35
|
+
return udp_scrape(socket_obj, infohashes, connection_id, transaction_id, host, port)
|
|
36
|
+
finally:
|
|
37
|
+
socket_obj.close()
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def prepare_udp(host, port):
|
|
41
|
+
"""
|
|
42
|
+
Prepares the UDP socket
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
host: Domain or IP address of the tracker.
|
|
46
|
+
port: Port number of the tracker.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
tuple: Tuple containing socket object and IP address.
|
|
50
|
+
"""
|
|
51
|
+
socket_obj = udp_create_connection(host, port)
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
ip = socket.gethostbyname(host)
|
|
55
|
+
except socket.gaierror:
|
|
56
|
+
raise Exception(f"Failed to resolve host '{host}'.")
|
|
57
|
+
|
|
58
|
+
return socket_obj, ip
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def udp_create_connection(host, port):
|
|
62
|
+
"""
|
|
63
|
+
Creates a UDP connection
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
host: Domain or IP address of the tracker.
|
|
67
|
+
port: Port number of the tracker.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
socket: Socket object.
|
|
71
|
+
"""
|
|
72
|
+
try:
|
|
73
|
+
socket_obj = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
|
74
|
+
socket_obj.connect((host, port))
|
|
75
|
+
return socket_obj
|
|
76
|
+
except socket.error as e:
|
|
77
|
+
raise Exception(f"Failed to create socket for '{host}:{port}' - {str(e)}.")
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def udp_connection_request(socket_obj):
|
|
81
|
+
"""
|
|
82
|
+
Sends a connection request
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
socket_obj: Socket object.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
tuple: Tuple containing transaction_id and connection_id.
|
|
89
|
+
"""
|
|
90
|
+
connection_id = 0x41727101980 # Default connection ID
|
|
91
|
+
action = 0 # Action (0 = connection, 1 = announce, 2 = scrape)
|
|
92
|
+
transaction_id = random.randint(0, 2147483647) # Random transaction ID
|
|
93
|
+
|
|
94
|
+
buffer = struct.pack(">QII", connection_id, action, transaction_id)
|
|
95
|
+
|
|
96
|
+
try:
|
|
97
|
+
socket_obj.send(buffer)
|
|
98
|
+
except socket.error as e:
|
|
99
|
+
raise Exception(f"Failed to send connection request - {str(e)}.")
|
|
100
|
+
|
|
101
|
+
return transaction_id, connection_id
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def udp_connection_response(socket_obj, transaction_id, host, port):
|
|
105
|
+
"""
|
|
106
|
+
Receives a connection response
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
socket_obj: Socket object.
|
|
110
|
+
transaction_id: Transaction ID.
|
|
111
|
+
host: Domain or IP address of the tracker.
|
|
112
|
+
port: Port number of the tracker.
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
int: Connection ID.
|
|
116
|
+
"""
|
|
117
|
+
try:
|
|
118
|
+
response = socket_obj.recv(16)
|
|
119
|
+
except socket.error as e:
|
|
120
|
+
raise Exception(f"Failed to receive connection response from '{host}:{port}' - {str(e)}.")
|
|
121
|
+
|
|
122
|
+
if len(response) != 16:
|
|
123
|
+
raise Exception(f"Invalid response length from '{host}:{port}'.")
|
|
124
|
+
|
|
125
|
+
return_action, return_transaction_id, connection_id = struct.unpack(">IIQ", response)
|
|
126
|
+
|
|
127
|
+
if return_transaction_id != transaction_id:
|
|
128
|
+
raise Exception(f"Invalid transaction ID from '{host}:{port}'.")
|
|
129
|
+
|
|
130
|
+
if return_action != 0:
|
|
131
|
+
raise Exception(f"Invalid action from '{host}:{port}'.")
|
|
132
|
+
|
|
133
|
+
return connection_id
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def udp_scrape(socket_obj, hashes, connection_id, transaction_id, host, port):
|
|
137
|
+
"""
|
|
138
|
+
Sends a scrape request
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
socket_obj: Socket object.
|
|
142
|
+
hashes: List (>1) or string of infohash(es).
|
|
143
|
+
connection_id: Connection ID.
|
|
144
|
+
transaction_id: Transaction ID.
|
|
145
|
+
host: Domain or IP address of the tracker.
|
|
146
|
+
port: Port number of the tracker.
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
dict: Dictionary of results.
|
|
150
|
+
"""
|
|
151
|
+
action = 2 # Action (2 = scrape)
|
|
152
|
+
|
|
153
|
+
# Create scrape request
|
|
154
|
+
buffer = udp_scrape_request(socket_obj, hashes, connection_id, transaction_id)
|
|
155
|
+
|
|
156
|
+
try:
|
|
157
|
+
# Send scrape request
|
|
158
|
+
socket_obj.send(buffer)
|
|
159
|
+
|
|
160
|
+
# Receive scrape response
|
|
161
|
+
response = socket_obj.recv(8 + (12 * len(hashes)))
|
|
162
|
+
|
|
163
|
+
# Parse scrape response
|
|
164
|
+
if len(response) < 8:
|
|
165
|
+
raise Exception(f"Invalid scrape response from '{host}:{port}'.")
|
|
166
|
+
|
|
167
|
+
return_action, return_transaction_id = struct.unpack(">II", response[:8])
|
|
168
|
+
|
|
169
|
+
# Verify transaction ID
|
|
170
|
+
if transaction_id != return_transaction_id:
|
|
171
|
+
raise Exception(f"Invalid transaction ID from '{host}:{port}'.")
|
|
172
|
+
|
|
173
|
+
# Verify action
|
|
174
|
+
if return_action != action:
|
|
175
|
+
err_msg = struct.unpack(">I", response[4:8])[0]
|
|
176
|
+
raise Exception(f"Tracker error, code: {err_msg} from '{host}:{port}'.")
|
|
177
|
+
|
|
178
|
+
# Create keys array
|
|
179
|
+
keys = []
|
|
180
|
+
for infohash in hashes:
|
|
181
|
+
keys.append(infohash)
|
|
182
|
+
|
|
183
|
+
# Parse results
|
|
184
|
+
return udp_scrape_data(response, hashes, host, keys, 8, len(response), 12)
|
|
185
|
+
except socket.error as e:
|
|
186
|
+
raise Exception(f"Socket error from '{host}:{port}' - {str(e)}.")
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def udp_scrape_request(socket_obj, hashes, connection_id, transaction_id):
|
|
190
|
+
"""
|
|
191
|
+
Creates a scrape request
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
socket_obj: Socket object.
|
|
195
|
+
hashes: List (>1) or string of infohash(es).
|
|
196
|
+
connection_id: Connection ID.
|
|
197
|
+
transaction_id: Transaction ID.
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
bytes: Scrape request.
|
|
201
|
+
"""
|
|
202
|
+
action = 2 # Action (2 = scrape)
|
|
203
|
+
|
|
204
|
+
buffer = struct.pack(">QII", connection_id, action, transaction_id)
|
|
205
|
+
|
|
206
|
+
for infohash in hashes:
|
|
207
|
+
buffer += collect_info_hash(infohash)
|
|
208
|
+
|
|
209
|
+
return buffer
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def udp_announce(socket_obj, hashes, connection_id):
|
|
213
|
+
"""
|
|
214
|
+
Sends an announce request
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
socket_obj: Socket object.
|
|
218
|
+
hashes: List (>1) or string of infohash(es).
|
|
219
|
+
connection_id: Connection ID.
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
dict: Dictionary of results.
|
|
223
|
+
"""
|
|
224
|
+
if len(hashes) > 1:
|
|
225
|
+
raise Exception(f"Too many hashes for UDP announce ({len(hashes)}).")
|
|
226
|
+
|
|
227
|
+
action = 1 # Action (1 = announce)
|
|
228
|
+
transaction_id = random.randint(0, 2147483647) # Random transaction ID
|
|
229
|
+
|
|
230
|
+
infohash = collect_info_hash(hashes[0])
|
|
231
|
+
peer_id = random_peer_id()
|
|
232
|
+
downloaded = 0
|
|
233
|
+
left = 0
|
|
234
|
+
uploaded = 0
|
|
235
|
+
event = 0
|
|
236
|
+
ip = 0
|
|
237
|
+
key = 0
|
|
238
|
+
num_want = -1
|
|
239
|
+
port = 6889
|
|
240
|
+
|
|
241
|
+
buffer = struct.pack(">QII20s20sQQQIIIiH",
|
|
242
|
+
connection_id, action, transaction_id, infohash, peer_id,
|
|
243
|
+
downloaded, left, uploaded, event, ip, key, num_want, port)
|
|
244
|
+
|
|
245
|
+
try:
|
|
246
|
+
socket_obj.send(buffer)
|
|
247
|
+
result = udp_verify_announce(socket_obj, transaction_id)
|
|
248
|
+
|
|
249
|
+
return {
|
|
250
|
+
hashes[0]: {
|
|
251
|
+
'seeders': result[0],
|
|
252
|
+
'leechers': result[1],
|
|
253
|
+
'completed': result[2],
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
except socket.error as e:
|
|
257
|
+
raise Exception(f"Failed to send announce request - {str(e)}.")
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def udp_verify_announce(socket_obj, transaction_id):
|
|
261
|
+
"""
|
|
262
|
+
Verifies an announce response
|
|
263
|
+
|
|
264
|
+
Args:
|
|
265
|
+
socket_obj: Socket object.
|
|
266
|
+
transaction_id: Transaction ID.
|
|
267
|
+
|
|
268
|
+
Returns:
|
|
269
|
+
tuple: Tuple containing seeders, leechers, and completed.
|
|
270
|
+
"""
|
|
271
|
+
try:
|
|
272
|
+
response = socket_obj.recv(20)
|
|
273
|
+
except socket.error as e:
|
|
274
|
+
raise Exception(f"Failed to receive announce response - {str(e)}.")
|
|
275
|
+
|
|
276
|
+
if len(response) < 20:
|
|
277
|
+
raise Exception(f"Invalid announce response length ({len(response)}).")
|
|
278
|
+
|
|
279
|
+
return_action, return_transaction_id, interval, leechers, seeders = struct.unpack(">IIIII", response)
|
|
280
|
+
|
|
281
|
+
if return_transaction_id != transaction_id:
|
|
282
|
+
raise Exception(f"Invalid transaction ID ({return_transaction_id} != {transaction_id}).")
|
|
283
|
+
|
|
284
|
+
if return_action != 1:
|
|
285
|
+
raise Exception(f"Invalid action code ({return_action}).")
|
|
286
|
+
|
|
287
|
+
return (seeders, leechers, 0)
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def udp_scrape_data(response, hashes, host, keys, start, end, offset):
|
|
291
|
+
"""
|
|
292
|
+
Parses scrape response
|
|
293
|
+
|
|
294
|
+
Args:
|
|
295
|
+
response: Response from the tracker.
|
|
296
|
+
hashes: List (>1) or string of infohash(es).
|
|
297
|
+
host: Domain or IP address of the tracker.
|
|
298
|
+
keys: List of infohash keys.
|
|
299
|
+
start: Start position in the response.
|
|
300
|
+
end: End position in the response.
|
|
301
|
+
offset: Offset for each result.
|
|
302
|
+
|
|
303
|
+
Returns:
|
|
304
|
+
dict: Dictionary of results.
|
|
305
|
+
"""
|
|
306
|
+
results = {}
|
|
307
|
+
|
|
308
|
+
# Check if there is enough data for all hashes
|
|
309
|
+
if (end - start) < (len(hashes) * offset):
|
|
310
|
+
raise Exception(f"Invalid scrape response from '{host}'.")
|
|
311
|
+
|
|
312
|
+
# Parse each hash
|
|
313
|
+
for i, infohash in enumerate(hashes):
|
|
314
|
+
pos = start + (i * offset)
|
|
315
|
+
|
|
316
|
+
if pos + 12 <= end:
|
|
317
|
+
seeders, completed, leechers = struct.unpack(">III", response[pos:pos+12])
|
|
318
|
+
|
|
319
|
+
results[keys[i]] = {
|
|
320
|
+
'seeders': seeders,
|
|
321
|
+
'completed': completed,
|
|
322
|
+
'leechers': leechers,
|
|
323
|
+
}
|
|
324
|
+
else:
|
|
325
|
+
raise Exception(f"Invalid scrape response from '{host}'.")
|
|
326
|
+
|
|
327
|
+
return results
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Utility functions for Scrapeer.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
import random
|
|
7
|
+
import binascii
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def normalize_infohashes(infohashes, errors):
|
|
11
|
+
"""
|
|
12
|
+
Normalizes the given hashes
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
infohashes: List of infohash(es).
|
|
16
|
+
errors: List to append any errors to.
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
list: Normalized infohash(es).
|
|
20
|
+
"""
|
|
21
|
+
if not isinstance(infohashes, list):
|
|
22
|
+
infohashes = [infohashes]
|
|
23
|
+
|
|
24
|
+
normalized = []
|
|
25
|
+
for infohash in infohashes:
|
|
26
|
+
# Convert to lowercase for consistency
|
|
27
|
+
infohash = infohash.lower()
|
|
28
|
+
if not re.match(r'^[a-f0-9]{40}$', infohash):
|
|
29
|
+
errors.append(f'Invalid info hash skipped ({infohash}).')
|
|
30
|
+
else:
|
|
31
|
+
normalized.append(infohash)
|
|
32
|
+
|
|
33
|
+
total_infohashes = len(normalized)
|
|
34
|
+
if total_infohashes > 64 or total_infohashes < 1:
|
|
35
|
+
raise ValueError(f'Invalid amount of valid infohashes ({total_infohashes}).')
|
|
36
|
+
|
|
37
|
+
return normalized
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def get_passkey(path):
|
|
41
|
+
"""
|
|
42
|
+
Returns the passkey found in the scrape request.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
path: Path from the scrape request.
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
str: Passkey or empty string.
|
|
49
|
+
"""
|
|
50
|
+
if path and re.search(r'[a-z0-9]{32}', path, re.IGNORECASE):
|
|
51
|
+
matches = re.search(r'[a-z0-9]{32}', path, re.IGNORECASE)
|
|
52
|
+
return f'/{matches.group(0)}'
|
|
53
|
+
return ''
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def random_peer_id():
|
|
57
|
+
"""
|
|
58
|
+
Generate a random peer_id.
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
bytes: A random peer_id.
|
|
62
|
+
"""
|
|
63
|
+
return '-PY0001-' + ''.join([str(random.randint(0, 9)) for _ in range(12)]).encode()
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def collect_info_hash(infohash):
|
|
67
|
+
"""
|
|
68
|
+
Converts infohash to binary.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
infohash: Infohash to convert.
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
bytes: Binary representation of the infohash.
|
|
75
|
+
"""
|
|
76
|
+
return binascii.unhexlify(infohash)
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: scrapeer
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Essential Python library that scrapes HTTP(S) and UDP trackers for torrent information.
|
|
5
|
+
Home-page: https://github.com/tboy1337/scrapeer-py
|
|
6
|
+
Download-URL: https://github.com/tboy1337/scrapeer-py/releases/latest
|
|
7
|
+
Author: tboy1337
|
|
8
|
+
Author-email: obywhuie@anonaddy.com
|
|
9
|
+
License: MIT
|
|
10
|
+
Keywords: torrent,torrents,scraper,scrapeer,torrent-scraper,torrent-scraping
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Requires-Python: >=3.6
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
License-File: LICENSE.txt
|
|
17
|
+
Dynamic: author
|
|
18
|
+
Dynamic: author-email
|
|
19
|
+
Dynamic: classifier
|
|
20
|
+
Dynamic: description
|
|
21
|
+
Dynamic: description-content-type
|
|
22
|
+
Dynamic: download-url
|
|
23
|
+
Dynamic: home-page
|
|
24
|
+
Dynamic: keywords
|
|
25
|
+
Dynamic: license
|
|
26
|
+
Dynamic: license-file
|
|
27
|
+
Dynamic: requires-python
|
|
28
|
+
Dynamic: summary
|
|
29
|
+
|
|
30
|
+
# Scrapeer-py
|
|
31
|
+
|
|
32
|
+
A tiny Python library that lets you scrape HTTP(S) and UDP trackers for torrent information.
|
|
33
|
+
|
|
34
|
+
Scrapeer-py is a Python port of the original PHP [Scrapeer](https://github.com/torrentpier/scrapeer) library by [TorrentPier](https://github.com/torrentpier).
|
|
35
|
+
|
|
36
|
+
## Overview
|
|
37
|
+
|
|
38
|
+
Scrapeer-py allows you to retrieve peer information from BitTorrent trackers using both HTTP(S) and UDP protocols. It can fetch seeders, leechers, and completed download counts for multiple torrents from multiple trackers simultaneously.
|
|
39
|
+
|
|
40
|
+
## Features
|
|
41
|
+
|
|
42
|
+
- Support for both HTTP(S) and UDP tracker protocols
|
|
43
|
+
- Batch scraping of multiple infohashes at once (up to 64)
|
|
44
|
+
- Support for trackers with passkeys
|
|
45
|
+
- Optional announce mode for trackers that don't support scrape
|
|
46
|
+
- Configurable timeout settings
|
|
47
|
+
- Detailed error reporting
|
|
48
|
+
- Well-organized modular codebase
|
|
49
|
+
|
|
50
|
+
## Installation
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
pip install scrapeer
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Usage
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
from scrapeer import Scraper
|
|
60
|
+
|
|
61
|
+
# Initialize the scraper
|
|
62
|
+
scraper = Scraper()
|
|
63
|
+
|
|
64
|
+
# Define your infohashes and trackers
|
|
65
|
+
infohashes = [
|
|
66
|
+
"0123456789abcdef0123456789abcdef01234567",
|
|
67
|
+
"fedcba9876543210fedcba9876543210fedcba98"
|
|
68
|
+
]
|
|
69
|
+
|
|
70
|
+
trackers = [
|
|
71
|
+
"udp://tracker.example.com:80",
|
|
72
|
+
"http://tracker.example.org:6969/announce",
|
|
73
|
+
"https://private-tracker.example.net:443/YOUR_PASSKEY/announce"
|
|
74
|
+
]
|
|
75
|
+
|
|
76
|
+
# Get the results (timeout of 3 seconds per tracker)
|
|
77
|
+
results = scraper.scrape(
|
|
78
|
+
hashes=infohashes,
|
|
79
|
+
trackers=trackers,
|
|
80
|
+
timeout=3
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# Print the results
|
|
84
|
+
for infohash, data in results.items():
|
|
85
|
+
print(f"Results for {infohash}:")
|
|
86
|
+
print(f" Seeders: {data['seeders']}")
|
|
87
|
+
print(f" Leechers: {data['leechers']}")
|
|
88
|
+
print(f" Completed: {data['completed']}")
|
|
89
|
+
|
|
90
|
+
# Check if there were any errors
|
|
91
|
+
if scraper.has_errors():
|
|
92
|
+
print("\nErrors:")
|
|
93
|
+
for error in scraper.get_errors():
|
|
94
|
+
print(f" {error}")
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## Package Structure
|
|
98
|
+
|
|
99
|
+
Scrapeer-py is organized into the following modules:
|
|
100
|
+
|
|
101
|
+
- `scrapeer/` - Main package directory
|
|
102
|
+
- `__init__.py` - Package initialization that exports the Scraper class
|
|
103
|
+
- `scraper.py` - Main Scraper class implementation
|
|
104
|
+
- `http.py` - HTTP(S) protocol scraping functionality
|
|
105
|
+
- `udp.py` - UDP protocol scraping functionality
|
|
106
|
+
- `utils.py` - Utility functions used across the package
|
|
107
|
+
|
|
108
|
+
## API Reference
|
|
109
|
+
|
|
110
|
+
### `Scraper` class
|
|
111
|
+
|
|
112
|
+
#### `scrape(hashes, trackers, max_trackers=None, timeout=2, announce=False)`
|
|
113
|
+
|
|
114
|
+
Scrape trackers for torrent information.
|
|
115
|
+
|
|
116
|
+
- **Parameters**:
|
|
117
|
+
- `hashes`: List (>1) or string of infohash(es)
|
|
118
|
+
- `trackers`: List (>1) or string of tracker(s)
|
|
119
|
+
- `max_trackers`: (Optional) Maximum number of trackers to be scraped, Default all
|
|
120
|
+
- `timeout`: (Optional) Maximum time for each tracker scrape in seconds, Default 2
|
|
121
|
+
- `announce`: (Optional) Use announce instead of scrape, Default False
|
|
122
|
+
|
|
123
|
+
- **Returns**:
|
|
124
|
+
- Dictionary of results with infohashes as keys and stats as values
|
|
125
|
+
|
|
126
|
+
#### `has_errors()`
|
|
127
|
+
|
|
128
|
+
Checks if there are any errors.
|
|
129
|
+
|
|
130
|
+
- **Returns**:
|
|
131
|
+
- `bool`: True if errors are present, False otherwise
|
|
132
|
+
|
|
133
|
+
#### `get_errors()`
|
|
134
|
+
|
|
135
|
+
Returns all the errors that were logged.
|
|
136
|
+
|
|
137
|
+
- **Returns**:
|
|
138
|
+
- `list`: All the logged errors
|
|
139
|
+
|
|
140
|
+
## Limitations
|
|
141
|
+
|
|
142
|
+
- Maximum of 64 infohashes per request
|
|
143
|
+
- Minimum of 1 infohash per request
|
|
144
|
+
- Only supports BitTorrent trackers (HTTP(S) and UDP)
|
|
145
|
+
|
|
146
|
+
## License
|
|
147
|
+
|
|
148
|
+
This project is licensed under the MIT License - see the [LICENSE.txt](LICENSE.txt) file for details.
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
LICENSE.txt
|
|
2
|
+
README.md
|
|
3
|
+
setup.py
|
|
4
|
+
scrapeer/__init__.py
|
|
5
|
+
scrapeer/http.py
|
|
6
|
+
scrapeer/scraper.py
|
|
7
|
+
scrapeer/udp.py
|
|
8
|
+
scrapeer/utils.py
|
|
9
|
+
scrapeer.egg-info/PKG-INFO
|
|
10
|
+
scrapeer.egg-info/SOURCES.txt
|
|
11
|
+
scrapeer.egg-info/dependency_links.txt
|
|
12
|
+
scrapeer.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
scrapeer
|
scrapeer-1.0.0/setup.cfg
ADDED
scrapeer-1.0.0/setup.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
setup(
|
|
4
|
+
name="scrapeer",
|
|
5
|
+
version="1.0.0",
|
|
6
|
+
description="Essential Python library that scrapes HTTP(S) and UDP trackers for torrent information.",
|
|
7
|
+
author="tboy1337",
|
|
8
|
+
author_email="obywhuie@anonaddy.com",
|
|
9
|
+
url="https://github.com/tboy1337/scrapeer-py",
|
|
10
|
+
download_url="https://github.com/tboy1337/scrapeer-py/releases/latest",
|
|
11
|
+
license="MIT",
|
|
12
|
+
packages=find_packages(),
|
|
13
|
+
classifiers=[
|
|
14
|
+
"Programming Language :: Python :: 3",
|
|
15
|
+
"License :: OSI Approved :: MIT License",
|
|
16
|
+
"Operating System :: OS Independent",
|
|
17
|
+
],
|
|
18
|
+
python_requires=">=3.6",
|
|
19
|
+
keywords=[
|
|
20
|
+
"torrent",
|
|
21
|
+
"torrents",
|
|
22
|
+
"scraper",
|
|
23
|
+
"scrapeer",
|
|
24
|
+
"torrent-scraper",
|
|
25
|
+
"torrent-scraping"
|
|
26
|
+
],
|
|
27
|
+
long_description=open("README.md").read(),
|
|
28
|
+
long_description_content_type="text/markdown",
|
|
29
|
+
)
|