scrapeer 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 tboy1337
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,148 @@
1
+ Metadata-Version: 2.4
2
+ Name: scrapeer
3
+ Version: 1.0.0
4
+ Summary: Essential Python library that scrapes HTTP(S) and UDP trackers for torrent information.
5
+ Home-page: https://github.com/tboy1337/scrapeer-py
6
+ Download-URL: https://github.com/tboy1337/scrapeer-py/releases/latest
7
+ Author: tboy1337
8
+ Author-email: obywhuie@anonaddy.com
9
+ License: MIT
10
+ Keywords: torrent,torrents,scraper,scrapeer,torrent-scraper,torrent-scraping
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Operating System :: OS Independent
14
+ Requires-Python: >=3.6
15
+ Description-Content-Type: text/markdown
16
+ License-File: LICENSE.txt
17
+ Dynamic: author
18
+ Dynamic: author-email
19
+ Dynamic: classifier
20
+ Dynamic: description
21
+ Dynamic: description-content-type
22
+ Dynamic: download-url
23
+ Dynamic: home-page
24
+ Dynamic: keywords
25
+ Dynamic: license
26
+ Dynamic: license-file
27
+ Dynamic: requires-python
28
+ Dynamic: summary
29
+
30
+ # Scrapeer-py
31
+
32
+ A tiny Python library that lets you scrape HTTP(S) and UDP trackers for torrent information.
33
+
34
+ Scrapeer-py is a Python port of the original PHP [Scrapeer](https://github.com/torrentpier/scrapeer) library by [TorrentPier](https://github.com/torrentpier).
35
+
36
+ ## Overview
37
+
38
+ Scrapeer-py allows you to retrieve peer information from BitTorrent trackers using both HTTP(S) and UDP protocols. It can fetch seeders, leechers, and completed download counts for multiple torrents from multiple trackers simultaneously.
39
+
40
+ ## Features
41
+
42
+ - Support for both HTTP(S) and UDP tracker protocols
43
+ - Batch scraping of multiple infohashes at once (up to 64)
44
+ - Support for trackers with passkeys
45
+ - Optional announce mode for trackers that don't support scrape
46
+ - Configurable timeout settings
47
+ - Detailed error reporting
48
+ - Well-organized modular codebase
49
+
50
+ ## Installation
51
+
52
+ ```bash
53
+ pip install scrapeer
54
+ ```
55
+
56
+ ## Usage
57
+
58
+ ```python
59
+ from scrapeer import Scraper
60
+
61
+ # Initialize the scraper
62
+ scraper = Scraper()
63
+
64
+ # Define your infohashes and trackers
65
+ infohashes = [
66
+ "0123456789abcdef0123456789abcdef01234567",
67
+ "fedcba9876543210fedcba9876543210fedcba98"
68
+ ]
69
+
70
+ trackers = [
71
+ "udp://tracker.example.com:80",
72
+ "http://tracker.example.org:6969/announce",
73
+ "https://private-tracker.example.net:443/YOUR_PASSKEY/announce"
74
+ ]
75
+
76
+ # Get the results (timeout of 3 seconds per tracker)
77
+ results = scraper.scrape(
78
+ hashes=infohashes,
79
+ trackers=trackers,
80
+ timeout=3
81
+ )
82
+
83
+ # Print the results
84
+ for infohash, data in results.items():
85
+ print(f"Results for {infohash}:")
86
+ print(f" Seeders: {data['seeders']}")
87
+ print(f" Leechers: {data['leechers']}")
88
+ print(f" Completed: {data['completed']}")
89
+
90
+ # Check if there were any errors
91
+ if scraper.has_errors():
92
+ print("\nErrors:")
93
+ for error in scraper.get_errors():
94
+ print(f" {error}")
95
+ ```
96
+
97
+ ## Package Structure
98
+
99
+ Scrapeer-py is organized into the following modules:
100
+
101
+ - `scrapeer/` - Main package directory
102
+ - `__init__.py` - Package initialization that exports the Scraper class
103
+ - `scraper.py` - Main Scraper class implementation
104
+ - `http.py` - HTTP(S) protocol scraping functionality
105
+ - `udp.py` - UDP protocol scraping functionality
106
+ - `utils.py` - Utility functions used across the package
107
+
108
+ ## API Reference
109
+
110
+ ### `Scraper` class
111
+
112
+ #### `scrape(hashes, trackers, max_trackers=None, timeout=2, announce=False)`
113
+
114
+ Scrape trackers for torrent information.
115
+
116
+ - **Parameters**:
117
+ - `hashes`: List (>1) or string of infohash(es)
118
+ - `trackers`: List (>1) or string of tracker(s)
119
+ - `max_trackers`: (Optional) Maximum number of trackers to be scraped, Default all
120
+ - `timeout`: (Optional) Maximum time for each tracker scrape in seconds, Default 2
121
+ - `announce`: (Optional) Use announce instead of scrape, Default False
122
+
123
+ - **Returns**:
124
+ - Dictionary of results with infohashes as keys and stats as values
125
+
126
+ #### `has_errors()`
127
+
128
+ Checks if there are any errors.
129
+
130
+ - **Returns**:
131
+ - `bool`: True if errors are present, False otherwise
132
+
133
+ #### `get_errors()`
134
+
135
+ Returns all the errors that were logged.
136
+
137
+ - **Returns**:
138
+ - `list`: All the logged errors
139
+
140
+ ## Limitations
141
+
142
+ - Maximum of 64 infohashes per request
143
+ - Minimum of 1 infohash per request
144
+ - Only supports BitTorrent trackers (HTTP(S) and UDP)
145
+
146
+ ## License
147
+
148
+ This project is licensed under the MIT License - see the [LICENSE.txt](LICENSE.txt) file for details.
@@ -0,0 +1,119 @@
1
+ # Scrapeer-py
2
+
3
+ A tiny Python library that lets you scrape HTTP(S) and UDP trackers for torrent information.
4
+
5
+ Scrapeer-py is a Python port of the original PHP [Scrapeer](https://github.com/torrentpier/scrapeer) library by [TorrentPier](https://github.com/torrentpier).
6
+
7
+ ## Overview
8
+
9
+ Scrapeer-py allows you to retrieve peer information from BitTorrent trackers using both HTTP(S) and UDP protocols. It can fetch seeders, leechers, and completed download counts for multiple torrents from multiple trackers simultaneously.
10
+
11
+ ## Features
12
+
13
+ - Support for both HTTP(S) and UDP tracker protocols
14
+ - Batch scraping of multiple infohashes at once (up to 64)
15
+ - Support for trackers with passkeys
16
+ - Optional announce mode for trackers that don't support scrape
17
+ - Configurable timeout settings
18
+ - Detailed error reporting
19
+ - Well-organized modular codebase
20
+
21
+ ## Installation
22
+
23
+ ```bash
24
+ pip install scrapeer
25
+ ```
26
+
27
+ ## Usage
28
+
29
+ ```python
30
+ from scrapeer import Scraper
31
+
32
+ # Initialize the scraper
33
+ scraper = Scraper()
34
+
35
+ # Define your infohashes and trackers
36
+ infohashes = [
37
+ "0123456789abcdef0123456789abcdef01234567",
38
+ "fedcba9876543210fedcba9876543210fedcba98"
39
+ ]
40
+
41
+ trackers = [
42
+ "udp://tracker.example.com:80",
43
+ "http://tracker.example.org:6969/announce",
44
+ "https://private-tracker.example.net:443/YOUR_PASSKEY/announce"
45
+ ]
46
+
47
+ # Get the results (timeout of 3 seconds per tracker)
48
+ results = scraper.scrape(
49
+ hashes=infohashes,
50
+ trackers=trackers,
51
+ timeout=3
52
+ )
53
+
54
+ # Print the results
55
+ for infohash, data in results.items():
56
+ print(f"Results for {infohash}:")
57
+ print(f" Seeders: {data['seeders']}")
58
+ print(f" Leechers: {data['leechers']}")
59
+ print(f" Completed: {data['completed']}")
60
+
61
+ # Check if there were any errors
62
+ if scraper.has_errors():
63
+ print("\nErrors:")
64
+ for error in scraper.get_errors():
65
+ print(f" {error}")
66
+ ```
67
+
68
+ ## Package Structure
69
+
70
+ Scrapeer-py is organized into the following modules:
71
+
72
+ - `scrapeer/` - Main package directory
73
+ - `__init__.py` - Package initialization that exports the Scraper class
74
+ - `scraper.py` - Main Scraper class implementation
75
+ - `http.py` - HTTP(S) protocol scraping functionality
76
+ - `udp.py` - UDP protocol scraping functionality
77
+ - `utils.py` - Utility functions used across the package
78
+
79
+ ## API Reference
80
+
81
+ ### `Scraper` class
82
+
83
+ #### `scrape(hashes, trackers, max_trackers=None, timeout=2, announce=False)`
84
+
85
+ Scrape trackers for torrent information.
86
+
87
+ - **Parameters**:
88
+ - `hashes`: List (>1) or string of infohash(es)
89
+ - `trackers`: List (>1) or string of tracker(s)
90
+ - `max_trackers`: (Optional) Maximum number of trackers to be scraped, Default all
91
+ - `timeout`: (Optional) Maximum time for each tracker scrape in seconds, Default 2
92
+ - `announce`: (Optional) Use announce instead of scrape, Default False
93
+
94
+ - **Returns**:
95
+ - Dictionary of results with infohashes as keys and stats as values
96
+
97
+ #### `has_errors()`
98
+
99
+ Checks if there are any errors.
100
+
101
+ - **Returns**:
102
+ - `bool`: True if errors are present, False otherwise
103
+
104
+ #### `get_errors()`
105
+
106
+ Returns all the errors that were logged.
107
+
108
+ - **Returns**:
109
+ - `list`: All the logged errors
110
+
111
+ ## Limitations
112
+
113
+ - Maximum of 64 infohashes per request
114
+ - Minimum of 1 infohash per request
115
+ - Only supports BitTorrent trackers (HTTP(S) and UDP)
116
+
117
+ ## License
118
+
119
+ This project is licensed under the MIT License - see the [LICENSE.txt](LICENSE.txt) file for details.
@@ -0,0 +1,11 @@
1
+ """
2
+ Scrapeer-py, a tiny Python library that lets you scrape
3
+ HTTP(S) and UDP trackers for torrent information.
4
+
5
+ Port of the original PHP Scrapeer library by TorrentPier.
6
+ """
7
+
8
+ from .scraper import Scraper
9
+
10
+ __version__ = '1.0.0'
11
+ __all__ = ['Scraper']
@@ -0,0 +1,227 @@
1
+ """
2
+ HTTP scraping functionality for Scrapeer.
3
+ """
4
+
5
+ import urllib.request
6
+ import urllib.parse
7
+ import re
8
+ import socket
9
+
10
+
11
+ def scrape_http(infohashes, protocol, host, port, passkey, announce, timeout):
12
+ """
13
+ Initiates the HTTP(S) scraping
14
+
15
+ Args:
16
+ infohashes: List (>1) or string of infohash(es).
17
+ protocol: Protocol to use for the scraping.
18
+ host: Domain or IP address of the tracker.
19
+ port: Optional. Port number of the tracker.
20
+ passkey: Optional. Passkey provided in the scrape request.
21
+ announce: Optional. Use announce instead of scrape.
22
+ timeout: Maximum time for each tracker scrape in seconds.
23
+
24
+ Returns:
25
+ dict: Dictionary of results.
26
+ """
27
+ if announce:
28
+ response = http_announce(infohashes, protocol, host, port, passkey, timeout)
29
+ else:
30
+ query = http_query(infohashes, protocol, host, port, passkey)
31
+ response = http_request(query, host, port, timeout)
32
+
33
+ results = http_data(response, infohashes, host)
34
+ return results
35
+
36
+
37
+ def http_query(infohashes, protocol, host, port, passkey):
38
+ """
39
+ Builds the HTTP(S) query
40
+
41
+ Args:
42
+ infohashes: List (>1) or string of infohash(es).
43
+ protocol: Protocol to use for the scraping.
44
+ host: Domain or IP address of the tracker.
45
+ port: Port number of the tracker.
46
+ passkey: Optional. Passkey provided in the scrape request.
47
+
48
+ Returns:
49
+ str: Fully qualified URL.
50
+ """
51
+ info = urllib.parse.urlparse(f"{protocol}://{host}:{port}/scrape{passkey}")
52
+ query = f"{info.scheme}://{info.netloc}{info.path}"
53
+
54
+ if len(infohashes) > 1:
55
+ query += '?'
56
+
57
+ for index, infohash in enumerate(infohashes):
58
+ query += f"info_hash={urllib.parse.quote(bytes.fromhex(infohash))}"
59
+
60
+ if index < len(infohashes) - 1:
61
+ query += '&'
62
+ elif len(infohashes) == 1:
63
+ query += f"?info_hash={urllib.parse.quote(bytes.fromhex(infohashes[0]))}"
64
+
65
+ return query
66
+
67
+
68
+ def http_request(query, host, port, timeout):
69
+ """
70
+ Sends HTTP(S) request to the tracker
71
+
72
+ Args:
73
+ query: URL to the tracker.
74
+ host: Domain or IP address of the tracker.
75
+ port: Port number of the tracker.
76
+ timeout: Maximum time for each tracker scrape in seconds.
77
+
78
+ Returns:
79
+ str: Response from the tracker.
80
+ """
81
+ socket.setdefaulttimeout(timeout)
82
+
83
+ try:
84
+ request = urllib.request.Request(
85
+ query,
86
+ headers={'User-Agent': 'Scrapeer-py/1.0.0'}
87
+ )
88
+ response = urllib.request.urlopen(request).read()
89
+ return response
90
+ except Exception as e:
91
+ raise Exception(f"Connection error: {host}:{port} - {str(e)}")
92
+
93
+
94
+ def http_announce(infohashes, protocol, host, port, passkey, timeout):
95
+ """
96
+ Announces to the tracker instead of scraping
97
+
98
+ Args:
99
+ infohashes: List (>1) or string of infohash(es).
100
+ protocol: Protocol to use for the scraping.
101
+ host: Domain or IP address of the tracker.
102
+ port: Port number of the tracker.
103
+ passkey: Optional. Passkey provided in the scrape request.
104
+ timeout: Maximum time for each tracker scrape in seconds.
105
+
106
+ Returns:
107
+ str: Response from the tracker.
108
+ """
109
+ info = urllib.parse.urlparse(f"{protocol}://{host}:{port}/announce{passkey}")
110
+ query = f"{info.scheme}://{info.netloc}{info.path}"
111
+
112
+ if len(infohashes) > 1:
113
+ raise Exception(f"Too many hashes for HTTP announce ({len(infohashes)}).")
114
+
115
+ query += f"?info_hash={urllib.parse.quote(bytes.fromhex(infohashes[0]))}"
116
+ query += "&peer_id=test1234567891234567"
117
+ query += "&port=6889"
118
+ query += "&uploaded=0"
119
+ query += "&downloaded=0"
120
+ query += "&left=0"
121
+ query += "&compact=1"
122
+
123
+ socket.setdefaulttimeout(timeout)
124
+
125
+ try:
126
+ request = urllib.request.Request(
127
+ query,
128
+ headers={'User-Agent': 'Scrapeer-py/1.0.0'}
129
+ )
130
+ response = urllib.request.urlopen(request).read()
131
+ return response
132
+ except Exception as e:
133
+ raise Exception(f"Connection error: {host}:{port} - {str(e)}")
134
+
135
+
136
+ def http_data(response, infohashes, host):
137
+ """
138
+ Gets the data from HTTP(S) response
139
+
140
+ Args:
141
+ response: Response from the tracker.
142
+ infohashes: List (>1) or string of infohash(es).
143
+ host: Domain or IP address of the tracker.
144
+
145
+ Returns:
146
+ dict: Dictionary of results.
147
+ """
148
+ data = str(response)
149
+ results = {}
150
+ pattern_all = r"d8:completei(\d+)e10:downloadedi(\d+)e10:incompletei(\d+)e"
151
+ pattern_single = r"d8:completei(\d+)e10:incompletei(\d+)e"
152
+
153
+ for infohash in infohashes:
154
+ pattern = f"{infohash}:{pattern_all}"
155
+ matches = re.search(pattern, data, re.IGNORECASE)
156
+
157
+ if matches:
158
+ results[infohash] = {
159
+ 'seeders': int(matches.group(1)),
160
+ 'completed': int(matches.group(2)),
161
+ 'leechers': int(matches.group(3)),
162
+ }
163
+ else:
164
+ pattern = f"{infohash}:{pattern_single}"
165
+ matches = re.search(pattern, data, re.IGNORECASE)
166
+
167
+ if matches:
168
+ results[infohash] = {
169
+ 'seeders': int(matches.group(1)),
170
+ 'completed': 0,
171
+ 'leechers': int(matches.group(2)),
172
+ }
173
+ else:
174
+ info = get_information(data, 'd5:filesd', 'ee')
175
+
176
+ if info:
177
+ pattern = f"20:{bytes.fromhex(infohash).decode('latin-1', errors='ignore')}d"
178
+ start = info.find(pattern)
179
+
180
+ if start != -1:
181
+ info = info[start:]
182
+ end = info.find('e')
183
+ info = info[:end + 1]
184
+
185
+ seeders = re.search(r"completei(\d+)e", info, re.IGNORECASE)
186
+ leechers = re.search(r"incompletei(\d+)e", info, re.IGNORECASE)
187
+ completed = re.search(r"downloadedi(\d+)e", info, re.IGNORECASE)
188
+
189
+ seeders = int(seeders.group(1)) if seeders else 0
190
+ leechers = int(leechers.group(1)) if leechers else 0
191
+ completed = int(completed.group(1)) if completed else 0
192
+
193
+ results[infohash] = {
194
+ 'seeders': seeders,
195
+ 'completed': completed,
196
+ 'leechers': leechers,
197
+ }
198
+ else:
199
+ raise Exception(f"Failed to parse torrent data from '{host}'.")
200
+ else:
201
+ raise Exception(f"Invalid scrape response from '{host}'.")
202
+
203
+ return results
204
+
205
+
206
+ def get_information(data, start, end):
207
+ """
208
+ Gets information from HTTP(S) response
209
+
210
+ Args:
211
+ data: Response from the tracker.
212
+ start: Starting string.
213
+ end: Ending string.
214
+
215
+ Returns:
216
+ str: Information or None.
217
+ """
218
+ start_pos = data.find(start)
219
+
220
+ if start_pos != -1:
221
+ start_pos += len(start)
222
+ end_pos = data.find(end, start_pos)
223
+
224
+ if end_pos != -1:
225
+ return data[start_pos:end_pos]
226
+
227
+ return None
@@ -0,0 +1,134 @@
1
+ """
2
+ Main Scraper class for Scrapeer.
3
+ """
4
+
5
+ import urllib.parse
6
+ from .http import scrape_http
7
+ from .udp import scrape_udp
8
+ from .utils import normalize_infohashes, get_passkey
9
+
10
+
11
+ class Scraper:
12
+ """
13
+ The one and only class you'll ever need.
14
+ """
15
+
16
+ VERSION = '1.0.0' # Python port version
17
+
18
+ def __init__(self):
19
+ """
20
+ Initialize the scraper.
21
+ """
22
+ self.errors = []
23
+ self.infohashes = []
24
+ self.timeout = 2
25
+
26
+ def scrape(self, hashes, trackers, max_trackers=None, timeout=2, announce=False):
27
+ """
28
+ Initiates the scraper
29
+
30
+ Args:
31
+ hashes: List (>1) or string of infohash(es).
32
+ trackers: List (>1) or string of tracker(s).
33
+ max_trackers: Optional. Maximum number of trackers to be scraped, Default all.
34
+ timeout: Optional. Maximum time for each tracker scrape in seconds, Default 2.
35
+ announce: Optional. Use announce instead of scrape, Default false.
36
+
37
+ Returns:
38
+ dict: Dictionary of results.
39
+ """
40
+ final_result = {}
41
+
42
+ if not trackers:
43
+ self.errors.append('No tracker specified, aborting.')
44
+ return final_result
45
+ elif not isinstance(trackers, list):
46
+ trackers = [trackers]
47
+
48
+ if isinstance(timeout, int):
49
+ self.timeout = timeout
50
+ else:
51
+ self.timeout = 2
52
+ self.errors.append('Timeout must be an integer. Using default value.')
53
+
54
+ try:
55
+ self.infohashes = normalize_infohashes(hashes, self.errors)
56
+ except ValueError as e:
57
+ self.errors.append(str(e))
58
+ return final_result
59
+
60
+ max_iterations = max_trackers if isinstance(max_trackers, int) else len(trackers)
61
+ for index, tracker in enumerate(trackers):
62
+ if self.infohashes and index < max_iterations:
63
+ info = urllib.parse.urlparse(tracker)
64
+ protocol = info.scheme
65
+ host = info.netloc.split(':')[0] if ':' in info.netloc else info.netloc
66
+
67
+ if not protocol or not host:
68
+ self.errors.append(f'Skipping invalid tracker ({tracker}).')
69
+ continue
70
+
71
+ port = info.port if info.port else None
72
+ path = info.path if info.path else None
73
+ passkey = get_passkey(path)
74
+
75
+ result = self.try_scrape(protocol, host, port, passkey, announce)
76
+ final_result.update(result)
77
+ continue
78
+ break
79
+
80
+ return final_result
81
+
82
+ def try_scrape(self, protocol, host, port, passkey, announce):
83
+ """
84
+ Tries to scrape with a single tracker.
85
+
86
+ Args:
87
+ protocol: Protocol of the tracker.
88
+ host: Domain or address of the tracker.
89
+ port: Optional. Port number of the tracker.
90
+ passkey: Optional. Passkey provided in the scrape request.
91
+ announce: Optional. Use announce instead of scrape, Default false.
92
+
93
+ Returns:
94
+ dict: Dictionary of results.
95
+ """
96
+ infohashes = self.infohashes.copy()
97
+ self.infohashes = []
98
+ results = {}
99
+
100
+ try:
101
+ if protocol == 'udp':
102
+ port = port if port else 80
103
+ results = scrape_udp(infohashes, host, port, announce, self.timeout)
104
+ elif protocol == 'http':
105
+ port = port if port else 80
106
+ results = scrape_http(infohashes, protocol, host, port, passkey, announce, self.timeout)
107
+ elif protocol == 'https':
108
+ port = port if port else 443
109
+ results = scrape_http(infohashes, protocol, host, port, passkey, announce, self.timeout)
110
+ else:
111
+ raise Exception(f'Unsupported protocol ({protocol}://{host}).')
112
+ except Exception as e:
113
+ self.infohashes = infohashes
114
+ self.errors.append(str(e))
115
+
116
+ return results
117
+
118
+ def has_errors(self):
119
+ """
120
+ Checks if there are any errors.
121
+
122
+ Returns:
123
+ bool: True if errors are present, False otherwise.
124
+ """
125
+ return len(self.errors) > 0
126
+
127
+ def get_errors(self):
128
+ """
129
+ Returns all the errors that were logged.
130
+
131
+ Returns:
132
+ list: All the logged errors.
133
+ """
134
+ return self.errors
@@ -0,0 +1,327 @@
1
+ """
2
+ UDP scraping functionality for Scrapeer.
3
+ """
4
+
5
+ import socket
6
+ import struct
7
+ import random
8
+ from .utils import random_peer_id, collect_info_hash
9
+
10
+
11
+ def scrape_udp(infohashes, host, port, announce, timeout):
12
+ """
13
+ Initiates the UDP scraping
14
+
15
+ Args:
16
+ infohashes: List (>1) or string of infohash(es).
17
+ host: Domain or IP address of the tracker.
18
+ port: Port number of the tracker.
19
+ announce: Optional. Use announce instead of scrape.
20
+ timeout: Maximum time for each tracker scrape in seconds.
21
+
22
+ Returns:
23
+ dict: Dictionary of results.
24
+ """
25
+ socket_obj, ip = prepare_udp(host, port)
26
+ socket_obj.settimeout(timeout)
27
+
28
+ try:
29
+ transaction_id, connection_id = udp_connection_request(socket_obj)
30
+ connection_id = udp_connection_response(socket_obj, transaction_id, host, port)
31
+
32
+ if announce:
33
+ return udp_announce(socket_obj, infohashes, connection_id)
34
+ else:
35
+ return udp_scrape(socket_obj, infohashes, connection_id, transaction_id, host, port)
36
+ finally:
37
+ socket_obj.close()
38
+
39
+
40
+ def prepare_udp(host, port):
41
+ """
42
+ Prepares the UDP socket
43
+
44
+ Args:
45
+ host: Domain or IP address of the tracker.
46
+ port: Port number of the tracker.
47
+
48
+ Returns:
49
+ tuple: Tuple containing socket object and IP address.
50
+ """
51
+ socket_obj = udp_create_connection(host, port)
52
+
53
+ try:
54
+ ip = socket.gethostbyname(host)
55
+ except socket.gaierror:
56
+ raise Exception(f"Failed to resolve host '{host}'.")
57
+
58
+ return socket_obj, ip
59
+
60
+
61
+ def udp_create_connection(host, port):
62
+ """
63
+ Creates a UDP connection
64
+
65
+ Args:
66
+ host: Domain or IP address of the tracker.
67
+ port: Port number of the tracker.
68
+
69
+ Returns:
70
+ socket: Socket object.
71
+ """
72
+ try:
73
+ socket_obj = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
74
+ socket_obj.connect((host, port))
75
+ return socket_obj
76
+ except socket.error as e:
77
+ raise Exception(f"Failed to create socket for '{host}:{port}' - {str(e)}.")
78
+
79
+
80
+ def udp_connection_request(socket_obj):
81
+ """
82
+ Sends a connection request
83
+
84
+ Args:
85
+ socket_obj: Socket object.
86
+
87
+ Returns:
88
+ tuple: Tuple containing transaction_id and connection_id.
89
+ """
90
+ connection_id = 0x41727101980 # Default connection ID
91
+ action = 0 # Action (0 = connection, 1 = announce, 2 = scrape)
92
+ transaction_id = random.randint(0, 2147483647) # Random transaction ID
93
+
94
+ buffer = struct.pack(">QII", connection_id, action, transaction_id)
95
+
96
+ try:
97
+ socket_obj.send(buffer)
98
+ except socket.error as e:
99
+ raise Exception(f"Failed to send connection request - {str(e)}.")
100
+
101
+ return transaction_id, connection_id
102
+
103
+
104
+ def udp_connection_response(socket_obj, transaction_id, host, port):
105
+ """
106
+ Receives a connection response
107
+
108
+ Args:
109
+ socket_obj: Socket object.
110
+ transaction_id: Transaction ID.
111
+ host: Domain or IP address of the tracker.
112
+ port: Port number of the tracker.
113
+
114
+ Returns:
115
+ int: Connection ID.
116
+ """
117
+ try:
118
+ response = socket_obj.recv(16)
119
+ except socket.error as e:
120
+ raise Exception(f"Failed to receive connection response from '{host}:{port}' - {str(e)}.")
121
+
122
+ if len(response) != 16:
123
+ raise Exception(f"Invalid response length from '{host}:{port}'.")
124
+
125
+ return_action, return_transaction_id, connection_id = struct.unpack(">IIQ", response)
126
+
127
+ if return_transaction_id != transaction_id:
128
+ raise Exception(f"Invalid transaction ID from '{host}:{port}'.")
129
+
130
+ if return_action != 0:
131
+ raise Exception(f"Invalid action from '{host}:{port}'.")
132
+
133
+ return connection_id
134
+
135
+
136
+ def udp_scrape(socket_obj, hashes, connection_id, transaction_id, host, port):
137
+ """
138
+ Sends a scrape request
139
+
140
+ Args:
141
+ socket_obj: Socket object.
142
+ hashes: List (>1) or string of infohash(es).
143
+ connection_id: Connection ID.
144
+ transaction_id: Transaction ID.
145
+ host: Domain or IP address of the tracker.
146
+ port: Port number of the tracker.
147
+
148
+ Returns:
149
+ dict: Dictionary of results.
150
+ """
151
+ action = 2 # Action (2 = scrape)
152
+
153
+ # Create scrape request
154
+ buffer = udp_scrape_request(socket_obj, hashes, connection_id, transaction_id)
155
+
156
+ try:
157
+ # Send scrape request
158
+ socket_obj.send(buffer)
159
+
160
+ # Receive scrape response
161
+ response = socket_obj.recv(8 + (12 * len(hashes)))
162
+
163
+ # Parse scrape response
164
+ if len(response) < 8:
165
+ raise Exception(f"Invalid scrape response from '{host}:{port}'.")
166
+
167
+ return_action, return_transaction_id = struct.unpack(">II", response[:8])
168
+
169
+ # Verify transaction ID
170
+ if transaction_id != return_transaction_id:
171
+ raise Exception(f"Invalid transaction ID from '{host}:{port}'.")
172
+
173
+ # Verify action
174
+ if return_action != action:
175
+ err_msg = struct.unpack(">I", response[4:8])[0]
176
+ raise Exception(f"Tracker error, code: {err_msg} from '{host}:{port}'.")
177
+
178
+ # Create keys array
179
+ keys = []
180
+ for infohash in hashes:
181
+ keys.append(infohash)
182
+
183
+ # Parse results
184
+ return udp_scrape_data(response, hashes, host, keys, 8, len(response), 12)
185
+ except socket.error as e:
186
+ raise Exception(f"Socket error from '{host}:{port}' - {str(e)}.")
187
+
188
+
189
+ def udp_scrape_request(socket_obj, hashes, connection_id, transaction_id):
190
+ """
191
+ Creates a scrape request
192
+
193
+ Args:
194
+ socket_obj: Socket object.
195
+ hashes: List (>1) or string of infohash(es).
196
+ connection_id: Connection ID.
197
+ transaction_id: Transaction ID.
198
+
199
+ Returns:
200
+ bytes: Scrape request.
201
+ """
202
+ action = 2 # Action (2 = scrape)
203
+
204
+ buffer = struct.pack(">QII", connection_id, action, transaction_id)
205
+
206
+ for infohash in hashes:
207
+ buffer += collect_info_hash(infohash)
208
+
209
+ return buffer
210
+
211
+
212
+ def udp_announce(socket_obj, hashes, connection_id):
213
+ """
214
+ Sends an announce request
215
+
216
+ Args:
217
+ socket_obj: Socket object.
218
+ hashes: List (>1) or string of infohash(es).
219
+ connection_id: Connection ID.
220
+
221
+ Returns:
222
+ dict: Dictionary of results.
223
+ """
224
+ if len(hashes) > 1:
225
+ raise Exception(f"Too many hashes for UDP announce ({len(hashes)}).")
226
+
227
+ action = 1 # Action (1 = announce)
228
+ transaction_id = random.randint(0, 2147483647) # Random transaction ID
229
+
230
+ infohash = collect_info_hash(hashes[0])
231
+ peer_id = random_peer_id()
232
+ downloaded = 0
233
+ left = 0
234
+ uploaded = 0
235
+ event = 0
236
+ ip = 0
237
+ key = 0
238
+ num_want = -1
239
+ port = 6889
240
+
241
+ buffer = struct.pack(">QII20s20sQQQIIIiH",
242
+ connection_id, action, transaction_id, infohash, peer_id,
243
+ downloaded, left, uploaded, event, ip, key, num_want, port)
244
+
245
+ try:
246
+ socket_obj.send(buffer)
247
+ result = udp_verify_announce(socket_obj, transaction_id)
248
+
249
+ return {
250
+ hashes[0]: {
251
+ 'seeders': result[0],
252
+ 'leechers': result[1],
253
+ 'completed': result[2],
254
+ }
255
+ }
256
+ except socket.error as e:
257
+ raise Exception(f"Failed to send announce request - {str(e)}.")
258
+
259
+
260
+ def udp_verify_announce(socket_obj, transaction_id):
261
+ """
262
+ Verifies an announce response
263
+
264
+ Args:
265
+ socket_obj: Socket object.
266
+ transaction_id: Transaction ID.
267
+
268
+ Returns:
269
+ tuple: Tuple containing seeders, leechers, and completed.
270
+ """
271
+ try:
272
+ response = socket_obj.recv(20)
273
+ except socket.error as e:
274
+ raise Exception(f"Failed to receive announce response - {str(e)}.")
275
+
276
+ if len(response) < 20:
277
+ raise Exception(f"Invalid announce response length ({len(response)}).")
278
+
279
+ return_action, return_transaction_id, interval, leechers, seeders = struct.unpack(">IIIII", response)
280
+
281
+ if return_transaction_id != transaction_id:
282
+ raise Exception(f"Invalid transaction ID ({return_transaction_id} != {transaction_id}).")
283
+
284
+ if return_action != 1:
285
+ raise Exception(f"Invalid action code ({return_action}).")
286
+
287
+ return (seeders, leechers, 0)
288
+
289
+
290
+ def udp_scrape_data(response, hashes, host, keys, start, end, offset):
291
+ """
292
+ Parses scrape response
293
+
294
+ Args:
295
+ response: Response from the tracker.
296
+ hashes: List (>1) or string of infohash(es).
297
+ host: Domain or IP address of the tracker.
298
+ keys: List of infohash keys.
299
+ start: Start position in the response.
300
+ end: End position in the response.
301
+ offset: Offset for each result.
302
+
303
+ Returns:
304
+ dict: Dictionary of results.
305
+ """
306
+ results = {}
307
+
308
+ # Check if there is enough data for all hashes
309
+ if (end - start) < (len(hashes) * offset):
310
+ raise Exception(f"Invalid scrape response from '{host}'.")
311
+
312
+ # Parse each hash
313
+ for i, infohash in enumerate(hashes):
314
+ pos = start + (i * offset)
315
+
316
+ if pos + 12 <= end:
317
+ seeders, completed, leechers = struct.unpack(">III", response[pos:pos+12])
318
+
319
+ results[keys[i]] = {
320
+ 'seeders': seeders,
321
+ 'completed': completed,
322
+ 'leechers': leechers,
323
+ }
324
+ else:
325
+ raise Exception(f"Invalid scrape response from '{host}'.")
326
+
327
+ return results
@@ -0,0 +1,76 @@
1
+ """
2
+ Utility functions for Scrapeer.
3
+ """
4
+
5
+ import re
6
+ import random
7
+ import binascii
8
+
9
+
10
+ def normalize_infohashes(infohashes, errors):
11
+ """
12
+ Normalizes the given hashes
13
+
14
+ Args:
15
+ infohashes: List of infohash(es).
16
+ errors: List to append any errors to.
17
+
18
+ Returns:
19
+ list: Normalized infohash(es).
20
+ """
21
+ if not isinstance(infohashes, list):
22
+ infohashes = [infohashes]
23
+
24
+ normalized = []
25
+ for infohash in infohashes:
26
+ # Convert to lowercase for consistency
27
+ infohash = infohash.lower()
28
+ if not re.match(r'^[a-f0-9]{40}$', infohash):
29
+ errors.append(f'Invalid info hash skipped ({infohash}).')
30
+ else:
31
+ normalized.append(infohash)
32
+
33
+ total_infohashes = len(normalized)
34
+ if total_infohashes > 64 or total_infohashes < 1:
35
+ raise ValueError(f'Invalid amount of valid infohashes ({total_infohashes}).')
36
+
37
+ return normalized
38
+
39
+
40
+ def get_passkey(path):
41
+ """
42
+ Returns the passkey found in the scrape request.
43
+
44
+ Args:
45
+ path: Path from the scrape request.
46
+
47
+ Returns:
48
+ str: Passkey or empty string.
49
+ """
50
+ if path and re.search(r'[a-z0-9]{32}', path, re.IGNORECASE):
51
+ matches = re.search(r'[a-z0-9]{32}', path, re.IGNORECASE)
52
+ return f'/{matches.group(0)}'
53
+ return ''
54
+
55
+
56
+ def random_peer_id():
57
+ """
58
+ Generate a random peer_id.
59
+
60
+ Returns:
61
+ bytes: A random peer_id.
62
+ """
63
+ return '-PY0001-' + ''.join([str(random.randint(0, 9)) for _ in range(12)]).encode()
64
+
65
+
66
+ def collect_info_hash(infohash):
67
+ """
68
+ Converts infohash to binary.
69
+
70
+ Args:
71
+ infohash: Infohash to convert.
72
+
73
+ Returns:
74
+ bytes: Binary representation of the infohash.
75
+ """
76
+ return binascii.unhexlify(infohash)
@@ -0,0 +1,148 @@
1
+ Metadata-Version: 2.4
2
+ Name: scrapeer
3
+ Version: 1.0.0
4
+ Summary: Essential Python library that scrapes HTTP(S) and UDP trackers for torrent information.
5
+ Home-page: https://github.com/tboy1337/scrapeer-py
6
+ Download-URL: https://github.com/tboy1337/scrapeer-py/releases/latest
7
+ Author: tboy1337
8
+ Author-email: obywhuie@anonaddy.com
9
+ License: MIT
10
+ Keywords: torrent,torrents,scraper,scrapeer,torrent-scraper,torrent-scraping
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Operating System :: OS Independent
14
+ Requires-Python: >=3.6
15
+ Description-Content-Type: text/markdown
16
+ License-File: LICENSE.txt
17
+ Dynamic: author
18
+ Dynamic: author-email
19
+ Dynamic: classifier
20
+ Dynamic: description
21
+ Dynamic: description-content-type
22
+ Dynamic: download-url
23
+ Dynamic: home-page
24
+ Dynamic: keywords
25
+ Dynamic: license
26
+ Dynamic: license-file
27
+ Dynamic: requires-python
28
+ Dynamic: summary
29
+
30
+ # Scrapeer-py
31
+
32
+ A tiny Python library that lets you scrape HTTP(S) and UDP trackers for torrent information.
33
+
34
+ Scrapeer-py is a Python port of the original PHP [Scrapeer](https://github.com/torrentpier/scrapeer) library by [TorrentPier](https://github.com/torrentpier).
35
+
36
+ ## Overview
37
+
38
+ Scrapeer-py allows you to retrieve peer information from BitTorrent trackers using both HTTP(S) and UDP protocols. It can fetch seeders, leechers, and completed download counts for multiple torrents from multiple trackers simultaneously.
39
+
40
+ ## Features
41
+
42
+ - Support for both HTTP(S) and UDP tracker protocols
43
+ - Batch scraping of multiple infohashes at once (up to 64)
44
+ - Support for trackers with passkeys
45
+ - Optional announce mode for trackers that don't support scrape
46
+ - Configurable timeout settings
47
+ - Detailed error reporting
48
+ - Well-organized modular codebase
49
+
50
+ ## Installation
51
+
52
+ ```bash
53
+ pip install scrapeer
54
+ ```
55
+
56
+ ## Usage
57
+
58
+ ```python
59
+ from scrapeer import Scraper
60
+
61
+ # Initialize the scraper
62
+ scraper = Scraper()
63
+
64
+ # Define your infohashes and trackers
65
+ infohashes = [
66
+ "0123456789abcdef0123456789abcdef01234567",
67
+ "fedcba9876543210fedcba9876543210fedcba98"
68
+ ]
69
+
70
+ trackers = [
71
+ "udp://tracker.example.com:80",
72
+ "http://tracker.example.org:6969/announce",
73
+ "https://private-tracker.example.net:443/YOUR_PASSKEY/announce"
74
+ ]
75
+
76
+ # Get the results (timeout of 3 seconds per tracker)
77
+ results = scraper.scrape(
78
+ hashes=infohashes,
79
+ trackers=trackers,
80
+ timeout=3
81
+ )
82
+
83
+ # Print the results
84
+ for infohash, data in results.items():
85
+ print(f"Results for {infohash}:")
86
+ print(f" Seeders: {data['seeders']}")
87
+ print(f" Leechers: {data['leechers']}")
88
+ print(f" Completed: {data['completed']}")
89
+
90
+ # Check if there were any errors
91
+ if scraper.has_errors():
92
+ print("\nErrors:")
93
+ for error in scraper.get_errors():
94
+ print(f" {error}")
95
+ ```
96
+
97
+ ## Package Structure
98
+
99
+ Scrapeer-py is organized into the following modules:
100
+
101
+ - `scrapeer/` - Main package directory
102
+ - `__init__.py` - Package initialization that exports the Scraper class
103
+ - `scraper.py` - Main Scraper class implementation
104
+ - `http.py` - HTTP(S) protocol scraping functionality
105
+ - `udp.py` - UDP protocol scraping functionality
106
+ - `utils.py` - Utility functions used across the package
107
+
108
+ ## API Reference
109
+
110
+ ### `Scraper` class
111
+
112
+ #### `scrape(hashes, trackers, max_trackers=None, timeout=2, announce=False)`
113
+
114
+ Scrape trackers for torrent information.
115
+
116
+ - **Parameters**:
117
+ - `hashes`: List (>1) or string of infohash(es)
118
+ - `trackers`: List (>1) or string of tracker(s)
119
+ - `max_trackers`: (Optional) Maximum number of trackers to be scraped, Default all
120
+ - `timeout`: (Optional) Maximum time for each tracker scrape in seconds, Default 2
121
+ - `announce`: (Optional) Use announce instead of scrape, Default False
122
+
123
+ - **Returns**:
124
+ - Dictionary of results with infohashes as keys and stats as values
125
+
126
+ #### `has_errors()`
127
+
128
+ Checks if there are any errors.
129
+
130
+ - **Returns**:
131
+ - `bool`: True if errors are present, False otherwise
132
+
133
+ #### `get_errors()`
134
+
135
+ Returns all the errors that were logged.
136
+
137
+ - **Returns**:
138
+ - `list`: All the logged errors
139
+
140
+ ## Limitations
141
+
142
+ - Maximum of 64 infohashes per request
143
+ - Minimum of 1 infohash per request
144
+ - Only supports BitTorrent trackers (HTTP(S) and UDP)
145
+
146
+ ## License
147
+
148
+ This project is licensed under the MIT License - see the [LICENSE.txt](LICENSE.txt) file for details.
@@ -0,0 +1,12 @@
1
+ LICENSE.txt
2
+ README.md
3
+ setup.py
4
+ scrapeer/__init__.py
5
+ scrapeer/http.py
6
+ scrapeer/scraper.py
7
+ scrapeer/udp.py
8
+ scrapeer/utils.py
9
+ scrapeer.egg-info/PKG-INFO
10
+ scrapeer.egg-info/SOURCES.txt
11
+ scrapeer.egg-info/dependency_links.txt
12
+ scrapeer.egg-info/top_level.txt
@@ -0,0 +1 @@
1
+ scrapeer
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,29 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name="scrapeer",
5
+ version="1.0.0",
6
+ description="Essential Python library that scrapes HTTP(S) and UDP trackers for torrent information.",
7
+ author="tboy1337",
8
+ author_email="obywhuie@anonaddy.com",
9
+ url="https://github.com/tboy1337/scrapeer-py",
10
+ download_url="https://github.com/tboy1337/scrapeer-py/releases/latest",
11
+ license="MIT",
12
+ packages=find_packages(),
13
+ classifiers=[
14
+ "Programming Language :: Python :: 3",
15
+ "License :: OSI Approved :: MIT License",
16
+ "Operating System :: OS Independent",
17
+ ],
18
+ python_requires=">=3.6",
19
+ keywords=[
20
+ "torrent",
21
+ "torrents",
22
+ "scraper",
23
+ "scrapeer",
24
+ "torrent-scraper",
25
+ "torrent-scraping"
26
+ ],
27
+ long_description=open("README.md").read(),
28
+ long_description_content_type="text/markdown",
29
+ )