NullGazeX 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nullgaze/__init__.py +168 -0
- nullgaze/downloader.py +697 -0
- nullgaze/exceptions.py +32 -0
- nullgaze/scraper.py +375 -0
- nullgaze/utils.py +138 -0
- nullgazex-2.1.0.dist-info/METADATA +325 -0
- nullgazex-2.1.0.dist-info/RECORD +10 -0
- nullgazex-2.1.0.dist-info/WHEEL +5 -0
- nullgazex-2.1.0.dist-info/licenses/LICENSE +21 -0
- nullgazex-2.1.0.dist-info/top_level.txt +1 -0
nullgaze/__init__.py
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
"""
|
|
2
|
+
NullGazeX — High-speed content retrieval with DPI bypass and TLS impersonation.
|
|
3
|
+
|
|
4
|
+
Provides two engines:
|
|
5
|
+
- ImageDownloader — download images through any firewall/protection
|
|
6
|
+
- PageScraper — scrape any page content undetected
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from .downloader import ImageDownloader
|
|
10
|
+
from .scraper import PageScraper
|
|
11
|
+
from .exceptions import (
|
|
12
|
+
NullGazeError,
|
|
13
|
+
DownloadFailedError,
|
|
14
|
+
InvalidURLError,
|
|
15
|
+
ScrapeError,
|
|
16
|
+
BlockedError,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
__version__ = "2.1.0"
|
|
20
|
+
|
|
21
|
+
# ------------------------------------------------------------------
|
|
22
|
+
# Image downloading (convenience)
|
|
23
|
+
# ------------------------------------------------------------------
|
|
24
|
+
def download_image(
|
|
25
|
+
url: str,
|
|
26
|
+
output_path: str,
|
|
27
|
+
verbose: bool = False,
|
|
28
|
+
headers: dict = None,
|
|
29
|
+
race_timeout: float = 4.0,
|
|
30
|
+
) -> str:
|
|
31
|
+
"""
|
|
32
|
+
Convenience wrapper to download a single image.
|
|
33
|
+
|
|
34
|
+
All bypass strategies are raced in parallel — the fastest wins.
|
|
35
|
+
"""
|
|
36
|
+
downloader = ImageDownloader(verbose=verbose)
|
|
37
|
+
return downloader.download(
|
|
38
|
+
url, output_path, headers=headers, race_timeout=race_timeout
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def download_images(
|
|
43
|
+
targets: list,
|
|
44
|
+
max_workers: int = 20,
|
|
45
|
+
verbose: bool = False,
|
|
46
|
+
headers: dict = None,
|
|
47
|
+
race_timeout: float = 4.0,
|
|
48
|
+
adaptive_delay: bool = True,
|
|
49
|
+
) -> list:
|
|
50
|
+
"""
|
|
51
|
+
Download multiple images concurrently with adaptive anti-blocking delays.
|
|
52
|
+
"""
|
|
53
|
+
downloader = ImageDownloader(verbose=verbose)
|
|
54
|
+
return downloader.download_bulk(
|
|
55
|
+
targets,
|
|
56
|
+
max_workers=max_workers,
|
|
57
|
+
headers=headers,
|
|
58
|
+
race_timeout=race_timeout,
|
|
59
|
+
adaptive_delay=adaptive_delay,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# ------------------------------------------------------------------
|
|
64
|
+
# Page scraping (convenience)
|
|
65
|
+
# ------------------------------------------------------------------
|
|
66
|
+
def scrape_page(
|
|
67
|
+
url: str,
|
|
68
|
+
headers: dict = None,
|
|
69
|
+
verbose: bool = False,
|
|
70
|
+
race_timeout: float = 5.0,
|
|
71
|
+
) -> str:
|
|
72
|
+
"""
|
|
73
|
+
Scrape a page and return raw HTML.
|
|
74
|
+
|
|
75
|
+
Uses the same DPI-bypass proxy + rotating TLS fingerprints
|
|
76
|
+
that power the image downloader. Undetectable by robot tests.
|
|
77
|
+
"""
|
|
78
|
+
scraper = PageScraper(verbose=verbose)
|
|
79
|
+
return scraper.scrape(url, headers=headers, race_timeout=race_timeout)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def scrape_text(
|
|
83
|
+
url: str,
|
|
84
|
+
headers: dict = None,
|
|
85
|
+
verbose: bool = False,
|
|
86
|
+
race_timeout: float = 5.0,
|
|
87
|
+
) -> str:
|
|
88
|
+
"""Scrape a page and return clean plain-text content."""
|
|
89
|
+
scraper = PageScraper(verbose=verbose)
|
|
90
|
+
return scraper.scrape_text(url, headers=headers, race_timeout=race_timeout)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def scrape_title(
|
|
94
|
+
url: str,
|
|
95
|
+
headers: dict = None,
|
|
96
|
+
verbose: bool = False,
|
|
97
|
+
race_timeout: float = 5.0,
|
|
98
|
+
) -> str:
|
|
99
|
+
"""Scrape a page and return its <title> text."""
|
|
100
|
+
scraper = PageScraper(verbose=verbose)
|
|
101
|
+
return scraper.scrape_title(url, headers=headers, race_timeout=race_timeout)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def scrape_json(
|
|
105
|
+
url: str,
|
|
106
|
+
headers: dict = None,
|
|
107
|
+
verbose: bool = False,
|
|
108
|
+
race_timeout: float = 5.0,
|
|
109
|
+
):
|
|
110
|
+
"""Scrape a JSON endpoint and return the parsed Python object."""
|
|
111
|
+
scraper = PageScraper(verbose=verbose)
|
|
112
|
+
return scraper.scrape_json(url, headers=headers, race_timeout=race_timeout)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def scrape_bulk(
|
|
116
|
+
urls: list,
|
|
117
|
+
max_workers: int = 15,
|
|
118
|
+
headers: dict = None,
|
|
119
|
+
verbose: bool = False,
|
|
120
|
+
race_timeout: float = 5.0,
|
|
121
|
+
adaptive_delay: bool = True,
|
|
122
|
+
) -> list:
|
|
123
|
+
"""Scrape multiple URLs in parallel."""
|
|
124
|
+
scraper = PageScraper(verbose=verbose)
|
|
125
|
+
return scraper.scrape_bulk(
|
|
126
|
+
urls,
|
|
127
|
+
max_workers=max_workers,
|
|
128
|
+
headers=headers,
|
|
129
|
+
race_timeout=race_timeout,
|
|
130
|
+
adaptive_delay=adaptive_delay,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
# ------------------------------------------------------------------
|
|
135
|
+
# Engine pre-warming
|
|
136
|
+
# ------------------------------------------------------------------
|
|
137
|
+
def engine_prewarm():
|
|
138
|
+
"""
|
|
139
|
+
Pre-start the shared DPI-bypass proxy so the first real request
|
|
140
|
+
hits a hot path (sub-100ms).
|
|
141
|
+
"""
|
|
142
|
+
BaseEngine = ImageDownloader.__bases__[0]
|
|
143
|
+
BaseEngine.prewarm()
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
# Public API surface
|
|
147
|
+
__all__ = [
|
|
148
|
+
# Classes
|
|
149
|
+
"ImageDownloader",
|
|
150
|
+
"PageScraper",
|
|
151
|
+
# Image functions
|
|
152
|
+
"download_image",
|
|
153
|
+
"download_images",
|
|
154
|
+
# Scraping functions
|
|
155
|
+
"scrape_page",
|
|
156
|
+
"scrape_text",
|
|
157
|
+
"scrape_title",
|
|
158
|
+
"scrape_json",
|
|
159
|
+
"scrape_bulk",
|
|
160
|
+
# Utility
|
|
161
|
+
"engine_prewarm",
|
|
162
|
+
# Exceptions
|
|
163
|
+
"NullGazeError",
|
|
164
|
+
"DownloadFailedError",
|
|
165
|
+
"InvalidURLError",
|
|
166
|
+
"ScrapeError",
|
|
167
|
+
"BlockedError",
|
|
168
|
+
]
|