mdbq 4.0.66__py3-none-any.whl → 4.0.68__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__version__.py +1 -1
- mdbq/selenium/__init__.py +4 -0
- mdbq/selenium/get_driver.py +262 -0
- {mdbq-4.0.66.dist-info → mdbq-4.0.68.dist-info}/METADATA +1 -1
- {mdbq-4.0.66.dist-info → mdbq-4.0.68.dist-info}/RECORD +7 -5
- {mdbq-4.0.66.dist-info → mdbq-4.0.68.dist-info}/WHEEL +0 -0
- {mdbq-4.0.66.dist-info → mdbq-4.0.68.dist-info}/top_level.txt +0 -0
mdbq/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
VERSION = '4.0.
|
1
|
+
VERSION = '4.0.68'
|
@@ -0,0 +1,262 @@
|
|
1
|
+
# -*- coding:utf-8 -*-
|
2
|
+
import os
|
3
|
+
import platform
|
4
|
+
import getpass
|
5
|
+
from selenium import webdriver
|
6
|
+
from selenium.webdriver.chrome.service import Service
|
7
|
+
import re
|
8
|
+
import socket
|
9
|
+
import tempfile
|
10
|
+
import shutil
|
11
|
+
import uuid
|
12
|
+
|
13
|
+
dir_path = os.path.expanduser("~")
|
14
|
+
|
15
|
+
|
16
|
+
class GetDriverException(Exception):
|
17
|
+
"""自定义异常:GetDriver相关错误"""
|
18
|
+
pass
|
19
|
+
|
20
|
+
|
21
|
+
class GetDriver:
|
22
|
+
"""
|
23
|
+
Selenium ChromeDriver 管理器,支持多平台、代理、无头模式、下载目录、User-Agent等高级配置。
|
24
|
+
支持上下文管理器(with语法),自动资源清理。
|
25
|
+
"""
|
26
|
+
def __init__(self, url=None, headless=False, proxy=None, user_agent=None, download_dir=None, chrome_path=None, chromedriver_path=None, maximize_window=True):
|
27
|
+
"""
|
28
|
+
初始化GetDriver
|
29
|
+
:param url: 允许的安全站点(用于insecure origin as secure)
|
30
|
+
:param headless: 是否无头模式
|
31
|
+
:param proxy: 代理(支持http、https、socks5,格式如socks5://127.0.0.1:1080)
|
32
|
+
:param user_agent: 自定义User-Agent
|
33
|
+
:param download_dir: 下载目录
|
34
|
+
:param chrome_path: Chrome浏览器路径
|
35
|
+
:param chromedriver_path: Chromedriver路径
|
36
|
+
"""
|
37
|
+
self.url = url
|
38
|
+
self.headless = headless
|
39
|
+
self.proxy = proxy
|
40
|
+
self.user_agent = user_agent
|
41
|
+
self.download_dir = os.path.expanduser(download_dir) if download_dir else os.path.expanduser('~/Downloads')
|
42
|
+
self.chrome_path = chrome_path
|
43
|
+
self.chromedriver_path = chromedriver_path
|
44
|
+
self.temp_dirs = [] # 存储临时目录路径,用于清理
|
45
|
+
self.driver = None
|
46
|
+
if not self.user_agent:
|
47
|
+
user_agents = [
|
48
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36',
|
49
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36',
|
50
|
+
]
|
51
|
+
import random
|
52
|
+
self.user_agent = user_agents[random.randint(0, len(user_agents) - 1)]
|
53
|
+
self.maximize_window = maximize_window
|
54
|
+
|
55
|
+
def __enter__(self):
|
56
|
+
"""
|
57
|
+
支持with语法自动获取driver
|
58
|
+
:return: selenium.webdriver.Chrome实例
|
59
|
+
"""
|
60
|
+
self.driver = self.getdriver()
|
61
|
+
return self.driver
|
62
|
+
|
63
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
64
|
+
"""
|
65
|
+
支持with语法自动清理资源
|
66
|
+
"""
|
67
|
+
self.quit()
|
68
|
+
|
69
|
+
def close(self):
|
70
|
+
"""
|
71
|
+
关闭浏览器窗口并清理临时目录
|
72
|
+
"""
|
73
|
+
if self.driver:
|
74
|
+
try:
|
75
|
+
self.driver.close()
|
76
|
+
except:
|
77
|
+
pass
|
78
|
+
self._cleanup_temp_dirs()
|
79
|
+
|
80
|
+
def quit(self):
|
81
|
+
"""
|
82
|
+
彻底退出浏览器并清理临时目录
|
83
|
+
"""
|
84
|
+
if self.driver:
|
85
|
+
try:
|
86
|
+
self.driver.quit()
|
87
|
+
except:
|
88
|
+
pass
|
89
|
+
self._cleanup_temp_dirs()
|
90
|
+
|
91
|
+
def _cleanup_temp_dirs(self):
|
92
|
+
"""
|
93
|
+
清理所有创建的临时目录
|
94
|
+
"""
|
95
|
+
for temp_dir in self.temp_dirs:
|
96
|
+
try:
|
97
|
+
if os.path.exists(temp_dir):
|
98
|
+
shutil.rmtree(temp_dir)
|
99
|
+
except:
|
100
|
+
pass
|
101
|
+
self.temp_dirs = []
|
102
|
+
|
103
|
+
def check_proxy(self):
|
104
|
+
"""
|
105
|
+
校验代理格式和连通性,支持http/https/socks5
|
106
|
+
:return: True/False
|
107
|
+
"""
|
108
|
+
if not self.proxy:
|
109
|
+
return True
|
110
|
+
# 支持协议前缀
|
111
|
+
proxy_pattern = r'^(socks5|http|https)://(\d{1,3}(\.\d{1,3}){3}):(\d+)$'
|
112
|
+
if not re.match(proxy_pattern, self.proxy):
|
113
|
+
return False
|
114
|
+
proto, ip, _, _, port = re.match(proxy_pattern, self.proxy).groups()
|
115
|
+
try:
|
116
|
+
sock = socket.create_connection((ip, int(port)), timeout=5)
|
117
|
+
sock.close()
|
118
|
+
return True
|
119
|
+
except:
|
120
|
+
return False
|
121
|
+
|
122
|
+
def getdriver(self):
|
123
|
+
"""
|
124
|
+
创建并返回Chrome WebDriver实例,自动注入反检测JS,异常时抛出GetDriverException
|
125
|
+
:return: selenium.webdriver.Chrome实例
|
126
|
+
:raises: GetDriverException
|
127
|
+
"""
|
128
|
+
if not self.check_proxy():
|
129
|
+
raise GetDriverException(f"代理不可用或格式错误: {self.proxy}")
|
130
|
+
option = webdriver.ChromeOptions() # 浏览器启动选项
|
131
|
+
if self.headless:
|
132
|
+
option.add_argument("--headless") # 设置无界面模式
|
133
|
+
option.add_argument("--window-size=1920,1080")
|
134
|
+
option.add_argument("--disable-gpu")
|
135
|
+
option.add_argument("--no-sandbox")
|
136
|
+
option.add_argument("--disable-dev-shm-usage")
|
137
|
+
# 添加唯一的用户数据目录,避免Chrome实例冲突
|
138
|
+
temp_dir = tempfile.mkdtemp(prefix=f'chrome_automation_{uuid.uuid4().hex[:8]}_')
|
139
|
+
option.add_argument(f'--user-data-dir={temp_dir}')
|
140
|
+
option.add_argument('--no-first-run')
|
141
|
+
option.add_argument('--no-default-browser-check')
|
142
|
+
option.add_argument('--disable-background-timer-throttling')
|
143
|
+
option.add_argument('--disable-backgrounding-occluded-windows')
|
144
|
+
option.add_argument('--disable-renderer-backgrounding')
|
145
|
+
option.add_argument('--disable-features=TranslateUI')
|
146
|
+
option.add_argument('--disable-ipc-flooding-protection')
|
147
|
+
# 关键安全浏览禁用参数
|
148
|
+
option.add_argument('--allow-insecure-localhost')
|
149
|
+
option.add_argument('--allow-running-insecure-content')
|
150
|
+
option.add_argument('--disable-features=BlockInsecurePrivateNetworkRequests,SafeBrowsing,DownloadBubble,SafeBrowsingEnhancedProtection,DownloadWarning')
|
151
|
+
option.add_argument('--safebrowsing-disable-download-protection')
|
152
|
+
option.add_argument('--disable-client-side-phishing-detection')
|
153
|
+
option.add_argument('--disable-popup-blocking')
|
154
|
+
option.add_argument('--ignore-certificate-errors')
|
155
|
+
if self.url:
|
156
|
+
option.add_argument(f"--unsafely-treat-insecure-origin-as-secure={self.url}")
|
157
|
+
# User-Agent
|
158
|
+
option.add_argument(f'--user-agent={self.user_agent}')
|
159
|
+
# 自动化相关设置
|
160
|
+
option.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
|
161
|
+
option.add_experimental_option("useAutomationExtension", False)
|
162
|
+
# 代理设置
|
163
|
+
if self.proxy:
|
164
|
+
option.add_argument(f'--proxy-server={self.proxy}')
|
165
|
+
# 下载配置
|
166
|
+
prefs = {
|
167
|
+
"download.default_directory": self.download_dir,
|
168
|
+
"download.prompt_for_download": False,
|
169
|
+
"download.directory_upgrade": True,
|
170
|
+
"safebrowsing.enabled": False,
|
171
|
+
"safebrowsing.disable_download_protection": True,
|
172
|
+
"profile.content_settings.exceptions.automatic_downloads.*.setting": 1,
|
173
|
+
"profile.default_content_settings.popups": 0,
|
174
|
+
"profile.default_content_setting_values.automatic_downloads": 1,
|
175
|
+
"profile.default_content_setting_values.notifications": 2,
|
176
|
+
"credentials_enable_service": False,
|
177
|
+
"profile.password_manager_enabled": False,
|
178
|
+
"download_restrictions": 0,
|
179
|
+
}
|
180
|
+
# 平台与路径自动检测
|
181
|
+
sys_platform = platform.system().lower()
|
182
|
+
chrome_path = self.chrome_path
|
183
|
+
chromedriver_path = self.chromedriver_path
|
184
|
+
try:
|
185
|
+
if sys_platform == 'windows':
|
186
|
+
if not chrome_path:
|
187
|
+
chrome_path = os.path.join(f'C:\\Users\\{getpass.getuser()}', 'chrome\\chrome_win64\\chrome.exe')
|
188
|
+
if not chromedriver_path:
|
189
|
+
chromedriver_path = os.path.join(f'C:\\Users\\{getpass.getuser()}', 'chrome\\chromedriver.exe')
|
190
|
+
option.binary_location = chrome_path
|
191
|
+
service = Service(chromedriver_path)
|
192
|
+
elif sys_platform == 'linux':
|
193
|
+
if not chrome_path:
|
194
|
+
chrome_path = '/usr/bin/google-chrome'
|
195
|
+
if not chromedriver_path:
|
196
|
+
chromedriver_path = '/usr/local/bin/chromedriver'
|
197
|
+
option.binary_location = chrome_path
|
198
|
+
service = Service(chromedriver_path)
|
199
|
+
elif sys_platform == 'darwin':
|
200
|
+
if not chrome_path:
|
201
|
+
# 优先使用用户指定的默认路径
|
202
|
+
chrome_path_candidates = [
|
203
|
+
'/usr/local/chrome/Google Chrome for Testing.app/Contents/MacOS/Google Chrome',
|
204
|
+
'/usr/local/chrome/Google Chrome for Testing.app',
|
205
|
+
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
206
|
+
'/Applications/Google Chrome for Testing.app/Contents/MacOS/Google Chrome',
|
207
|
+
]
|
208
|
+
chrome_path = next((p for p in chrome_path_candidates if os.path.exists(p)), None)
|
209
|
+
if not chromedriver_path:
|
210
|
+
chromedriver_path_candidates = [
|
211
|
+
'/usr/local/chrome/chromedriver',
|
212
|
+
'/usr/local/bin/chromedriver',
|
213
|
+
'/opt/homebrew/bin/chromedriver',
|
214
|
+
]
|
215
|
+
chromedriver_path = next((p for p in chromedriver_path_candidates if os.path.exists(p)), None)
|
216
|
+
if not chrome_path or not chromedriver_path:
|
217
|
+
raise GetDriverException("未找到Chrome或Chromedriver,请手动指定chrome_path和chromedriver_path")
|
218
|
+
# option.binary_location = chrome_path # macOS 设置此参数报错
|
219
|
+
service = Service(chromedriver_path)
|
220
|
+
else:
|
221
|
+
raise GetDriverException(f"不支持的平台: {sys_platform}")
|
222
|
+
except Exception as e:
|
223
|
+
raise GetDriverException(f"浏览器路径配置异常: {e}")
|
224
|
+
option.add_experimental_option("prefs", prefs)
|
225
|
+
try:
|
226
|
+
driver = webdriver.Chrome(service=service, options=option)
|
227
|
+
if self.maximize_window:
|
228
|
+
driver.maximize_window()
|
229
|
+
# --- 防反爬:注入多段JS隐藏Selenium特征 ---
|
230
|
+
js_hide_features = [
|
231
|
+
"Object.defineProperty(navigator, 'webdriver', {get: () => false});",
|
232
|
+
"Object.defineProperty(navigator, 'plugins', {get: () => [1,2,3,4,5]});",
|
233
|
+
"Object.defineProperty(navigator, 'languages', {get: () => ['zh-CN', 'zh', 'en']});",
|
234
|
+
"window.chrome = {runtime: {}};",
|
235
|
+
"delete window.navigator.__proto__.webdriver;",
|
236
|
+
r"for (let key in window) {if (key.match(/^[\$\_]{3,}/)) {try {delete window[key];} catch(e){}}}"
|
237
|
+
]
|
238
|
+
for js in js_hide_features:
|
239
|
+
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": js})
|
240
|
+
self.temp_dirs.append(temp_dir)
|
241
|
+
self.driver = driver
|
242
|
+
return driver
|
243
|
+
except:
|
244
|
+
try:
|
245
|
+
if os.path.exists(temp_dir):
|
246
|
+
shutil.rmtree(temp_dir)
|
247
|
+
except Exception as e:
|
248
|
+
pass
|
249
|
+
raise GetDriverException(f"启动ChromeDriver失败: {e}")
|
250
|
+
|
251
|
+
|
252
|
+
if __name__ == '__main__':
|
253
|
+
with GetDriver(
|
254
|
+
headless=True,
|
255
|
+
proxy=None, # 代理('socks5://127.0.0.1:1080')
|
256
|
+
user_agent=None,
|
257
|
+
download_dir=None,
|
258
|
+
chrome_path=None,
|
259
|
+
chromedriver_path=None,
|
260
|
+
) as driver:
|
261
|
+
driver.get('https://www.baidu.com')
|
262
|
+
print(driver.title)
|
@@ -1,5 +1,5 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
|
-
mdbq/__version__.py,sha256=
|
2
|
+
mdbq/__version__.py,sha256=B4RJ6rNe1H9OhX5hZ0vZAUpRVdYgKa7ZoYjTltxb7fw,18
|
3
3
|
mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
|
4
4
|
mdbq/log/mylogger.py,sha256=kPe3wsQNaB1slfX-Z7VMqzZoMoqPfc7ylYXZDBeFzzI,24945
|
5
5
|
mdbq/myconf/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
@@ -21,8 +21,10 @@ mdbq/pbix/pbix_refresh.py,sha256=JUjKW3bNEyoMVfVfo77UhguvS5AWkixvVhDbw4_MHco,239
|
|
21
21
|
mdbq/pbix/refresh_all.py,sha256=OBT9EewSZ0aRS9vL_FflVn74d4l2G00wzHiikCC4TC0,5926
|
22
22
|
mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
|
23
23
|
mdbq/redis/getredis.py,sha256=vpBuNc22uj9Vr-_Dh25_wpwWM1e-072EAAIBdB_IpL0,23494
|
24
|
+
mdbq/selenium/__init__.py,sha256=AKzeEceqZyvqn2dEDoJSzDQnbuENkJSHAlbHAD0u0ZI,10
|
25
|
+
mdbq/selenium/get_driver.py,sha256=fX0AMIpJZIqQFJvSjn334mo1XZShbXRIvU7GfbCn2g4,11424
|
24
26
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
25
|
-
mdbq-4.0.
|
26
|
-
mdbq-4.0.
|
27
|
-
mdbq-4.0.
|
28
|
-
mdbq-4.0.
|
27
|
+
mdbq-4.0.68.dist-info/METADATA,sha256=43vKa0YoA1c5aD-rX6V68bFPQqscOAuQXxM4zIyArEI,364
|
28
|
+
mdbq-4.0.68.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
29
|
+
mdbq-4.0.68.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
30
|
+
mdbq-4.0.68.dist-info/RECORD,,
|
File without changes
|
File without changes
|