ODtools 2.1.21__tar.gz → 2.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools/monitor_tools.py +101 -58
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools/monitor_tools_new.py +104 -127
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools.egg-info/PKG-INFO +27 -6
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools.egg-info/requires.txt +5 -3
- {ODtools-2.1.21 → odtools-2.2.1}/PKG-INFO +27 -6
- {ODtools-2.1.21 → odtools-2.2.1}/setup.py +10 -4
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools/__init__.py +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools/bloom_filter_tools.py +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools/excel_tools.py +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools/fastdfs_tools.py +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools/fdfs_client/__init__.py +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools/fdfs_client/client.py +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools/fdfs_client/connection.py +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools/fdfs_client/exceptions.py +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools/fdfs_client/fdfs_protol.py +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools/fdfs_client/fdfs_test.py +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools/fdfs_client/storage_client.py +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools/fdfs_client/tracker_client.py +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools/fdfs_client/utils.py +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools/hbase_client/THBaseService.py +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools/hbase_client/__init__.py +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools/hbase_client/constants.py +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools/hbase_client/ttypes.py +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools/hbase_tools.py +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools/kafka_tools.py +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools/log_tools.py +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools/redis_tools.py +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools/request_headers.py +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools/save_data_class.py +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools/scrapy_redis/__init__.py +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools/scrapy_redis/connection.py +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools/scrapy_redis/defaults.py +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools/scrapy_redis/dupefilter.py +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools/scrapy_redis/picklecompat.py +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools/scrapy_redis/pipelines.py +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools/scrapy_redis/queue.py +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools/scrapy_redis/scheduler.py +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools/scrapy_redis/spiders.py +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools/scrapy_redis/utils.py +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools/singleton_tools.py +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools/time_counter.py +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools/timeit_counter.py +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools.egg-info/SOURCES.txt +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools.egg-info/dependency_links.txt +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools.egg-info/top_level.txt +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/ODtools.egg-info/zip-safe +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/README.md +0 -0
- {ODtools-2.1.21 → odtools-2.2.1}/setup.cfg +0 -0
|
@@ -4,17 +4,26 @@
|
|
|
4
4
|
# @File : test_monitor_class.py
|
|
5
5
|
|
|
6
6
|
|
|
7
|
-
import asyncio
|
|
8
7
|
import time
|
|
9
8
|
from datetime import datetime
|
|
10
9
|
from enum import Enum
|
|
11
10
|
|
|
12
11
|
import requests
|
|
13
|
-
from aiohttp import ClientSession
|
|
14
|
-
from aiosocksy.connector import ProxyClientRequest, ProxyConnector
|
|
15
12
|
from apscheduler.schedulers.blocking import BlockingScheduler
|
|
16
13
|
from redis import Redis
|
|
17
14
|
from rediscluster import RedisCluster
|
|
15
|
+
import asyncio
|
|
16
|
+
from urllib.parse import urlparse
|
|
17
|
+
from aiohttp import ClientSession, ClientResponse
|
|
18
|
+
from typing import List, Dict, Any, Tuple, Optional
|
|
19
|
+
|
|
20
|
+
# 只在需要 SOCKS 时才导入(避免无代理时依赖 aiohttp-socks)
|
|
21
|
+
try:
|
|
22
|
+
from aiohttp_socks import ProxyConnector
|
|
23
|
+
|
|
24
|
+
HAS_SOCKS = True
|
|
25
|
+
except ImportError:
|
|
26
|
+
HAS_SOCKS = False
|
|
18
27
|
|
|
19
28
|
|
|
20
29
|
# 把超时时间的设置
|
|
@@ -101,7 +110,7 @@ class Task(object):
|
|
|
101
110
|
"""
|
|
102
111
|
任务调度分发类,主要分为定时分发任务、循环分发任务、单次分发任务
|
|
103
112
|
"""
|
|
104
|
-
|
|
113
|
+
|
|
105
114
|
def __init__(self, db_client=None, *, db_host: str = None, db_port: int = 6379, db_cluster: bool = False,
|
|
106
115
|
db_password: str = None):
|
|
107
116
|
"""
|
|
@@ -117,7 +126,7 @@ class Task(object):
|
|
|
117
126
|
else:
|
|
118
127
|
redis_type = RedisCluster if db_cluster else Redis
|
|
119
128
|
self.db_client = redis_type(host=db_host, port=db_port, password=db_password)
|
|
120
|
-
|
|
129
|
+
|
|
121
130
|
def cron_job(self, que_name: str, task_data: list, *, hour: str = None, minute: str = None, second: str = None):
|
|
122
131
|
"""
|
|
123
132
|
定时分发任务,任务队列类型必须是列表类型
|
|
@@ -137,7 +146,7 @@ class Task(object):
|
|
|
137
146
|
args=[que_name, task_data]
|
|
138
147
|
)
|
|
139
148
|
scheduler.start()
|
|
140
|
-
|
|
149
|
+
|
|
141
150
|
def cyclic_job(self, que_name: str, task_data: list, second: int):
|
|
142
151
|
"""
|
|
143
152
|
循环分发任务,任务队列类型必须是列表类型
|
|
@@ -150,7 +159,7 @@ class Task(object):
|
|
|
150
159
|
self.db_client.rpush(que_name, *task_data)
|
|
151
160
|
print('task distribution complete...')
|
|
152
161
|
time.sleep(second)
|
|
153
|
-
|
|
162
|
+
|
|
154
163
|
def single_job(self, que_name: str, task_data: list, que_type: str = 'list', sort: str = 'r'):
|
|
155
164
|
"""
|
|
156
165
|
单次任务分发,队列可以是列表或集合
|
|
@@ -169,7 +178,7 @@ class Mrequest(object):
|
|
|
169
178
|
"""
|
|
170
179
|
发送请求的类,分为同步和异步
|
|
171
180
|
"""
|
|
172
|
-
|
|
181
|
+
|
|
173
182
|
def __init__(self, db_client=None, statistic=False, *, db_host: str = None, db_port: int = 6379,
|
|
174
183
|
db_cluster: bool = False,
|
|
175
184
|
db_password: str = None):
|
|
@@ -188,7 +197,7 @@ class Mrequest(object):
|
|
|
188
197
|
else:
|
|
189
198
|
redis_type = RedisCluster if db_cluster else Redis
|
|
190
199
|
self.db_client = redis_type(host=db_host, port=db_port, password=db_password)
|
|
191
|
-
|
|
200
|
+
|
|
192
201
|
def mrequest(self, urls: str, method: str = 'get', record_dict: dict = dict, step: int = 1, **kwargs):
|
|
193
202
|
"""
|
|
194
203
|
使用requests同步发送请求,参数分别为目标任务url,请求方法,计数器字典,计数步长
|
|
@@ -216,58 +225,92 @@ class Mrequest(object):
|
|
|
216
225
|
spider_record(self.db_client, record_dict, step, False)
|
|
217
226
|
print('失败计数加一')
|
|
218
227
|
return Response
|
|
219
|
-
|
|
220
|
-
def aio_request(
|
|
221
|
-
|
|
228
|
+
|
|
229
|
+
def aio_request(
|
|
230
|
+
self,
|
|
231
|
+
urls: List[str],
|
|
232
|
+
record_dict: Dict[str, Any] = None,
|
|
233
|
+
response_model: str = 'html',
|
|
234
|
+
step: int = 1,
|
|
235
|
+
cookies: Optional[Dict] = None,
|
|
236
|
+
proxy_url: Optional[str] = None, # 支持: http://, https://, socks4://, socks5://
|
|
237
|
+
**kwargs
|
|
238
|
+
) -> List[Tuple[Any, str, Optional[ClientResponse]]]:
|
|
222
239
|
"""
|
|
223
|
-
|
|
224
|
-
|
|
240
|
+
异步发送请求,自适应支持 HTTP/HTTPS 和 SOCKS4/SOCKS5 代理。
|
|
241
|
+
- HTTP/HTTPS 代理:使用 aiohttp 原生 proxy 参数
|
|
242
|
+
- SOCKS 代理:使用 aiohttp-socks(需安装)
|
|
225
243
|
"""
|
|
226
|
-
if
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
async def async_request(url, cookies=None, **kwargs):
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
244
|
+
if record_dict is None:
|
|
245
|
+
record_dict = {}
|
|
246
|
+
|
|
247
|
+
async def async_request(url: str, cookies: Optional[Dict] = None, **kwargs):
|
|
248
|
+
# 解析代理类型
|
|
249
|
+
use_socks = False
|
|
250
|
+
if proxy_url:
|
|
251
|
+
parsed = urlparse(proxy_url)
|
|
252
|
+
if parsed.scheme in ('socks4', 'socks5'):
|
|
253
|
+
use_socks = True
|
|
254
|
+
elif parsed.scheme not in ('http', 'https'):
|
|
255
|
+
raise ValueError(f"Unsupported proxy scheme: {parsed.scheme}. Use http, https, socks4, or socks5.")
|
|
256
|
+
|
|
257
|
+
try:
|
|
258
|
+
if use_socks:
|
|
259
|
+
# 使用 SOCKS 代理
|
|
260
|
+
if not HAS_SOCKS:
|
|
261
|
+
raise RuntimeError(
|
|
262
|
+
"SOCKS proxy requires 'aiohttp-socks' package. Install with: pip install aiohttp-socks")
|
|
263
|
+
connector = ProxyConnector.from_url(proxy_url)
|
|
264
|
+
async with ClientSession(connector=connector, cookies=cookies) as session:
|
|
265
|
+
async with session.get(url, **kwargs) as response:
|
|
266
|
+
return await _process_response(response, url)
|
|
267
|
+
else:
|
|
268
|
+
# 使用 HTTP/HTTPS 代理 或 无代理
|
|
269
|
+
async with ClientSession(cookies=cookies) as session:
|
|
270
|
+
async with session.get(url, proxy=proxy_url, **kwargs) as response:
|
|
271
|
+
return await _process_response(response, url)
|
|
272
|
+
|
|
273
|
+
except Exception as e:
|
|
274
|
+
if self.statistic and 'component_name' in record_dict:
|
|
275
|
+
component_name = record_dict['component_name']
|
|
276
|
+
record_dict['record_info'] = f'{component_name}_request_fail'
|
|
277
|
+
spider_record(self.db_client, record_dict, step, False)
|
|
278
|
+
import traceback
|
|
279
|
+
print(f"Error fetching {url} via {proxy_url or 'direct'}: {e}\n{traceback.format_exc()}")
|
|
280
|
+
return '', url, None
|
|
281
|
+
|
|
282
|
+
async def _process_response(response: ClientResponse, url: str):
|
|
283
|
+
"""统一处理响应内容"""
|
|
284
|
+
if self.statistic and 'component_name' in record_dict:
|
|
285
|
+
component_name = record_dict['component_name']
|
|
286
|
+
if response.status == 200:
|
|
287
|
+
record_dict['record_info'] = f'{component_name}_request_success'
|
|
288
|
+
else:
|
|
289
|
+
record_dict['record_info'] = f'{component_name}_request_fail'
|
|
290
|
+
spider_record(self.db_client, record_dict, step, False)
|
|
291
|
+
|
|
292
|
+
if response_model == 'json':
|
|
293
|
+
content = await response.json()
|
|
294
|
+
elif response_model == 'bytes':
|
|
295
|
+
content = await response.read()
|
|
296
|
+
elif response_model == 'html':
|
|
297
|
+
content = await response.text()
|
|
233
298
|
else:
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
if self.statistic: spider_record(self.db_client, record_dict, step, False)
|
|
243
|
-
if response_model == 'json':
|
|
244
|
-
return await response.json(), str(response.url), response
|
|
245
|
-
elif response_model == 'bytes':
|
|
246
|
-
return await response.read(), str(response.url), response
|
|
247
|
-
elif response_model == 'html':
|
|
248
|
-
return await response.text(), str(response.url), response
|
|
249
|
-
else:
|
|
250
|
-
return response, str(response.url), response
|
|
251
|
-
except Exception as e:
|
|
252
|
-
if self.statistic:
|
|
253
|
-
record_dict['record_info'] = '{}_request_fail'.format(component_name)
|
|
254
|
-
spider_record(self.db_client, record_dict, step, False)
|
|
255
|
-
import traceback
|
|
256
|
-
print(traceback.format_exc(), url)
|
|
257
|
-
return '', url, Response
|
|
258
|
-
|
|
259
|
-
tasks = [asyncio.ensure_future(async_request(i, cookies=cookies, **kwargs)) for i in urls]
|
|
260
|
-
loop = asyncio.get_event_loop()
|
|
261
|
-
loop.run_until_complete(asyncio.wait(tasks))
|
|
262
|
-
del loop
|
|
263
|
-
return [i.result() for i in tasks]
|
|
299
|
+
content = response
|
|
300
|
+
|
|
301
|
+
return content, str(response.url), response
|
|
302
|
+
|
|
303
|
+
# 执行所有任务
|
|
304
|
+
tasks = [async_request(url, cookies=cookies, **kwargs) for url in urls]
|
|
305
|
+
results = asyncio.run(asyncio.gather(*tasks, return_exceptions=False))
|
|
306
|
+
return results
|
|
264
307
|
|
|
265
308
|
|
|
266
309
|
class Parse(object):
|
|
267
310
|
"""
|
|
268
311
|
解析源数据类,使用思路为继承该类并重写analysis_data方法
|
|
269
312
|
"""
|
|
270
|
-
|
|
313
|
+
|
|
271
314
|
def __init__(self, db_client=None, statistic=False, *, db_host: str = None, db_port: int = 6379,
|
|
272
315
|
db_cluster: bool = False,
|
|
273
316
|
db_password: str = None):
|
|
@@ -277,7 +320,7 @@ class Parse(object):
|
|
|
277
320
|
else:
|
|
278
321
|
redis_type = RedisCluster if db_cluster else Redis
|
|
279
322
|
self.db_client = redis_type(host=db_host, port=db_port, password=db_password)
|
|
280
|
-
|
|
323
|
+
|
|
281
324
|
# @time_counter
|
|
282
325
|
def analysis_data(self, source_code, record_dict: dict = dict, step: int = 1, *args, **kwargs):
|
|
283
326
|
"""
|
|
@@ -293,7 +336,7 @@ class Parse(object):
|
|
|
293
336
|
|
|
294
337
|
|
|
295
338
|
class Monitor(object):
|
|
296
|
-
|
|
339
|
+
|
|
297
340
|
def __init__(self, rds=None, statistic=False, *, redis_host: str = None, redis_port: int = 6379,
|
|
298
341
|
redis_cluster: bool = False,
|
|
299
342
|
redis_password: str = None):
|
|
@@ -303,11 +346,11 @@ class Monitor(object):
|
|
|
303
346
|
else:
|
|
304
347
|
redis_type = RedisCluster if redis_cluster else Redis
|
|
305
348
|
self.rds = redis_type(host=redis_host, port=redis_port, password=redis_password)
|
|
306
|
-
|
|
349
|
+
|
|
307
350
|
def log(self):
|
|
308
351
|
"""日志统计"""
|
|
309
352
|
...
|
|
310
|
-
|
|
353
|
+
|
|
311
354
|
# 在收集的大key前面添加日期,之后删除被收集的大key
|
|
312
355
|
def statistics_count(self, summary_keys: dict, *, hour: str = None, minute: str = None, second: str = None):
|
|
313
356
|
"""
|
|
@@ -318,7 +361,7 @@ class Monitor(object):
|
|
|
318
361
|
:param second:
|
|
319
362
|
:return:
|
|
320
363
|
"""
|
|
321
|
-
|
|
364
|
+
|
|
322
365
|
def summary_count(summary_keys: dict):
|
|
323
366
|
today_date = get_today_str()
|
|
324
367
|
for summary_key, big_keys in summary_keys.items():
|
|
@@ -328,7 +371,7 @@ class Monitor(object):
|
|
|
328
371
|
if today_date not in summary_key:
|
|
329
372
|
summary_key = today_date + '_' + summary_key
|
|
330
373
|
self.rds.hset(summary_key, k, v)
|
|
331
|
-
|
|
374
|
+
|
|
332
375
|
scheduler = BlockingScheduler(timezone="Asia/Shanghai")
|
|
333
376
|
scheduler.add_job(
|
|
334
377
|
summary_count, 'cron',
|
|
@@ -4,18 +4,26 @@
|
|
|
4
4
|
# @File : test_monitor_class.py
|
|
5
5
|
|
|
6
6
|
|
|
7
|
-
import asyncio
|
|
8
|
-
import json
|
|
9
7
|
import time
|
|
10
8
|
from datetime import datetime
|
|
11
9
|
from enum import Enum
|
|
12
10
|
|
|
13
11
|
import requests
|
|
14
|
-
from aiohttp import ClientSession
|
|
15
|
-
from aiosocksy.connector import ProxyClientRequest, ProxyConnector
|
|
16
12
|
from apscheduler.schedulers.blocking import BlockingScheduler
|
|
17
13
|
from redis import Redis
|
|
18
14
|
from rediscluster import RedisCluster
|
|
15
|
+
import asyncio
|
|
16
|
+
from urllib.parse import urlparse
|
|
17
|
+
from aiohttp import ClientSession, ClientResponse
|
|
18
|
+
from typing import List, Dict, Any, Tuple, Optional
|
|
19
|
+
|
|
20
|
+
# 只在需要 SOCKS 时才导入(避免无代理时依赖 aiohttp-socks)
|
|
21
|
+
try:
|
|
22
|
+
from aiohttp_socks import ProxyConnector
|
|
23
|
+
|
|
24
|
+
HAS_SOCKS = True
|
|
25
|
+
except ImportError:
|
|
26
|
+
HAS_SOCKS = False
|
|
19
27
|
|
|
20
28
|
|
|
21
29
|
# 把超时时间的设置
|
|
@@ -54,28 +62,6 @@ def spider_record(client, record_dict: dict = dict, step: int = 1, del_record: b
|
|
|
54
62
|
if not client.ttl(record_info): client.expireat(record_info, get_tomorrow_timestamp())
|
|
55
63
|
|
|
56
64
|
|
|
57
|
-
# redis hash计数器的改变
|
|
58
|
-
def spider_proxy_record(client, record_dict: dict = dict, del_record: bool = False):
|
|
59
|
-
"""
|
|
60
|
-
增加和较少redis计数器,控制redis的使用个数
|
|
61
|
-
{"count":0,“bandwidth”:1}
|
|
62
|
-
:param client: 数据库连接对象
|
|
63
|
-
:param record_dict: 形式为{'general_info': xxx,"project_name":1,"source_name":"Weibo", 'proxy_info': {"111.114.114.114":1"}},字典的key必须是general_info和record_info
|
|
64
|
-
:return:
|
|
65
|
-
"""
|
|
66
|
-
source_name = record_dict['source_name']
|
|
67
|
-
proxy_info = record_dict['proxy_info']
|
|
68
|
-
if not isinstance(source_name, Source): raise TypeError('error data type source_name')
|
|
69
|
-
proxy_key = '{}:Proxy'.format(source_name.value)
|
|
70
|
-
for k, v in proxy_info.items():
|
|
71
|
-
key_value = client.hget(proxy_key, k)
|
|
72
|
-
if not key_value:
|
|
73
|
-
key_value = """{"count":"0"}"""
|
|
74
|
-
key_value = json.loads(key_value)
|
|
75
|
-
key_value.update({"count": int(key_value.get("count")) + v, })
|
|
76
|
-
client.hset(proxy_key, k, json.dumps(key_value, ensure_ascii=False))
|
|
77
|
-
|
|
78
|
-
|
|
79
65
|
def get_tomorrow_timestamp() -> int:
|
|
80
66
|
"""
|
|
81
67
|
获取第二天零点的时间戳
|
|
@@ -108,27 +94,23 @@ class Source(Enum):
|
|
|
108
94
|
WECHAT = 'Wechat'
|
|
109
95
|
DOUYIN = 'Douyin'
|
|
110
96
|
NEWS = 'News'
|
|
111
|
-
SD = 'SD'
|
|
112
97
|
TIEBA = 'Tieba'
|
|
113
98
|
APP = 'App'
|
|
114
99
|
HOTLIST = 'HotList'
|
|
115
100
|
TOUTIAO = 'Toutiao'
|
|
116
|
-
BAIDU_BAIJIAHAO = 'baidu_baijihao'
|
|
117
101
|
OTHER = 'OtherSource'
|
|
118
102
|
|
|
119
103
|
|
|
120
104
|
class Response(Enum):
|
|
121
105
|
"""错误的数据链接请求"""
|
|
122
106
|
status_code = 500
|
|
123
|
-
status = 500
|
|
124
|
-
headers = {}
|
|
125
107
|
|
|
126
108
|
|
|
127
109
|
class Task(object):
|
|
128
110
|
"""
|
|
129
111
|
任务调度分发类,主要分为定时分发任务、循环分发任务、单次分发任务
|
|
130
112
|
"""
|
|
131
|
-
|
|
113
|
+
|
|
132
114
|
def __init__(self, db_client=None, *, db_host: str = None, db_port: int = 6379, db_cluster: bool = False,
|
|
133
115
|
db_password: str = None):
|
|
134
116
|
"""
|
|
@@ -144,7 +126,7 @@ class Task(object):
|
|
|
144
126
|
else:
|
|
145
127
|
redis_type = RedisCluster if db_cluster else Redis
|
|
146
128
|
self.db_client = redis_type(host=db_host, port=db_port, password=db_password)
|
|
147
|
-
|
|
129
|
+
|
|
148
130
|
def cron_job(self, que_name: str, task_data: list, *, hour: str = None, minute: str = None, second: str = None):
|
|
149
131
|
"""
|
|
150
132
|
定时分发任务,任务队列类型必须是列表类型
|
|
@@ -164,7 +146,7 @@ class Task(object):
|
|
|
164
146
|
args=[que_name, task_data]
|
|
165
147
|
)
|
|
166
148
|
scheduler.start()
|
|
167
|
-
|
|
149
|
+
|
|
168
150
|
def cyclic_job(self, que_name: str, task_data: list, second: int):
|
|
169
151
|
"""
|
|
170
152
|
循环分发任务,任务队列类型必须是列表类型
|
|
@@ -177,7 +159,7 @@ class Task(object):
|
|
|
177
159
|
self.db_client.rpush(que_name, *task_data)
|
|
178
160
|
print('task distribution complete...')
|
|
179
161
|
time.sleep(second)
|
|
180
|
-
|
|
162
|
+
|
|
181
163
|
def single_job(self, que_name: str, task_data: list, que_type: str = 'list', sort: str = 'r'):
|
|
182
164
|
"""
|
|
183
165
|
单次任务分发,队列可以是列表或集合
|
|
@@ -196,7 +178,7 @@ class Mrequest(object):
|
|
|
196
178
|
"""
|
|
197
179
|
发送请求的类,分为同步和异步
|
|
198
180
|
"""
|
|
199
|
-
|
|
181
|
+
|
|
200
182
|
def __init__(self, db_client=None, statistic=False, *, db_host: str = None, db_port: int = 6379,
|
|
201
183
|
db_cluster: bool = False,
|
|
202
184
|
db_password: str = None):
|
|
@@ -215,7 +197,7 @@ class Mrequest(object):
|
|
|
215
197
|
else:
|
|
216
198
|
redis_type = RedisCluster if db_cluster else Redis
|
|
217
199
|
self.db_client = redis_type(host=db_host, port=db_port, password=db_password)
|
|
218
|
-
|
|
200
|
+
|
|
219
201
|
def mrequest(self, urls: str, method: str = 'get', record_dict: dict = dict, step: int = 1, **kwargs):
|
|
220
202
|
"""
|
|
221
203
|
使用requests同步发送请求,参数分别为目标任务url,请求方法,计数器字典,计数步长
|
|
@@ -228,112 +210,107 @@ class Mrequest(object):
|
|
|
228
210
|
request_method = getattr(requests, method.lower())
|
|
229
211
|
except AttributeError:
|
|
230
212
|
raise AttributeError('no method named {}'.format(method))
|
|
231
|
-
"""增加请求的单次计数,记录所有请求"""
|
|
232
|
-
if self.statistic: record_dict['record_info'] = '{}_request_all'.format(component_name)
|
|
233
|
-
if self.statistic: spider_record(self.db_client, record_dict, step, False)
|
|
234
213
|
try:
|
|
235
214
|
response = request_method(urls, **kwargs)
|
|
236
|
-
if "proxies" in kwargs.keys():
|
|
237
|
-
"""增加代理正在使用限制"""
|
|
238
|
-
proxy_ip = kwargs.get("proxies").get("http")
|
|
239
|
-
if self.statistic: record_dict['proxy_info'] = {proxy_ip.split("//")[-1]: 1} if proxy_ip else {"": 1}
|
|
240
|
-
if self.statistic: spider_proxy_record(self.db_client, record_dict, False)
|
|
241
|
-
|
|
242
|
-
if self.statistic: record_dict['record_info'] = '{}_request_proxy'.format(component_name)
|
|
243
|
-
if self.statistic: spider_record(self.db_client, record_dict, step, False)
|
|
244
|
-
|
|
245
215
|
if response.status_code == 200:
|
|
246
216
|
if self.statistic: record_dict['record_info'] = '{}_request_success'.format(component_name)
|
|
247
|
-
if self.statistic: spider_record(self.db_client, record_dict, step, False)
|
|
248
|
-
"""增加返回值,结果结果统计,按照bytes为单位统计"""
|
|
249
|
-
if self.statistic: record_dict['record_info'] = '{}_request_content_length'.format(component_name)
|
|
250
|
-
content_length = eval(response.headers.get("Content-Length", "0"))
|
|
251
|
-
if self.statistic: spider_record(self.db_client, record_dict,
|
|
252
|
-
content_length if content_length else len(response.content),
|
|
253
|
-
False)
|
|
254
217
|
else:
|
|
255
218
|
if self.statistic: record_dict['record_info'] = '{}_request_fail'.format(component_name)
|
|
256
|
-
|
|
219
|
+
if self.statistic: spider_record(self.db_client, record_dict, step, False)
|
|
257
220
|
return response
|
|
258
221
|
except Exception as e:
|
|
259
|
-
|
|
260
|
-
print("请求出错,原因 {}".format(traceback.format_exc()))
|
|
222
|
+
print(e)
|
|
261
223
|
if self.statistic:
|
|
262
224
|
record_dict['record_info'] = '{}_request_fail'.format(component_name)
|
|
263
225
|
spider_record(self.db_client, record_dict, step, False)
|
|
264
|
-
print('
|
|
226
|
+
print('失败计数加一')
|
|
265
227
|
return Response
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
228
|
+
|
|
229
|
+
def aio_request(
|
|
230
|
+
self,
|
|
231
|
+
urls: List[str],
|
|
232
|
+
record_dict: Dict[str, Any] = None,
|
|
233
|
+
response_model: str = 'html',
|
|
234
|
+
step: int = 1,
|
|
235
|
+
cookies: Optional[Dict] = None,
|
|
236
|
+
proxy_url: Optional[str] = None, # 支持: http://, https://, socks4://, socks5://
|
|
237
|
+
**kwargs
|
|
238
|
+
) -> List[Tuple[Any, str, Optional[ClientResponse]]]:
|
|
275
239
|
"""
|
|
276
|
-
|
|
277
|
-
|
|
240
|
+
异步发送请求,自适应支持 HTTP/HTTPS 和 SOCKS4/SOCKS5 代理。
|
|
241
|
+
- HTTP/HTTPS 代理:使用 aiohttp 原生 proxy 参数
|
|
242
|
+
- SOCKS 代理:使用 aiohttp-socks(需安装)
|
|
278
243
|
"""
|
|
279
|
-
if
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
async def async_request(url, cookies=None, **kwargs):
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
244
|
+
if record_dict is None:
|
|
245
|
+
record_dict = {}
|
|
246
|
+
|
|
247
|
+
async def async_request(url: str, cookies: Optional[Dict] = None, **kwargs):
|
|
248
|
+
# 解析代理类型
|
|
249
|
+
use_socks = False
|
|
250
|
+
if proxy_url:
|
|
251
|
+
parsed = urlparse(proxy_url)
|
|
252
|
+
if parsed.scheme in ('socks4', 'socks5'):
|
|
253
|
+
use_socks = True
|
|
254
|
+
elif parsed.scheme not in ('http', 'https'):
|
|
255
|
+
raise ValueError(f"Unsupported proxy scheme: {parsed.scheme}. Use http, https, socks4, or socks5.")
|
|
256
|
+
|
|
257
|
+
try:
|
|
258
|
+
if use_socks:
|
|
259
|
+
# 使用 SOCKS 代理
|
|
260
|
+
if not HAS_SOCKS:
|
|
261
|
+
raise RuntimeError(
|
|
262
|
+
"SOCKS proxy requires 'aiohttp-socks' package. Install with: pip install aiohttp-socks")
|
|
263
|
+
connector = ProxyConnector.from_url(proxy_url)
|
|
264
|
+
async with ClientSession(connector=connector, cookies=cookies) as session:
|
|
265
|
+
async with session.get(url, **kwargs) as response:
|
|
266
|
+
return await _process_response(response, url)
|
|
267
|
+
else:
|
|
268
|
+
# 使用 HTTP/HTTPS 代理 或 无代理
|
|
269
|
+
async with ClientSession(cookies=cookies) as session:
|
|
270
|
+
async with session.get(url, proxy=proxy_url, **kwargs) as response:
|
|
271
|
+
return await _process_response(response, url)
|
|
272
|
+
|
|
273
|
+
except Exception as e:
|
|
274
|
+
if self.statistic and 'component_name' in record_dict:
|
|
275
|
+
component_name = record_dict['component_name']
|
|
276
|
+
record_dict['record_info'] = f'{component_name}_request_fail'
|
|
277
|
+
spider_record(self.db_client, record_dict, step, False)
|
|
278
|
+
import traceback
|
|
279
|
+
print(f"Error fetching {url} via {proxy_url or 'direct'}: {e}\n{traceback.format_exc()}")
|
|
280
|
+
return '', url, None
|
|
281
|
+
|
|
282
|
+
async def _process_response(response: ClientResponse, url: str):
|
|
283
|
+
"""统一处理响应内容"""
|
|
284
|
+
if self.statistic and 'component_name' in record_dict:
|
|
285
|
+
component_name = record_dict['component_name']
|
|
286
|
+
if response.status == 200:
|
|
287
|
+
record_dict['record_info'] = f'{component_name}_request_success'
|
|
288
|
+
else:
|
|
289
|
+
record_dict['record_info'] = f'{component_name}_request_fail'
|
|
290
|
+
spider_record(self.db_client, record_dict, step, False)
|
|
291
|
+
|
|
292
|
+
if response_model == 'json':
|
|
293
|
+
content = await response.json()
|
|
294
|
+
elif response_model == 'bytes':
|
|
295
|
+
content = await response.read()
|
|
296
|
+
elif response_model == 'html':
|
|
297
|
+
content = await response.text()
|
|
286
298
|
else:
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
if self.statistic: spider_record(self.db_client, record_dict, step, False)
|
|
296
|
-
if self.statistic: record_dict['record_info'] = '{}_request_content_length'.format(
|
|
297
|
-
component_name)
|
|
298
|
-
content_length = eval(response.headers.get("Content-Length", "0"))
|
|
299
|
-
res_content = await response.read()
|
|
300
|
-
if self.statistic: spider_record(self.db_client, record_dict,
|
|
301
|
-
content_length if content_length else len(res_content),
|
|
302
|
-
False)
|
|
303
|
-
if response_model == 'json':
|
|
304
|
-
return await response.json(), str(response.url), response
|
|
305
|
-
elif response_model == 'bytes':
|
|
306
|
-
return await response.read(), str(response.url), response
|
|
307
|
-
elif response_model == 'html':
|
|
308
|
-
return await response.text(), str(response.url), response
|
|
309
|
-
else:
|
|
310
|
-
return await response, str(response.url), response
|
|
311
|
-
except Exception as e:
|
|
312
|
-
if self.statistic:
|
|
313
|
-
record_dict['record_info'] = '{}_request_fail'.format(component_name)
|
|
314
|
-
spider_record(self.db_client, record_dict, step, False)
|
|
315
|
-
import traceback
|
|
316
|
-
print(traceback.format_exc(), url)
|
|
317
|
-
if self.statistic: record_dict['record_info'] = '{}_request_content_length'.format(component_name)
|
|
318
|
-
content_length = 0
|
|
319
|
-
res_content = ""
|
|
320
|
-
if self.statistic: spider_record(self.db_client, record_dict,
|
|
321
|
-
content_length if content_length else len(res_content),
|
|
322
|
-
False)
|
|
323
|
-
return '', url, Response
|
|
324
|
-
|
|
325
|
-
tasks = [asyncio.ensure_future(async_request(i, cookies=cookies, **kwargs)) for i in urls]
|
|
326
|
-
loop = asyncio.get_event_loop()
|
|
327
|
-
loop.run_until_complete(asyncio.wait(tasks))
|
|
328
|
-
del loop
|
|
329
|
-
return [i.result() for i in tasks]
|
|
299
|
+
content = response
|
|
300
|
+
|
|
301
|
+
return content, str(response.url), response
|
|
302
|
+
|
|
303
|
+
# 执行所有任务
|
|
304
|
+
tasks = [async_request(url, cookies=cookies, **kwargs) for url in urls]
|
|
305
|
+
results = asyncio.run(asyncio.gather(*tasks, return_exceptions=False))
|
|
306
|
+
return results
|
|
330
307
|
|
|
331
308
|
|
|
332
309
|
class Parse(object):
|
|
333
310
|
"""
|
|
334
311
|
解析源数据类,使用思路为继承该类并重写analysis_data方法
|
|
335
312
|
"""
|
|
336
|
-
|
|
313
|
+
|
|
337
314
|
def __init__(self, db_client=None, statistic=False, *, db_host: str = None, db_port: int = 6379,
|
|
338
315
|
db_cluster: bool = False,
|
|
339
316
|
db_password: str = None):
|
|
@@ -343,7 +320,7 @@ class Parse(object):
|
|
|
343
320
|
else:
|
|
344
321
|
redis_type = RedisCluster if db_cluster else Redis
|
|
345
322
|
self.db_client = redis_type(host=db_host, port=db_port, password=db_password)
|
|
346
|
-
|
|
323
|
+
|
|
347
324
|
# @time_counter
|
|
348
325
|
def analysis_data(self, source_code, record_dict: dict = dict, step: int = 1, *args, **kwargs):
|
|
349
326
|
"""
|
|
@@ -359,7 +336,7 @@ class Parse(object):
|
|
|
359
336
|
|
|
360
337
|
|
|
361
338
|
class Monitor(object):
|
|
362
|
-
|
|
339
|
+
|
|
363
340
|
def __init__(self, rds=None, statistic=False, *, redis_host: str = None, redis_port: int = 6379,
|
|
364
341
|
redis_cluster: bool = False,
|
|
365
342
|
redis_password: str = None):
|
|
@@ -369,11 +346,11 @@ class Monitor(object):
|
|
|
369
346
|
else:
|
|
370
347
|
redis_type = RedisCluster if redis_cluster else Redis
|
|
371
348
|
self.rds = redis_type(host=redis_host, port=redis_port, password=redis_password)
|
|
372
|
-
|
|
349
|
+
|
|
373
350
|
def log(self):
|
|
374
351
|
"""日志统计"""
|
|
375
352
|
...
|
|
376
|
-
|
|
353
|
+
|
|
377
354
|
# 在收集的大key前面添加日期,之后删除被收集的大key
|
|
378
355
|
def statistics_count(self, summary_keys: dict, *, hour: str = None, minute: str = None, second: str = None):
|
|
379
356
|
"""
|
|
@@ -384,7 +361,7 @@ class Monitor(object):
|
|
|
384
361
|
:param second:
|
|
385
362
|
:return:
|
|
386
363
|
"""
|
|
387
|
-
|
|
364
|
+
|
|
388
365
|
def summary_count(summary_keys: dict):
|
|
389
366
|
today_date = get_today_str()
|
|
390
367
|
for summary_key, big_keys in summary_keys.items():
|
|
@@ -394,7 +371,7 @@ class Monitor(object):
|
|
|
394
371
|
if today_date not in summary_key:
|
|
395
372
|
summary_key = today_date + '_' + summary_key
|
|
396
373
|
self.rds.hset(summary_key, k, v)
|
|
397
|
-
|
|
374
|
+
|
|
398
375
|
scheduler = BlockingScheduler(timezone="Asia/Shanghai")
|
|
399
376
|
scheduler.add_job(
|
|
400
377
|
summary_count, 'cron',
|
|
@@ -1,12 +1,11 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: ODtools
|
|
3
|
-
Version: 2.1
|
|
3
|
+
Version: 2.2.1
|
|
4
4
|
Summary: zkrTools
|
|
5
5
|
Home-page: https://github.com/zkr-origin-data-dpt/ODtools
|
|
6
6
|
Author: zkrPython
|
|
7
7
|
Author-email: 178031608@qq.com
|
|
8
8
|
License: Apache License
|
|
9
|
-
Platform: UNKNOWN
|
|
10
9
|
Classifier: Environment :: Web Environment
|
|
11
10
|
Classifier: Intended Audience :: Developers
|
|
12
11
|
Classifier: Operating System :: OS Independent
|
|
@@ -23,6 +22,28 @@ Classifier: Programming Language :: Python :: 3.6
|
|
|
23
22
|
Classifier: Programming Language :: Python :: 3.7
|
|
24
23
|
Classifier: Programming Language :: Python :: 3.8
|
|
25
24
|
Classifier: Programming Language :: Python :: 3.9
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
25
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
26
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
27
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
28
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
29
|
+
Requires-Dist: xlrd
|
|
30
|
+
Requires-Dist: xlwt
|
|
31
|
+
Requires-Dist: elasticsearch==8.2.0
|
|
32
|
+
Requires-Dist: thrift
|
|
33
|
+
Requires-Dist: kafka-python
|
|
34
|
+
Requires-Dist: redis-py-cluster
|
|
35
|
+
Requires-Dist: pymysql
|
|
36
|
+
Requires-Dist: loguru
|
|
37
|
+
Requires-Dist: colorlog
|
|
38
|
+
Requires-Dist: aiohttp-socks
|
|
39
|
+
Requires-Dist: apscheduler
|
|
40
|
+
Requires-Dist: wheel
|
|
41
|
+
Requires-Dist: twine
|
|
42
|
+
Requires-Dist: PySocks
|
|
43
|
+
Dynamic: author
|
|
44
|
+
Dynamic: author-email
|
|
45
|
+
Dynamic: classifier
|
|
46
|
+
Dynamic: home-page
|
|
47
|
+
Dynamic: license
|
|
48
|
+
Dynamic: requires-dist
|
|
49
|
+
Dynamic: summary
|
|
@@ -1,12 +1,11 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: ODtools
|
|
3
|
-
Version: 2.1
|
|
3
|
+
Version: 2.2.1
|
|
4
4
|
Summary: zkrTools
|
|
5
5
|
Home-page: https://github.com/zkr-origin-data-dpt/ODtools
|
|
6
6
|
Author: zkrPython
|
|
7
7
|
Author-email: 178031608@qq.com
|
|
8
8
|
License: Apache License
|
|
9
|
-
Platform: UNKNOWN
|
|
10
9
|
Classifier: Environment :: Web Environment
|
|
11
10
|
Classifier: Intended Audience :: Developers
|
|
12
11
|
Classifier: Operating System :: OS Independent
|
|
@@ -23,6 +22,28 @@ Classifier: Programming Language :: Python :: 3.6
|
|
|
23
22
|
Classifier: Programming Language :: Python :: 3.7
|
|
24
23
|
Classifier: Programming Language :: Python :: 3.8
|
|
25
24
|
Classifier: Programming Language :: Python :: 3.9
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
25
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
26
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
27
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
28
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
29
|
+
Requires-Dist: xlrd
|
|
30
|
+
Requires-Dist: xlwt
|
|
31
|
+
Requires-Dist: elasticsearch==8.2.0
|
|
32
|
+
Requires-Dist: thrift
|
|
33
|
+
Requires-Dist: kafka-python
|
|
34
|
+
Requires-Dist: redis-py-cluster
|
|
35
|
+
Requires-Dist: pymysql
|
|
36
|
+
Requires-Dist: loguru
|
|
37
|
+
Requires-Dist: colorlog
|
|
38
|
+
Requires-Dist: aiohttp-socks
|
|
39
|
+
Requires-Dist: apscheduler
|
|
40
|
+
Requires-Dist: wheel
|
|
41
|
+
Requires-Dist: twine
|
|
42
|
+
Requires-Dist: PySocks
|
|
43
|
+
Dynamic: author
|
|
44
|
+
Dynamic: author-email
|
|
45
|
+
Dynamic: classifier
|
|
46
|
+
Dynamic: home-page
|
|
47
|
+
Dynamic: license
|
|
48
|
+
Dynamic: requires-dist
|
|
49
|
+
Dynamic: summary
|
|
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
|
|
|
2
2
|
|
|
3
3
|
setup(
|
|
4
4
|
name="ODtools",
|
|
5
|
-
version="2.1
|
|
5
|
+
version="2.2.1",
|
|
6
6
|
author="zkrPython",
|
|
7
7
|
author_email="178031608@qq.com",
|
|
8
8
|
description="zkrTools",
|
|
@@ -15,15 +15,17 @@ setup(
|
|
|
15
15
|
install_requires=[
|
|
16
16
|
"xlrd",
|
|
17
17
|
"xlwt",
|
|
18
|
-
"elasticsearch==
|
|
18
|
+
"elasticsearch==8.2.0",
|
|
19
19
|
"thrift",
|
|
20
20
|
"kafka-python",
|
|
21
|
-
"redis-py-cluster
|
|
21
|
+
"redis-py-cluster",
|
|
22
22
|
"pymysql",
|
|
23
23
|
"loguru",
|
|
24
24
|
"colorlog",
|
|
25
|
-
"
|
|
25
|
+
"aiohttp-socks",
|
|
26
26
|
"apscheduler",
|
|
27
|
+
"wheel",
|
|
28
|
+
"twine",
|
|
27
29
|
"PySocks",#增加依赖,requests socket 代理依赖
|
|
28
30
|
],
|
|
29
31
|
classifiers=[
|
|
@@ -43,5 +45,9 @@ setup(
|
|
|
43
45
|
'Programming Language :: Python :: 3.7',
|
|
44
46
|
'Programming Language :: Python :: 3.8',
|
|
45
47
|
'Programming Language :: Python :: 3.9',
|
|
48
|
+
'Programming Language :: Python :: 3.10',
|
|
49
|
+
'Programming Language :: Python :: 3.11',
|
|
50
|
+
'Programming Language :: Python :: 3.12',
|
|
51
|
+
'Programming Language :: Python :: 3.13',
|
|
46
52
|
],
|
|
47
53
|
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|