ODtools 2.2.0__tar.gz → 2.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. {odtools-2.2.0 → odtools-2.2.1}/ODtools/monitor_tools.py +101 -58
  2. {odtools-2.2.0 → odtools-2.2.1}/ODtools/monitor_tools_new.py +104 -127
  3. {odtools-2.2.0 → odtools-2.2.1}/ODtools.egg-info/PKG-INFO +3 -1
  4. {odtools-2.2.0 → odtools-2.2.1}/ODtools.egg-info/requires.txt +2 -0
  5. {odtools-2.2.0 → odtools-2.2.1}/PKG-INFO +3 -1
  6. {odtools-2.2.0 → odtools-2.2.1}/setup.py +3 -1
  7. {odtools-2.2.0 → odtools-2.2.1}/ODtools/__init__.py +0 -0
  8. {odtools-2.2.0 → odtools-2.2.1}/ODtools/bloom_filter_tools.py +0 -0
  9. {odtools-2.2.0 → odtools-2.2.1}/ODtools/excel_tools.py +0 -0
  10. {odtools-2.2.0 → odtools-2.2.1}/ODtools/fastdfs_tools.py +0 -0
  11. {odtools-2.2.0 → odtools-2.2.1}/ODtools/fdfs_client/__init__.py +0 -0
  12. {odtools-2.2.0 → odtools-2.2.1}/ODtools/fdfs_client/client.py +0 -0
  13. {odtools-2.2.0 → odtools-2.2.1}/ODtools/fdfs_client/connection.py +0 -0
  14. {odtools-2.2.0 → odtools-2.2.1}/ODtools/fdfs_client/exceptions.py +0 -0
  15. {odtools-2.2.0 → odtools-2.2.1}/ODtools/fdfs_client/fdfs_protol.py +0 -0
  16. {odtools-2.2.0 → odtools-2.2.1}/ODtools/fdfs_client/fdfs_test.py +0 -0
  17. {odtools-2.2.0 → odtools-2.2.1}/ODtools/fdfs_client/storage_client.py +0 -0
  18. {odtools-2.2.0 → odtools-2.2.1}/ODtools/fdfs_client/tracker_client.py +0 -0
  19. {odtools-2.2.0 → odtools-2.2.1}/ODtools/fdfs_client/utils.py +0 -0
  20. {odtools-2.2.0 → odtools-2.2.1}/ODtools/hbase_client/THBaseService.py +0 -0
  21. {odtools-2.2.0 → odtools-2.2.1}/ODtools/hbase_client/__init__.py +0 -0
  22. {odtools-2.2.0 → odtools-2.2.1}/ODtools/hbase_client/constants.py +0 -0
  23. {odtools-2.2.0 → odtools-2.2.1}/ODtools/hbase_client/ttypes.py +0 -0
  24. {odtools-2.2.0 → odtools-2.2.1}/ODtools/hbase_tools.py +0 -0
  25. {odtools-2.2.0 → odtools-2.2.1}/ODtools/kafka_tools.py +0 -0
  26. {odtools-2.2.0 → odtools-2.2.1}/ODtools/log_tools.py +0 -0
  27. {odtools-2.2.0 → odtools-2.2.1}/ODtools/redis_tools.py +0 -0
  28. {odtools-2.2.0 → odtools-2.2.1}/ODtools/request_headers.py +0 -0
  29. {odtools-2.2.0 → odtools-2.2.1}/ODtools/save_data_class.py +0 -0
  30. {odtools-2.2.0 → odtools-2.2.1}/ODtools/scrapy_redis/__init__.py +0 -0
  31. {odtools-2.2.0 → odtools-2.2.1}/ODtools/scrapy_redis/connection.py +0 -0
  32. {odtools-2.2.0 → odtools-2.2.1}/ODtools/scrapy_redis/defaults.py +0 -0
  33. {odtools-2.2.0 → odtools-2.2.1}/ODtools/scrapy_redis/dupefilter.py +0 -0
  34. {odtools-2.2.0 → odtools-2.2.1}/ODtools/scrapy_redis/picklecompat.py +0 -0
  35. {odtools-2.2.0 → odtools-2.2.1}/ODtools/scrapy_redis/pipelines.py +0 -0
  36. {odtools-2.2.0 → odtools-2.2.1}/ODtools/scrapy_redis/queue.py +0 -0
  37. {odtools-2.2.0 → odtools-2.2.1}/ODtools/scrapy_redis/scheduler.py +0 -0
  38. {odtools-2.2.0 → odtools-2.2.1}/ODtools/scrapy_redis/spiders.py +0 -0
  39. {odtools-2.2.0 → odtools-2.2.1}/ODtools/scrapy_redis/utils.py +0 -0
  40. {odtools-2.2.0 → odtools-2.2.1}/ODtools/singleton_tools.py +0 -0
  41. {odtools-2.2.0 → odtools-2.2.1}/ODtools/time_counter.py +0 -0
  42. {odtools-2.2.0 → odtools-2.2.1}/ODtools/timeit_counter.py +0 -0
  43. {odtools-2.2.0 → odtools-2.2.1}/ODtools.egg-info/SOURCES.txt +0 -0
  44. {odtools-2.2.0 → odtools-2.2.1}/ODtools.egg-info/dependency_links.txt +0 -0
  45. {odtools-2.2.0 → odtools-2.2.1}/ODtools.egg-info/top_level.txt +0 -0
  46. {odtools-2.2.0 → odtools-2.2.1}/ODtools.egg-info/zip-safe +0 -0
  47. {odtools-2.2.0 → odtools-2.2.1}/README.md +0 -0
  48. {odtools-2.2.0 → odtools-2.2.1}/setup.cfg +0 -0
@@ -4,17 +4,26 @@
4
4
  # @File : test_monitor_class.py
5
5
 
6
6
 
7
- import asyncio
8
7
  import time
9
8
  from datetime import datetime
10
9
  from enum import Enum
11
10
 
12
11
  import requests
13
- from aiohttp import ClientSession
14
- from aiosocksy.connector import ProxyClientRequest, ProxyConnector
15
12
  from apscheduler.schedulers.blocking import BlockingScheduler
16
13
  from redis import Redis
17
14
  from rediscluster import RedisCluster
15
+ import asyncio
16
+ from urllib.parse import urlparse
17
+ from aiohttp import ClientSession, ClientResponse
18
+ from typing import List, Dict, Any, Tuple, Optional
19
+
20
+ # 只在需要 SOCKS 时才导入(避免无代理时依赖 aiohttp-socks)
21
+ try:
22
+ from aiohttp_socks import ProxyConnector
23
+
24
+ HAS_SOCKS = True
25
+ except ImportError:
26
+ HAS_SOCKS = False
18
27
 
19
28
 
20
29
  # 把超时时间的设置
@@ -101,7 +110,7 @@ class Task(object):
101
110
  """
102
111
  任务调度分发类,主要分为定时分发任务、循环分发任务、单次分发任务
103
112
  """
104
-
113
+
105
114
  def __init__(self, db_client=None, *, db_host: str = None, db_port: int = 6379, db_cluster: bool = False,
106
115
  db_password: str = None):
107
116
  """
@@ -117,7 +126,7 @@ class Task(object):
117
126
  else:
118
127
  redis_type = RedisCluster if db_cluster else Redis
119
128
  self.db_client = redis_type(host=db_host, port=db_port, password=db_password)
120
-
129
+
121
130
  def cron_job(self, que_name: str, task_data: list, *, hour: str = None, minute: str = None, second: str = None):
122
131
  """
123
132
  定时分发任务,任务队列类型必须是列表类型
@@ -137,7 +146,7 @@ class Task(object):
137
146
  args=[que_name, task_data]
138
147
  )
139
148
  scheduler.start()
140
-
149
+
141
150
  def cyclic_job(self, que_name: str, task_data: list, second: int):
142
151
  """
143
152
  循环分发任务,任务队列类型必须是列表类型
@@ -150,7 +159,7 @@ class Task(object):
150
159
  self.db_client.rpush(que_name, *task_data)
151
160
  print('task distribution complete...')
152
161
  time.sleep(second)
153
-
162
+
154
163
  def single_job(self, que_name: str, task_data: list, que_type: str = 'list', sort: str = 'r'):
155
164
  """
156
165
  单次任务分发,队列可以是列表或集合
@@ -169,7 +178,7 @@ class Mrequest(object):
169
178
  """
170
179
  发送请求的类,分为同步和异步
171
180
  """
172
-
181
+
173
182
  def __init__(self, db_client=None, statistic=False, *, db_host: str = None, db_port: int = 6379,
174
183
  db_cluster: bool = False,
175
184
  db_password: str = None):
@@ -188,7 +197,7 @@ class Mrequest(object):
188
197
  else:
189
198
  redis_type = RedisCluster if db_cluster else Redis
190
199
  self.db_client = redis_type(host=db_host, port=db_port, password=db_password)
191
-
200
+
192
201
  def mrequest(self, urls: str, method: str = 'get', record_dict: dict = dict, step: int = 1, **kwargs):
193
202
  """
194
203
  使用requests同步发送请求,参数分别为目标任务url,请求方法,计数器字典,计数步长
@@ -216,58 +225,92 @@ class Mrequest(object):
216
225
  spider_record(self.db_client, record_dict, step, False)
217
226
  print('失败计数加一')
218
227
  return Response
219
-
220
- def aio_request(self, urls: list, record_dict: dict = dict, response_model: str = 'html', step: int = 1,
221
- cookies=None, **kwargs):
228
+
229
+ def aio_request(
230
+ self,
231
+ urls: List[str],
232
+ record_dict: Dict[str, Any] = None,
233
+ response_model: str = 'html',
234
+ step: int = 1,
235
+ cookies: Optional[Dict] = None,
236
+ proxy_url: Optional[str] = None, # 支持: http://, https://, socks4://, socks5://
237
+ **kwargs
238
+ ) -> List[Tuple[Any, str, Optional[ClientResponse]]]:
222
239
  """
223
- 异步发送请求,使用aiohttp.请求参数分别是: 目标任务url列表,计数器字典,请求响应格式,计数器步长
224
- :return:
240
+ 异步发送请求,自适应支持 HTTP/HTTPS 和 SOCKS4/SOCKS5 代理。
241
+ - HTTP/HTTPS 代理:使用 aiohttp 原生 proxy 参数
242
+ - SOCKS 代理:使用 aiohttp-socks(需安装)
225
243
  """
226
- if self.statistic:
227
- component_name = record_dict['component_name']
228
-
229
- async def async_request(url, cookies=None, **kwargs):
230
- if cookies:
231
- clientSessionObj = ClientSession(connector=ProxyConnector(), request_class=ProxyClientRequest,
232
- cookies=cookies)
244
+ if record_dict is None:
245
+ record_dict = {}
246
+
247
+ async def async_request(url: str, cookies: Optional[Dict] = None, **kwargs):
248
+ # 解析代理类型
249
+ use_socks = False
250
+ if proxy_url:
251
+ parsed = urlparse(proxy_url)
252
+ if parsed.scheme in ('socks4', 'socks5'):
253
+ use_socks = True
254
+ elif parsed.scheme not in ('http', 'https'):
255
+ raise ValueError(f"Unsupported proxy scheme: {parsed.scheme}. Use http, https, socks4, or socks5.")
256
+
257
+ try:
258
+ if use_socks:
259
+ # 使用 SOCKS 代理
260
+ if not HAS_SOCKS:
261
+ raise RuntimeError(
262
+ "SOCKS proxy requires 'aiohttp-socks' package. Install with: pip install aiohttp-socks")
263
+ connector = ProxyConnector.from_url(proxy_url)
264
+ async with ClientSession(connector=connector, cookies=cookies) as session:
265
+ async with session.get(url, **kwargs) as response:
266
+ return await _process_response(response, url)
267
+ else:
268
+ # 使用 HTTP/HTTPS 代理 或 无代理
269
+ async with ClientSession(cookies=cookies) as session:
270
+ async with session.get(url, proxy=proxy_url, **kwargs) as response:
271
+ return await _process_response(response, url)
272
+
273
+ except Exception as e:
274
+ if self.statistic and 'component_name' in record_dict:
275
+ component_name = record_dict['component_name']
276
+ record_dict['record_info'] = f'{component_name}_request_fail'
277
+ spider_record(self.db_client, record_dict, step, False)
278
+ import traceback
279
+ print(f"Error fetching {url} via {proxy_url or 'direct'}: {e}\n{traceback.format_exc()}")
280
+ return '', url, None
281
+
282
+ async def _process_response(response: ClientResponse, url: str):
283
+ """统一处理响应内容"""
284
+ if self.statistic and 'component_name' in record_dict:
285
+ component_name = record_dict['component_name']
286
+ if response.status == 200:
287
+ record_dict['record_info'] = f'{component_name}_request_success'
288
+ else:
289
+ record_dict['record_info'] = f'{component_name}_request_fail'
290
+ spider_record(self.db_client, record_dict, step, False)
291
+
292
+ if response_model == 'json':
293
+ content = await response.json()
294
+ elif response_model == 'bytes':
295
+ content = await response.read()
296
+ elif response_model == 'html':
297
+ content = await response.text()
233
298
  else:
234
- clientSessionObj = ClientSession(connector=ProxyConnector(), request_class=ProxyClientRequest)
235
- async with clientSessionObj as session:
236
- try:
237
- async with session.get(url, **kwargs) as response:
238
- if response.status == 200:
239
- if self.statistic: record_dict['record_info'] = '{}_request_success'.format(component_name)
240
- else:
241
- if self.statistic: record_dict['record_info'] = '{}_request_fail'.format(component_name)
242
- if self.statistic: spider_record(self.db_client, record_dict, step, False)
243
- if response_model == 'json':
244
- return await response.json(), str(response.url), response
245
- elif response_model == 'bytes':
246
- return await response.read(), str(response.url), response
247
- elif response_model == 'html':
248
- return await response.text(), str(response.url), response
249
- else:
250
- return response, str(response.url), response
251
- except Exception as e:
252
- if self.statistic:
253
- record_dict['record_info'] = '{}_request_fail'.format(component_name)
254
- spider_record(self.db_client, record_dict, step, False)
255
- import traceback
256
- print(traceback.format_exc(), url)
257
- return '', url, Response
258
-
259
- tasks = [asyncio.ensure_future(async_request(i, cookies=cookies, **kwargs)) for i in urls]
260
- loop = asyncio.get_event_loop()
261
- loop.run_until_complete(asyncio.wait(tasks))
262
- del loop
263
- return [i.result() for i in tasks]
299
+ content = response
300
+
301
+ return content, str(response.url), response
302
+
303
+ # 执行所有任务
304
+ tasks = [async_request(url, cookies=cookies, **kwargs) for url in urls]
305
+ results = asyncio.run(asyncio.gather(*tasks, return_exceptions=False))
306
+ return results
264
307
 
265
308
 
266
309
  class Parse(object):
267
310
  """
268
311
  解析源数据类,使用思路为继承该类并重写analysis_data方法
269
312
  """
270
-
313
+
271
314
  def __init__(self, db_client=None, statistic=False, *, db_host: str = None, db_port: int = 6379,
272
315
  db_cluster: bool = False,
273
316
  db_password: str = None):
@@ -277,7 +320,7 @@ class Parse(object):
277
320
  else:
278
321
  redis_type = RedisCluster if db_cluster else Redis
279
322
  self.db_client = redis_type(host=db_host, port=db_port, password=db_password)
280
-
323
+
281
324
  # @time_counter
282
325
  def analysis_data(self, source_code, record_dict: dict = dict, step: int = 1, *args, **kwargs):
283
326
  """
@@ -293,7 +336,7 @@ class Parse(object):
293
336
 
294
337
 
295
338
  class Monitor(object):
296
-
339
+
297
340
  def __init__(self, rds=None, statistic=False, *, redis_host: str = None, redis_port: int = 6379,
298
341
  redis_cluster: bool = False,
299
342
  redis_password: str = None):
@@ -303,11 +346,11 @@ class Monitor(object):
303
346
  else:
304
347
  redis_type = RedisCluster if redis_cluster else Redis
305
348
  self.rds = redis_type(host=redis_host, port=redis_port, password=redis_password)
306
-
349
+
307
350
  def log(self):
308
351
  """日志统计"""
309
352
  ...
310
-
353
+
311
354
  # 在收集的大key前面添加日期,之后删除被收集的大key
312
355
  def statistics_count(self, summary_keys: dict, *, hour: str = None, minute: str = None, second: str = None):
313
356
  """
@@ -318,7 +361,7 @@ class Monitor(object):
318
361
  :param second:
319
362
  :return:
320
363
  """
321
-
364
+
322
365
  def summary_count(summary_keys: dict):
323
366
  today_date = get_today_str()
324
367
  for summary_key, big_keys in summary_keys.items():
@@ -328,7 +371,7 @@ class Monitor(object):
328
371
  if today_date not in summary_key:
329
372
  summary_key = today_date + '_' + summary_key
330
373
  self.rds.hset(summary_key, k, v)
331
-
374
+
332
375
  scheduler = BlockingScheduler(timezone="Asia/Shanghai")
333
376
  scheduler.add_job(
334
377
  summary_count, 'cron',
@@ -4,18 +4,26 @@
4
4
  # @File : test_monitor_class.py
5
5
 
6
6
 
7
- import asyncio
8
- import json
9
7
  import time
10
8
  from datetime import datetime
11
9
  from enum import Enum
12
10
 
13
11
  import requests
14
- from aiohttp import ClientSession
15
- from aiosocksy.connector import ProxyClientRequest, ProxyConnector
16
12
  from apscheduler.schedulers.blocking import BlockingScheduler
17
13
  from redis import Redis
18
14
  from rediscluster import RedisCluster
15
+ import asyncio
16
+ from urllib.parse import urlparse
17
+ from aiohttp import ClientSession, ClientResponse
18
+ from typing import List, Dict, Any, Tuple, Optional
19
+
20
+ # 只在需要 SOCKS 时才导入(避免无代理时依赖 aiohttp-socks)
21
+ try:
22
+ from aiohttp_socks import ProxyConnector
23
+
24
+ HAS_SOCKS = True
25
+ except ImportError:
26
+ HAS_SOCKS = False
19
27
 
20
28
 
21
29
  # 把超时时间的设置
@@ -54,28 +62,6 @@ def spider_record(client, record_dict: dict = dict, step: int = 1, del_record: b
54
62
  if not client.ttl(record_info): client.expireat(record_info, get_tomorrow_timestamp())
55
63
 
56
64
 
57
- # redis hash计数器的改变
58
- def spider_proxy_record(client, record_dict: dict = dict, del_record: bool = False):
59
- """
60
- 增加和较少redis计数器,控制redis的使用个数
61
- {"count":0,“bandwidth”:1}
62
- :param client: 数据库连接对象
63
- :param record_dict: 形式为{'general_info': xxx,"project_name":1,"source_name":"Weibo", 'proxy_info': {"111.114.114.114":1"}},字典的key必须是general_info和record_info
64
- :return:
65
- """
66
- source_name = record_dict['source_name']
67
- proxy_info = record_dict['proxy_info']
68
- if not isinstance(source_name, Source): raise TypeError('error data type source_name')
69
- proxy_key = '{}:Proxy'.format(source_name.value)
70
- for k, v in proxy_info.items():
71
- key_value = client.hget(proxy_key, k)
72
- if not key_value:
73
- key_value = """{"count":"0"}"""
74
- key_value = json.loads(key_value)
75
- key_value.update({"count": int(key_value.get("count")) + v, })
76
- client.hset(proxy_key, k, json.dumps(key_value, ensure_ascii=False))
77
-
78
-
79
65
  def get_tomorrow_timestamp() -> int:
80
66
  """
81
67
  获取第二天零点的时间戳
@@ -108,27 +94,23 @@ class Source(Enum):
108
94
  WECHAT = 'Wechat'
109
95
  DOUYIN = 'Douyin'
110
96
  NEWS = 'News'
111
- SD = 'SD'
112
97
  TIEBA = 'Tieba'
113
98
  APP = 'App'
114
99
  HOTLIST = 'HotList'
115
100
  TOUTIAO = 'Toutiao'
116
- BAIDU_BAIJIAHAO = 'baidu_baijihao'
117
101
  OTHER = 'OtherSource'
118
102
 
119
103
 
120
104
  class Response(Enum):
121
105
  """错误的数据链接请求"""
122
106
  status_code = 500
123
- status = 500
124
- headers = {}
125
107
 
126
108
 
127
109
  class Task(object):
128
110
  """
129
111
  任务调度分发类,主要分为定时分发任务、循环分发任务、单次分发任务
130
112
  """
131
-
113
+
132
114
  def __init__(self, db_client=None, *, db_host: str = None, db_port: int = 6379, db_cluster: bool = False,
133
115
  db_password: str = None):
134
116
  """
@@ -144,7 +126,7 @@ class Task(object):
144
126
  else:
145
127
  redis_type = RedisCluster if db_cluster else Redis
146
128
  self.db_client = redis_type(host=db_host, port=db_port, password=db_password)
147
-
129
+
148
130
  def cron_job(self, que_name: str, task_data: list, *, hour: str = None, minute: str = None, second: str = None):
149
131
  """
150
132
  定时分发任务,任务队列类型必须是列表类型
@@ -164,7 +146,7 @@ class Task(object):
164
146
  args=[que_name, task_data]
165
147
  )
166
148
  scheduler.start()
167
-
149
+
168
150
  def cyclic_job(self, que_name: str, task_data: list, second: int):
169
151
  """
170
152
  循环分发任务,任务队列类型必须是列表类型
@@ -177,7 +159,7 @@ class Task(object):
177
159
  self.db_client.rpush(que_name, *task_data)
178
160
  print('task distribution complete...')
179
161
  time.sleep(second)
180
-
162
+
181
163
  def single_job(self, que_name: str, task_data: list, que_type: str = 'list', sort: str = 'r'):
182
164
  """
183
165
  单次任务分发,队列可以是列表或集合
@@ -196,7 +178,7 @@ class Mrequest(object):
196
178
  """
197
179
  发送请求的类,分为同步和异步
198
180
  """
199
-
181
+
200
182
  def __init__(self, db_client=None, statistic=False, *, db_host: str = None, db_port: int = 6379,
201
183
  db_cluster: bool = False,
202
184
  db_password: str = None):
@@ -215,7 +197,7 @@ class Mrequest(object):
215
197
  else:
216
198
  redis_type = RedisCluster if db_cluster else Redis
217
199
  self.db_client = redis_type(host=db_host, port=db_port, password=db_password)
218
-
200
+
219
201
  def mrequest(self, urls: str, method: str = 'get', record_dict: dict = dict, step: int = 1, **kwargs):
220
202
  """
221
203
  使用requests同步发送请求,参数分别为目标任务url,请求方法,计数器字典,计数步长
@@ -228,112 +210,107 @@ class Mrequest(object):
228
210
  request_method = getattr(requests, method.lower())
229
211
  except AttributeError:
230
212
  raise AttributeError('no method named {}'.format(method))
231
- """增加请求的单次计数,记录所有请求"""
232
- if self.statistic: record_dict['record_info'] = '{}_request_all'.format(component_name)
233
- if self.statistic: spider_record(self.db_client, record_dict, step, False)
234
213
  try:
235
214
  response = request_method(urls, **kwargs)
236
- if "proxies" in kwargs.keys():
237
- """增加代理正在使用限制"""
238
- proxy_ip = kwargs.get("proxies").get("http")
239
- if self.statistic: record_dict['proxy_info'] = {proxy_ip.split("//")[-1]: 1} if proxy_ip else {"": 1}
240
- if self.statistic: spider_proxy_record(self.db_client, record_dict, False)
241
-
242
- if self.statistic: record_dict['record_info'] = '{}_request_proxy'.format(component_name)
243
- if self.statistic: spider_record(self.db_client, record_dict, step, False)
244
-
245
215
  if response.status_code == 200:
246
216
  if self.statistic: record_dict['record_info'] = '{}_request_success'.format(component_name)
247
- if self.statistic: spider_record(self.db_client, record_dict, step, False)
248
- """增加返回值,结果结果统计,按照bytes为单位统计"""
249
- if self.statistic: record_dict['record_info'] = '{}_request_content_length'.format(component_name)
250
- content_length = eval(response.headers.get("Content-Length", "0"))
251
- if self.statistic: spider_record(self.db_client, record_dict,
252
- content_length if content_length else len(response.content),
253
- False)
254
217
  else:
255
218
  if self.statistic: record_dict['record_info'] = '{}_request_fail'.format(component_name)
256
- if self.statistic: spider_record(self.db_client, record_dict, step, False)
219
+ if self.statistic: spider_record(self.db_client, record_dict, step, False)
257
220
  return response
258
221
  except Exception as e:
259
- import traceback
260
- print("请求出错,原因 {}".format(traceback.format_exc()))
222
+ print(e)
261
223
  if self.statistic:
262
224
  record_dict['record_info'] = '{}_request_fail'.format(component_name)
263
225
  spider_record(self.db_client, record_dict, step, False)
264
- print('请求出错,失败计数加一', record_dict)
226
+ print('失败计数加一')
265
227
  return Response
266
- finally:
267
- if "proxies" in kwargs.keys():
268
- """增加代理正在使用限制"""
269
- proxy_ip = kwargs.get("proxies").get("http")
270
- if self.statistic: record_dict['proxy_info'] = {proxy_ip.split("//")[-1]: -1} if proxy_ip else {"": 1}
271
- if self.statistic: spider_proxy_record(self.db_client, record_dict, False)
272
-
273
- def aio_request(self, urls: list, record_dict: dict = dict, response_model: str = 'html', step: int = 1,
274
- cookies=None, **kwargs):
228
+
229
+ def aio_request(
230
+ self,
231
+ urls: List[str],
232
+ record_dict: Dict[str, Any] = None,
233
+ response_model: str = 'html',
234
+ step: int = 1,
235
+ cookies: Optional[Dict] = None,
236
+ proxy_url: Optional[str] = None, # 支持: http://, https://, socks4://, socks5://
237
+ **kwargs
238
+ ) -> List[Tuple[Any, str, Optional[ClientResponse]]]:
275
239
  """
276
- 异步发送请求,使用aiohttp.请求参数分别是: 目标任务url列表,计数器字典,请求响应格式,计数器步长
277
- :return:
240
+ 异步发送请求,自适应支持 HTTP/HTTPS 和 SOCKS4/SOCKS5 代理。
241
+ - HTTP/HTTPS 代理:使用 aiohttp 原生 proxy 参数
242
+ - SOCKS 代理:使用 aiohttp-socks(需安装)
278
243
  """
279
- if self.statistic:
280
- component_name = record_dict['component_name']
281
-
282
- async def async_request(url, cookies=None, **kwargs):
283
- if cookies:
284
- clientSessionObj = ClientSession(connector=ProxyConnector(), request_class=ProxyClientRequest,
285
- cookies=cookies)
244
+ if record_dict is None:
245
+ record_dict = {}
246
+
247
+ async def async_request(url: str, cookies: Optional[Dict] = None, **kwargs):
248
+ # 解析代理类型
249
+ use_socks = False
250
+ if proxy_url:
251
+ parsed = urlparse(proxy_url)
252
+ if parsed.scheme in ('socks4', 'socks5'):
253
+ use_socks = True
254
+ elif parsed.scheme not in ('http', 'https'):
255
+ raise ValueError(f"Unsupported proxy scheme: {parsed.scheme}. Use http, https, socks4, or socks5.")
256
+
257
+ try:
258
+ if use_socks:
259
+ # 使用 SOCKS 代理
260
+ if not HAS_SOCKS:
261
+ raise RuntimeError(
262
+ "SOCKS proxy requires 'aiohttp-socks' package. Install with: pip install aiohttp-socks")
263
+ connector = ProxyConnector.from_url(proxy_url)
264
+ async with ClientSession(connector=connector, cookies=cookies) as session:
265
+ async with session.get(url, **kwargs) as response:
266
+ return await _process_response(response, url)
267
+ else:
268
+ # 使用 HTTP/HTTPS 代理 或 无代理
269
+ async with ClientSession(cookies=cookies) as session:
270
+ async with session.get(url, proxy=proxy_url, **kwargs) as response:
271
+ return await _process_response(response, url)
272
+
273
+ except Exception as e:
274
+ if self.statistic and 'component_name' in record_dict:
275
+ component_name = record_dict['component_name']
276
+ record_dict['record_info'] = f'{component_name}_request_fail'
277
+ spider_record(self.db_client, record_dict, step, False)
278
+ import traceback
279
+ print(f"Error fetching {url} via {proxy_url or 'direct'}: {e}\n{traceback.format_exc()}")
280
+ return '', url, None
281
+
282
+ async def _process_response(response: ClientResponse, url: str):
283
+ """统一处理响应内容"""
284
+ if self.statistic and 'component_name' in record_dict:
285
+ component_name = record_dict['component_name']
286
+ if response.status == 200:
287
+ record_dict['record_info'] = f'{component_name}_request_success'
288
+ else:
289
+ record_dict['record_info'] = f'{component_name}_request_fail'
290
+ spider_record(self.db_client, record_dict, step, False)
291
+
292
+ if response_model == 'json':
293
+ content = await response.json()
294
+ elif response_model == 'bytes':
295
+ content = await response.read()
296
+ elif response_model == 'html':
297
+ content = await response.text()
286
298
  else:
287
- clientSessionObj = ClientSession(connector=ProxyConnector(), request_class=ProxyClientRequest)
288
- async with clientSessionObj as session:
289
- try:
290
- async with session.get(url, **kwargs) as response:
291
- if response.status == 200:
292
- if self.statistic: record_dict['record_info'] = '{}_request_success'.format(component_name)
293
- else:
294
- if self.statistic: record_dict['record_info'] = '{}_request_fail'.format(component_name)
295
- if self.statistic: spider_record(self.db_client, record_dict, step, False)
296
- if self.statistic: record_dict['record_info'] = '{}_request_content_length'.format(
297
- component_name)
298
- content_length = eval(response.headers.get("Content-Length", "0"))
299
- res_content = await response.read()
300
- if self.statistic: spider_record(self.db_client, record_dict,
301
- content_length if content_length else len(res_content),
302
- False)
303
- if response_model == 'json':
304
- return await response.json(), str(response.url), response
305
- elif response_model == 'bytes':
306
- return await response.read(), str(response.url), response
307
- elif response_model == 'html':
308
- return await response.text(), str(response.url), response
309
- else:
310
- return await response, str(response.url), response
311
- except Exception as e:
312
- if self.statistic:
313
- record_dict['record_info'] = '{}_request_fail'.format(component_name)
314
- spider_record(self.db_client, record_dict, step, False)
315
- import traceback
316
- print(traceback.format_exc(), url)
317
- if self.statistic: record_dict['record_info'] = '{}_request_content_length'.format(component_name)
318
- content_length = 0
319
- res_content = ""
320
- if self.statistic: spider_record(self.db_client, record_dict,
321
- content_length if content_length else len(res_content),
322
- False)
323
- return '', url, Response
324
-
325
- tasks = [asyncio.ensure_future(async_request(i, cookies=cookies, **kwargs)) for i in urls]
326
- loop = asyncio.get_event_loop()
327
- loop.run_until_complete(asyncio.wait(tasks))
328
- del loop
329
- return [i.result() for i in tasks]
299
+ content = response
300
+
301
+ return content, str(response.url), response
302
+
303
+ # 执行所有任务
304
+ tasks = [async_request(url, cookies=cookies, **kwargs) for url in urls]
305
+ results = asyncio.run(asyncio.gather(*tasks, return_exceptions=False))
306
+ return results
330
307
 
331
308
 
332
309
  class Parse(object):
333
310
  """
334
311
  解析源数据类,使用思路为继承该类并重写analysis_data方法
335
312
  """
336
-
313
+
337
314
  def __init__(self, db_client=None, statistic=False, *, db_host: str = None, db_port: int = 6379,
338
315
  db_cluster: bool = False,
339
316
  db_password: str = None):
@@ -343,7 +320,7 @@ class Parse(object):
343
320
  else:
344
321
  redis_type = RedisCluster if db_cluster else Redis
345
322
  self.db_client = redis_type(host=db_host, port=db_port, password=db_password)
346
-
323
+
347
324
  # @time_counter
348
325
  def analysis_data(self, source_code, record_dict: dict = dict, step: int = 1, *args, **kwargs):
349
326
  """
@@ -359,7 +336,7 @@ class Parse(object):
359
336
 
360
337
 
361
338
  class Monitor(object):
362
-
339
+
363
340
  def __init__(self, rds=None, statistic=False, *, redis_host: str = None, redis_port: int = 6379,
364
341
  redis_cluster: bool = False,
365
342
  redis_password: str = None):
@@ -369,11 +346,11 @@ class Monitor(object):
369
346
  else:
370
347
  redis_type = RedisCluster if redis_cluster else Redis
371
348
  self.rds = redis_type(host=redis_host, port=redis_port, password=redis_password)
372
-
349
+
373
350
  def log(self):
374
351
  """日志统计"""
375
352
  ...
376
-
353
+
377
354
  # 在收集的大key前面添加日期,之后删除被收集的大key
378
355
  def statistics_count(self, summary_keys: dict, *, hour: str = None, minute: str = None, second: str = None):
379
356
  """
@@ -384,7 +361,7 @@ class Monitor(object):
384
361
  :param second:
385
362
  :return:
386
363
  """
387
-
364
+
388
365
  def summary_count(summary_keys: dict):
389
366
  today_date = get_today_str()
390
367
  for summary_key, big_keys in summary_keys.items():
@@ -394,7 +371,7 @@ class Monitor(object):
394
371
  if today_date not in summary_key:
395
372
  summary_key = today_date + '_' + summary_key
396
373
  self.rds.hset(summary_key, k, v)
397
-
374
+
398
375
  scheduler = BlockingScheduler(timezone="Asia/Shanghai")
399
376
  scheduler.add_job(
400
377
  summary_count, 'cron',
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ODtools
3
- Version: 2.2.0
3
+ Version: 2.2.1
4
4
  Summary: zkrTools
5
5
  Home-page: https://github.com/zkr-origin-data-dpt/ODtools
6
6
  Author: zkrPython
@@ -37,6 +37,8 @@ Requires-Dist: loguru
37
37
  Requires-Dist: colorlog
38
38
  Requires-Dist: aiohttp-socks
39
39
  Requires-Dist: apscheduler
40
+ Requires-Dist: wheel
41
+ Requires-Dist: twine
40
42
  Requires-Dist: PySocks
41
43
  Dynamic: author
42
44
  Dynamic: author-email
@@ -9,4 +9,6 @@ loguru
9
9
  colorlog
10
10
  aiohttp-socks
11
11
  apscheduler
12
+ wheel
13
+ twine
12
14
  PySocks
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ODtools
3
- Version: 2.2.0
3
+ Version: 2.2.1
4
4
  Summary: zkrTools
5
5
  Home-page: https://github.com/zkr-origin-data-dpt/ODtools
6
6
  Author: zkrPython
@@ -37,6 +37,8 @@ Requires-Dist: loguru
37
37
  Requires-Dist: colorlog
38
38
  Requires-Dist: aiohttp-socks
39
39
  Requires-Dist: apscheduler
40
+ Requires-Dist: wheel
41
+ Requires-Dist: twine
40
42
  Requires-Dist: PySocks
41
43
  Dynamic: author
42
44
  Dynamic: author-email
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name="ODtools",
5
- version="2.2.0",
5
+ version="2.2.1",
6
6
  author="zkrPython",
7
7
  author_email="178031608@qq.com",
8
8
  description="zkrTools",
@@ -24,6 +24,8 @@ setup(
24
24
  "colorlog",
25
25
  "aiohttp-socks",
26
26
  "apscheduler",
27
+ "wheel",
28
+ "twine",
27
29
  "PySocks",#增加依赖,requests socket 代理依赖
28
30
  ],
29
31
  classifiers=[
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes