cobweb-launcher 1.3.15__tar.gz → 3.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. {cobweb-launcher-1.3.15/cobweb_launcher.egg-info → cobweb-launcher-3.1.1}/PKG-INFO +1 -1
  2. cobweb-launcher-3.1.1/cobweb/__init__.py +2 -0
  3. cobweb-launcher-3.1.1/cobweb/base/__init__.py +9 -0
  4. {cobweb-launcher-1.3.15 → cobweb-launcher-3.1.1}/cobweb/base/common_queue.py +0 -13
  5. {cobweb-launcher-1.3.15 → cobweb-launcher-3.1.1}/cobweb/base/request.py +2 -14
  6. {cobweb-launcher-1.3.15 → cobweb-launcher-3.1.1}/cobweb/base/seed.py +16 -12
  7. {cobweb-launcher-1.3.15 → cobweb-launcher-3.1.1}/cobweb/constant.py +0 -16
  8. cobweb-launcher-3.1.1/cobweb/crawlers/crawler.py +28 -0
  9. cobweb-launcher-3.1.1/cobweb/db/redis_db.py +215 -0
  10. cobweb-launcher-3.1.1/cobweb/launchers/__init__.py +9 -0
  11. cobweb-launcher-3.1.1/cobweb/launchers/distributor.py +171 -0
  12. cobweb-launcher-3.1.1/cobweb/launchers/launcher.py +167 -0
  13. cobweb-launcher-3.1.1/cobweb/launchers/uploader.py +65 -0
  14. cobweb-launcher-3.1.1/cobweb/pipelines/pipeline.py +15 -0
  15. cobweb-launcher-3.1.1/cobweb/schedulers/__init__.py +1 -0
  16. cobweb-launcher-3.1.1/cobweb/schedulers/launcher_air.py +93 -0
  17. cobweb-launcher-3.1.1/cobweb/schedulers/launcher_api.py +225 -0
  18. cobweb-launcher-3.1.1/cobweb/schedulers/scheduler.py +85 -0
  19. cobweb-launcher-3.1.1/cobweb/schedulers/scheduler_with_redis.py +177 -0
  20. {cobweb-launcher-1.3.15 → cobweb-launcher-3.1.1}/cobweb/setting.py +15 -32
  21. {cobweb-launcher-1.3.15 → cobweb-launcher-3.1.1}/cobweb/utils/__init__.py +2 -1
  22. cobweb-launcher-3.1.1/cobweb/utils/decorators.py +43 -0
  23. cobweb-launcher-3.1.1/cobweb/utils/dotting.py +55 -0
  24. {cobweb-launcher-1.3.15 → cobweb-launcher-3.1.1}/cobweb/utils/oss.py +28 -9
  25. {cobweb-launcher-1.3.15 → cobweb-launcher-3.1.1/cobweb_launcher.egg-info}/PKG-INFO +1 -1
  26. {cobweb-launcher-1.3.15 → cobweb-launcher-3.1.1}/cobweb_launcher.egg-info/SOURCES.txt +9 -9
  27. {cobweb-launcher-1.3.15 → cobweb-launcher-3.1.1}/setup.py +1 -1
  28. cobweb-launcher-1.3.15/cobweb/__init__.py +0 -2
  29. cobweb-launcher-1.3.15/cobweb/base/__init__.py +0 -154
  30. cobweb-launcher-1.3.15/cobweb/base/basic.py +0 -297
  31. cobweb-launcher-1.3.15/cobweb/base/dotting.py +0 -35
  32. cobweb-launcher-1.3.15/cobweb/crawlers/crawler.py +0 -110
  33. cobweb-launcher-1.3.15/cobweb/db/redis_db.py +0 -158
  34. cobweb-launcher-1.3.15/cobweb/launchers/__init__.py +0 -3
  35. cobweb-launcher-1.3.15/cobweb/launchers/launcher.py +0 -211
  36. cobweb-launcher-1.3.15/cobweb/launchers/launcher_air.py +0 -88
  37. cobweb-launcher-1.3.15/cobweb/launchers/launcher_api.py +0 -89
  38. cobweb-launcher-1.3.15/cobweb/launchers/launcher_pro.py +0 -88
  39. cobweb-launcher-1.3.15/cobweb/pipelines/pipeline.py +0 -48
  40. cobweb-launcher-1.3.15/cobweb/schedulers/__init__.py +0 -3
  41. cobweb-launcher-1.3.15/cobweb/schedulers/scheduler_api.py +0 -72
  42. cobweb-launcher-1.3.15/cobweb/schedulers/scheduler_redis.py +0 -72
  43. cobweb-launcher-1.3.15/test/test.py +0 -29
  44. {cobweb-launcher-1.3.15 → cobweb-launcher-3.1.1}/LICENSE +0 -0
  45. {cobweb-launcher-1.3.15 → cobweb-launcher-3.1.1}/README.md +0 -0
  46. {cobweb-launcher-1.3.15 → cobweb-launcher-3.1.1}/cobweb/base/item.py +0 -0
  47. {cobweb-launcher-1.3.15 → cobweb-launcher-3.1.1}/cobweb/base/log.py +0 -0
  48. {cobweb-launcher-1.3.15 → cobweb-launcher-3.1.1}/cobweb/base/response.py +0 -0
  49. {cobweb-launcher-1.3.15 → cobweb-launcher-3.1.1}/cobweb/crawlers/__init__.py +0 -0
  50. {cobweb-launcher-1.3.15 → cobweb-launcher-3.1.1}/cobweb/db/__init__.py +0 -0
  51. {cobweb-launcher-1.3.15 → cobweb-launcher-3.1.1}/cobweb/db/api_db.py +0 -0
  52. {cobweb-launcher-1.3.15 → cobweb-launcher-3.1.1}/cobweb/exceptions/__init__.py +0 -0
  53. {cobweb-launcher-1.3.15 → cobweb-launcher-3.1.1}/cobweb/exceptions/oss_db_exception.py +0 -0
  54. {cobweb-launcher-1.3.15 → cobweb-launcher-3.1.1}/cobweb/pipelines/__init__.py +0 -0
  55. {cobweb-launcher-1.3.15 → cobweb-launcher-3.1.1}/cobweb/pipelines/pipeline_console.py +0 -0
  56. {cobweb-launcher-1.3.15 → cobweb-launcher-3.1.1}/cobweb/pipelines/pipeline_loghub.py +0 -0
  57. {cobweb-launcher-1.3.15 → cobweb-launcher-3.1.1}/cobweb/utils/bloom.py +0 -0
  58. {cobweb-launcher-1.3.15 → cobweb-launcher-3.1.1}/cobweb/utils/tools.py +0 -0
  59. {cobweb-launcher-1.3.15 → cobweb-launcher-3.1.1}/cobweb_launcher.egg-info/dependency_links.txt +0 -0
  60. {cobweb-launcher-1.3.15 → cobweb-launcher-3.1.1}/cobweb_launcher.egg-info/requires.txt +0 -0
  61. {cobweb-launcher-1.3.15 → cobweb-launcher-3.1.1}/cobweb_launcher.egg-info/top_level.txt +0 -0
  62. {cobweb-launcher-1.3.15 → cobweb-launcher-3.1.1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 1.3.15
3
+ Version: 3.1.1
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -0,0 +1,2 @@
1
+ from .launchers import Launcher
2
+ from .constant import CrawlerModel
@@ -0,0 +1,9 @@
1
+ from .common_queue import Queue
2
+ from .response import Response
3
+ from .request import Request
4
+ from .item import BaseItem, ConsoleItem
5
+ from .seed import Seed
6
+
7
+ from .log import logger
8
+ # from .decorators import decorator_oss_db
9
+
@@ -1,4 +1,3 @@
1
- import time
2
1
  from collections import deque
3
2
 
4
3
 
@@ -29,15 +28,3 @@ class Queue:
29
28
  return None
30
29
  except AttributeError:
31
30
  return None
32
-
33
- def clear(self):
34
- self._queue.clear()
35
-
36
- def get(self):
37
- try:
38
- yield self._queue.popleft()
39
- except IndexError:
40
- time.sleep(1)
41
- yield None
42
- except AttributeError:
43
- yield None
@@ -1,4 +1,3 @@
1
- import json
2
1
  import random
3
2
  import requests
4
3
 
@@ -31,6 +30,7 @@ class Request:
31
30
  **kwargs
32
31
  ):
33
32
  self.url = url
33
+ self.seed = seed
34
34
  self.check_status_code = check_status_code
35
35
  self.request_setting = {}
36
36
 
@@ -46,12 +46,6 @@ class Request:
46
46
  if random_ua:
47
47
  self._build_header()
48
48
 
49
- if isinstance(seed, Seed):
50
- self.seed = seed.to_string
51
- else:
52
- kwargs.update(**seed.to_dict)
53
- self.seed = self.to_string
54
-
55
49
  @property
56
50
  def _random_ua(self) -> str:
57
51
  v1 = random.randint(4, 15)
@@ -79,16 +73,10 @@ class Request:
79
73
  @property
80
74
  def to_dict(self):
81
75
  _dict = self.__dict__.copy()
76
+ _dict.pop('url')
82
77
  _dict.pop('seed')
83
78
  _dict.pop('check_status_code')
84
79
  _dict.pop('request_setting')
85
80
  return _dict
86
81
 
87
- @property
88
- def to_string(self) -> str:
89
- return json.dumps(
90
- self.to_dict,
91
- ensure_ascii=False,
92
- separators=(",", ":")
93
- )
94
82
 
@@ -5,11 +5,13 @@ import hashlib
5
5
 
6
6
  class SeedParams:
7
7
 
8
- def __init__(self, retry, priority, seed_version, seed_status=None):
8
+ def __init__(self, retry, priority, seed_version, seed_status=None, proxy_type=None, proxy=None):
9
9
  self.retry = retry or 0
10
10
  self.priority = priority or 300
11
11
  self.seed_version = seed_version or int(time.time())
12
12
  self.seed_status = seed_status
13
+ self.proxy_type = proxy_type
14
+ self.proxy = proxy
13
15
 
14
16
 
15
17
  class Seed:
@@ -18,7 +20,9 @@ class Seed:
18
20
  "retry",
19
21
  "priority",
20
22
  "seed_version",
21
- "seed_status"
23
+ "seed_status",
24
+ "proxy_type",
25
+ "proxy"
22
26
  ]
23
27
 
24
28
  def __init__(
@@ -29,6 +33,8 @@ class Seed:
29
33
  priority=None,
30
34
  seed_version=None,
31
35
  seed_status=None,
36
+ proxy_type=None,
37
+ proxy=None,
32
38
  **kwargs
33
39
  ):
34
40
  if any(isinstance(seed, t) for t in (str, bytes)):
@@ -51,6 +57,8 @@ class Seed:
51
57
  "priority": priority,
52
58
  "seed_version": seed_version,
53
59
  "seed_status": seed_status,
60
+ "proxy_type": proxy_type,
61
+ "proxy": proxy
54
62
  }
55
63
 
56
64
  if kwargs:
@@ -104,15 +112,11 @@ class Seed:
104
112
  separators=(",", ":")
105
113
  )
106
114
 
107
- # @property
108
- # def get_all(self):
109
- # return json.dumps(
110
- # self.__dict__,
111
- # ensure_ascii=False,
112
- # separators=(",", ":")
113
- # )
114
-
115
115
  @property
116
- def seed(self):
117
- return self.to_string
116
+ def get_all(self):
117
+ return json.dumps(
118
+ self.__dict__,
119
+ ensure_ascii=False,
120
+ separators=(",", ":")
121
+ )
118
122
 
@@ -37,22 +37,6 @@ class LogTemplate:
37
37
  ----------------------- end - console pipeline ------------------
38
38
  """
39
39
 
40
- launcher_polling = """
41
- ----------------------- start - 轮训日志: {task} -----------------
42
- 正在运行任务
43
- 构造请求任务数: {memory_todo_count}
44
- 正在下载任务数: {memory_download_count}
45
- 任务内存队列
46
- 待构造请求队列: {todo_queue_len}
47
- 待删除请求队列: {delete_queue_len}
48
- 待进行下载队列: {request_queue_len}
49
- 待解析响应队列: {response_queue_len}
50
- 待删除下载队列: {done_queue_len}
51
- 存储队列
52
- 待上传数据队列: {upload_queue_len}
53
- ----------------------- end - 轮训日志: {task} ------------------
54
- """
55
-
56
40
  launcher_air_polling = """
57
41
  ----------------------- start - 轮训日志: {task} -----------------
58
42
  内存队列
@@ -0,0 +1,28 @@
1
+ import json
2
+ from typing import Union
3
+ from cobweb.base import (
4
+ Seed,
5
+ BaseItem,
6
+ Request,
7
+ Response,
8
+ ConsoleItem,
9
+ )
10
+
11
+
12
+ class Crawler:
13
+
14
+ @staticmethod
15
+ def request(seed: Seed) -> Union[Request, BaseItem]:
16
+ yield Request(seed.url, seed, timeout=5)
17
+
18
+ @staticmethod
19
+ def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
20
+ response = item.download()
21
+ yield Response(item.seed, response, **item.to_dict)
22
+
23
+ @staticmethod
24
+ def parse(item: Response) -> BaseItem:
25
+ upload_item = item.to_dict
26
+ upload_item["text"] = item.response.text
27
+ yield ConsoleItem(item.seed, data=json.dumps(upload_item, ensure_ascii=False))
28
+
@@ -0,0 +1,215 @@
1
+ import redis
2
+ import time
3
+ from cobweb import setting
4
+ from redis.exceptions import ConnectionError, TimeoutError
5
+
6
+
7
+ class RedisDB:
8
+ def __init__(self, **kwargs):
9
+ redis_config = kwargs or setting.REDIS_CONFIG
10
+ self.host = redis_config['host']
11
+ self.password = redis_config['password']
12
+ self.port = redis_config['port']
13
+ self.db = redis_config['db']
14
+
15
+ self.max_retries = 5
16
+ self.retry_delay = 5
17
+ self.client = None
18
+ self.connect()
19
+
20
+ def connect(self):
21
+ """尝试连接 Redis"""
22
+ retries = 0
23
+ while retries < self.max_retries:
24
+ try:
25
+ self.client = redis.Redis(
26
+ host=self.host,
27
+ port=self.port,
28
+ password=self.password,
29
+ db=self.db,
30
+ socket_timeout=5, # 设置连接超时时间
31
+ socket_connect_timeout=5 # 设置连接超时时间
32
+ )
33
+ # 测试连接是否成功
34
+ self.client.ping()
35
+ return
36
+ except (ConnectionError, TimeoutError) as e:
37
+ retries += 1
38
+ if retries < self.max_retries:
39
+ time.sleep(self.retry_delay)
40
+ else:
41
+ raise Exception("达到最大重试次数,无法连接 Redis")
42
+
43
+ def is_connected(self):
44
+ try:
45
+ self.client.ping()
46
+ return True
47
+ except (ConnectionError, TimeoutError):
48
+ return False
49
+
50
+ def reconnect(self):
51
+ self.connect()
52
+
53
+ def execute_command(self, command, *args, **kwargs):
54
+ retries = 0
55
+ while retries < self.max_retries:
56
+ try:
57
+ if not self.is_connected():
58
+ self.reconnect()
59
+ return getattr(self.client, command)(*args, **kwargs)
60
+ except (ConnectionError, TimeoutError) as e:
61
+ retries += 1
62
+ if retries < self.max_retries:
63
+ time.sleep(self.retry_delay)
64
+ else:
65
+ raise Exception("达到最大重试次数,无法执行命令")
66
+
67
+ def get(self, name):
68
+ # with self.get_connection() as client:
69
+ # return client.get(name)
70
+ return self.execute_command("get", name)
71
+
72
+ def incrby(self, name, value):
73
+ # with self.get_connection() as client:
74
+ # client.incrby(name, value)
75
+ self.execute_command("incrby", name, value)
76
+
77
+ def setnx(self, name, value=""):
78
+ # with self.get_connection() as client:
79
+ # client.setnx(name, value)
80
+ self.execute_command("setnx", name, value)
81
+
82
+ def setex(self, name, t, value=""):
83
+ # with self.get_connection() as client:
84
+ # client.setex(name, t, value)
85
+ self.execute_command("setex", name, t, value)
86
+
87
+ def expire(self, name, t, nx: bool = False, xx: bool = False, gt: bool = False, lt: bool = False):
88
+ # with self.get_connection() as client:
89
+ # client.expire(name, t, nx, xx, gt, lt)
90
+ self.execute_command("expire", name, t, nx, xx, gt, lt)
91
+
92
+ def ttl(self, name):
93
+ # with self.get_connection() as client:
94
+ # return client.ttl(name)
95
+ return self.execute_command("ttl", name)
96
+
97
+ def delete(self, name):
98
+ # with self.get_connection() as client:
99
+ # return client.delete(name)
100
+ return self.execute_command("delete", name)
101
+
102
+ def exists(self, *name) -> bool:
103
+ # with self.get_connection() as client:
104
+ # return client.exists(*name)
105
+ return self.execute_command("exists", *name)
106
+
107
+ def sadd(self, name, value):
108
+ # with self.get_connection() as client:
109
+ # return client.sadd(name, value)
110
+ return self.execute_command("sadd", name, value)
111
+
112
+ def zcard(self, name) -> bool:
113
+ # with self.get_connection() as client:
114
+ # return client.zcard(name)
115
+ return self.execute_command("zcard", name)
116
+
117
+ def zadd(self, name, item: dict, **kwargs):
118
+ # with self.get_connection() as client:
119
+ # return client.zadd(name, item, **kwargs)
120
+ return self.execute_command("zadd", name, item, **kwargs)
121
+
122
+ def zrem(self, name, *value):
123
+ # with self.get_connection() as client:
124
+ # return client.zrem(name, *value)
125
+ return self.execute_command("zrem", name, *value)
126
+
127
+ def zcount(self, name, _min, _max):
128
+ # with self.get_connection() as client:
129
+ # return client.zcount(name, _min, _max)
130
+ return self.execute_command("zcount", name, _min, _max)
131
+
132
+ # def zrangebyscore(self, name, _min, _max, start, num, withscores: bool = False, *args):
133
+ # with self.get_connection() as client:
134
+ # return client.zrangebyscore(name, _min, _max, start, num, withscores, *args)
135
+
136
+ def lua(self, script: str, keys: list = None, args: list = None):
137
+ keys = keys or []
138
+ args = args or []
139
+ keys_count = len(keys)
140
+ return self.execute_command("eval", script, keys_count, *keys, *args)
141
+
142
+ def lua_sha(self, sha1: str, keys: list = None, args: list = None):
143
+ keys = keys or []
144
+ args = args or []
145
+ keys_count = len(keys)
146
+ return self.execute_command("evalsha", sha1, keys_count, *keys, *args)
147
+
148
+ def execute_lua(self, lua_script: str, keys: list, *args):
149
+ execute = self.execute_command("register_script", lua_script)
150
+ return execute(keys=keys, args=args)
151
+
152
+ def lock(self, key, t=15) -> bool:
153
+ lua_script = """
154
+ local status = redis.call('setnx', KEYS[1], 1)
155
+ if ( status == 1 ) then
156
+ redis.call('expire', KEYS[1], ARGV[1])
157
+ end
158
+ return status
159
+ """
160
+ status = self.execute_lua(lua_script, [key], t)
161
+ return bool(status)
162
+
163
+ def members(self, key, score, start=0, count=1000, _min="-inf", _max="+inf") -> list:
164
+ lua_script = """
165
+ local min = ARGV[1]
166
+ local max = ARGV[2]
167
+ local start = ARGV[3]
168
+ local count = ARGV[4]
169
+ local score = ARGV[5]
170
+ local members = nil
171
+
172
+ if ( type(count) == string ) then
173
+ members = redis.call('zrangebyscore', KEYS[1], min, max, 'WITHSCORES')
174
+ else
175
+ members = redis.call('zrangebyscore', KEYS[1], min, max, 'WITHSCORES', 'limit', start, count)
176
+ end
177
+
178
+ local result = {}
179
+
180
+ for i = 1, #members, 2 do
181
+ local priority = nil
182
+ local member = members[i]
183
+ local originPriority = nil
184
+ if ( members[i+1] + 0 < 0 ) then
185
+ originPriority = math.ceil(members[i+1]) * 1000 - members[i+1] * 1000
186
+ else
187
+ originPriority = math.floor(members[i+1])
188
+ end
189
+
190
+ if ( score + 0 >= 1000 ) then
191
+ priority = -score - originPriority / 1000
192
+ elseif ( score + 0 == 0 ) then
193
+ priority = originPriority
194
+ else
195
+ originPriority = score
196
+ priority = score
197
+ end
198
+ redis.call('zadd', KEYS[1], priority, member)
199
+ table.insert(result, member)
200
+ table.insert(result, originPriority)
201
+ end
202
+
203
+ return result
204
+ """
205
+ members = self.execute_lua(lua_script, [key], _min, _max, start, count, score)
206
+ return [(members[i].decode(), int(members[i + 1])) for i in range(0, len(members), 2)]
207
+
208
+ def done(self, keys: list, *args) -> list:
209
+ lua_script = """
210
+ for i, member in ipairs(ARGV) do
211
+ redis.call("zrem", KEYS[1], member)
212
+ redis.call("sadd", KEYS[2], member)
213
+ end
214
+ """
215
+ self.execute_lua(lua_script, keys, *args)
@@ -0,0 +1,9 @@
1
+ # from .launcher_air import LauncherAir
2
+ # from .launcher_pro import LauncherPro
3
+ # from .launcher_api import LauncherApi
4
+
5
+
6
+ from .launcher import Launcher
7
+ from .uploader import Uploader
8
+ from .distributor import Distributor
9
+
@@ -0,0 +1,171 @@
1
+ import time
2
+ import threading
3
+ import traceback
4
+ from inspect import isgenerator
5
+ from typing import Callable
6
+ from urllib.parse import urlparse
7
+ from requests import Response as Res
8
+
9
+ from cobweb import setting
10
+ from cobweb.constant import DealModel, LogTemplate
11
+ from cobweb.base import (
12
+ Seed,
13
+ Queue,
14
+ BaseItem,
15
+ Request,
16
+ Response,
17
+ logger
18
+ )
19
+ from cobweb.utils import LoghubDot, check_pause
20
+
21
+
22
+ class Distributor(threading.Thread):
23
+
24
+ def __init__(
25
+ self,
26
+ task: str,
27
+ project: str,
28
+ new: Queue,
29
+ todo: Queue,
30
+ done: Queue,
31
+ upload: Queue,
32
+ register: Callable,
33
+ stop: threading.Event,
34
+ pause: threading.Event,
35
+ SpiderCrawler
36
+ ):
37
+ super().__init__()
38
+ self.task = task
39
+ self.project = project
40
+ self.stop = stop
41
+ self.pause = pause
42
+
43
+ self.new = new
44
+ self.todo = todo
45
+ self.done = done
46
+ self.upload = upload
47
+ self.register = register
48
+
49
+ self.time_sleep = setting.SPIDER_TIME_SLEEP
50
+ self.thread_num = setting.SPIDER_THREAD_NUM
51
+ self.max_retries = setting.SPIDER_MAX_RETRIES
52
+ self.record_failed = setting.RECORD_FAILED_SPIDER
53
+ self.loghub_dot = LoghubDot() # todo: 解偶
54
+
55
+ self.Crawler = SpiderCrawler
56
+
57
+ logger.debug(f"Distribute instance attrs: {self.__dict__}")
58
+
59
+ def distribute(self, item, seed, _id: int):
60
+ if isinstance(item, Request):
61
+ seed.params.start_time = time.time()
62
+ self.process(item=seed, seed=seed, callback=self.Crawler.download, _id=1)
63
+ elif isinstance(item, Response):
64
+ if _id == 2:
65
+ raise TypeError("parse function can't yield a Response instance")
66
+ dot = isinstance(item.response, Res)
67
+ self.spider_logging(seed, item, dot=dot)
68
+ self.process(item=seed, seed=seed, callback=self.Crawler.parse, _id=2)
69
+ elif isinstance(item, BaseItem):
70
+ self.upload.push(item)
71
+ elif isinstance(item, Seed):
72
+ self.new.push((seed, item), direct_insertion=True)
73
+ elif isinstance(item, str) and item == DealModel.poll:
74
+ self.todo.push(seed)
75
+ elif isinstance(item, str) and item == DealModel.done:
76
+ self.done.push(seed)
77
+ elif isinstance(item, str) and item == DealModel.fail:
78
+ seed.params.retry += 1
79
+ if seed.params.retry < self.max_retries:
80
+ self.todo.push(seed)
81
+ else:
82
+ if record_failed := self.record_failed:
83
+ try:
84
+ response = Response(seed, "failed", max_retries=True)
85
+ self.process(response, seed, self.Crawler.parse, _id=2)
86
+ except:
87
+ record_failed = False
88
+ if not record_failed:
89
+ self.done.push(seed)
90
+ else:
91
+ raise TypeError("yield value type error!")
92
+
93
+ def process(self, item, seed, callback, _id: int):
94
+ result_iterators = callback(item)
95
+ if not isgenerator(result_iterators):
96
+ raise TypeError(f"{callback.__name__} function isn't a generator!")
97
+ for result_item in result_iterators:
98
+ self.distribute(result_item, seed, _id)
99
+
100
+ @check_pause
101
+ def spider(self):
102
+ if seed := self.todo.pop():
103
+ try:
104
+ self.process(item=seed, seed=seed, callback=self.Crawler.request, _id=0)
105
+ except Exception as e:
106
+ url, status = seed.url, e.__class__.__name__
107
+ msg = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
108
+ if getattr(e, "response", None) and isinstance(e.response, Res):
109
+ url = e.response.request.url
110
+ status = e.response.status_code
111
+ self.spider_logging(seed, None, error=True, url=url, status=status, msg=msg)
112
+ self.distribute(DealModel.fail, seed, _id=-1)
113
+
114
+ def spider_logging(
115
+ self, seed,
116
+ item: Response = None,
117
+ error: bool = False,
118
+ dot: bool = True,
119
+ **kwargs
120
+ ):
121
+ detail_log_info = LogTemplate.log_info(seed.to_dict)
122
+ if error:
123
+ url = kwargs.get("url")
124
+ msg = kwargs.get("msg")
125
+ status = kwargs.get("status")
126
+ if dot:
127
+ self.loghub_dot.build(
128
+ topic=urlparse(url).netloc,
129
+ data_size=-1, cost_time=-1,
130
+ status=status, url=url,
131
+ seed=seed.to_string,
132
+ proxy_type=seed.params.proxy_type,
133
+ proxy=seed.params.proxy,
134
+ project=self.project,
135
+ task=self.task, msg=msg,
136
+ )
137
+ logger.info(LogTemplate.download_exception.format(
138
+ detail=detail_log_info,
139
+ retry=seed.params.retry,
140
+ priority=seed.params.priority,
141
+ seed_version=seed.params.seed_version,
142
+ identifier=seed.identifier or "",
143
+ exception=msg
144
+ ))
145
+ else:
146
+ logger.info(LogTemplate.download_info.format(
147
+ detail=detail_log_info,
148
+ retry=seed.params.retry,
149
+ priority=seed.params.priority,
150
+ seed_version=seed.params.seed_version,
151
+ identifier=seed.identifier or "",
152
+ status=item.response,
153
+ response=LogTemplate.log_info(item.to_dict)
154
+ ))
155
+ if dot:
156
+ end_time = time.time()
157
+ stime = seed.params.start_time
158
+ cost_time = end_time - stime if stime else -1
159
+ topic = urlparse(item.response.request.url).netloc
160
+ data_size = int(item.response.headers.get("content-length", 0))
161
+ self.loghub_dot.build(
162
+ topic=topic, data_size=data_size, cost_time=cost_time,
163
+ status=200, seed=seed.to_string, url=item.response.url,
164
+ proxy=seed.params.proxy, proxy_type=seed.params.proxy_type,
165
+ project=self.project, task=self.task,
166
+ )
167
+
168
+ def run(self):
169
+ self.register(self.loghub_dot.build_run, tag="LoghubDot")
170
+ for _ in range(self.thread_num):
171
+ self.register(self.spider, tag="Distributor")