cobweb-launcher 1.3.14__tar.gz → 3.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cobweb-launcher-1.3.14/cobweb_launcher.egg-info → cobweb-launcher-3.1.0}/PKG-INFO +1 -1
- cobweb-launcher-3.1.0/cobweb/__init__.py +2 -0
- cobweb-launcher-3.1.0/cobweb/base/__init__.py +9 -0
- {cobweb-launcher-1.3.14 → cobweb-launcher-3.1.0}/cobweb/base/common_queue.py +0 -13
- {cobweb-launcher-1.3.14 → cobweb-launcher-3.1.0}/cobweb/base/request.py +2 -14
- {cobweb-launcher-1.3.14 → cobweb-launcher-3.1.0}/cobweb/base/seed.py +16 -12
- {cobweb-launcher-1.3.14 → cobweb-launcher-3.1.0}/cobweb/constant.py +0 -16
- cobweb-launcher-3.1.0/cobweb/crawlers/crawler.py +28 -0
- cobweb-launcher-3.1.0/cobweb/db/redis_db.py +215 -0
- cobweb-launcher-3.1.0/cobweb/launchers/__init__.py +9 -0
- cobweb-launcher-3.1.0/cobweb/launchers/distributor.py +171 -0
- cobweb-launcher-3.1.0/cobweb/launchers/launcher.py +167 -0
- cobweb-launcher-3.1.0/cobweb/launchers/uploader.py +65 -0
- cobweb-launcher-3.1.0/cobweb/pipelines/pipeline.py +15 -0
- cobweb-launcher-3.1.0/cobweb/schedulers/__init__.py +1 -0
- cobweb-launcher-3.1.0/cobweb/schedulers/launcher_air.py +93 -0
- cobweb-launcher-3.1.0/cobweb/schedulers/launcher_api.py +225 -0
- cobweb-launcher-3.1.0/cobweb/schedulers/scheduler.py +85 -0
- cobweb-launcher-3.1.0/cobweb/schedulers/scheduler_with_redis.py +177 -0
- {cobweb-launcher-1.3.14 → cobweb-launcher-3.1.0}/cobweb/setting.py +15 -32
- {cobweb-launcher-1.3.14 → cobweb-launcher-3.1.0}/cobweb/utils/__init__.py +2 -1
- cobweb-launcher-3.1.0/cobweb/utils/decorators.py +43 -0
- cobweb-launcher-3.1.0/cobweb/utils/dotting.py +55 -0
- {cobweb-launcher-1.3.14 → cobweb-launcher-3.1.0}/cobweb/utils/oss.py +28 -9
- {cobweb-launcher-1.3.14 → cobweb-launcher-3.1.0/cobweb_launcher.egg-info}/PKG-INFO +1 -1
- {cobweb-launcher-1.3.14 → cobweb-launcher-3.1.0}/cobweb_launcher.egg-info/SOURCES.txt +9 -9
- {cobweb-launcher-1.3.14 → cobweb-launcher-3.1.0}/setup.py +1 -1
- cobweb-launcher-1.3.14/cobweb/__init__.py +0 -2
- cobweb-launcher-1.3.14/cobweb/base/__init__.py +0 -154
- cobweb-launcher-1.3.14/cobweb/base/basic.py +0 -295
- cobweb-launcher-1.3.14/cobweb/base/dotting.py +0 -35
- cobweb-launcher-1.3.14/cobweb/crawlers/crawler.py +0 -110
- cobweb-launcher-1.3.14/cobweb/db/redis_db.py +0 -158
- cobweb-launcher-1.3.14/cobweb/launchers/__init__.py +0 -3
- cobweb-launcher-1.3.14/cobweb/launchers/launcher.py +0 -211
- cobweb-launcher-1.3.14/cobweb/launchers/launcher_air.py +0 -88
- cobweb-launcher-1.3.14/cobweb/launchers/launcher_api.py +0 -88
- cobweb-launcher-1.3.14/cobweb/launchers/launcher_pro.py +0 -88
- cobweb-launcher-1.3.14/cobweb/pipelines/pipeline.py +0 -48
- cobweb-launcher-1.3.14/cobweb/schedulers/__init__.py +0 -3
- cobweb-launcher-1.3.14/cobweb/schedulers/scheduler_api.py +0 -72
- cobweb-launcher-1.3.14/cobweb/schedulers/scheduler_redis.py +0 -72
- cobweb-launcher-1.3.14/test/test.py +0 -29
- {cobweb-launcher-1.3.14 → cobweb-launcher-3.1.0}/LICENSE +0 -0
- {cobweb-launcher-1.3.14 → cobweb-launcher-3.1.0}/README.md +0 -0
- {cobweb-launcher-1.3.14 → cobweb-launcher-3.1.0}/cobweb/base/item.py +0 -0
- {cobweb-launcher-1.3.14 → cobweb-launcher-3.1.0}/cobweb/base/log.py +0 -0
- {cobweb-launcher-1.3.14 → cobweb-launcher-3.1.0}/cobweb/base/response.py +0 -0
- {cobweb-launcher-1.3.14 → cobweb-launcher-3.1.0}/cobweb/crawlers/__init__.py +0 -0
- {cobweb-launcher-1.3.14 → cobweb-launcher-3.1.0}/cobweb/db/__init__.py +0 -0
- {cobweb-launcher-1.3.14 → cobweb-launcher-3.1.0}/cobweb/db/api_db.py +0 -0
- {cobweb-launcher-1.3.14 → cobweb-launcher-3.1.0}/cobweb/exceptions/__init__.py +0 -0
- {cobweb-launcher-1.3.14 → cobweb-launcher-3.1.0}/cobweb/exceptions/oss_db_exception.py +0 -0
- {cobweb-launcher-1.3.14 → cobweb-launcher-3.1.0}/cobweb/pipelines/__init__.py +0 -0
- {cobweb-launcher-1.3.14 → cobweb-launcher-3.1.0}/cobweb/pipelines/pipeline_console.py +0 -0
- {cobweb-launcher-1.3.14 → cobweb-launcher-3.1.0}/cobweb/pipelines/pipeline_loghub.py +0 -0
- {cobweb-launcher-1.3.14 → cobweb-launcher-3.1.0}/cobweb/utils/bloom.py +0 -0
- {cobweb-launcher-1.3.14 → cobweb-launcher-3.1.0}/cobweb/utils/tools.py +0 -0
- {cobweb-launcher-1.3.14 → cobweb-launcher-3.1.0}/cobweb_launcher.egg-info/dependency_links.txt +0 -0
- {cobweb-launcher-1.3.14 → cobweb-launcher-3.1.0}/cobweb_launcher.egg-info/requires.txt +0 -0
- {cobweb-launcher-1.3.14 → cobweb-launcher-3.1.0}/cobweb_launcher.egg-info/top_level.txt +0 -0
- {cobweb-launcher-1.3.14 → cobweb-launcher-3.1.0}/setup.cfg +0 -0
@@ -1,4 +1,3 @@
|
|
1
|
-
import time
|
2
1
|
from collections import deque
|
3
2
|
|
4
3
|
|
@@ -29,15 +28,3 @@ class Queue:
|
|
29
28
|
return None
|
30
29
|
except AttributeError:
|
31
30
|
return None
|
32
|
-
|
33
|
-
def clear(self):
|
34
|
-
self._queue.clear()
|
35
|
-
|
36
|
-
def get(self):
|
37
|
-
try:
|
38
|
-
yield self._queue.popleft()
|
39
|
-
except IndexError:
|
40
|
-
time.sleep(1)
|
41
|
-
yield None
|
42
|
-
except AttributeError:
|
43
|
-
yield None
|
@@ -1,4 +1,3 @@
|
|
1
|
-
import json
|
2
1
|
import random
|
3
2
|
import requests
|
4
3
|
|
@@ -31,6 +30,7 @@ class Request:
|
|
31
30
|
**kwargs
|
32
31
|
):
|
33
32
|
self.url = url
|
33
|
+
self.seed = seed
|
34
34
|
self.check_status_code = check_status_code
|
35
35
|
self.request_setting = {}
|
36
36
|
|
@@ -46,12 +46,6 @@ class Request:
|
|
46
46
|
if random_ua:
|
47
47
|
self._build_header()
|
48
48
|
|
49
|
-
if isinstance(seed, Seed):
|
50
|
-
self.seed = seed.to_string
|
51
|
-
else:
|
52
|
-
kwargs.update(**seed.to_dict)
|
53
|
-
self.seed = self.to_string
|
54
|
-
|
55
49
|
@property
|
56
50
|
def _random_ua(self) -> str:
|
57
51
|
v1 = random.randint(4, 15)
|
@@ -79,16 +73,10 @@ class Request:
|
|
79
73
|
@property
|
80
74
|
def to_dict(self):
|
81
75
|
_dict = self.__dict__.copy()
|
76
|
+
_dict.pop('url')
|
82
77
|
_dict.pop('seed')
|
83
78
|
_dict.pop('check_status_code')
|
84
79
|
_dict.pop('request_setting')
|
85
80
|
return _dict
|
86
81
|
|
87
|
-
@property
|
88
|
-
def to_string(self) -> str:
|
89
|
-
return json.dumps(
|
90
|
-
self.to_dict,
|
91
|
-
ensure_ascii=False,
|
92
|
-
separators=(",", ":")
|
93
|
-
)
|
94
82
|
|
@@ -5,11 +5,13 @@ import hashlib
|
|
5
5
|
|
6
6
|
class SeedParams:
|
7
7
|
|
8
|
-
def __init__(self, retry, priority, seed_version, seed_status=None):
|
8
|
+
def __init__(self, retry, priority, seed_version, seed_status=None, proxy_type=None, proxy=None):
|
9
9
|
self.retry = retry or 0
|
10
10
|
self.priority = priority or 300
|
11
11
|
self.seed_version = seed_version or int(time.time())
|
12
12
|
self.seed_status = seed_status
|
13
|
+
self.proxy_type = proxy_type
|
14
|
+
self.proxy = proxy
|
13
15
|
|
14
16
|
|
15
17
|
class Seed:
|
@@ -18,7 +20,9 @@ class Seed:
|
|
18
20
|
"retry",
|
19
21
|
"priority",
|
20
22
|
"seed_version",
|
21
|
-
"seed_status"
|
23
|
+
"seed_status",
|
24
|
+
"proxy_type",
|
25
|
+
"proxy"
|
22
26
|
]
|
23
27
|
|
24
28
|
def __init__(
|
@@ -29,6 +33,8 @@ class Seed:
|
|
29
33
|
priority=None,
|
30
34
|
seed_version=None,
|
31
35
|
seed_status=None,
|
36
|
+
proxy_type=None,
|
37
|
+
proxy=None,
|
32
38
|
**kwargs
|
33
39
|
):
|
34
40
|
if any(isinstance(seed, t) for t in (str, bytes)):
|
@@ -51,6 +57,8 @@ class Seed:
|
|
51
57
|
"priority": priority,
|
52
58
|
"seed_version": seed_version,
|
53
59
|
"seed_status": seed_status,
|
60
|
+
"proxy_type": proxy_type,
|
61
|
+
"proxy": proxy
|
54
62
|
}
|
55
63
|
|
56
64
|
if kwargs:
|
@@ -104,15 +112,11 @@ class Seed:
|
|
104
112
|
separators=(",", ":")
|
105
113
|
)
|
106
114
|
|
107
|
-
# @property
|
108
|
-
# def get_all(self):
|
109
|
-
# return json.dumps(
|
110
|
-
# self.__dict__,
|
111
|
-
# ensure_ascii=False,
|
112
|
-
# separators=(",", ":")
|
113
|
-
# )
|
114
|
-
|
115
115
|
@property
|
116
|
-
def
|
117
|
-
return
|
116
|
+
def get_all(self):
|
117
|
+
return json.dumps(
|
118
|
+
self.__dict__,
|
119
|
+
ensure_ascii=False,
|
120
|
+
separators=(",", ":")
|
121
|
+
)
|
118
122
|
|
@@ -37,22 +37,6 @@ class LogTemplate:
|
|
37
37
|
----------------------- end - console pipeline ------------------
|
38
38
|
"""
|
39
39
|
|
40
|
-
launcher_polling = """
|
41
|
-
----------------------- start - 轮训日志: {task} -----------------
|
42
|
-
正在运行任务
|
43
|
-
构造请求任务数: {memory_todo_count}
|
44
|
-
正在下载任务数: {memory_download_count}
|
45
|
-
任务内存队列
|
46
|
-
待构造请求队列: {todo_queue_len}
|
47
|
-
待删除请求队列: {delete_queue_len}
|
48
|
-
待进行下载队列: {request_queue_len}
|
49
|
-
待解析响应队列: {response_queue_len}
|
50
|
-
待删除下载队列: {done_queue_len}
|
51
|
-
存储队列
|
52
|
-
待上传数据队列: {upload_queue_len}
|
53
|
-
----------------------- end - 轮训日志: {task} ------------------
|
54
|
-
"""
|
55
|
-
|
56
40
|
launcher_air_polling = """
|
57
41
|
----------------------- start - 轮训日志: {task} -----------------
|
58
42
|
内存队列
|
@@ -0,0 +1,28 @@
|
|
1
|
+
import json
|
2
|
+
from typing import Union
|
3
|
+
from cobweb.base import (
|
4
|
+
Seed,
|
5
|
+
BaseItem,
|
6
|
+
Request,
|
7
|
+
Response,
|
8
|
+
ConsoleItem,
|
9
|
+
)
|
10
|
+
|
11
|
+
|
12
|
+
class Crawler:
|
13
|
+
|
14
|
+
@staticmethod
|
15
|
+
def request(seed: Seed) -> Union[Request, BaseItem]:
|
16
|
+
yield Request(seed.url, seed, timeout=5)
|
17
|
+
|
18
|
+
@staticmethod
|
19
|
+
def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
|
20
|
+
response = item.download()
|
21
|
+
yield Response(item.seed, response, **item.to_dict)
|
22
|
+
|
23
|
+
@staticmethod
|
24
|
+
def parse(item: Response) -> BaseItem:
|
25
|
+
upload_item = item.to_dict
|
26
|
+
upload_item["text"] = item.response.text
|
27
|
+
yield ConsoleItem(item.seed, data=json.dumps(upload_item, ensure_ascii=False))
|
28
|
+
|
@@ -0,0 +1,215 @@
|
|
1
|
+
import redis
|
2
|
+
import time
|
3
|
+
from cobweb import setting
|
4
|
+
from redis.exceptions import ConnectionError, TimeoutError
|
5
|
+
|
6
|
+
|
7
|
+
class RedisDB:
|
8
|
+
def __init__(self, **kwargs):
|
9
|
+
redis_config = kwargs or setting.REDIS_CONFIG
|
10
|
+
self.host = redis_config['host']
|
11
|
+
self.password = redis_config['password']
|
12
|
+
self.port = redis_config['port']
|
13
|
+
self.db = redis_config['db']
|
14
|
+
|
15
|
+
self.max_retries = 5
|
16
|
+
self.retry_delay = 5
|
17
|
+
self.client = None
|
18
|
+
self.connect()
|
19
|
+
|
20
|
+
def connect(self):
|
21
|
+
"""尝试连接 Redis"""
|
22
|
+
retries = 0
|
23
|
+
while retries < self.max_retries:
|
24
|
+
try:
|
25
|
+
self.client = redis.Redis(
|
26
|
+
host=self.host,
|
27
|
+
port=self.port,
|
28
|
+
password=self.password,
|
29
|
+
db=self.db,
|
30
|
+
socket_timeout=5, # 设置连接超时时间
|
31
|
+
socket_connect_timeout=5 # 设置连接超时时间
|
32
|
+
)
|
33
|
+
# 测试连接是否成功
|
34
|
+
self.client.ping()
|
35
|
+
return
|
36
|
+
except (ConnectionError, TimeoutError) as e:
|
37
|
+
retries += 1
|
38
|
+
if retries < self.max_retries:
|
39
|
+
time.sleep(self.retry_delay)
|
40
|
+
else:
|
41
|
+
raise Exception("达到最大重试次数,无法连接 Redis")
|
42
|
+
|
43
|
+
def is_connected(self):
|
44
|
+
try:
|
45
|
+
self.client.ping()
|
46
|
+
return True
|
47
|
+
except (ConnectionError, TimeoutError):
|
48
|
+
return False
|
49
|
+
|
50
|
+
def reconnect(self):
|
51
|
+
self.connect()
|
52
|
+
|
53
|
+
def execute_command(self, command, *args, **kwargs):
|
54
|
+
retries = 0
|
55
|
+
while retries < self.max_retries:
|
56
|
+
try:
|
57
|
+
if not self.is_connected():
|
58
|
+
self.reconnect()
|
59
|
+
return getattr(self.client, command)(*args, **kwargs)
|
60
|
+
except (ConnectionError, TimeoutError) as e:
|
61
|
+
retries += 1
|
62
|
+
if retries < self.max_retries:
|
63
|
+
time.sleep(self.retry_delay)
|
64
|
+
else:
|
65
|
+
raise Exception("达到最大重试次数,无法执行命令")
|
66
|
+
|
67
|
+
def get(self, name):
|
68
|
+
# with self.get_connection() as client:
|
69
|
+
# return client.get(name)
|
70
|
+
return self.execute_command("get", name)
|
71
|
+
|
72
|
+
def incrby(self, name, value):
|
73
|
+
# with self.get_connection() as client:
|
74
|
+
# client.incrby(name, value)
|
75
|
+
self.execute_command("incrby", name, value)
|
76
|
+
|
77
|
+
def setnx(self, name, value=""):
|
78
|
+
# with self.get_connection() as client:
|
79
|
+
# client.setnx(name, value)
|
80
|
+
self.execute_command("setnx", name, value)
|
81
|
+
|
82
|
+
def setex(self, name, t, value=""):
|
83
|
+
# with self.get_connection() as client:
|
84
|
+
# client.setex(name, t, value)
|
85
|
+
self.execute_command("setex", name, t, value)
|
86
|
+
|
87
|
+
def expire(self, name, t, nx: bool = False, xx: bool = False, gt: bool = False, lt: bool = False):
|
88
|
+
# with self.get_connection() as client:
|
89
|
+
# client.expire(name, t, nx, xx, gt, lt)
|
90
|
+
self.execute_command("expire", name, t, nx, xx, gt, lt)
|
91
|
+
|
92
|
+
def ttl(self, name):
|
93
|
+
# with self.get_connection() as client:
|
94
|
+
# return client.ttl(name)
|
95
|
+
return self.execute_command("ttl", name)
|
96
|
+
|
97
|
+
def delete(self, name):
|
98
|
+
# with self.get_connection() as client:
|
99
|
+
# return client.delete(name)
|
100
|
+
return self.execute_command("delete", name)
|
101
|
+
|
102
|
+
def exists(self, *name) -> bool:
|
103
|
+
# with self.get_connection() as client:
|
104
|
+
# return client.exists(*name)
|
105
|
+
return self.execute_command("exists", *name)
|
106
|
+
|
107
|
+
def sadd(self, name, value):
|
108
|
+
# with self.get_connection() as client:
|
109
|
+
# return client.sadd(name, value)
|
110
|
+
return self.execute_command("sadd", name, value)
|
111
|
+
|
112
|
+
def zcard(self, name) -> bool:
|
113
|
+
# with self.get_connection() as client:
|
114
|
+
# return client.zcard(name)
|
115
|
+
return self.execute_command("zcard", name)
|
116
|
+
|
117
|
+
def zadd(self, name, item: dict, **kwargs):
|
118
|
+
# with self.get_connection() as client:
|
119
|
+
# return client.zadd(name, item, **kwargs)
|
120
|
+
return self.execute_command("zadd", name, item, **kwargs)
|
121
|
+
|
122
|
+
def zrem(self, name, *value):
|
123
|
+
# with self.get_connection() as client:
|
124
|
+
# return client.zrem(name, *value)
|
125
|
+
return self.execute_command("zrem", name, *value)
|
126
|
+
|
127
|
+
def zcount(self, name, _min, _max):
|
128
|
+
# with self.get_connection() as client:
|
129
|
+
# return client.zcount(name, _min, _max)
|
130
|
+
return self.execute_command("zcount", name, _min, _max)
|
131
|
+
|
132
|
+
# def zrangebyscore(self, name, _min, _max, start, num, withscores: bool = False, *args):
|
133
|
+
# with self.get_connection() as client:
|
134
|
+
# return client.zrangebyscore(name, _min, _max, start, num, withscores, *args)
|
135
|
+
|
136
|
+
def lua(self, script: str, keys: list = None, args: list = None):
|
137
|
+
keys = keys or []
|
138
|
+
args = args or []
|
139
|
+
keys_count = len(keys)
|
140
|
+
return self.execute_command("eval", script, keys_count, *keys, *args)
|
141
|
+
|
142
|
+
def lua_sha(self, sha1: str, keys: list = None, args: list = None):
|
143
|
+
keys = keys or []
|
144
|
+
args = args or []
|
145
|
+
keys_count = len(keys)
|
146
|
+
return self.execute_command("evalsha", sha1, keys_count, *keys, *args)
|
147
|
+
|
148
|
+
def execute_lua(self, lua_script: str, keys: list, *args):
|
149
|
+
execute = self.execute_command("register_script", lua_script)
|
150
|
+
return execute(keys=keys, args=args)
|
151
|
+
|
152
|
+
def lock(self, key, t=15) -> bool:
|
153
|
+
lua_script = """
|
154
|
+
local status = redis.call('setnx', KEYS[1], 1)
|
155
|
+
if ( status == 1 ) then
|
156
|
+
redis.call('expire', KEYS[1], ARGV[1])
|
157
|
+
end
|
158
|
+
return status
|
159
|
+
"""
|
160
|
+
status = self.execute_lua(lua_script, [key], t)
|
161
|
+
return bool(status)
|
162
|
+
|
163
|
+
def members(self, key, score, start=0, count=1000, _min="-inf", _max="+inf") -> list:
|
164
|
+
lua_script = """
|
165
|
+
local min = ARGV[1]
|
166
|
+
local max = ARGV[2]
|
167
|
+
local start = ARGV[3]
|
168
|
+
local count = ARGV[4]
|
169
|
+
local score = ARGV[5]
|
170
|
+
local members = nil
|
171
|
+
|
172
|
+
if ( type(count) == string ) then
|
173
|
+
members = redis.call('zrangebyscore', KEYS[1], min, max, 'WITHSCORES')
|
174
|
+
else
|
175
|
+
members = redis.call('zrangebyscore', KEYS[1], min, max, 'WITHSCORES', 'limit', start, count)
|
176
|
+
end
|
177
|
+
|
178
|
+
local result = {}
|
179
|
+
|
180
|
+
for i = 1, #members, 2 do
|
181
|
+
local priority = nil
|
182
|
+
local member = members[i]
|
183
|
+
local originPriority = nil
|
184
|
+
if ( members[i+1] + 0 < 0 ) then
|
185
|
+
originPriority = math.ceil(members[i+1]) * 1000 - members[i+1] * 1000
|
186
|
+
else
|
187
|
+
originPriority = math.floor(members[i+1])
|
188
|
+
end
|
189
|
+
|
190
|
+
if ( score + 0 >= 1000 ) then
|
191
|
+
priority = -score - originPriority / 1000
|
192
|
+
elseif ( score + 0 == 0 ) then
|
193
|
+
priority = originPriority
|
194
|
+
else
|
195
|
+
originPriority = score
|
196
|
+
priority = score
|
197
|
+
end
|
198
|
+
redis.call('zadd', KEYS[1], priority, member)
|
199
|
+
table.insert(result, member)
|
200
|
+
table.insert(result, originPriority)
|
201
|
+
end
|
202
|
+
|
203
|
+
return result
|
204
|
+
"""
|
205
|
+
members = self.execute_lua(lua_script, [key], _min, _max, start, count, score)
|
206
|
+
return [(members[i].decode(), int(members[i + 1])) for i in range(0, len(members), 2)]
|
207
|
+
|
208
|
+
def done(self, keys: list, *args) -> list:
|
209
|
+
lua_script = """
|
210
|
+
for i, member in ipairs(ARGV) do
|
211
|
+
redis.call("zrem", KEYS[1], member)
|
212
|
+
redis.call("sadd", KEYS[2], member)
|
213
|
+
end
|
214
|
+
"""
|
215
|
+
self.execute_lua(lua_script, keys, *args)
|
@@ -0,0 +1,171 @@
|
|
1
|
+
import time
|
2
|
+
import threading
|
3
|
+
import traceback
|
4
|
+
from inspect import isgenerator
|
5
|
+
from typing import Callable
|
6
|
+
from urllib.parse import urlparse
|
7
|
+
from requests import Response as Res
|
8
|
+
|
9
|
+
from cobweb import setting
|
10
|
+
from cobweb.constant import DealModel, LogTemplate
|
11
|
+
from cobweb.base import (
|
12
|
+
Seed,
|
13
|
+
Queue,
|
14
|
+
BaseItem,
|
15
|
+
Request,
|
16
|
+
Response,
|
17
|
+
logger
|
18
|
+
)
|
19
|
+
from cobweb.utils import LoghubDot, check_pause
|
20
|
+
|
21
|
+
|
22
|
+
class Distributor(threading.Thread):
|
23
|
+
|
24
|
+
def __init__(
|
25
|
+
self,
|
26
|
+
task: str,
|
27
|
+
project: str,
|
28
|
+
new: Queue,
|
29
|
+
todo: Queue,
|
30
|
+
done: Queue,
|
31
|
+
upload: Queue,
|
32
|
+
register: Callable,
|
33
|
+
stop: threading.Event,
|
34
|
+
pause: threading.Event,
|
35
|
+
SpiderCrawler
|
36
|
+
):
|
37
|
+
super().__init__()
|
38
|
+
self.task = task
|
39
|
+
self.project = project
|
40
|
+
self.stop = stop
|
41
|
+
self.pause = pause
|
42
|
+
|
43
|
+
self.new = new
|
44
|
+
self.todo = todo
|
45
|
+
self.done = done
|
46
|
+
self.upload = upload
|
47
|
+
self.register = register
|
48
|
+
|
49
|
+
self.time_sleep = setting.SPIDER_TIME_SLEEP
|
50
|
+
self.thread_num = setting.SPIDER_THREAD_NUM
|
51
|
+
self.max_retries = setting.SPIDER_MAX_RETRIES
|
52
|
+
self.record_failed = setting.RECORD_FAILED_SPIDER
|
53
|
+
self.loghub_dot = LoghubDot() # todo: 解偶
|
54
|
+
|
55
|
+
self.Crawler = SpiderCrawler
|
56
|
+
|
57
|
+
logger.debug(f"Distribute instance attrs: {self.__dict__}")
|
58
|
+
|
59
|
+
def distribute(self, item, seed, _id: int):
|
60
|
+
if isinstance(item, Request):
|
61
|
+
seed.params.start_time = time.time()
|
62
|
+
self.process(item=seed, seed=seed, callback=self.Crawler.download, _id=1)
|
63
|
+
elif isinstance(item, Response):
|
64
|
+
if _id == 2:
|
65
|
+
raise TypeError("parse function can't yield a Response instance")
|
66
|
+
dot = isinstance(item.response, Res)
|
67
|
+
self.spider_logging(seed, item, dot=dot)
|
68
|
+
self.process(item=seed, seed=seed, callback=self.Crawler.parse, _id=2)
|
69
|
+
elif isinstance(item, BaseItem):
|
70
|
+
self.upload.push(item)
|
71
|
+
elif isinstance(item, Seed):
|
72
|
+
self.new.push((seed, item), direct_insertion=True)
|
73
|
+
elif isinstance(item, str) and item == DealModel.poll:
|
74
|
+
self.todo.push(seed)
|
75
|
+
elif isinstance(item, str) and item == DealModel.done:
|
76
|
+
self.done.push(seed)
|
77
|
+
elif isinstance(item, str) and item == DealModel.fail:
|
78
|
+
seed.params.retry += 1
|
79
|
+
if seed.params.retry < self.max_retries:
|
80
|
+
self.todo.push(seed)
|
81
|
+
else:
|
82
|
+
if record_failed := self.record_failed:
|
83
|
+
try:
|
84
|
+
response = Response(seed, "failed", max_retries=True)
|
85
|
+
self.process(response, seed, self.Crawler.parse, _id=2)
|
86
|
+
except:
|
87
|
+
record_failed = False
|
88
|
+
if not record_failed:
|
89
|
+
self.done.push(seed)
|
90
|
+
else:
|
91
|
+
raise TypeError("yield value type error!")
|
92
|
+
|
93
|
+
def process(self, item, seed, callback, _id: int):
|
94
|
+
result_iterators = callback(item)
|
95
|
+
if not isgenerator(result_iterators):
|
96
|
+
raise TypeError(f"{callback.__name__} function isn't a generator!")
|
97
|
+
for result_item in result_iterators:
|
98
|
+
self.distribute(result_item, seed, _id)
|
99
|
+
|
100
|
+
@check_pause
|
101
|
+
def spider(self):
|
102
|
+
if seed := self.todo.pop():
|
103
|
+
try:
|
104
|
+
self.process(item=seed, seed=seed, callback=self.Crawler.request, _id=0)
|
105
|
+
except Exception as e:
|
106
|
+
url, status = seed.url, e.__class__.__name__
|
107
|
+
msg = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
108
|
+
if getattr(e, "response", None) and isinstance(e.response, Res):
|
109
|
+
url = e.response.request.url
|
110
|
+
status = e.response.status_code
|
111
|
+
self.spider_logging(seed, None, error=True, url=url, status=status, msg=msg)
|
112
|
+
self.distribute(DealModel.fail, seed, _id=-1)
|
113
|
+
|
114
|
+
def spider_logging(
|
115
|
+
self, seed,
|
116
|
+
item: Response = None,
|
117
|
+
error: bool = False,
|
118
|
+
dot: bool = True,
|
119
|
+
**kwargs
|
120
|
+
):
|
121
|
+
detail_log_info = LogTemplate.log_info(seed.to_dict)
|
122
|
+
if error:
|
123
|
+
url = kwargs.get("url")
|
124
|
+
msg = kwargs.get("msg")
|
125
|
+
status = kwargs.get("status")
|
126
|
+
if dot:
|
127
|
+
self.loghub_dot.build(
|
128
|
+
topic=urlparse(url).netloc,
|
129
|
+
data_size=-1, cost_time=-1,
|
130
|
+
status=status, url=url,
|
131
|
+
seed=seed.to_string,
|
132
|
+
proxy_type=seed.params.proxy_type,
|
133
|
+
proxy=seed.params.proxy,
|
134
|
+
project=self.project,
|
135
|
+
task=self.task, msg=msg,
|
136
|
+
)
|
137
|
+
logger.info(LogTemplate.download_exception.format(
|
138
|
+
detail=detail_log_info,
|
139
|
+
retry=seed.params.retry,
|
140
|
+
priority=seed.params.priority,
|
141
|
+
seed_version=seed.params.seed_version,
|
142
|
+
identifier=seed.identifier or "",
|
143
|
+
exception=msg
|
144
|
+
))
|
145
|
+
else:
|
146
|
+
logger.info(LogTemplate.download_info.format(
|
147
|
+
detail=detail_log_info,
|
148
|
+
retry=seed.params.retry,
|
149
|
+
priority=seed.params.priority,
|
150
|
+
seed_version=seed.params.seed_version,
|
151
|
+
identifier=seed.identifier or "",
|
152
|
+
status=item.response,
|
153
|
+
response=LogTemplate.log_info(item.to_dict)
|
154
|
+
))
|
155
|
+
if dot:
|
156
|
+
end_time = time.time()
|
157
|
+
stime = seed.params.start_time
|
158
|
+
cost_time = end_time - stime if stime else -1
|
159
|
+
topic = urlparse(item.response.request.url).netloc
|
160
|
+
data_size = int(item.response.headers.get("content-length", 0))
|
161
|
+
self.loghub_dot.build(
|
162
|
+
topic=topic, data_size=data_size, cost_time=cost_time,
|
163
|
+
status=200, seed=seed.to_string, url=item.response.url,
|
164
|
+
proxy=seed.params.proxy, proxy_type=seed.params.proxy_type,
|
165
|
+
project=self.project, task=self.task,
|
166
|
+
)
|
167
|
+
|
168
|
+
def run(self):
|
169
|
+
self.register(self.loghub_dot.build_run, tag="LoghubDot")
|
170
|
+
for _ in range(self.thread_num):
|
171
|
+
self.register(self.spider, tag="Distributor")
|