cobweb-launcher 1.3.5__py3-none-any.whl → 1.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cobweb_launcher-1.3.5.dist-info → cobweb_launcher-1.3.7.dist-info}/METADATA +1 -1
- cobweb_launcher-1.3.7.dist-info/RECORD +40 -0
- cobweb_launcher-1.3.7.dist-info/top_level.txt +1 -0
- cobweb/base/decorators.py +0 -40
- cobweb/crawlers/base_crawler.py +0 -144
- cobweb/crawlers/file_crawler.py +0 -98
- cobweb/pipelines/base_pipeline.py +0 -54
- cobweb/pipelines/loghub_pipeline.py +0 -34
- cobweb/utils/dotting.py +0 -32
- cobweb_/__init__.py +0 -2
- cobweb_/base/__init__.py +0 -9
- cobweb_/base/common_queue.py +0 -30
- cobweb_/base/decorators.py +0 -40
- cobweb_/base/item.py +0 -46
- cobweb_/base/log.py +0 -94
- cobweb_/base/request.py +0 -82
- cobweb_/base/response.py +0 -23
- cobweb_/base/seed.py +0 -114
- cobweb_/constant.py +0 -94
- cobweb_/crawlers/__init__.py +0 -1
- cobweb_/crawlers/crawler.py +0 -184
- cobweb_/db/__init__.py +0 -2
- cobweb_/db/api_db.py +0 -82
- cobweb_/db/redis_db.py +0 -130
- cobweb_/exceptions/__init__.py +0 -1
- cobweb_/exceptions/oss_db_exception.py +0 -28
- cobweb_/launchers/__init__.py +0 -3
- cobweb_/launchers/launcher.py +0 -235
- cobweb_/launchers/launcher_air.py +0 -88
- cobweb_/launchers/launcher_api.py +0 -221
- cobweb_/launchers/launcher_pro.py +0 -222
- cobweb_/pipelines/__init__.py +0 -3
- cobweb_/pipelines/pipeline.py +0 -69
- cobweb_/pipelines/pipeline_console.py +0 -22
- cobweb_/pipelines/pipeline_loghub.py +0 -34
- cobweb_/setting.py +0 -74
- cobweb_/utils/__init__.py +0 -5
- cobweb_/utils/bloom.py +0 -58
- cobweb_/utils/dotting.py +0 -32
- cobweb_/utils/oss.py +0 -94
- cobweb_/utils/tools.py +0 -42
- cobweb_launcher-1.3.5.dist-info/RECORD +0 -111
- cobweb_launcher-1.3.5.dist-info/top_level.txt +0 -2
- cobweb_new/__init__.py +0 -2
- cobweb_new/base/__init__.py +0 -72
- cobweb_new/base/common_queue.py +0 -53
- cobweb_new/base/decorators.py +0 -72
- cobweb_new/base/item.py +0 -46
- cobweb_new/base/log.py +0 -94
- cobweb_new/base/request.py +0 -82
- cobweb_new/base/response.py +0 -23
- cobweb_new/base/seed.py +0 -118
- cobweb_new/constant.py +0 -105
- cobweb_new/crawlers/__init__.py +0 -1
- cobweb_new/crawlers/crawler-new.py +0 -85
- cobweb_new/crawlers/crawler.py +0 -170
- cobweb_new/db/__init__.py +0 -2
- cobweb_new/db/api_db.py +0 -82
- cobweb_new/db/redis_db.py +0 -158
- cobweb_new/exceptions/__init__.py +0 -1
- cobweb_new/exceptions/oss_db_exception.py +0 -28
- cobweb_new/launchers/__init__.py +0 -3
- cobweb_new/launchers/launcher.py +0 -237
- cobweb_new/launchers/launcher_air.py +0 -88
- cobweb_new/launchers/launcher_api.py +0 -161
- cobweb_new/launchers/launcher_pro.py +0 -96
- cobweb_new/launchers/tesss.py +0 -47
- cobweb_new/pipelines/__init__.py +0 -3
- cobweb_new/pipelines/pipeline.py +0 -68
- cobweb_new/pipelines/pipeline_console.py +0 -22
- cobweb_new/pipelines/pipeline_loghub.py +0 -34
- cobweb_new/setting.py +0 -95
- cobweb_new/utils/__init__.py +0 -5
- cobweb_new/utils/bloom.py +0 -58
- cobweb_new/utils/oss.py +0 -94
- cobweb_new/utils/tools.py +0 -42
- {cobweb_launcher-1.3.5.dist-info → cobweb_launcher-1.3.7.dist-info}/LICENSE +0 -0
- {cobweb_launcher-1.3.5.dist-info → cobweb_launcher-1.3.7.dist-info}/WHEEL +0 -0
cobweb_new/db/api_db.py
DELETED
@@ -1,82 +0,0 @@
|
|
1
|
-
import os
|
2
|
-
import json
|
3
|
-
import requests
|
4
|
-
|
5
|
-
|
6
|
-
class ApiDB:
|
7
|
-
|
8
|
-
def __init__(self, host=None, **kwargs):
|
9
|
-
self.host = host or os.getenv("REDIS_API_HOST", "http://127.0.0.1:4396")
|
10
|
-
|
11
|
-
def _get_response(self, api, params: dict = None):
|
12
|
-
try:
|
13
|
-
url = self.host + api
|
14
|
-
response = requests.get(url, params=params)
|
15
|
-
json_data = response.json()
|
16
|
-
response.close()
|
17
|
-
return json_data["data"]
|
18
|
-
except:
|
19
|
-
return None
|
20
|
-
|
21
|
-
def _post_response(self, api, params: dict = None, data: dict = None):
|
22
|
-
try:
|
23
|
-
url = self.host + api
|
24
|
-
headers = {"Content-Type": "application/json"}
|
25
|
-
response = requests.post(url, headers=headers, params=params, data=json.dumps(data))
|
26
|
-
json_data = response.json()
|
27
|
-
response.close()
|
28
|
-
return json_data["data"]
|
29
|
-
except:
|
30
|
-
return None
|
31
|
-
|
32
|
-
def get(self, name):
|
33
|
-
return self._get_response(api="/get", params=dict(name=name))
|
34
|
-
|
35
|
-
def setnx(self, name, value=""):
|
36
|
-
return self._get_response(api="/setnx", params=dict(name=name, value=value))
|
37
|
-
|
38
|
-
def setex(self, name, t, value=""):
|
39
|
-
return self._get_response(api="/setex", params=dict(name=name, value=value, t=t))
|
40
|
-
|
41
|
-
def expire(self, name, t, nx: bool = False, xx: bool = False, gt: bool = False, lt: bool = False):
|
42
|
-
return self._get_response(api="/expire", params=dict(name=name, t=t, nx=nx, xx=xx, gt=gt, lt=lt))
|
43
|
-
|
44
|
-
def ttl(self, name):
|
45
|
-
return self._get_response(api="/ttl", params=dict(name=name))
|
46
|
-
|
47
|
-
def delete(self, name):
|
48
|
-
return self._get_response(api="/delete", params=dict(name=name))
|
49
|
-
|
50
|
-
def exists(self, name):
|
51
|
-
return self._get_response(api="/exists", params=dict(name=name))
|
52
|
-
|
53
|
-
def incrby(self, name, value):
|
54
|
-
return self._get_response(api="/incrby", params=dict(name=name, value=value))
|
55
|
-
|
56
|
-
def zcard(self, name) -> bool:
|
57
|
-
return self._get_response(api="/zcard", params=dict(name=name))
|
58
|
-
|
59
|
-
def zadd(self, name, item: dict, **kwargs):
|
60
|
-
return self._post_response(api="/zadd", data=dict(name=name, mapping=item, **kwargs))
|
61
|
-
|
62
|
-
def zrem(self, name, *values):
|
63
|
-
return self._post_response(api="/zrem", data=dict(name=name, values=values))
|
64
|
-
|
65
|
-
def zcount(self, name, _min, _max):
|
66
|
-
return self._get_response(api="/zcount", params=dict(name=name, min=_min, max=_max))
|
67
|
-
|
68
|
-
def lock(self, name, t=15) -> bool:
|
69
|
-
return self._get_response(api="/lock", params=dict(name=name, t=t))
|
70
|
-
|
71
|
-
def auto_incr(self, name, t=15, limit=1000) -> bool:
|
72
|
-
return self._get_response(api="/auto_incr", params=dict(name=name, t=t, limit=limit))
|
73
|
-
|
74
|
-
def members(self, name, score, start=0, count=5000, _min="-inf", _max="+inf"):
|
75
|
-
return self._get_response(api="/members", params=dict(name=name, score=score, start=start, count=count, min=_min, max=_max))
|
76
|
-
|
77
|
-
def done(self, name: list, *values):
|
78
|
-
return self._post_response(api="/done", data=dict(name=name, values=values))
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
cobweb_new/db/redis_db.py
DELETED
@@ -1,158 +0,0 @@
|
|
1
|
-
import redis
|
2
|
-
from cobweb import setting
|
3
|
-
|
4
|
-
|
5
|
-
class RedisDB:
|
6
|
-
|
7
|
-
def __init__(self, **kwargs):
|
8
|
-
redis_config = kwargs or setting.REDIS_CONFIG
|
9
|
-
pool = redis.ConnectionPool(**redis_config)
|
10
|
-
self._client = redis.Redis(connection_pool=pool)
|
11
|
-
|
12
|
-
def setnx(self, name, value=""):
|
13
|
-
return self._client.setnx(name, value)
|
14
|
-
|
15
|
-
def setex(self, name, t, value=""):
|
16
|
-
return self._client.setex(name, t, value)
|
17
|
-
|
18
|
-
def expire(self, name, t, nx: bool = False, xx: bool = False, gt: bool = False, lt: bool = False):
|
19
|
-
return self._client.expire(name, t, nx, xx, gt, lt)
|
20
|
-
|
21
|
-
def ttl(self, name):
|
22
|
-
return self._client.ttl(name)
|
23
|
-
|
24
|
-
def delete(self, name):
|
25
|
-
return self._client.delete(name)
|
26
|
-
|
27
|
-
def exists(self, *name) -> bool:
|
28
|
-
return self._client.exists(*name)
|
29
|
-
|
30
|
-
def incrby(self, name, value):
|
31
|
-
return self._client.incrby(name, value)
|
32
|
-
|
33
|
-
def sadd(self, name, value):
|
34
|
-
return self._client.sadd(name, value)
|
35
|
-
|
36
|
-
def zcard(self, name) -> bool:
|
37
|
-
return self._client.zcard(name)
|
38
|
-
|
39
|
-
def zadd(self, name, item: dict, **kwargs):
|
40
|
-
return self._client.zadd(name, item, **kwargs)
|
41
|
-
|
42
|
-
def zrem(self, name, *value):
|
43
|
-
return self._client.zrem(name, *value)
|
44
|
-
|
45
|
-
def zcount(self, name, _min, _max):
|
46
|
-
return self._client.zcount(name, _min, _max)
|
47
|
-
|
48
|
-
# def zrangebyscore(self, name, _min, _max, start, num, withscores: bool = False, *args):
|
49
|
-
# return self._client.zrangebyscore(name, _min, _max, start, num, withscores, *args)
|
50
|
-
|
51
|
-
def lua(self, script: str, keys: list = None, args: list = None):
|
52
|
-
keys = keys or []
|
53
|
-
args = args or []
|
54
|
-
keys_count = len(keys)
|
55
|
-
return self._client.eval(script, keys_count, *keys, *args)
|
56
|
-
|
57
|
-
def lua_sha(self, sha1: str, keys: list = None, args: list = None):
|
58
|
-
keys = keys or []
|
59
|
-
args = args or []
|
60
|
-
keys_count = len(keys)
|
61
|
-
return self._client.evalsha(sha1, keys_count, *keys, *args)
|
62
|
-
|
63
|
-
def execute_lua(self, lua_script: str, keys: list, *args):
|
64
|
-
execute = self._client.register_script(lua_script)
|
65
|
-
return execute(keys=keys, args=args)
|
66
|
-
|
67
|
-
def lock(self, key, t=15) -> bool:
|
68
|
-
lua_script = """
|
69
|
-
local status = redis.call('setnx', KEYS[1], 1)
|
70
|
-
if ( status == 1 ) then
|
71
|
-
redis.call('expire', KEYS[1], ARGV[1])
|
72
|
-
end
|
73
|
-
return status
|
74
|
-
"""
|
75
|
-
status = self.execute_lua(lua_script, [key], t)
|
76
|
-
return bool(status)
|
77
|
-
|
78
|
-
def auto_incr(self, name, t=15, limit=1000):
|
79
|
-
lua_script = """
|
80
|
-
local count = 0
|
81
|
-
local status = false
|
82
|
-
local limit = ARGV[2]
|
83
|
-
local expire = redis.call('ttl', KEYS[1])
|
84
|
-
|
85
|
-
if ( expire == -2 ) then
|
86
|
-
redis.call('setnx', KEYS[1], 1)
|
87
|
-
elseif ( expire == -1) then
|
88
|
-
redis.call('expire', KEYS[1], ARGV[1])
|
89
|
-
else
|
90
|
-
count = redis.call('get', KEYS[1])
|
91
|
-
end
|
92
|
-
|
93
|
-
if ( count + 0 < limit + 0 ) then
|
94
|
-
status = true
|
95
|
-
redis.call('incr', KEYS[1])
|
96
|
-
end
|
97
|
-
|
98
|
-
return status
|
99
|
-
"""
|
100
|
-
status = self.execute_lua(lua_script, [name], t, limit)
|
101
|
-
return bool(status)
|
102
|
-
|
103
|
-
def members(self, key, score, start=0, count=5000, _min="-inf", _max="+inf") -> list:
|
104
|
-
lua_script = """
|
105
|
-
local min = ARGV[1]
|
106
|
-
local max = ARGV[2]
|
107
|
-
local start = ARGV[3]
|
108
|
-
local count = ARGV[4]
|
109
|
-
local score = ARGV[5]
|
110
|
-
local members = nil
|
111
|
-
|
112
|
-
if ( type(count) == string ) then
|
113
|
-
members = redis.call('zrangebyscore', KEYS[1], min, max, 'WITHSCORES')
|
114
|
-
else
|
115
|
-
members = redis.call('zrangebyscore', KEYS[1], min, max, 'WITHSCORES', 'limit', start, count)
|
116
|
-
end
|
117
|
-
|
118
|
-
local result = {}
|
119
|
-
|
120
|
-
for i = 1, #members, 2 do
|
121
|
-
local priority = nil
|
122
|
-
local member = members[i]
|
123
|
-
local originPriority = nil
|
124
|
-
if ( members[i+1] + 0 < 0 ) then
|
125
|
-
originPriority = math.ceil(members[i+1]) * 1000 - members[i+1] * 1000
|
126
|
-
else
|
127
|
-
originPriority = math.floor(members[i+1])
|
128
|
-
end
|
129
|
-
|
130
|
-
if ( score + 0 >= 1000 ) then
|
131
|
-
priority = -score - originPriority / 1000
|
132
|
-
elseif ( score + 0 == 0 ) then
|
133
|
-
priority = originPriority
|
134
|
-
else
|
135
|
-
originPriority = score
|
136
|
-
priority = score
|
137
|
-
end
|
138
|
-
redis.call('zadd', KEYS[1], priority, member)
|
139
|
-
table.insert(result, member)
|
140
|
-
table.insert(result, originPriority)
|
141
|
-
end
|
142
|
-
|
143
|
-
return result
|
144
|
-
"""
|
145
|
-
members = self.execute_lua(lua_script, [key], _min, _max, start, count, score)
|
146
|
-
return [(members[i].decode(), int(members[i + 1])) for i in range(0, len(members), 2)]
|
147
|
-
|
148
|
-
def done(self, keys: list, *args) -> list:
|
149
|
-
lua_script = """
|
150
|
-
for i, member in ipairs(ARGV) do
|
151
|
-
redis.call("zrem", KEYS[1], member)
|
152
|
-
redis.call("sadd", KEYS[2], member)
|
153
|
-
end
|
154
|
-
"""
|
155
|
-
self.execute_lua(lua_script, keys, *args)
|
156
|
-
|
157
|
-
|
158
|
-
|
@@ -1 +0,0 @@
|
|
1
|
-
from .oss_db_exception import *
|
@@ -1,28 +0,0 @@
|
|
1
|
-
class OssDBException(Exception):
|
2
|
-
"""Base oss client exception that all others inherit."""
|
3
|
-
|
4
|
-
|
5
|
-
class OssDBMergeError(OssDBException):
|
6
|
-
"""
|
7
|
-
Exception raised when execute merge operation fails.
|
8
|
-
"""
|
9
|
-
|
10
|
-
|
11
|
-
class OssDBPutPartError(OssDBException):
|
12
|
-
"""
|
13
|
-
Exception raised when upload part operation fails.
|
14
|
-
"""
|
15
|
-
|
16
|
-
|
17
|
-
class OssDBPutObjError(OssDBException):
|
18
|
-
"""
|
19
|
-
Exception raised when upload operation fails.
|
20
|
-
"""
|
21
|
-
|
22
|
-
|
23
|
-
class OssDBAppendObjError(OssDBException):
|
24
|
-
"""Exception raised when upload operation fails."""
|
25
|
-
|
26
|
-
|
27
|
-
class OssDBInitPartError(OssDBException):
|
28
|
-
"""Exception raised when init upload operation fails."""
|
cobweb_new/launchers/__init__.py
DELETED
cobweb_new/launchers/launcher.py
DELETED
@@ -1,237 +0,0 @@
|
|
1
|
-
import time
|
2
|
-
import inspect
|
3
|
-
import threading
|
4
|
-
import importlib
|
5
|
-
|
6
|
-
from inspect import isgenerator
|
7
|
-
from typing import Union, Callable
|
8
|
-
|
9
|
-
from constant import DealModel, LogTemplate
|
10
|
-
from cobweb.utils import dynamic_load_class
|
11
|
-
from cobweb.base import Seed, Queue, logger, TaskQueue
|
12
|
-
from cobweb import setting
|
13
|
-
|
14
|
-
|
15
|
-
class Launcher(threading.Thread):
|
16
|
-
|
17
|
-
__CUSTOM_FUNC__ = {}
|
18
|
-
|
19
|
-
def __init__(self, task, project, custom_setting=None, **kwargs):
|
20
|
-
super().__init__()
|
21
|
-
self.task = task
|
22
|
-
self.project = project
|
23
|
-
self.custom_func = dict()
|
24
|
-
self.app_time = int(time.time())
|
25
|
-
|
26
|
-
_setting = dict()
|
27
|
-
|
28
|
-
if custom_setting:
|
29
|
-
if isinstance(custom_setting, dict):
|
30
|
-
_setting = custom_setting
|
31
|
-
else:
|
32
|
-
if isinstance(custom_setting, str):
|
33
|
-
custom_setting = importlib.import_module(custom_setting)
|
34
|
-
if not inspect.ismodule(custom_setting):
|
35
|
-
raise Exception
|
36
|
-
for k, v in custom_setting.__dict__.items():
|
37
|
-
if not k.startswith("__") and not inspect.ismodule(v):
|
38
|
-
_setting[k] = v
|
39
|
-
|
40
|
-
_setting.update(**kwargs)
|
41
|
-
|
42
|
-
for k, v in _setting.items():
|
43
|
-
setattr(setting, k.upper(), v)
|
44
|
-
|
45
|
-
self.scheduling_wait_time = setting.SCHEDULING_WAIT_TIME
|
46
|
-
self.inserting_wait_time = setting.INSERTING_WAIT_TIME
|
47
|
-
self.removing_wait_time = setting.REMOVING_WAIT_TIME
|
48
|
-
|
49
|
-
self.scheduling_size = setting.SCHEDULING_SIZE
|
50
|
-
self.inserting_size = setting.INSERTING_SIZE
|
51
|
-
self.removing_size = setting.REMOVING_SIZE
|
52
|
-
|
53
|
-
self.todo_queue_size = setting.TODO_QUEUE_SIZE
|
54
|
-
self.seed_queue_size = setting.SEED_QUEUE_SIZE
|
55
|
-
self.request_queue_size = setting.REQUEST_QUEUE_SIZE
|
56
|
-
self.download_queue_size = setting.DOWNLOAD_QUEUE_SIZE
|
57
|
-
self.response_queue_size = setting.RESPONSE_QUEUE_SIZE
|
58
|
-
self.upload_queue_size = setting.UPLOAD_QUEUE_SIZE
|
59
|
-
self.delete_queue_size = setting.DELETE_QUEUE_SIZE
|
60
|
-
self.done_queue_size = setting.DONE_QUEUE_SIZE
|
61
|
-
|
62
|
-
self.stop = threading.Event() # 结束事件
|
63
|
-
self.pause = threading.Event() # 暂停事件
|
64
|
-
|
65
|
-
self.crawler_path = setting.CRAWLER
|
66
|
-
self.pipeline_path = setting.PIPELINE
|
67
|
-
|
68
|
-
# self.crawler = None
|
69
|
-
# self.pipeline = None
|
70
|
-
|
71
|
-
self._threads = []
|
72
|
-
|
73
|
-
self._task_info = dict(todo={}, download={})
|
74
|
-
|
75
|
-
# ------
|
76
|
-
|
77
|
-
self.before_scheduler_wait_seconds = setting.BEFORE_SCHEDULER_WAIT_SECONDS
|
78
|
-
|
79
|
-
self.todo_queue_full_wait_seconds = setting.TODO_QUEUE_FULL_WAIT_SECONDS
|
80
|
-
self.new_queue_wait_seconds = setting.NEW_QUEUE_WAIT_SECONDS
|
81
|
-
self.done_queue_wait_seconds = setting.DONE_QUEUE_WAIT_SECONDS
|
82
|
-
self.upload_queue_wait_seconds = setting.UPLOAD_QUEUE_WAIT_SECONDS
|
83
|
-
self.seed_reset_seconds = setting.SEED_RESET_SECONDS
|
84
|
-
|
85
|
-
self.todo_queue_size = setting.TODO_QUEUE_SIZE
|
86
|
-
# self.new_queue_max_size = setting.NEW_QUEUE_MAX_SIZE
|
87
|
-
# self.done_queue_max_size = setting.DONE_QUEUE_MAX_SIZE
|
88
|
-
# self.upload_queue_max_size = setting.UPLOAD_QUEUE_MAX_SIZE
|
89
|
-
|
90
|
-
self.spider_max_retries = setting.SPIDER_MAX_RETRIES
|
91
|
-
self.spider_thread_num = setting.SPIDER_THREAD_NUM
|
92
|
-
self.spider_time_sleep = setting.SPIDER_TIME_SLEEP
|
93
|
-
self.spider_max_count = setting.SPIDER_MAX_COUNT
|
94
|
-
self.time_window = setting.TIME_WINDOW
|
95
|
-
|
96
|
-
self.done_model = setting.DONE_MODEL
|
97
|
-
self.task_model = setting.TASK_MODEL
|
98
|
-
|
99
|
-
self.filter_field = setting.FILTER_FIELD
|
100
|
-
|
101
|
-
@staticmethod
|
102
|
-
def insert_seed(seed: Union[Seed, dict]):
|
103
|
-
if isinstance(seed, dict):
|
104
|
-
seed = Seed(seed)
|
105
|
-
TaskQueue.SEED.push(seed)
|
106
|
-
|
107
|
-
@property
|
108
|
-
def request(self):
|
109
|
-
"""
|
110
|
-
自定义request函数
|
111
|
-
use case:
|
112
|
-
from cobweb.base import Request, BaseItem
|
113
|
-
@launcher.request
|
114
|
-
def request(seed: Seed) -> Union[Request, BaseItem]:
|
115
|
-
...
|
116
|
-
yield Request(seed.url, seed)
|
117
|
-
"""
|
118
|
-
def decorator(func):
|
119
|
-
self.custom_func['request'] = func
|
120
|
-
return decorator
|
121
|
-
|
122
|
-
@property
|
123
|
-
def download(self):
|
124
|
-
"""
|
125
|
-
自定义download函数
|
126
|
-
use case:
|
127
|
-
from cobweb.base import Request, Response, Seed, BaseItem
|
128
|
-
@launcher.download
|
129
|
-
def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
|
130
|
-
...
|
131
|
-
yield Response(item.seed, response)
|
132
|
-
"""
|
133
|
-
def decorator(func):
|
134
|
-
self.custom_func['download'] = func
|
135
|
-
return decorator
|
136
|
-
|
137
|
-
@property
|
138
|
-
def parse(self):
|
139
|
-
"""
|
140
|
-
自定义parse函数, xxxItem为自定义的存储数据类型
|
141
|
-
use case:
|
142
|
-
from cobweb.base import Request, Response
|
143
|
-
@launcher.parse
|
144
|
-
def parse(item: Response) -> BaseItem:
|
145
|
-
...
|
146
|
-
yield xxxItem(seed, **kwargs)
|
147
|
-
"""
|
148
|
-
def decorator(func):
|
149
|
-
self.custom_func['parse'] = func
|
150
|
-
return decorator
|
151
|
-
|
152
|
-
def remove_working_items(self, key, items):
|
153
|
-
for item in items:
|
154
|
-
self._task_info[key].pop(item, None)
|
155
|
-
|
156
|
-
def check_alive(self):
|
157
|
-
while not self.stop.is_set():
|
158
|
-
if not self.pause.is_set():
|
159
|
-
for thread in self._threads:
|
160
|
-
if not thread.is_alive():
|
161
|
-
thread.start()
|
162
|
-
time.sleep(1)
|
163
|
-
|
164
|
-
def _add_thread(self, func, num=1, obj=None, name=None, args=()):
|
165
|
-
obj = obj or self
|
166
|
-
name = obj.__class__.__name__ + name or func.__name__
|
167
|
-
for i in range(num):
|
168
|
-
func_name = name + "_" + str(i) if num > 1 else name
|
169
|
-
self._threads.append(threading.Thread(name=func_name, target=func, args=(obj,) + args))
|
170
|
-
|
171
|
-
def _init_schedule_thread(self):
|
172
|
-
...
|
173
|
-
|
174
|
-
def _polling(self):
|
175
|
-
check_emtpy_times = 0
|
176
|
-
while not self.stop.is_set():
|
177
|
-
if TaskQueue.is_empty():
|
178
|
-
if self.pause.is_set():
|
179
|
-
run_time = int(time.time()) - self.app_time
|
180
|
-
if not self.task_model and run_time > self.before_scheduler_wait_seconds:
|
181
|
-
logger.info("Done! ready to close thread...")
|
182
|
-
self.stop.set()
|
183
|
-
else:
|
184
|
-
logger.info("pause! waiting for resume...")
|
185
|
-
elif check_emtpy_times > 2:
|
186
|
-
logger.info("pause! waiting for resume...")
|
187
|
-
self.doing_seeds = {}
|
188
|
-
self.pause.set()
|
189
|
-
else:
|
190
|
-
logger.info(
|
191
|
-
"check whether the task is complete, "
|
192
|
-
f"reset times {3 - check_emtpy_times}"
|
193
|
-
)
|
194
|
-
check_emtpy_times += 1
|
195
|
-
elif TaskQueue.TODO.length:
|
196
|
-
logger.info(f"Recovery {self.task} task run!")
|
197
|
-
check_emtpy_times = 0
|
198
|
-
self.pause.clear()
|
199
|
-
else:
|
200
|
-
logger.info(LogTemplate.launcher_polling.format(
|
201
|
-
task=self.task,
|
202
|
-
doing_len=len(self.doing_seeds.keys()),
|
203
|
-
todo_len=TaskQueue.TODO.length,
|
204
|
-
done_len=TaskQueue.DONE.length,
|
205
|
-
upload_len=TaskQueue.UPLOAD.length,
|
206
|
-
))
|
207
|
-
|
208
|
-
time.sleep(10)
|
209
|
-
|
210
|
-
logger.info("Done! Ready to close thread...")
|
211
|
-
|
212
|
-
def run(self):
|
213
|
-
Crawler = dynamic_load_class(self.crawler_path)
|
214
|
-
Pipeline = dynamic_load_class(self.pipeline_path)
|
215
|
-
|
216
|
-
crawler = Crawler(
|
217
|
-
stop=self.stop, pause=self.pause,
|
218
|
-
thread_num=self.spider_thread_num,
|
219
|
-
time_sleep=self.spider_time_sleep,
|
220
|
-
custom_func=self.custom_func
|
221
|
-
)
|
222
|
-
|
223
|
-
pipeline = Pipeline(
|
224
|
-
stop=self.stop, pause=self.pause,
|
225
|
-
upload_size=self.upload_queue_max_size,
|
226
|
-
wait_seconds=self.upload_queue_wait_seconds
|
227
|
-
)
|
228
|
-
|
229
|
-
self._add_thread(obj=crawler, func=crawler.build_request_item)
|
230
|
-
self._add_thread(obj=crawler, func=crawler.build_download_item, num=self.spider_thread_num)
|
231
|
-
self._add_thread(obj=crawler, func=crawler.build_parse_item)
|
232
|
-
|
233
|
-
self._init_schedule_thread()
|
234
|
-
self.check_alive()
|
235
|
-
|
236
|
-
|
237
|
-
|
@@ -1,88 +0,0 @@
|
|
1
|
-
import time
|
2
|
-
|
3
|
-
from cobweb.base import logger
|
4
|
-
from cobweb.constant import LogTemplate
|
5
|
-
from .launcher import Launcher, check_pause
|
6
|
-
|
7
|
-
|
8
|
-
class LauncherAir(Launcher):
|
9
|
-
|
10
|
-
# def _scheduler(self):
|
11
|
-
# if self.start_seeds:
|
12
|
-
# self.__LAUNCHER_QUEUE__['todo'].push(self.start_seeds)
|
13
|
-
|
14
|
-
@check_pause
|
15
|
-
def _insert(self):
|
16
|
-
seeds = {}
|
17
|
-
status = self.__LAUNCHER_QUEUE__['new'].length < self._new_queue_max_size
|
18
|
-
for _ in range(self._new_queue_max_size):
|
19
|
-
seed = self.__LAUNCHER_QUEUE__['new'].pop()
|
20
|
-
if not seed:
|
21
|
-
break
|
22
|
-
seeds[seed.to_string] = seed.params.priority
|
23
|
-
if seeds:
|
24
|
-
self.__LAUNCHER_QUEUE__['todo'].push(seeds)
|
25
|
-
if status:
|
26
|
-
time.sleep(self._new_queue_wait_seconds)
|
27
|
-
|
28
|
-
@check_pause
|
29
|
-
def _delete(self):
|
30
|
-
seeds = []
|
31
|
-
status = self.__LAUNCHER_QUEUE__['done'].length < self._done_queue_max_size
|
32
|
-
|
33
|
-
for _ in range(self._done_queue_max_size):
|
34
|
-
seed = self.__LAUNCHER_QUEUE__['done'].pop()
|
35
|
-
if not seed:
|
36
|
-
break
|
37
|
-
seeds.append(seed.to_string)
|
38
|
-
|
39
|
-
if seeds:
|
40
|
-
self._remove_doing_seeds(seeds)
|
41
|
-
|
42
|
-
if status:
|
43
|
-
time.sleep(self._done_queue_wait_seconds)
|
44
|
-
|
45
|
-
def _polling(self):
|
46
|
-
|
47
|
-
check_emtpy_times = 0
|
48
|
-
|
49
|
-
while not self._stop.is_set():
|
50
|
-
|
51
|
-
queue_not_empty_count = 0
|
52
|
-
pooling_wait_seconds = 30
|
53
|
-
|
54
|
-
for q in self.__LAUNCHER_QUEUE__.values():
|
55
|
-
if q.length != 0:
|
56
|
-
queue_not_empty_count += 1
|
57
|
-
|
58
|
-
if queue_not_empty_count == 0:
|
59
|
-
pooling_wait_seconds = 3
|
60
|
-
if self._pause.is_set():
|
61
|
-
check_emtpy_times = 0
|
62
|
-
if not self._task_model:
|
63
|
-
logger.info("Done! Ready to close thread...")
|
64
|
-
self._stop.set()
|
65
|
-
elif check_emtpy_times > 2:
|
66
|
-
self.__DOING__ = {}
|
67
|
-
self._pause.set()
|
68
|
-
else:
|
69
|
-
logger.info(
|
70
|
-
"check whether the task is complete, "
|
71
|
-
f"reset times {3 - check_emtpy_times}"
|
72
|
-
)
|
73
|
-
check_emtpy_times += 1
|
74
|
-
elif self._pause.is_set():
|
75
|
-
self._pause.clear()
|
76
|
-
self._execute()
|
77
|
-
else:
|
78
|
-
logger.info(LogTemplate.launcher_air_polling.format(
|
79
|
-
task=self.task,
|
80
|
-
doing_len=len(self.__DOING__.keys()),
|
81
|
-
todo_len=self.__LAUNCHER_QUEUE__['todo'].length,
|
82
|
-
done_len=self.__LAUNCHER_QUEUE__['done'].length,
|
83
|
-
upload_len=self.__LAUNCHER_QUEUE__['upload'].length,
|
84
|
-
))
|
85
|
-
|
86
|
-
time.sleep(pooling_wait_seconds)
|
87
|
-
|
88
|
-
|