cobweb-launcher 1.3.5__py3-none-any.whl → 1.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. {cobweb_launcher-1.3.5.dist-info → cobweb_launcher-1.3.7.dist-info}/METADATA +1 -1
  2. cobweb_launcher-1.3.7.dist-info/RECORD +40 -0
  3. cobweb_launcher-1.3.7.dist-info/top_level.txt +1 -0
  4. cobweb/base/decorators.py +0 -40
  5. cobweb/crawlers/base_crawler.py +0 -144
  6. cobweb/crawlers/file_crawler.py +0 -98
  7. cobweb/pipelines/base_pipeline.py +0 -54
  8. cobweb/pipelines/loghub_pipeline.py +0 -34
  9. cobweb/utils/dotting.py +0 -32
  10. cobweb_/__init__.py +0 -2
  11. cobweb_/base/__init__.py +0 -9
  12. cobweb_/base/common_queue.py +0 -30
  13. cobweb_/base/decorators.py +0 -40
  14. cobweb_/base/item.py +0 -46
  15. cobweb_/base/log.py +0 -94
  16. cobweb_/base/request.py +0 -82
  17. cobweb_/base/response.py +0 -23
  18. cobweb_/base/seed.py +0 -114
  19. cobweb_/constant.py +0 -94
  20. cobweb_/crawlers/__init__.py +0 -1
  21. cobweb_/crawlers/crawler.py +0 -184
  22. cobweb_/db/__init__.py +0 -2
  23. cobweb_/db/api_db.py +0 -82
  24. cobweb_/db/redis_db.py +0 -130
  25. cobweb_/exceptions/__init__.py +0 -1
  26. cobweb_/exceptions/oss_db_exception.py +0 -28
  27. cobweb_/launchers/__init__.py +0 -3
  28. cobweb_/launchers/launcher.py +0 -235
  29. cobweb_/launchers/launcher_air.py +0 -88
  30. cobweb_/launchers/launcher_api.py +0 -221
  31. cobweb_/launchers/launcher_pro.py +0 -222
  32. cobweb_/pipelines/__init__.py +0 -3
  33. cobweb_/pipelines/pipeline.py +0 -69
  34. cobweb_/pipelines/pipeline_console.py +0 -22
  35. cobweb_/pipelines/pipeline_loghub.py +0 -34
  36. cobweb_/setting.py +0 -74
  37. cobweb_/utils/__init__.py +0 -5
  38. cobweb_/utils/bloom.py +0 -58
  39. cobweb_/utils/dotting.py +0 -32
  40. cobweb_/utils/oss.py +0 -94
  41. cobweb_/utils/tools.py +0 -42
  42. cobweb_launcher-1.3.5.dist-info/RECORD +0 -111
  43. cobweb_launcher-1.3.5.dist-info/top_level.txt +0 -2
  44. cobweb_new/__init__.py +0 -2
  45. cobweb_new/base/__init__.py +0 -72
  46. cobweb_new/base/common_queue.py +0 -53
  47. cobweb_new/base/decorators.py +0 -72
  48. cobweb_new/base/item.py +0 -46
  49. cobweb_new/base/log.py +0 -94
  50. cobweb_new/base/request.py +0 -82
  51. cobweb_new/base/response.py +0 -23
  52. cobweb_new/base/seed.py +0 -118
  53. cobweb_new/constant.py +0 -105
  54. cobweb_new/crawlers/__init__.py +0 -1
  55. cobweb_new/crawlers/crawler-new.py +0 -85
  56. cobweb_new/crawlers/crawler.py +0 -170
  57. cobweb_new/db/__init__.py +0 -2
  58. cobweb_new/db/api_db.py +0 -82
  59. cobweb_new/db/redis_db.py +0 -158
  60. cobweb_new/exceptions/__init__.py +0 -1
  61. cobweb_new/exceptions/oss_db_exception.py +0 -28
  62. cobweb_new/launchers/__init__.py +0 -3
  63. cobweb_new/launchers/launcher.py +0 -237
  64. cobweb_new/launchers/launcher_air.py +0 -88
  65. cobweb_new/launchers/launcher_api.py +0 -161
  66. cobweb_new/launchers/launcher_pro.py +0 -96
  67. cobweb_new/launchers/tesss.py +0 -47
  68. cobweb_new/pipelines/__init__.py +0 -3
  69. cobweb_new/pipelines/pipeline.py +0 -68
  70. cobweb_new/pipelines/pipeline_console.py +0 -22
  71. cobweb_new/pipelines/pipeline_loghub.py +0 -34
  72. cobweb_new/setting.py +0 -95
  73. cobweb_new/utils/__init__.py +0 -5
  74. cobweb_new/utils/bloom.py +0 -58
  75. cobweb_new/utils/oss.py +0 -94
  76. cobweb_new/utils/tools.py +0 -42
  77. {cobweb_launcher-1.3.5.dist-info → cobweb_launcher-1.3.7.dist-info}/LICENSE +0 -0
  78. {cobweb_launcher-1.3.5.dist-info → cobweb_launcher-1.3.7.dist-info}/WHEEL +0 -0
cobweb_/db/redis_db.py DELETED
@@ -1,130 +0,0 @@
1
- import redis
2
- from cobweb import setting
3
-
4
-
5
- class RedisDB:
6
-
7
- def __init__(self, **kwargs):
8
- redis_config = kwargs or setting.REDIS_CONFIG
9
- pool = redis.ConnectionPool(**redis_config)
10
- self._client = redis.Redis(connection_pool=pool)
11
-
12
- def setnx(self, name, value=""):
13
- return self._client.setnx(name, value)
14
-
15
- def setex(self, name, t, value=""):
16
- return self._client.setex(name, t, value)
17
-
18
- def expire(self, name, t, nx: bool = False, xx: bool = False, gt: bool = False, lt: bool = False):
19
- return self._client.expire(name, t, nx, xx, gt, lt)
20
-
21
- def ttl(self, name):
22
- return self._client.ttl(name)
23
-
24
- def delete(self, name):
25
- return self._client.delete(name)
26
-
27
- def exists(self, *name) -> bool:
28
- return self._client.exists(*name)
29
-
30
- def sadd(self, name, value):
31
- return self._client.sadd(name, value)
32
-
33
- def zcard(self, name) -> bool:
34
- return self._client.zcard(name)
35
-
36
- def zadd(self, name, item: dict, **kwargs):
37
- return self._client.zadd(name, item, **kwargs)
38
-
39
- def zrem(self, name, *value):
40
- return self._client.zrem(name, *value)
41
-
42
- def zcount(self, name, _min, _max):
43
- return self._client.zcount(name, _min, _max)
44
-
45
- # def zrangebyscore(self, name, _min, _max, start, num, withscores: bool = False, *args):
46
- # return self._client.zrangebyscore(name, _min, _max, start, num, withscores, *args)
47
-
48
- def lua(self, script: str, keys: list = None, args: list = None):
49
- keys = keys or []
50
- args = args or []
51
- keys_count = len(keys)
52
- return self._client.eval(script, keys_count, *keys, *args)
53
-
54
- def lua_sha(self, sha1: str, keys: list = None, args: list = None):
55
- keys = keys or []
56
- args = args or []
57
- keys_count = len(keys)
58
- return self._client.evalsha(sha1, keys_count, *keys, *args)
59
-
60
- def execute_lua(self, lua_script: str, keys: list, *args):
61
- execute = self._client.register_script(lua_script)
62
- return execute(keys=keys, args=args)
63
-
64
- def lock(self, key, t=15) -> bool:
65
- lua_script = """
66
- local status = redis.call('setnx', KEYS[1], 1)
67
- if ( status == 1 ) then
68
- redis.call('expire', KEYS[1], ARGV[1])
69
- end
70
- return status
71
- """
72
- status = self.execute_lua(lua_script, [key], t)
73
- return bool(status)
74
-
75
- def members(self, key, score, start=0, count=5000, _min="-inf", _max="+inf") -> list:
76
- lua_script = """
77
- local min = ARGV[1]
78
- local max = ARGV[2]
79
- local start = ARGV[3]
80
- local count = ARGV[4]
81
- local score = ARGV[5]
82
- local members = nil
83
-
84
- if ( type(count) == string ) then
85
- members = redis.call('zrangebyscore', KEYS[1], min, max, 'WITHSCORES')
86
- else
87
- members = redis.call('zrangebyscore', KEYS[1], min, max, 'WITHSCORES', 'limit', start, count)
88
- end
89
-
90
- local result = {}
91
-
92
- for i = 1, #members, 2 do
93
- local priority = nil
94
- local member = members[i]
95
- local originPriority = nil
96
- if ( members[i+1] + 0 < 0 ) then
97
- originPriority = math.ceil(members[i+1]) * 1000 - members[i+1] * 1000
98
- else
99
- originPriority = math.floor(members[i+1])
100
- end
101
-
102
- if ( score + 0 >= 1000 ) then
103
- priority = -score - originPriority / 1000
104
- elseif ( score + 0 == 0 ) then
105
- priority = originPriority
106
- else
107
- originPriority = score
108
- priority = score
109
- end
110
- redis.call('zadd', KEYS[1], priority, member)
111
- table.insert(result, member)
112
- table.insert(result, originPriority)
113
- end
114
-
115
- return result
116
- """
117
- members = self.execute_lua(lua_script, [key], _min, _max, start, count, score)
118
- return [(members[i].decode(), int(members[i + 1])) for i in range(0, len(members), 2)]
119
-
120
- def done(self, keys: list, *args) -> list:
121
- lua_script = """
122
- for i, member in ipairs(ARGV) do
123
- redis.call("zrem", KEYS[1], member)
124
- redis.call("sadd", KEYS[2], member)
125
- end
126
- """
127
- self.execute_lua(lua_script, keys, *args)
128
-
129
-
130
-
@@ -1 +0,0 @@
1
- from .oss_db_exception import *
@@ -1,28 +0,0 @@
1
- class OssDBException(Exception):
2
- """Base oss client exception that all others inherit."""
3
-
4
-
5
- class OssDBMergeError(OssDBException):
6
- """
7
- Exception raised when execute merge operation fails.
8
- """
9
-
10
-
11
- class OssDBPutPartError(OssDBException):
12
- """
13
- Exception raised when upload part operation fails.
14
- """
15
-
16
-
17
- class OssDBPutObjError(OssDBException):
18
- """
19
- Exception raised when upload operation fails.
20
- """
21
-
22
-
23
- class OssDBAppendObjError(OssDBException):
24
- """Exception raised when upload operation fails."""
25
-
26
-
27
- class OssDBInitPartError(OssDBException):
28
- """Exception raised when init upload operation fails."""
@@ -1,3 +0,0 @@
1
- from .launcher_air import LauncherAir
2
- from .launcher_pro import LauncherPro
3
- from .launcher_api import LauncherApi
@@ -1,235 +0,0 @@
1
- import time
2
- import inspect
3
- import threading
4
- import importlib
5
- from functools import wraps
6
-
7
-
8
- from cobweb import setting
9
- from cobweb.base import Seed, Queue, logger
10
- from cobweb.utils.tools import dynamic_load_class
11
-
12
-
13
- def check_pause(func):
14
- @wraps(func)
15
- def wrapper(self, *args, **kwargs):
16
- while not self._pause.is_set():
17
- try:
18
- func(self, *args, **kwargs)
19
- except Exception as e:
20
- logger.info(f"{func.__name__}: " + str(e))
21
- finally:
22
- time.sleep(0.1)
23
-
24
- return wrapper
25
-
26
-
27
- class Launcher(threading.Thread):
28
-
29
- SEEDS = []
30
-
31
- __DOING__ = {}
32
-
33
- __CUSTOM_FUNC__ = {
34
- # "download": None,
35
- # "request": None,
36
- # "parse": None,
37
- }
38
-
39
- __LAUNCHER_QUEUE__ = {
40
- "new": Queue(),
41
- "todo": Queue(),
42
- "done": Queue(),
43
- "upload": Queue()
44
- }
45
-
46
- __LAUNCHER_FUNC__ = [
47
- "_reset",
48
- "_scheduler",
49
- "_insert",
50
- "_refresh",
51
- "_delete",
52
- ]
53
-
54
- def __init__(self, task, project, custom_setting=None, **kwargs):
55
- super().__init__()
56
- self.task = task
57
- self.project = project
58
-
59
- self._app_time = int(time.time())
60
- self._stop = threading.Event() # 结束事件
61
- self._pause = threading.Event() # 暂停事件
62
-
63
- _setting = dict()
64
-
65
- if custom_setting:
66
- if isinstance(custom_setting, dict):
67
- _setting = custom_setting
68
- else:
69
- if isinstance(custom_setting, str):
70
- custom_setting = importlib.import_module(custom_setting)
71
- if not inspect.ismodule(custom_setting):
72
- raise Exception
73
- for k, v in custom_setting.__dict__.items():
74
- if not k.startswith("__") and not inspect.ismodule(v):
75
- _setting[k] = v
76
-
77
- _setting.update(**kwargs)
78
-
79
- for k, v in _setting.items():
80
- setattr(setting, k.upper(), v)
81
-
82
- self._Crawler = dynamic_load_class(setting.CRAWLER)
83
- self._Pipeline = dynamic_load_class(setting.PIPELINE)
84
-
85
- self._before_scheduler_wait_seconds = setting.BEFORE_SCHEDULER_WAIT_SECONDS
86
- self._scheduler_wait_seconds = setting.SCHEDULER_WAIT_SECONDS
87
- self._todo_queue_full_wait_seconds = setting.TODO_QUEUE_FULL_WAIT_SECONDS
88
- self._new_queue_wait_seconds = setting.NEW_QUEUE_WAIT_SECONDS
89
- self._done_queue_wait_seconds = setting.DONE_QUEUE_WAIT_SECONDS
90
- self._upload_queue_wait_seconds = setting.UPLOAD_QUEUE_WAIT_SECONDS
91
- self._seed_reset_seconds = setting.SEED_RESET_SECONDS
92
-
93
- self._todo_queue_size = setting.TODO_QUEUE_SIZE
94
- self._new_queue_max_size = setting.NEW_QUEUE_MAX_SIZE
95
- self._done_queue_max_size = setting.DONE_QUEUE_MAX_SIZE
96
- self._upload_queue_max_size = setting.UPLOAD_QUEUE_MAX_SIZE
97
-
98
- self._spider_max_retries = setting.SPIDER_MAX_RETRIES
99
- self._spider_thread_num = setting.SPIDER_THREAD_NUM
100
- self._spider_time_sleep = setting.SPIDER_TIME_SLEEP
101
- self._spider_max_count = setting.SPIDER_MAX_COUNT
102
- self._time_window = setting.TIME_WINDOW
103
-
104
- self._done_model = setting.DONE_MODEL
105
- self._task_model = setting.TASK_MODEL
106
-
107
- self._filter_field = setting.FILTER_FIELD
108
-
109
- @property
110
- def request(self):
111
- """
112
- 自定义request函数
113
- use case:
114
- from cobweb.base import Request, BaseItem
115
- @launcher.request
116
- def request(seed: Seed) -> Union[Request, BaseItem]:
117
- ...
118
- yield Request(seed.url, seed)
119
- """
120
- def decorator(func):
121
- self.__CUSTOM_FUNC__["request"] = func
122
- return decorator
123
-
124
- @property
125
- def download(self):
126
- """
127
- 自定义download函数
128
- use case:
129
- from cobweb.base import Request, Response, Seed, BaseItem
130
- @launcher.download
131
- def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
132
- ...
133
- yield Response(item.seed, response)
134
- """
135
- def decorator(func):
136
- self.__CUSTOM_FUNC__["download"] = func
137
- return decorator
138
-
139
- @property
140
- def parse(self):
141
- """
142
- 自定义parse函数, xxxItem为自定义的存储数据类型
143
- use case:
144
- from cobweb.base import Request, Response
145
- @launcher.parse
146
- def parse(item: Response) -> BaseItem:
147
- ...
148
- yield xxxItem(seed, **kwargs)
149
- """
150
- def decorator(func):
151
- self.__CUSTOM_FUNC__["parse"] = func
152
- return decorator
153
-
154
- def start_seeds(self):
155
- seeds = [Seed(seed) for seed in self.SEEDS]
156
- self.__LAUNCHER_QUEUE__['todo'].push(seeds)
157
- return seeds
158
-
159
- def _remove_doing_seeds(self, seeds):
160
- for seed in seeds:
161
- self.__DOING__.pop(seed, None)
162
- # logger.info("remove %s seeds from __DOING__" % len(seeds))
163
-
164
- def _get_seed(self) -> Seed:
165
- return self.__LAUNCHER_QUEUE__["todo"].pop()
166
-
167
- def _set_seed(self, seed, **kwargs):
168
- self.__LAUNCHER_QUEUE__["todo"].push(seed, **kwargs)
169
-
170
- def _upload_data(self, data, **kwargs):
171
- self.__LAUNCHER_QUEUE__["upload"].push(data, **kwargs)
172
-
173
- def _add_seed(self, seed, **kwargs):
174
- self.__LAUNCHER_QUEUE__["new"].push(seed, **kwargs)
175
-
176
- def _delete_seed(self, seed, **kwargs):
177
- self.__LAUNCHER_QUEUE__["done"].push(seed, **kwargs)
178
-
179
- def _execute(self):
180
- for func_name in self.__LAUNCHER_FUNC__:
181
- threading.Thread(name=func_name, target=getattr(self, func_name)).start()
182
- time.sleep(1)
183
-
184
- def run(self):
185
- threading.Thread(target=self._execute_heartbeat).start()
186
-
187
- self.start_seeds()
188
-
189
- self._Crawler(
190
- task=self.task, project=self.project,
191
- stop=self._stop, pause=self._pause,
192
- # launcher_queue=self.__LAUNCHER_QUEUE__,
193
- get_seed=self._get_seed,
194
- set_seed=self._set_seed,
195
- add_seed=self._add_seed,
196
- delete_seed=self._delete_seed,
197
- upload_data=self._upload_data,
198
- custom_func=self.__CUSTOM_FUNC__,
199
- thread_num = self._spider_thread_num,
200
- max_retries = self._spider_max_retries,
201
- time_sleep=self._spider_time_sleep
202
- ).start()
203
-
204
- self._Pipeline(
205
- stop=self._stop, pause=self._pause,
206
- upload=self.__LAUNCHER_QUEUE__["upload"],
207
- done=self.__LAUNCHER_QUEUE__["done"],
208
- upload_size=self._upload_queue_max_size,
209
- wait_seconds=self._upload_queue_wait_seconds
210
- ).start()
211
-
212
- self._execute()
213
- self._polling()
214
-
215
- def _execute_heartbeat(self):
216
- pass
217
-
218
- def _reset(self):
219
- pass
220
-
221
- def _scheduler(self):
222
- pass
223
-
224
- def _insert(self):
225
- pass
226
-
227
- def _refresh(self):
228
- pass
229
-
230
- def _delete(self):
231
- pass
232
-
233
- def _polling(self):
234
- pass
235
-
@@ -1,88 +0,0 @@
1
- import time
2
-
3
- from cobweb.base import logger
4
- from cobweb.constant import LogTemplate
5
- from .launcher import Launcher, check_pause
6
-
7
-
8
- class LauncherAir(Launcher):
9
-
10
- # def _scheduler(self):
11
- # if self.start_seeds:
12
- # self.__LAUNCHER_QUEUE__['todo'].push(self.start_seeds)
13
-
14
- @check_pause
15
- def _insert(self):
16
- seeds = {}
17
- status = self.__LAUNCHER_QUEUE__['new'].length < self._new_queue_max_size
18
- for _ in range(self._new_queue_max_size):
19
- seed = self.__LAUNCHER_QUEUE__['new'].pop()
20
- if not seed:
21
- break
22
- seeds[seed.to_string] = seed.params.priority
23
- if seeds:
24
- self.__LAUNCHER_QUEUE__['todo'].push(seeds)
25
- if status:
26
- time.sleep(self._new_queue_wait_seconds)
27
-
28
- @check_pause
29
- def _delete(self):
30
- seeds = []
31
- status = self.__LAUNCHER_QUEUE__['done'].length < self._done_queue_max_size
32
-
33
- for _ in range(self._done_queue_max_size):
34
- seed = self.__LAUNCHER_QUEUE__['done'].pop()
35
- if not seed:
36
- break
37
- seeds.append(seed.to_string)
38
-
39
- if seeds:
40
- self._remove_doing_seeds(seeds)
41
-
42
- if status:
43
- time.sleep(self._done_queue_wait_seconds)
44
-
45
- def _polling(self):
46
-
47
- check_emtpy_times = 0
48
-
49
- while not self._stop.is_set():
50
-
51
- queue_not_empty_count = 0
52
- pooling_wait_seconds = 30
53
-
54
- for q in self.__LAUNCHER_QUEUE__.values():
55
- if q.length != 0:
56
- queue_not_empty_count += 1
57
-
58
- if queue_not_empty_count == 0:
59
- pooling_wait_seconds = 3
60
- if self._pause.is_set():
61
- check_emtpy_times = 0
62
- if not self._task_model:
63
- logger.info("Done! Ready to close thread...")
64
- self._stop.set()
65
- elif check_emtpy_times > 2:
66
- self.__DOING__ = {}
67
- self._pause.set()
68
- else:
69
- logger.info(
70
- "check whether the task is complete, "
71
- f"reset times {3 - check_emtpy_times}"
72
- )
73
- check_emtpy_times += 1
74
- elif self._pause.is_set():
75
- self._pause.clear()
76
- self._execute()
77
- else:
78
- logger.info(LogTemplate.launcher_air_polling.format(
79
- task=self.task,
80
- doing_len=len(self.__DOING__.keys()),
81
- todo_len=self.__LAUNCHER_QUEUE__['todo'].length,
82
- done_len=self.__LAUNCHER_QUEUE__['done'].length,
83
- upload_len=self.__LAUNCHER_QUEUE__['upload'].length,
84
- ))
85
-
86
- time.sleep(pooling_wait_seconds)
87
-
88
-