cobweb-launcher 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cobweb-launcher might be problematic. Click here for more details.

@@ -40,6 +40,7 @@ class Launcher(threading.Thread):
40
40
  self.task = task
41
41
  self.project = project
42
42
 
43
+ self._app_time = int(time.time())
43
44
  self._stop = threading.Event() # 结束事件
44
45
  self._pause = threading.Event() # 暂停事件
45
46
 
@@ -65,6 +66,7 @@ class Launcher(threading.Thread):
65
66
  self._Crawler = dynamic_load_class(setting.CRAWLER)
66
67
  self._Pipeline = dynamic_load_class(setting.PIPELINE)
67
68
 
69
+ self._before_scheduler_wait_seconds = setting.BEFORE_SCHEDULER_WAIT_SECONDS
68
70
  self._scheduler_wait_seconds = setting.SCHEDULER_WAIT_SECONDS
69
71
  self._todo_queue_full_wait_seconds = setting.TODO_QUEUE_FULL_WAIT_SECONDS
70
72
  self._new_queue_wait_seconds = setting.NEW_QUEUE_WAIT_SECONDS
@@ -83,7 +85,6 @@ class Launcher(threading.Thread):
83
85
  self._done_model = setting.DONE_MODEL
84
86
  self._task_model = setting.TASK_MODEL
85
87
 
86
- # self._upload_queue = Queue()
87
88
 
88
89
  @property
89
90
  def start_seeds(self):
@@ -125,7 +126,7 @@ class Launcher(threading.Thread):
125
126
  自定义parse函数, xxxItem为自定义的存储数据类型
126
127
  use case:
127
128
  from cobweb.base import Request, Response
128
- @launcher.download
129
+ @launcher.parse
129
130
  def parse(item: Response) -> BaseItem:
130
131
  ...
131
132
  yield xxxItem(seed, **kwargs)
@@ -141,7 +142,7 @@ class Launcher(threading.Thread):
141
142
  def _execute(self):
142
143
  for func_name in self.__LAUNCHER_FUNC__:
143
144
  threading.Thread(name=func_name, target=getattr(self, func_name)).start()
144
- time.sleep(2)
145
+ time.sleep(1)
145
146
 
146
147
  def run(self):
147
148
  threading.Thread(target=self._execute_heartbeat).start()
@@ -137,6 +137,7 @@ class LauncherPro(Launcher):
137
137
  time.sleep(self._done_queue_wait_seconds)
138
138
 
139
139
  def _polling(self):
140
+ wait_scheduler_execute = True
140
141
  check_emtpy_times = 0
141
142
  while not self._stop.is_set():
142
143
  queue_not_empty_count = 0
@@ -145,26 +146,35 @@ class LauncherPro(Launcher):
145
146
  for q in self.__LAUNCHER_QUEUE__.values():
146
147
  if q.length != 0:
147
148
  queue_not_empty_count += 1
149
+ wait_scheduler_execute = False
148
150
 
149
151
  if queue_not_empty_count == 0:
150
152
  pooling_wait_seconds = 3
151
153
  if self._pause.is_set():
152
154
  check_emtpy_times = 0
153
- if not self._task_model:
154
- logger.info("Done! Ready to close thread...")
155
+ if not self._task_model and (
156
+ not wait_scheduler_execute or
157
+ int(time.time()) - self._app_time > self._before_scheduler_wait_seconds
158
+ ):
159
+ logger.info("Done! ready to close thread...")
155
160
  self._stop.set()
156
- elif not self._db.zcount(self._todo_key, _min=0, _max="(1000") and check_emtpy_times > 2:
161
+
162
+ elif self._db.zcount(self._todo_key, _min=0, _max="(1000"):
163
+ logger.info(f"Recovery {self.task} task run!")
164
+ self._pause.clear()
165
+ self._execute()
166
+ else:
167
+ logger.info("pause! waiting for resume...")
168
+ elif check_emtpy_times > 2:
157
169
  self.__DOING__ = {}
158
- self._pause.set()
170
+ if not self._db.zcount(self._todo_key, _min="-inf", _max="(1000"):
171
+ self._pause.set()
159
172
  else:
160
173
  logger.info(
161
174
  "check whether the task is complete, "
162
175
  f"reset times {3 - check_emtpy_times}"
163
176
  )
164
177
  check_emtpy_times += 1
165
- elif self._pause.is_set():
166
- self._pause.clear()
167
- self._execute()
168
178
  else:
169
179
  logger.info(LogTemplate.launcher_pro_polling.format(
170
180
  task=self.task,
@@ -178,36 +188,6 @@ class LauncherPro(Launcher):
178
188
  ))
179
189
 
180
190
  time.sleep(pooling_wait_seconds)
181
- # if self._pause.is_set():
182
- # self._pause.clear()
183
- # self._execute()
184
- #
185
- # elif queue_not_empty_count == 0:
186
- # pooling_wait_seconds = 5
187
- # check_emtpy_times += 1
188
- # else:
189
- # check_emtpy_times = 0
190
- #
191
- # if not self._db.zcount(self._todo, _min=0, _max="(1000") and check_emtpy_times > 2:
192
- # check_emtpy_times = 0
193
- # self.__DOING__ = {}
194
- # self._pause.set()
195
- #
196
- # time.sleep(pooling_wait_seconds)
197
- #
198
- # if not self._pause.is_set():
199
- # logger.info(LogTemplate.launcher_pro_polling.format(
200
- # task=self.task,
201
- # doing_len=len(self.__DOING__.keys()),
202
- # todo_len=self.__LAUNCHER_QUEUE__['todo'].length,
203
- # done_len=self.__LAUNCHER_QUEUE__['done'].length,
204
- # redis_seed_count=self._db.zcount(self._todo, "-inf", "+inf"),
205
- # redis_todo_len=self._db.zcount(self._todo, 0, "(1000"),
206
- # redis_doing_len=self._db.zcount(self._todo, "-inf", "(0"),
207
- # upload_len=self.__LAUNCHER_QUEUE__['upload'].length,
208
- # ))
209
- # elif not self._task_model:
210
- # self._stop.set()
211
191
 
212
192
  logger.info("Done! Ready to close thread...")
213
193
 
@@ -1,5 +1,3 @@
1
- import json
2
-
3
1
  from cobweb.base import ConsoleItem, logger
4
2
  from cobweb.constant import LogTemplate
5
3
  from cobweb.pipelines import Pipeline
cobweb/setting.py CHANGED
@@ -26,6 +26,9 @@ OSS_SECRET_KEY = os.getenv("OSS_SECRET_KEY")
26
26
  OSS_CHUNK_SIZE = 10 * 1024 ** 2
27
27
  OSS_MIN_UPLOAD_SIZE = 1024
28
28
 
29
+ # message
30
+ MESSAGE = ""
31
+
29
32
 
30
33
  # 采集器选择
31
34
  CRAWLER = "cobweb.crawlers.Crawler"
@@ -35,6 +38,8 @@ PIPELINE = "cobweb.pipelines.pipeline_console.Console"
35
38
 
36
39
 
37
40
  # Launcher 等待时间
41
+
42
+ BEFORE_SCHEDULER_WAIT_SECONDS = 60 # 调度前等待时间,只作用于单次任务
38
43
  SCHEDULER_WAIT_SECONDS = 15 # 调度等待时间
39
44
  TODO_QUEUE_FULL_WAIT_SECONDS = 5 # todo队列已满时等待时间
40
45
  NEW_QUEUE_WAIT_SECONDS = 30 # new队列等待时间
@@ -0,0 +1,204 @@
1
+ Metadata-Version: 2.1
2
+ Name: cobweb-launcher
3
+ Version: 1.2.2
4
+ Summary: spider_hole
5
+ Home-page: https://github.com/Juannie-PP/cobweb
6
+ Author: Juannie-PP
7
+ Author-email: 2604868278@qq.com
8
+ License: MIT
9
+ Keywords: cobweb-launcher, cobweb
10
+ Platform: UNKNOWN
11
+ Classifier: Programming Language :: Python :: 3
12
+ Requires-Python: >=3.7
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+ Requires-Dist: requests (>=2.19.1)
16
+ Requires-Dist: oss2 (>=2.18.1)
17
+ Requires-Dist: redis (>=4.4.4)
18
+ Requires-Dist: aliyun-log-python-sdk
19
+
20
+ # cobweb
21
+ cobweb是一个基于python的分布式爬虫调度框架,目前支持分布式爬虫,单机爬虫,支持自定义数据库,支持自定义数据存储,支持自定义数据处理等操作。
22
+
23
+ cobweb主要由3个模块和一个配置文件组成:Launcher启动器、Crawler采集器、Pipeline存储和setting配置文件。
24
+ 1. Launcher启动器:用于启动爬虫任务,控制爬虫任务的执行流程,以及数据存储和数据处理。
25
+ 框架提供两种启动器模式:LauncherAir、LauncherPro,分别对应单机爬虫模式和分布式调度模式。
26
+ 2. Crawler采集器:用于控制采集流程、数据下载和数据处理。
27
+ 框架提供了基础的采集器,用于控制采集流程、数据下载和数据处理,用户也可在创建任务时自定义请求、下载和解析方法,具体看使用方法介绍。
28
+ 3. Pipeline存储:用于存储采集到的数据,支持自定义数据存储和数据处理。框架提供了Console和Loghub两种存储方式,用户也可继承Pipeline抽象类自定义存储方式。
29
+ 4. setting配置文件:用于配置采集器、存储器、队列长度、采集线程数等参数,框架提供了默认配置,用户也可自定义配置。
30
+ ## 安装
31
+ ```
32
+ pip3 install --upgrade cobweb-launcher
33
+ ```
34
+ ## 使用方法介绍
35
+ ### 1. 任务创建
36
+ - LauncherAir任务创建
37
+ ```python
38
+ from cobweb import LauncherAir
39
+
40
+ # 创建启动器
41
+ app = LauncherAir(task="test", project="test")
42
+
43
+ # 设置采集种子
44
+ app.SEEDS = [{
45
+ "url": "https://www.baidu.com"
46
+ }]
47
+ ...
48
+ # 启动任务
49
+ app.start()
50
+ ```
51
+ - LauncherPro任务创建
52
+ LauncherPro依赖redis实现分布式调度,使用LauncherPro启动器需要完成环境变量的配置或自定义setting文件中的redis配置,如何配置查看`2. 自定义配置文件参数`
53
+ ```python
54
+ from cobweb import LauncherPro
55
+
56
+ # 创建启动器
57
+ app = LauncherPro(
58
+ task="test",
59
+ project="test"
60
+ )
61
+ ...
62
+ # 启动任务
63
+ app.start()
64
+ ```
65
+ ### 2. 自定义配置文件参数
66
+ - 通过自定义setting文件,配置文件导入字符串方式
67
+ > 默认配置文件:import cobweb.setting
68
+ > 不推荐!!!目前有bug,随缘使用...
69
+ 例如:同级目录下自定义创建了setting.py文件。
70
+ ```python
71
+ from cobweb import LauncherAir
72
+
73
+ app = LauncherAir(
74
+ task="test",
75
+ project="test",
76
+ setting="import setting"
77
+ )
78
+
79
+ ...
80
+
81
+ app.start()
82
+ ```
83
+ - 自定义修改setting中对象值
84
+ ```python
85
+ from cobweb import LauncherPro
86
+
87
+ # 创建启动器
88
+ app = LauncherPro(
89
+ task="test",
90
+ project="test",
91
+ REDIS_CONFIG = {
92
+ "host": ...,
93
+ "password":...,
94
+ "port": ...,
95
+ "db": ...
96
+ }
97
+ )
98
+ ...
99
+ # 启动任务
100
+ app.start()
101
+ ```
102
+ ### 3. 自定义请求
103
+ `@app.request`使用装饰器封装自定义请求方法,作用于发生请求前的操作,返回Request对象或继承于BaseItem对象,用于控制请求参数。
104
+ ```python
105
+ from typing import Union
106
+ from cobweb import LauncherAir
107
+ from cobweb.base import Seed, Request, BaseItem
108
+
109
+ app = LauncherAir(
110
+ task="test",
111
+ project="test"
112
+ )
113
+
114
+ ...
115
+
116
+ @app.request
117
+ def request(seed: Seed) -> Union[Request, BaseItem]:
118
+ # 可自定义headers,代理,构造请求参数等操作
119
+ proxies = {"http": ..., "https": ...}
120
+ yield Request(seed.url, seed, ..., proxies=proxies, timeout=15)
121
+ # yield xxxItem(seed, ...) # 跳过请求和解析直接进入数据存储流程
122
+
123
+ ...
124
+
125
+ app.start()
126
+ ```
127
+ > 默认请求方法
128
+ > def request(seed: Seed) -> Union[Request, BaseItem]:
129
+ >     yield Request(seed.url, seed, timeout=5)
130
+ ### 4. 自定义下载
131
+ `@app.download`使用装饰器封装自定义下载方法,作用于发生请求时的操作,返回Response对象或继承于BaseItem对象,用于控制请求参数。
132
+ ```python
133
+ from typing import Union
134
+ from cobweb import LauncherAir
135
+ from cobweb.base import Request, Response, BaseItem
136
+
137
+ app = LauncherAir(
138
+ task="test",
139
+ project="test"
140
+ )
141
+
142
+ ...
143
+
144
+ @app.download
145
+ def download(item: Request) -> Union[BaseItem, Response]:
146
+ ...
147
+ response = ...
148
+ ...
149
+ yield Response(item.seed, response, ...) # 返回Response对象,进行解析
150
+ # yield xxxItem(seed, ...) # 跳过请求和解析直接进入数据存储流程
151
+
152
+ ...
153
+
154
+ app.start()
155
+ ```
156
+ > 默认下载方法
157
+ > def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
158
+ >     response = item.download()
159
+ >     yield Response(item.seed, response, **item.to_dict)
160
+ ### 5. 自定义解析
161
+ 自定义解析需要由一个存储数据类和解析方法组成。存储数据类继承于BaseItem的对象,规定存储表名及字段,
162
+ 解析方法返回继承于BaseItem的对象,yield返回进行控制数据存储流程。
163
+ ```python
164
+ from typing import Union
165
+ from cobweb import LauncherAir
166
+ from cobweb.base import Seed, Response, BaseItem
167
+
168
+ class TestItem(BaseItem):
169
+ __TABLE__ = "test_data" # 表名
170
+ __FIELDS__ = "field1, field2, field3" # 字段名
171
+
172
+ app = LauncherAir(
173
+ task="test",
174
+ project="test"
175
+ )
176
+
177
+ ...
178
+
179
+ @app.parse
180
+ def parse(item: Response) -> Union[Seed, BaseItem]:
181
+ ...
182
+ yield TestItem(item.seed, field1=..., field2=..., field3=...)
183
+ # yield Seed(...) # 构造新种子推送至消费队列
184
+
185
+ ...
186
+
187
+ app.start()
188
+ ```
189
+ > 默认解析方法
190
+ > def parse(item: Request) -> Union[Seed, BaseItem]:
191
+ >     upload_item = item.to_dict
192
+ >     upload_item["text"] = item.response.text
193
+ >     yield ConsoleItem(item.seed, data=json.dumps(upload_item, ensure_ascii=False))
194
+ ## need deal
195
+ - 队列优化完善,使用queue的机制wait()同步各模块执行?
196
+ - 日志功能完善,单机模式调度和保存数据写入文件,结构化输出各任务日志
197
+ - 去重过滤(布隆过滤器等)
198
+ - 单机防丢失
199
+ - excel、mysql、redis数据完善
200
+
201
+ > 未更新流程图!!!
202
+ ![img.png](https://image-luyuan.oss-cn-hangzhou.aliyuncs.com/image/D2388CDC-B9E5-4CE4-9F2C-7D173763B6A8.png)
203
+
204
+
@@ -1,6 +1,6 @@
1
1
  cobweb/__init__.py,sha256=uMHyf4Fekbyw2xBCbkA8R0LwCpBJf5p_7pWbh60ZWYk,83
2
2
  cobweb/constant.py,sha256=zy3XYsc1qp2B76_Fn_hVQ8eGHlPBd3OFlZK2cryE6FY,2839
3
- cobweb/setting.py,sha256=zOO1cA_zQd4Q0CzY_tdSfdo-10L4QIVpm4382wbP5BQ,1906
3
+ cobweb/setting.py,sha256=_t3LMSpxUNR4dVD9Tox22W9omHPvjeWABFzpgkEOoH0,2016
4
4
  cobweb/base/__init__.py,sha256=4gwWWQ0Q8cYG9cD7Lwf4XMqRGc5M_mapS3IczR6zeCE,222
5
5
  cobweb/base/common_queue.py,sha256=W7PPZZFl52j3Mc916T0imHj7oAUelA6aKJwW-FecDPE,872
6
6
  cobweb/base/decorators.py,sha256=wDCaQ94aAZGxks9Ljc0aXq6omDXT1_yzFy83ZW6VbVI,930
@@ -18,20 +18,20 @@ cobweb/db/redis_db.py,sha256=NNI2QkRV1hEZI-z-COEncXt88z3pZN6wusKlcQzc8V4,4304
18
18
  cobweb/exceptions/__init__.py,sha256=E9SHnJBbhD7fOgPFMswqyOf8SKRDrI_i25L0bSpohvk,32
19
19
  cobweb/exceptions/oss_db_exception.py,sha256=iP_AImjNHT3-Iv49zCFQ3rdLnlvuHa3h2BXApgrOYpA,636
20
20
  cobweb/launchers/__init__.py,sha256=af0Y6wrGX8SQZ7w7XL2sOtREjCT3dwad-uCc3nIontY,76
21
- cobweb/launchers/launcher.py,sha256=zef9gQ0P_4lD3mButsbIXPr15pqZDyr9wlELCD3bsZs,5382
21
+ cobweb/launchers/launcher.py,sha256=Mepg-hv9YL5_VnY_Mwqxzd8ZRGyK8gxsl_5B_Ibz0Uc,5466
22
22
  cobweb/launchers/launcher_air.py,sha256=zHVEJqQCxYU1WDnqQzzEHbEXasR1GmKevujQkCfFt5o,2947
23
- cobweb/launchers/launcher_pro.py,sha256=IrrevNxmH39GySwE36RGBhtXatEGT3fffDFoJuzJbaM,8584
23
+ cobweb/launchers/launcher_pro.py,sha256=ihsEcdP8iOrJhHzQTYQow-8pIYK3CCn-iwwI3v9UzMw,7727
24
24
  cobweb/pipelines/__init__.py,sha256=zSUsGtx6smbs2iXBXvYynReKSgky-3gjqaAtKVnA_OU,105
25
25
  cobweb/pipelines/base_pipeline.py,sha256=fYnWf79GmhufXpcnMa3te18SbmnVeYLwxfyo-zLd9CY,1577
26
26
  cobweb/pipelines/loghub_pipeline.py,sha256=cjPO6w6UJ0jNw2fVvdX0BCdlm58T7dmYXlxzXOBpvfY,1027
27
27
  cobweb/pipelines/pipeline.py,sha256=29O7CzASDv52NJl_1KWlKYxNd79HiaQAm0oO5K9PQzw,1650
28
- cobweb/pipelines/pipeline_console.py,sha256=2ur-5nZvCSFBYFMIdePcK2mHp1ktJTBe7vRyH-kaF94,767
28
+ cobweb/pipelines/pipeline_console.py,sha256=NEh-4zhuVAQOqwXLsqeb-rcNZ9_KXFUpL3otUTL5qBs,754
29
29
  cobweb/pipelines/pipeline_loghub.py,sha256=xZ6D55BGdiM71WUv83jyLGbEyUwhBHLJRZoXthBxxTs,1019
30
30
  cobweb/utils/__init__.py,sha256=JTE4sBfHnKHhD6w9Auk0MIT7O9BMOamCeryhlHNx3Zg,47
31
31
  cobweb/utils/oss.py,sha256=gyt8-UB07tVphZLQXMOf-JTJwU-mWq8KZkOXKkAf3uk,3513
32
32
  cobweb/utils/tools.py,sha256=5JEaaAwYoV9Sdla2UBIJn6faUBuXmxUMagm9ck6FVqs,1253
33
- cobweb_launcher-1.2.0.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
34
- cobweb_launcher-1.2.0.dist-info/METADATA,sha256=wl8ODS1cutZUvE4HmsvsH_42atwhdElRcD5d6rsgSBk,1245
35
- cobweb_launcher-1.2.0.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
36
- cobweb_launcher-1.2.0.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
37
- cobweb_launcher-1.2.0.dist-info/RECORD,,
33
+ cobweb_launcher-1.2.2.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
34
+ cobweb_launcher-1.2.2.dist-info/METADATA,sha256=nMfaL4ItGVnr9tHtxNnBEGud1HI_ZRbIGqmfL4TnV7o,6489
35
+ cobweb_launcher-1.2.2.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
36
+ cobweb_launcher-1.2.2.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
37
+ cobweb_launcher-1.2.2.dist-info/RECORD,,
@@ -1,48 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: cobweb-launcher
3
- Version: 1.2.0
4
- Summary: spider_hole
5
- Home-page: https://github.com/Juannie-PP/cobweb
6
- Author: Juannie-PP
7
- Author-email: 2604868278@qq.com
8
- License: MIT
9
- Keywords: cobweb-launcher, cobweb
10
- Platform: UNKNOWN
11
- Classifier: Programming Language :: Python :: 3
12
- Requires-Python: >=3.7
13
- Description-Content-Type: text/markdown
14
- License-File: LICENSE
15
- Requires-Dist: requests (>=2.19.1)
16
- Requires-Dist: oss2 (>=2.18.1)
17
- Requires-Dist: redis (>=4.4.4)
18
- Requires-Dist: aliyun-log-python-sdk
19
-
20
- # cobweb
21
-
22
- > 通用爬虫框架: 1.单机模式采集框架;2.分布式采集框架
23
- >
24
- > 5部分
25
- >
26
- > 1. starter -- 启动器
27
- >
28
- > 2. scheduler -- 调度器
29
- >
30
- > 3. distributor -- 分发器
31
- >
32
- > 4. storer -- 存储器
33
- >
34
- > 5. utils -- 工具函数
35
- >
36
-
37
- need deal
38
- - 队列优化完善,使用queue的机制wait()同步各模块执行?
39
- - 日志功能完善,单机模式调度和保存数据写入文件,结构化输出各任务日志
40
- - 去重过滤(布隆过滤器等)
41
- - 防丢失(单机模式可以通过日志文件进行检查种子)
42
- - 自定义数据库的功能
43
- - excel、mysql、redis数据完善
44
-
45
-
46
- ![img.png](https://image-luyuan.oss-cn-hangzhou.aliyuncs.com/image/D2388CDC-B9E5-4CE4-9F2C-7D173763B6A8.png)
47
-
48
-