cobweb-launcher 1.1.23__tar.gz → 1.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cobweb-launcher might be problematic. Click here for more details.

Files changed (45) hide show
  1. cobweb-launcher-1.2.1/PKG-INFO +200 -0
  2. cobweb-launcher-1.2.1/README.md +183 -0
  3. cobweb-launcher-1.2.1/cobweb/__init__.py +2 -0
  4. {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.1}/cobweb/base/__init__.py +1 -1
  5. {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.1}/cobweb/base/item.py +7 -0
  6. {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.1}/cobweb/constant.py +23 -1
  7. cobweb-launcher-1.2.1/cobweb/crawlers/__init__.py +1 -0
  8. cobweb-launcher-1.1.23/cobweb/crawlers/base_crawler.py → cobweb-launcher-1.2.1/cobweb/crawlers/crawler.py +45 -29
  9. {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.1}/cobweb/launchers/__init__.py +1 -1
  10. {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.1}/cobweb/launchers/launcher.py +39 -53
  11. cobweb-launcher-1.2.1/cobweb/launchers/launcher_air.py +88 -0
  12. {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.1}/cobweb/launchers/launcher_pro.py +83 -36
  13. cobweb-launcher-1.2.1/cobweb/pipelines/__init__.py +3 -0
  14. cobweb-launcher-1.1.23/cobweb/pipelines/base_pipeline.py → cobweb-launcher-1.2.1/cobweb/pipelines/pipeline.py +20 -14
  15. cobweb-launcher-1.2.1/cobweb/pipelines/pipeline_console.py +22 -0
  16. cobweb-launcher-1.1.23/cobweb/pipelines/loghub_pipeline.py → cobweb-launcher-1.2.1/cobweb/pipelines/pipeline_loghub.py +1 -1
  17. {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.1}/cobweb/setting.py +6 -6
  18. {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.1}/cobweb/utils/tools.py +2 -2
  19. cobweb-launcher-1.2.1/cobweb_launcher.egg-info/PKG-INFO +200 -0
  20. {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.1}/cobweb_launcher.egg-info/SOURCES.txt +5 -4
  21. {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.1}/setup.py +1 -1
  22. cobweb-launcher-1.1.23/PKG-INFO +0 -44
  23. cobweb-launcher-1.1.23/README.md +0 -27
  24. cobweb-launcher-1.1.23/cobweb/__init__.py +0 -2
  25. cobweb-launcher-1.1.23/cobweb/crawlers/__init__.py +0 -2
  26. cobweb-launcher-1.1.23/cobweb/crawlers/file_crawler.py +0 -98
  27. cobweb-launcher-1.1.23/cobweb/pipelines/__init__.py +0 -2
  28. cobweb-launcher-1.1.23/cobweb_launcher.egg-info/PKG-INFO +0 -44
  29. {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.1}/LICENSE +0 -0
  30. {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.1}/cobweb/base/common_queue.py +0 -0
  31. {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.1}/cobweb/base/decorators.py +0 -0
  32. {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.1}/cobweb/base/log.py +0 -0
  33. {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.1}/cobweb/base/request.py +0 -0
  34. {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.1}/cobweb/base/response.py +0 -0
  35. {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.1}/cobweb/base/seed.py +0 -0
  36. {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.1}/cobweb/db/__init__.py +0 -0
  37. {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.1}/cobweb/db/redis_db.py +0 -0
  38. {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.1}/cobweb/exceptions/__init__.py +0 -0
  39. {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.1}/cobweb/exceptions/oss_db_exception.py +0 -0
  40. {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.1}/cobweb/utils/__init__.py +0 -0
  41. {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.1}/cobweb/utils/oss.py +0 -0
  42. {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.1}/cobweb_launcher.egg-info/dependency_links.txt +0 -0
  43. {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.1}/cobweb_launcher.egg-info/requires.txt +0 -0
  44. {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.1}/cobweb_launcher.egg-info/top_level.txt +0 -0
  45. {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.1}/setup.cfg +0 -0
@@ -0,0 +1,200 @@
1
+ Metadata-Version: 2.1
2
+ Name: cobweb-launcher
3
+ Version: 1.2.1
4
+ Summary: spider_hole
5
+ Home-page: https://github.com/Juannie-PP/cobweb
6
+ Author: Juannie-PP
7
+ Author-email: 2604868278@qq.com
8
+ License: MIT
9
+ Keywords: cobweb-launcher, cobweb
10
+ Platform: UNKNOWN
11
+ Classifier: Programming Language :: Python :: 3
12
+ Requires-Python: >=3.7
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+
16
+ # cobweb
17
+ cobweb是一个基于python的分布式爬虫调度框架,目前支持分布式爬虫,单机爬虫,支持自定义数据库,支持自定义数据存储,支持自定义数据处理等操作。
18
+
19
+ cobweb主要由3个模块和一个配置文件组成:Launcher启动器、Crawler采集器、Pipeline存储和setting配置文件。
20
+ 1. Launcher启动器:用于启动爬虫任务,控制爬虫任务的执行流程,以及数据存储和数据处理。
21
+ 框架提供两种启动器模式:LauncherAir、LauncherPro,分别对应单机爬虫模式和分布式调度模式。
22
+ 2. Crawler采集器:用于控制采集流程、数据下载和数据处理。
23
+ 框架提供了基础的采集器,用于控制采集流程、数据下载和数据处理,用户也可在创建任务时自定义请求、下载和解析方法,具体看使用方法介绍。
24
+ 3. Pipeline存储:用于存储采集到的数据,支持自定义数据存储和数据处理。框架提供了Console和Loghub两种存储方式,用户也可继承Pipeline抽象类自定义存储方式。
25
+ 4. setting配置文件:用于配置采集器、存储器、队列长度、采集线程数等参数,框架提供了默认配置,用户也可自定义配置。
26
+ ## 安装
27
+ ```
28
+ pip3 install --upgrade cobweb-launcher
29
+ ```
30
+ ## 使用方法介绍
31
+ ### 1. 任务创建
32
+ - LauncherAir任务创建
33
+ ```python
34
+ from cobweb import LauncherAir
35
+
36
+ # 创建启动器
37
+ app = LauncherAir(task="test", project="test")
38
+
39
+ # 设置采集种子
40
+ app.SEEDS = [{
41
+ "url": "https://www.baidu.com"
42
+ }]
43
+ ...
44
+ # 启动任务
45
+ app.start()
46
+ ```
47
+ - LauncherPro任务创建
48
+ LauncherPro依赖redis实现分布式调度,使用LauncherPro启动器需要完成环境变量的配置或自定义setting文件中的redis配置,如何配置查看`2. 自定义配置文件参数`
49
+ ```python
50
+ from cobweb import LauncherPro
51
+
52
+ # 创建启动器
53
+ app = LauncherPro(
54
+ task="test",
55
+ project="test"
56
+ )
57
+ ...
58
+ # 启动任务
59
+ app.start()
60
+ ```
61
+ ### 2. 自定义配置文件参数
62
+ - 通过自定义setting文件,配置文件导入字符串方式
63
+ > 默认配置文件:import cobweb.setting
64
+ > 不推荐!!!目前有bug,随缘使用...
65
+ 例如:同级目录下自定义创建了setting.py文件。
66
+ ```python
67
+ from cobweb import LauncherAir
68
+
69
+ app = LauncherAir(
70
+ task="test",
71
+ project="test",
72
+ setting="import setting"
73
+ )
74
+
75
+ ...
76
+
77
+ app.start()
78
+ ```
79
+ - 自定义修改setting中对象值
80
+ ```python
81
+ from cobweb import LauncherPro
82
+
83
+ # 创建启动器
84
+ app = LauncherPro(
85
+ task="test",
86
+ project="test",
87
+ REDIS_CONFIG = {
88
+ "host": ...,
89
+ "password":...,
90
+ "port": ...,
91
+ "db": ...
92
+ }
93
+ )
94
+ ...
95
+ # 启动任务
96
+ app.start()
97
+ ```
98
+ ### 3. 自定义请求
99
+ `@app.request`使用装饰器封装自定义请求方法,作用于发生请求前的操作,返回Request对象或继承于BaseItem对象,用于控制请求参数。
100
+ ```python
101
+ from typing import Union
102
+ from cobweb import LauncherAir
103
+ from cobweb.base import Seed, Request, BaseItem
104
+
105
+ app = LauncherAir(
106
+ task="test",
107
+ project="test"
108
+ )
109
+
110
+ ...
111
+
112
+ @app.request
113
+ def request(seed: Seed) -> Union[Request, BaseItem]:
114
+ # 可自定义headers,代理,构造请求参数等操作
115
+ proxies = {"http": ..., "https": ...}
116
+ yield Request(seed.url, seed, ..., proxies=proxies, timeout=15)
117
+ # yield xxxItem(seed, ...) # 跳过请求和解析直接进入数据存储流程
118
+
119
+ ...
120
+
121
+ app.start()
122
+ ```
123
+ > 默认请求方法
124
+ > def request(seed: Seed) -> Union[Request, BaseItem]:
125
+ >     yield Request(seed.url, seed, timeout=5)
126
+ ### 4. 自定义下载
127
+ `@app.download`使用装饰器封装自定义下载方法,作用于发生请求时的操作,返回Response对象或继承于BaseItem对象,用于控制请求参数。
128
+ ```python
129
+ from typing import Union
130
+ from cobweb import LauncherAir
131
+ from cobweb.base import Request, Response, BaseItem
132
+
133
+ app = LauncherAir(
134
+ task="test",
135
+ project="test"
136
+ )
137
+
138
+ ...
139
+
140
+ @app.download
141
+ def download(item: Request) -> Union[BaseItem, Response]:
142
+ ...
143
+ response = ...
144
+ ...
145
+ yield Response(item.seed, response, ...) # 返回Response对象,进行解析
146
+ # yield xxxItem(seed, ...) # 跳过请求和解析直接进入数据存储流程
147
+
148
+ ...
149
+
150
+ app.start()
151
+ ```
152
+ > 默认下载方法
153
+ > def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
154
+ >     response = item.download()
155
+ >     yield Response(item.seed, response, **item.to_dict)
156
+ ### 5. 自定义解析
157
+ 自定义解析需要由一个存储数据类和解析方法组成。存储数据类继承于BaseItem的对象,规定存储表名及字段,
158
+ 解析方法返回继承于BaseItem的对象,yield返回进行控制数据存储流程。
159
+ ```python
160
+ from typing import Union
161
+ from cobweb import LauncherAir
162
+ from cobweb.base import Seed, Response, BaseItem
163
+
164
+ class TestItem(BaseItem):
165
+ __TABLE__ = "test_data" # 表名
166
+ __FIELDS__ = "field1, field2, field3" # 字段名
167
+
168
+ app = LauncherAir(
169
+ task="test",
170
+ project="test"
171
+ )
172
+
173
+ ...
174
+
175
+ @app.parse
176
+ def parse(item: Response) -> Union[Seed, BaseItem]:
177
+ ...
178
+ yield TestItem(item.seed, field1=..., field2=..., field3=...)
179
+ # yield Seed(...) # 构造新种子推送至消费队列
180
+
181
+ ...
182
+
183
+ app.start()
184
+ ```
185
+ > 默认解析方法
186
+ > def parse(item: Request) -> Union[Seed, BaseItem]:
187
+ >     upload_item = item.to_dict
188
+ >     upload_item["text"] = item.response.text
189
+ >     yield ConsoleItem(item.seed, data=json.dumps(upload_item, ensure_ascii=False))
190
+ ## need deal
191
+ - 队列优化完善,使用queue的机制wait()同步各模块执行?
192
+ - 日志功能完善,单机模式调度和保存数据写入文件,结构化输出各任务日志
193
+ - 去重过滤(布隆过滤器等)
194
+ - 单机防丢失
195
+ - excel、mysql、redis数据完善
196
+
197
+ > 未更新流程图!!!
198
+ ![img.png](https://image-luyuan.oss-cn-hangzhou.aliyuncs.com/image/D2388CDC-B9E5-4CE4-9F2C-7D173763B6A8.png)
199
+
200
+
@@ -0,0 +1,183 @@
1
+ # cobweb
2
+ cobweb是一个基于python的分布式爬虫调度框架,目前支持分布式爬虫,单机爬虫,支持自定义数据库,支持自定义数据存储,支持自定义数据处理等操作。
3
+
4
+ cobweb主要由3个模块和一个配置文件组成:Launcher启动器、Crawler采集器、Pipeline存储和setting配置文件。
5
+ 1. Launcher启动器:用于启动爬虫任务,控制爬虫任务的执行流程,以及数据存储和数据处理。
6
+ 框架提供两种启动器模式:LauncherAir、LauncherPro,分别对应单机爬虫模式和分布式调度模式。
7
+ 2. Crawler采集器:用于控制采集流程、数据下载和数据处理。
8
+ 框架提供了基础的采集器,用于控制采集流程、数据下载和数据处理,用户也可在创建任务时自定义请求、下载和解析方法,具体看使用方法介绍。
9
+ 3. Pipeline存储:用于存储采集到的数据,支持自定义数据存储和数据处理。框架提供了Console和Loghub两种存储方式,用户也可继承Pipeline抽象类自定义存储方式。
10
+ 4. setting配置文件:用于配置采集器、存储器、队列长度、采集线程数等参数,框架提供了默认配置,用户也可自定义配置。
11
+ ## 安装
12
+ ```
13
+ pip3 install --upgrade cobweb-launcher
14
+ ```
15
+ ## 使用方法介绍
16
+ ### 1. 任务创建
17
+ - LauncherAir任务创建
18
+ ```python
19
+ from cobweb import LauncherAir
20
+
21
+ # 创建启动器
22
+ app = LauncherAir(task="test", project="test")
23
+
24
+ # 设置采集种子
25
+ app.SEEDS = [{
26
+ "url": "https://www.baidu.com"
27
+ }]
28
+ ...
29
+ # 启动任务
30
+ app.start()
31
+ ```
32
+ - LauncherPro任务创建
33
+ LauncherPro依赖redis实现分布式调度,使用LauncherPro启动器需要完成环境变量的配置或自定义setting文件中的redis配置,如何配置查看`2. 自定义配置文件参数`
34
+ ```python
35
+ from cobweb import LauncherPro
36
+
37
+ # 创建启动器
38
+ app = LauncherPro(
39
+ task="test",
40
+ project="test"
41
+ )
42
+ ...
43
+ # 启动任务
44
+ app.start()
45
+ ```
46
+ ### 2. 自定义配置文件参数
47
+ - 通过自定义setting文件,配置文件导入字符串方式
48
+ > 默认配置文件:import cobweb.setting
49
+ > 不推荐!!!目前有bug,随缘使用...
50
+ 例如:同级目录下自定义创建了setting.py文件。
51
+ ```python
52
+ from cobweb import LauncherAir
53
+
54
+ app = LauncherAir(
55
+ task="test",
56
+ project="test",
57
+ setting="import setting"
58
+ )
59
+
60
+ ...
61
+
62
+ app.start()
63
+ ```
64
+ - 自定义修改setting中对象值
65
+ ```python
66
+ from cobweb import LauncherPro
67
+
68
+ # 创建启动器
69
+ app = LauncherPro(
70
+ task="test",
71
+ project="test",
72
+ REDIS_CONFIG = {
73
+ "host": ...,
74
+ "password":...,
75
+ "port": ...,
76
+ "db": ...
77
+ }
78
+ )
79
+ ...
80
+ # 启动任务
81
+ app.start()
82
+ ```
83
+ ### 3. 自定义请求
84
+ `@app.request`使用装饰器封装自定义请求方法,作用于发生请求前的操作,返回Request对象或继承于BaseItem对象,用于控制请求参数。
85
+ ```python
86
+ from typing import Union
87
+ from cobweb import LauncherAir
88
+ from cobweb.base import Seed, Request, BaseItem
89
+
90
+ app = LauncherAir(
91
+ task="test",
92
+ project="test"
93
+ )
94
+
95
+ ...
96
+
97
+ @app.request
98
+ def request(seed: Seed) -> Union[Request, BaseItem]:
99
+ # 可自定义headers,代理,构造请求参数等操作
100
+ proxies = {"http": ..., "https": ...}
101
+ yield Request(seed.url, seed, ..., proxies=proxies, timeout=15)
102
+ # yield xxxItem(seed, ...) # 跳过请求和解析直接进入数据存储流程
103
+
104
+ ...
105
+
106
+ app.start()
107
+ ```
108
+ > 默认请求方法
109
+ > def request(seed: Seed) -> Union[Request, BaseItem]:
110
+ >     yield Request(seed.url, seed, timeout=5)
111
+ ### 4. 自定义下载
112
+ `@app.download`使用装饰器封装自定义下载方法,作用于发生请求时的操作,返回Response对象或继承于BaseItem对象,用于控制请求参数。
113
+ ```python
114
+ from typing import Union
115
+ from cobweb import LauncherAir
116
+ from cobweb.base import Request, Response, BaseItem
117
+
118
+ app = LauncherAir(
119
+ task="test",
120
+ project="test"
121
+ )
122
+
123
+ ...
124
+
125
+ @app.download
126
+ def download(item: Request) -> Union[BaseItem, Response]:
127
+ ...
128
+ response = ...
129
+ ...
130
+ yield Response(item.seed, response, ...) # 返回Response对象,进行解析
131
+ # yield xxxItem(seed, ...) # 跳过请求和解析直接进入数据存储流程
132
+
133
+ ...
134
+
135
+ app.start()
136
+ ```
137
+ > 默认下载方法
138
+ > def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
139
+ >     response = item.download()
140
+ >     yield Response(item.seed, response, **item.to_dict)
141
+ ### 5. 自定义解析
142
+ 自定义解析需要由一个存储数据类和解析方法组成。存储数据类继承于BaseItem的对象,规定存储表名及字段,
143
+ 解析方法返回继承于BaseItem的对象,yield返回进行控制数据存储流程。
144
+ ```python
145
+ from typing import Union
146
+ from cobweb import LauncherAir
147
+ from cobweb.base import Seed, Response, BaseItem
148
+
149
+ class TestItem(BaseItem):
150
+ __TABLE__ = "test_data" # 表名
151
+ __FIELDS__ = "field1, field2, field3" # 字段名
152
+
153
+ app = LauncherAir(
154
+ task="test",
155
+ project="test"
156
+ )
157
+
158
+ ...
159
+
160
+ @app.parse
161
+ def parse(item: Response) -> Union[Seed, BaseItem]:
162
+ ...
163
+ yield TestItem(item.seed, field1=..., field2=..., field3=...)
164
+ # yield Seed(...) # 构造新种子推送至消费队列
165
+
166
+ ...
167
+
168
+ app.start()
169
+ ```
170
+ > 默认解析方法
171
+ > def parse(item: Request) -> Union[Seed, BaseItem]:
172
+ >     upload_item = item.to_dict
173
+ >     upload_item["text"] = item.response.text
174
+ >     yield ConsoleItem(item.seed, data=json.dumps(upload_item, ensure_ascii=False))
175
+ ## need deal
176
+ - 队列优化完善,使用queue的机制wait()同步各模块执行?
177
+ - 日志功能完善,单机模式调度和保存数据写入文件,结构化输出各任务日志
178
+ - 去重过滤(布隆过滤器等)
179
+ - 单机防丢失
180
+ - excel、mysql、redis数据完善
181
+
182
+ > 未更新流程图!!!
183
+ ![img.png](https://image-luyuan.oss-cn-hangzhou.aliyuncs.com/image/D2388CDC-B9E5-4CE4-9F2C-7D173763B6A8.png)
@@ -0,0 +1,2 @@
1
+ from .launchers import LauncherAir, LauncherPro
2
+ from .constant import CrawlerModel
@@ -1,7 +1,7 @@
1
1
  from .common_queue import Queue
2
2
  from .response import Response
3
3
  from .request import Request
4
- from .item import BaseItem
4
+ from .item import BaseItem, ConsoleItem
5
5
  from .seed import Seed
6
6
 
7
7
  from .log import logger
@@ -37,3 +37,10 @@ class BaseItem(metaclass=Item):
37
37
  @property
38
38
  def table(self):
39
39
  return self.Data.__name__
40
+
41
+
42
+ class ConsoleItem(BaseItem):
43
+
44
+ __TABLE__ = "console"
45
+ __FIELDS__ = "data"
46
+
@@ -30,6 +30,24 @@ class DealModel:
30
30
 
31
31
  class LogTemplate:
32
32
 
33
+ console_item = """
34
+ ----------------------- start - console pipeline -----------------
35
+ 种子详情 \n{seed_detail}
36
+ 解析详情 \n{parse_detail}
37
+ ----------------------- end - console pipeline ------------------
38
+ """
39
+
40
+ launcher_air_polling = """
41
+ ----------------------- start - 轮训日志: {task} -----------------
42
+ 内存队列
43
+ 种子数: {doing_len}
44
+ 待消费: {todo_len}
45
+ 已消费: {done_len}
46
+ 存储队列
47
+ 待上传: {upload_len}
48
+ ----------------------- end - 轮训日志: {task} ------------------
49
+ """
50
+
33
51
  launcher_pro_polling = """
34
52
  ----------------------- start - 轮训日志: {task} -----------------
35
53
  内存队列
@@ -69,4 +87,8 @@ class LogTemplate:
69
87
  response
70
88
  status : {status} \n{response}
71
89
  ------------------------------------------------------------------
72
- """
90
+ """
91
+
92
+ @staticmethod
93
+ def log_info(item: dict) -> str:
94
+ return "\n".join([" " * 12 + f"{str(k).ljust(14)}: {str(v)}" for k, v in item.items()])
@@ -0,0 +1 @@
1
+ from .crawler import Crawler
@@ -1,40 +1,52 @@
1
+ import json
1
2
  import threading
2
3
  import time
3
4
  import traceback
4
-
5
5
  from inspect import isgenerator
6
6
  from typing import Union, Callable, Mapping
7
7
 
8
- from cobweb.base import Queue, Seed, BaseItem, Request, Response, logger
9
8
  from cobweb.constant import DealModel, LogTemplate
10
- from cobweb.utils import download_log_info
11
- from cobweb import setting
9
+ from cobweb.base import (
10
+ Queue,
11
+ Seed,
12
+ BaseItem,
13
+ Request,
14
+ Response,
15
+ ConsoleItem,
16
+ logger
17
+ )
12
18
 
13
19
 
14
20
  class Crawler(threading.Thread):
15
21
 
16
22
  def __init__(
17
23
  self,
18
- upload_queue: Queue,
19
- custom_func: Union[Mapping[str, Callable]],
24
+ stop: threading.Event,
25
+ pause: threading.Event,
20
26
  launcher_queue: Union[Mapping[str, Queue]],
27
+ custom_func: Union[Mapping[str, Callable]],
28
+ thread_num: int,
29
+ max_retries: int
21
30
  ):
22
31
  super().__init__()
23
32
 
24
- self.upload_queue = upload_queue
33
+ self._stop = stop
34
+ self._pause = pause
35
+ self._new = launcher_queue["new"]
36
+ self._todo = launcher_queue["todo"]
37
+ self._done = launcher_queue["done"]
38
+ self._upload = launcher_queue["upload"]
39
+
25
40
  for func_name, _callable in custom_func.items():
26
41
  if isinstance(_callable, Callable):
27
42
  self.__setattr__(func_name, _callable)
28
43
 
29
- self.launcher_queue = launcher_queue
30
-
31
- self.spider_thread_num = setting.SPIDER_THREAD_NUM
32
- self.max_retries = setting.SPIDER_MAX_RETRIES
44
+ self.thread_num = thread_num
45
+ self.max_retries = max_retries
33
46
 
34
47
  @staticmethod
35
48
  def request(seed: Seed) -> Union[Request, BaseItem]:
36
- stream = True if setting.DOWNLOAD_MODEL else False
37
- yield Request(seed.url, seed, stream=stream, timeout=5)
49
+ yield Request(seed.url, seed, timeout=5)
38
50
 
39
51
  @staticmethod
40
52
  def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
@@ -43,39 +55,43 @@ class Crawler(threading.Thread):
43
55
 
44
56
  @staticmethod
45
57
  def parse(item: Response) -> BaseItem:
46
- pass
58
+ upload_item = item.to_dict
59
+ upload_item["text"] = item.response.text
60
+ yield ConsoleItem(item.seed, data=json.dumps(upload_item, ensure_ascii=False))
47
61
 
48
- def get_seed(self) -> Seed:
49
- return self.launcher_queue['todo'].pop()
62
+ # def get_seed(self) -> Seed:
63
+ # return self._todo.pop()
50
64
 
51
65
  def distribute(self, item, seed):
52
66
  if isinstance(item, BaseItem):
53
- self.upload_queue.push(item)
67
+ self._upload.push(item)
54
68
  elif isinstance(item, Seed):
55
- self.launcher_queue['new'].push(item)
69
+ self._new.push(item)
56
70
  elif isinstance(item, str) and item == DealModel.poll:
57
- self.launcher_queue['todo'].push(seed)
71
+ self._todo.push(seed)
58
72
  elif isinstance(item, str) and item == DealModel.done:
59
- self.launcher_queue['done'].push(seed)
73
+ self._done.push(seed)
60
74
  elif isinstance(item, str) and item == DealModel.fail:
61
75
  seed.params.seed_status = DealModel.fail
62
- self.launcher_queue['done'].push(seed)
76
+ self._done.push(seed)
63
77
  else:
64
78
  raise TypeError("yield value type error!")
65
79
 
66
80
  def spider(self):
67
- while True:
68
- seed = self.get_seed()
81
+ while not self._stop.is_set():
82
+
83
+ seed = self._todo.pop()
69
84
 
70
85
  if not seed:
86
+ time.sleep(1)
71
87
  continue
72
88
 
73
89
  elif seed.params.retry >= self.max_retries:
74
90
  seed.params.seed_status = DealModel.fail
75
- self.launcher_queue['done'].push(seed)
91
+ self._done.push(seed)
76
92
  continue
77
93
 
78
- seed_detail_log_info = download_log_info(seed.to_dict)
94
+ seed_detail_log_info = LogTemplate.log_info(seed.to_dict)
79
95
 
80
96
  try:
81
97
  request_iterators = self.request(seed)
@@ -106,7 +122,7 @@ class Crawler(threading.Thread):
106
122
  seed_version=seed.params.seed_version,
107
123
  identifier=seed.identifier or "",
108
124
  status=download_item.response,
109
- response=download_log_info(download_item.to_dict)
125
+ response=LogTemplate.log_info(download_item.to_dict)
110
126
  ))
111
127
  parse_iterators = self.parse(download_item)
112
128
  if not isgenerator(parse_iterators):
@@ -123,7 +139,6 @@ class Crawler(threading.Thread):
123
139
 
124
140
  if not iterator_status:
125
141
  raise ValueError("request/download/parse function yield value error!")
126
-
127
142
  except Exception as e:
128
143
  logger.info(LogTemplate.download_exception.format(
129
144
  detail=seed_detail_log_info,
@@ -134,11 +149,12 @@ class Crawler(threading.Thread):
134
149
  exception=''.join(traceback.format_exception(type(e), e, e.__traceback__))
135
150
  ))
136
151
  seed.params.retry += 1
137
- self.launcher_queue['todo'].push(seed)
152
+ self._todo.push(seed)
138
153
  finally:
139
154
  time.sleep(0.1)
155
+ logger.info("spider thread close")
140
156
 
141
157
  def run(self):
142
- for index in range(self.spider_thread_num):
158
+ for index in range(self.thread_num):
143
159
  threading.Thread(name=f"spider_{index}", target=self.spider).start()
144
160
 
@@ -1,2 +1,2 @@
1
- from .launcher import Launcher
1
+ from .launcher_air import LauncherAir
2
2
  from .launcher_pro import LauncherPro