coocan 0.5.2.2__tar.gz → 0.5.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {coocan-0.5.2.2 → coocan-0.5.3.1}/PKG-INFO +9 -4
- {coocan-0.5.2.2 → coocan-0.5.3.1}/coocan/spider/base.py +51 -17
- {coocan-0.5.2.2 → coocan-0.5.3.1}/coocan.egg-info/PKG-INFO +9 -4
- {coocan-0.5.2.2 → coocan-0.5.3.1}/coocan.egg-info/SOURCES.txt +1 -0
- coocan-0.5.3.1/pyproject.toml +28 -0
- {coocan-0.5.2.2 → coocan-0.5.3.1}/setup.py +2 -1
- {coocan-0.5.2.2 → coocan-0.5.3.1}/README.md +0 -0
- {coocan-0.5.2.2 → coocan-0.5.3.1}/coocan/__init__.py +0 -0
- {coocan-0.5.2.2 → coocan-0.5.3.1}/coocan/cmd/__init__.py +0 -0
- {coocan-0.5.2.2 → coocan-0.5.3.1}/coocan/cmd/cli.py +0 -0
- {coocan-0.5.2.2 → coocan-0.5.3.1}/coocan/gen.py +0 -0
- {coocan-0.5.2.2 → coocan-0.5.3.1}/coocan/push_project.py +0 -0
- {coocan-0.5.2.2 → coocan-0.5.3.1}/coocan/spider/__init__.py +0 -0
- {coocan-0.5.2.2 → coocan-0.5.3.1}/coocan/templates/spider.txt +0 -0
- {coocan-0.5.2.2 → coocan-0.5.3.1}/coocan/url/__init__.py +0 -0
- {coocan-0.5.2.2 → coocan-0.5.3.1}/coocan/url/request.py +0 -0
- {coocan-0.5.2.2 → coocan-0.5.3.1}/coocan/url/response.py +0 -0
- {coocan-0.5.2.2 → coocan-0.5.3.1}/coocan.egg-info/dependency_links.txt +0 -0
- {coocan-0.5.2.2 → coocan-0.5.3.1}/coocan.egg-info/entry_points.txt +0 -0
- {coocan-0.5.2.2 → coocan-0.5.3.1}/coocan.egg-info/requires.txt +0 -0
- {coocan-0.5.2.2 → coocan-0.5.3.1}/coocan.egg-info/top_level.txt +0 -0
- {coocan-0.5.2.2 → coocan-0.5.3.1}/setup.cfg +0 -0
@@ -1,14 +1,19 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: coocan
|
3
|
-
Version: 0.5.
|
4
|
-
Summary: Air Spider Framework
|
3
|
+
Version: 0.5.3.1
|
4
|
+
Summary: Air Async Spider Framework
|
5
|
+
Home-page: https://github.com/markadc/coocan
|
5
6
|
Author: wauo
|
6
|
-
Author-email: markadc@126.com
|
7
|
+
Author-email: wauo <markadc@126.com>
|
8
|
+
Project-URL: Homepage, https://github.com/markadc/coocan
|
7
9
|
Requires-Python: >=3.10
|
8
10
|
Description-Content-Type: text/markdown
|
9
11
|
Requires-Dist: click>=8.0.0
|
10
12
|
Requires-Dist: httpx
|
11
13
|
Requires-Dist: loguru
|
14
|
+
Dynamic: author
|
15
|
+
Dynamic: home-page
|
16
|
+
Dynamic: requires-python
|
12
17
|
|
13
18
|
# 项目说明
|
14
19
|
|
@@ -24,6 +24,7 @@ class MiniSpider:
|
|
24
24
|
enable_random_ua = True
|
25
25
|
headers_extra_field = {}
|
26
26
|
delay = 0
|
27
|
+
item_speed = 100
|
27
28
|
|
28
29
|
def start_requests(self):
|
29
30
|
"""初始请求"""
|
@@ -55,10 +56,10 @@ class MiniSpider:
|
|
55
56
|
def handle_callback_excetpion(self, e: Exception, request: Request, response: Response):
|
56
57
|
logger.error("{} `回调`时出现异常 | {} | {} | {}".format(response.status_code, e, request.callback.__name__, request.url))
|
57
58
|
|
58
|
-
async def
|
59
|
+
async def request_task(self, q1: asyncio.PriorityQueue, q2: asyncio.Queue, semaphore: asyncio.Semaphore):
|
59
60
|
"""工作协程,从队列中获取请求并处理"""
|
60
61
|
while True:
|
61
|
-
req: Request = await
|
62
|
+
req: Request = await q1.get()
|
62
63
|
|
63
64
|
# 结束信号
|
64
65
|
if req.url == "":
|
@@ -82,7 +83,7 @@ class MiniSpider:
|
|
82
83
|
try:
|
83
84
|
result = self.handle_request_excetpion(e, req)
|
84
85
|
if isinstance(result, Request):
|
85
|
-
await
|
86
|
+
await q1.put(result)
|
86
87
|
break
|
87
88
|
except IgnoreRequest as e:
|
88
89
|
logger.debug("{} 忽略请求 {}".format(e, req.url))
|
@@ -105,39 +106,72 @@ class MiniSpider:
|
|
105
106
|
try:
|
106
107
|
cached = req.callback(Response(resp), **req.cb_kwargs)
|
107
108
|
if isinstance(cached, Iterator):
|
108
|
-
for
|
109
|
-
|
109
|
+
for c in cached:
|
110
|
+
if isinstance(c, Request):
|
111
|
+
await q1.put(c) # 把后续请求加入队列
|
112
|
+
elif isinstance(c, dict):
|
113
|
+
await q2.put(c)
|
114
|
+
else:
|
115
|
+
logger.warning("Please yield `Request` or `dict` Not {}".format(c))
|
110
116
|
except Exception as e:
|
111
117
|
self.handle_callback_excetpion(e, req, resp)
|
112
118
|
finally:
|
113
119
|
break
|
114
120
|
|
115
|
-
|
121
|
+
q1.task_done()
|
122
|
+
|
123
|
+
async def item_task(self, q2: asyncio.Queue):
|
124
|
+
while True:
|
125
|
+
item = await q2.get()
|
126
|
+
if item is None:
|
127
|
+
break
|
128
|
+
self.process_item(item)
|
129
|
+
q2.task_done()
|
130
|
+
|
131
|
+
def process_item(self, item: dict):
|
132
|
+
logger.success(item)
|
116
133
|
|
117
134
|
async def run(self):
|
118
135
|
"""爬取入口"""
|
119
|
-
|
136
|
+
request_queue = asyncio.PriorityQueue()
|
137
|
+
item_queue = asyncio.Queue()
|
120
138
|
semaphore = asyncio.Semaphore(self.max_requests)
|
121
139
|
|
122
|
-
#
|
123
|
-
|
124
|
-
asyncio.create_task(self.
|
140
|
+
# 处理请求...
|
141
|
+
request_tasks = [
|
142
|
+
asyncio.create_task(self.request_task(request_queue, item_queue, semaphore))
|
125
143
|
for _ in range(self.max_requests)
|
126
144
|
]
|
127
145
|
|
128
|
-
#
|
146
|
+
# 处理数据...
|
147
|
+
item_tasks = [
|
148
|
+
asyncio.create_task(self.item_task(item_queue))
|
149
|
+
for _ in range(self.item_speed)
|
150
|
+
]
|
151
|
+
|
152
|
+
# 发送最开始的请求
|
129
153
|
for req in self.start_requests():
|
130
|
-
await
|
154
|
+
await request_queue.put(req)
|
131
155
|
|
132
|
-
#
|
133
|
-
await
|
156
|
+
# 等待所有请求处理完成
|
157
|
+
await request_queue.join()
|
158
|
+
logger.debug("处理请求已结束")
|
134
159
|
|
135
|
-
#
|
160
|
+
# 等待所有数据处理完成
|
161
|
+
await item_queue.join()
|
162
|
+
logger.debug("处理数据已结束")
|
163
|
+
|
164
|
+
# 退出请求任务
|
136
165
|
for _ in range(self.max_requests):
|
137
|
-
await
|
166
|
+
await request_queue.put(Request(url=""))
|
167
|
+
|
168
|
+
# 退出数据任务
|
169
|
+
for _ in range(self.item_speed):
|
170
|
+
await item_queue.put(None)
|
138
171
|
|
139
172
|
# 等待所有工作协程完成
|
140
|
-
await asyncio.gather(*
|
173
|
+
await asyncio.gather(*request_tasks)
|
174
|
+
await asyncio.gather(*item_tasks)
|
141
175
|
|
142
176
|
def go(self):
|
143
177
|
asyncio.run(self.run())
|
@@ -1,14 +1,19 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: coocan
|
3
|
-
Version: 0.5.
|
4
|
-
Summary: Air Spider Framework
|
3
|
+
Version: 0.5.3.1
|
4
|
+
Summary: Air Async Spider Framework
|
5
|
+
Home-page: https://github.com/markadc/coocan
|
5
6
|
Author: wauo
|
6
|
-
Author-email: markadc@126.com
|
7
|
+
Author-email: wauo <markadc@126.com>
|
8
|
+
Project-URL: Homepage, https://github.com/markadc/coocan
|
7
9
|
Requires-Python: >=3.10
|
8
10
|
Description-Content-Type: text/markdown
|
9
11
|
Requires-Dist: click>=8.0.0
|
10
12
|
Requires-Dist: httpx
|
11
13
|
Requires-Dist: loguru
|
14
|
+
Dynamic: author
|
15
|
+
Dynamic: home-page
|
16
|
+
Dynamic: requires-python
|
12
17
|
|
13
18
|
# 项目说明
|
14
19
|
|
@@ -0,0 +1,28 @@
|
|
1
|
+
[build-system]
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
3
|
+
build-backend = "setuptools.build_meta"
|
4
|
+
|
5
|
+
[project]
|
6
|
+
name = "coocan"
|
7
|
+
version = "0.5.3.1"
|
8
|
+
authors = [
|
9
|
+
{ name = "wauo", email = "markadc@126.com" }
|
10
|
+
]
|
11
|
+
description = "Air Async Spider Framework"
|
12
|
+
readme = "README.md"
|
13
|
+
requires-python = ">=3.10"
|
14
|
+
dependencies = [
|
15
|
+
"click>=8.0.0",
|
16
|
+
"httpx",
|
17
|
+
"loguru"
|
18
|
+
]
|
19
|
+
urls = { Homepage = "https://github.com/markadc/coocan" }
|
20
|
+
|
21
|
+
[project.scripts]
|
22
|
+
coocan = "coocan.cmd.cli:main"
|
23
|
+
|
24
|
+
[tool.setuptools]
|
25
|
+
include-package-data = true
|
26
|
+
|
27
|
+
[tool.setuptools.package-data]
|
28
|
+
coocan = ["templates/*"]
|
@@ -4,8 +4,9 @@ with open("README.md", "r", encoding="utf-8") as f:
|
|
4
4
|
long_description = f.read()
|
5
5
|
|
6
6
|
setup(
|
7
|
+
url="https://github.com/markadc/coocan",
|
7
8
|
name="coocan",
|
8
|
-
version="0.5.
|
9
|
+
version="0.5.3",
|
9
10
|
author="wauo",
|
10
11
|
author_email="markadc@126.com",
|
11
12
|
description="Air Spider Framework",
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|