coocan 0.5.2.1__tar.gz → 0.5.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,8 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: coocan
3
- Version: 0.5.2.1
3
+ Version: 0.5.3
4
4
  Summary: Air Spider Framework
5
+ Home-page: https://github.com/markadc/coocan
5
6
  Author: wauo
6
7
  Author-email: markadc@126.com
7
8
  Requires-Python: >=3.10
@@ -28,7 +28,7 @@ def snake_to_pascal(snake_str: str):
28
28
  def main(ctx):
29
29
  if ctx.invoked_subcommand is None:
30
30
  print(help_info)
31
- click.echo("cc new -s <spider_file_name>")
31
+ click.echo("coocan new -s <spider_file_name>")
32
32
 
33
33
 
34
34
  @main.command()
@@ -2,6 +2,7 @@ import os
2
2
  import sys
3
3
 
4
4
  msg = sys.argv[1] if len(sys.argv) == 2 else "Auto Submit"
5
+ msg = sys.argv[1] if len(sys.argv) == 2 else "更新爬虫示例(接收爬虫数据)"
5
6
 
6
7
  cmd1 = "git add ."
7
8
  cmd2 = 'git commit -m "{}"'.format(msg)
@@ -24,6 +24,7 @@ class MiniSpider:
24
24
  enable_random_ua = True
25
25
  headers_extra_field = {}
26
26
  delay = 0
27
+ item_speed = 100
27
28
 
28
29
  def start_requests(self):
29
30
  """初始请求"""
@@ -55,10 +56,10 @@ class MiniSpider:
55
56
  def handle_callback_excetpion(self, e: Exception, request: Request, response: Response):
56
57
  logger.error("{} `回调`时出现异常 | {} | {} | {}".format(response.status_code, e, request.callback.__name__, request.url))
57
58
 
58
- async def worker(self, queue: asyncio.PriorityQueue, semaphore: asyncio.Semaphore):
59
+ async def request_task(self, q1: asyncio.PriorityQueue, q2: asyncio.Queue, semaphore: asyncio.Semaphore):
59
60
  """工作协程,从队列中获取请求并处理"""
60
61
  while True:
61
- req: Request = await queue.get()
62
+ req: Request = await q1.get()
62
63
 
63
64
  # 结束信号
64
65
  if req.url == "":
@@ -82,7 +83,7 @@ class MiniSpider:
82
83
  try:
83
84
  result = self.handle_request_excetpion(e, req)
84
85
  if isinstance(result, Request):
85
- await queue.put(result)
86
+ await q1.put(result)
86
87
  break
87
88
  except IgnoreRequest as e:
88
89
  logger.debug("{} 忽略请求 {}".format(e, req.url))
@@ -105,39 +106,72 @@ class MiniSpider:
105
106
  try:
106
107
  cached = req.callback(Response(resp), **req.cb_kwargs)
107
108
  if isinstance(cached, Iterator):
108
- for next_request in cached:
109
- await queue.put(next_request) # 把后续请求加入队列
109
+ for c in cached:
110
+ if isinstance(c, Request):
111
+ await q1.put(c) # 把后续请求加入队列
112
+ elif isinstance(c, dict):
113
+ await q2.put(c)
114
+ else:
115
+ logger.warning("Please yield `Request` or `dict` Not {}".format(c))
110
116
  except Exception as e:
111
117
  self.handle_callback_excetpion(e, req, resp)
112
118
  finally:
113
119
  break
114
120
 
115
- queue.task_done()
121
+ q1.task_done()
122
+
123
+ async def item_task(self, q2: asyncio.Queue):
124
+ while True:
125
+ item = await q2.get()
126
+ if item is None:
127
+ break
128
+ self.process_item(item)
129
+ q2.task_done()
130
+
131
+ def process_item(self, item: dict):
132
+ logger.success(item)
116
133
 
117
134
  async def run(self):
118
135
  """爬取入口"""
119
- queue = asyncio.PriorityQueue()
136
+ request_queue = asyncio.PriorityQueue()
137
+ item_queue = asyncio.Queue()
120
138
  semaphore = asyncio.Semaphore(self.max_requests)
121
139
 
122
- # 工作协程启动...
123
- workers = [
124
- asyncio.create_task(self.worker(queue, semaphore))
140
+ # 处理请求...
141
+ request_tasks = [
142
+ asyncio.create_task(self.request_task(request_queue, item_queue, semaphore))
125
143
  for _ in range(self.max_requests)
126
144
  ]
127
145
 
128
- # 将初始请求加入队列
146
+ # 处理数据...
147
+ item_tasks = [
148
+ asyncio.create_task(self.item_task(item_queue))
149
+ for _ in range(self.item_speed)
150
+ ]
151
+
152
+ # 发送最开始的请求
129
153
  for req in self.start_requests():
130
- await queue.put(req)
154
+ await request_queue.put(req)
131
155
 
132
- # 等待队列中的所有任务完成
133
- await queue.join()
156
+ # 等待所有请求处理完成
157
+ await request_queue.join()
158
+ logger.debug("处理请求已结束")
134
159
 
135
- # ...停止工作协程
160
+ # 等待所有数据处理完成
161
+ await item_queue.join()
162
+ logger.debug("处理数据已结束")
163
+
164
+ # 退出请求任务
136
165
  for _ in range(self.max_requests):
137
- await queue.put(Request(url=""))
166
+ await request_queue.put(Request(url=""))
167
+
168
+ # 退出数据任务
169
+ for _ in range(self.item_speed):
170
+ await item_queue.put(None)
138
171
 
139
172
  # 等待所有工作协程完成
140
- await asyncio.gather(*workers)
173
+ await asyncio.gather(*request_tasks)
174
+ await asyncio.gather(*item_tasks)
141
175
 
142
176
  def go(self):
143
177
  asyncio.run(self.run())
@@ -1,7 +1,8 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: coocan
3
- Version: 0.5.2.1
3
+ Version: 0.5.3
4
4
  Summary: Air Spider Framework
5
+ Home-page: https://github.com/markadc/coocan
5
6
  Author: wauo
6
7
  Author-email: markadc@126.com
7
8
  Requires-Python: >=3.10
@@ -4,8 +4,9 @@ with open("README.md", "r", encoding="utf-8") as f:
4
4
  long_description = f.read()
5
5
 
6
6
  setup(
7
+ url="https://github.com/markadc/coocan",
7
8
  name="coocan",
8
- version="0.5.2.1",
9
+ version="0.5.3",
9
10
  author="wauo",
10
11
  author_email="markadc@126.com",
11
12
  description="Air Spider Framework",
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes