coocan 0.3.2__tar.gz → 0.3.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: coocan
3
- Version: 0.3.2
3
+ Version: 0.3.3
4
4
  Summary: Air Spider Framework
5
5
  Author: wauo
6
6
  Author-email: markadc@126.com
@@ -0,0 +1 @@
1
+ from coocan.spider.base import MiniSpider, IgnoreRequest
@@ -3,35 +3,39 @@ from collections.abc import Iterator
3
3
 
4
4
  from loguru import logger
5
5
 
6
- import coocan
7
- from coocan.url import Request
6
+ from coocan.url import Request, Response
7
+
8
+
9
+ class IgnoreRequest(Exception):
10
+ pass
8
11
 
9
12
 
10
13
  class MiniSpider:
11
14
  start_urls = []
12
15
  max_requests = 5
16
+ max_retry_times = 3
13
17
 
14
18
  def start_requests(self):
15
19
  """初始请求"""
16
20
  assert self.start_urls, "没有起始 URL 列表"
17
21
  for url in self.start_urls:
18
- yield coocan.Request(url, self.parse)
22
+ yield Request(url, self.parse)
19
23
 
20
24
  def middleware(self, request: Request):
21
25
  request.headers.setdefault("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36 Edg/134.0.0.0")
22
26
 
23
27
  async def get_response(self, request: Request):
24
28
  """发送请求,获取响应"""
25
- try:
26
- self.middleware(request)
27
- response = await request.send()
28
- return response
29
- except Exception as e:
30
- logger.error("{} {}".format(request.url, e))
31
-
32
- def parse(self, response):
29
+ self.middleware(request)
30
+ response = await request.send()
31
+ return response
32
+
33
+ def parse(self, response: Response):
33
34
  raise NotImplementedError("没有定义回调函数 {}.parse ".format(self.__class__.__name__))
34
35
 
36
+ def when_request_error(self, e: Exception, request: Request):
37
+ logger.error("{} {}".format(type(e).__name__, request.url))
38
+
35
39
  async def worker(self, queue, semaphore):
36
40
  """工作协程,从队列中获取请求并处理"""
37
41
  while True:
@@ -43,16 +47,26 @@ class MiniSpider:
43
47
 
44
48
  # 控制并发
45
49
  async with semaphore:
46
- response = await self.get_response(request)
47
- if response:
50
+ for i in range(self.max_retry_times + 1):
51
+ if i > 0:
52
+ logger.debug("正在重试第{}次...{}".format(i, request.url))
48
53
  try:
49
- cached = request.callback(response, **request.cb_kwargs)
54
+ response = await self.get_response(request)
55
+ cached = request.callback(Response(response), **request.cb_kwargs)
50
56
  if isinstance(cached, Iterator):
51
57
  for next_request in cached:
52
58
  await queue.put(next_request) # 将后续请求加入队列
53
59
  except Exception as e:
54
- logger.error(e)
55
-
60
+ try:
61
+ result = self.when_request_error(e, request)
62
+ if isinstance(result, Request):
63
+ await queue.put(result)
64
+ logger.debug("新的请求 {}".format(result.url))
65
+ except IgnoreRequest as e:
66
+ logger.debug("{} 忽略请求 {}".format(e, request.url))
67
+ break
68
+ else:
69
+ break
56
70
  queue.task_done()
57
71
 
58
72
  async def run(self):
@@ -0,0 +1,2 @@
1
+ from coocan.url.request import Request
2
+ from coocan.url.response import SelectorResponse as Response
@@ -6,7 +6,7 @@ cli = httpx.AsyncClient()
6
6
 
7
7
 
8
8
  class Request:
9
- def __init__(self, url, callback: Callable = None, params=None, headers=None, data=None, json=None, timeout=None, cb_kwargs=None):
9
+ def __init__(self, url: str, callback: Callable = None, params=None, headers=None, data=None, json=None, timeout=None, cb_kwargs=None):
10
10
  self.url = url
11
11
  self.callback = callback
12
12
  self.params = params
@@ -1,15 +1,15 @@
1
1
  from typing import Callable
2
2
 
3
3
  from cocoman.spider.errors import ResponseCodeError, ResponseTextError
4
+ from httpx import Response
4
5
  from parsel import Selector
5
- from requests import Response
6
6
 
7
7
 
8
8
  class SelectorResponse(Response):
9
9
  """可以使用Xpath、CSS"""
10
10
 
11
11
  def __init__(self, response: Response):
12
- super().__init__()
12
+ super().__init__(response.status_code)
13
13
  self.__dict__.update(response.__dict__)
14
14
  self.selector = Selector(text=response.text)
15
15
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: coocan
3
- Version: 0.3.2
3
+ Version: 0.3.3
4
4
  Summary: Air Spider Framework
5
5
  Author: wauo
6
6
  Author-email: markadc@126.com
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name="coocan",
5
- version="0.3.2",
5
+ version="0.3.3",
6
6
  author="wauo",
7
7
  author_email="markadc@126.com",
8
8
  description="Air Spider Framework",
@@ -1 +0,0 @@
1
- from coocan.spider.base import MiniSpider
@@ -1,2 +0,0 @@
1
- from coocan.url.request import Request
2
- from coocan.url.response import SelectorResponse
File without changes
File without changes