coocan 0.3.2__tar.gz → 0.3.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {coocan-0.3.2 → coocan-0.3.3}/PKG-INFO +1 -1
- coocan-0.3.3/coocan/spider/__init__.py +1 -0
- {coocan-0.3.2 → coocan-0.3.3}/coocan/spider/base.py +30 -16
- coocan-0.3.3/coocan/url/__init__.py +2 -0
- {coocan-0.3.2 → coocan-0.3.3}/coocan/url/request.py +1 -1
- {coocan-0.3.2 → coocan-0.3.3}/coocan/url/response.py +2 -2
- {coocan-0.3.2 → coocan-0.3.3}/coocan.egg-info/PKG-INFO +1 -1
- {coocan-0.3.2 → coocan-0.3.3}/setup.py +1 -1
- coocan-0.3.2/coocan/spider/__init__.py +0 -1
- coocan-0.3.2/coocan/url/__init__.py +0 -2
- {coocan-0.3.2 → coocan-0.3.3}/coocan/__init__.py +0 -0
- {coocan-0.3.2 → coocan-0.3.3}/coocan.egg-info/SOURCES.txt +0 -0
- {coocan-0.3.2 → coocan-0.3.3}/coocan.egg-info/dependency_links.txt +0 -0
- {coocan-0.3.2 → coocan-0.3.3}/coocan.egg-info/top_level.txt +0 -0
- {coocan-0.3.2 → coocan-0.3.3}/setup.cfg +0 -0
@@ -0,0 +1 @@
|
|
1
|
+
from coocan.spider.base import MiniSpider, IgnoreRequest
|
@@ -3,35 +3,39 @@ from collections.abc import Iterator
|
|
3
3
|
|
4
4
|
from loguru import logger
|
5
5
|
|
6
|
-
import
|
7
|
-
|
6
|
+
from coocan.url import Request, Response
|
7
|
+
|
8
|
+
|
9
|
+
class IgnoreRequest(Exception):
|
10
|
+
pass
|
8
11
|
|
9
12
|
|
10
13
|
class MiniSpider:
|
11
14
|
start_urls = []
|
12
15
|
max_requests = 5
|
16
|
+
max_retry_times = 3
|
13
17
|
|
14
18
|
def start_requests(self):
|
15
19
|
"""初始请求"""
|
16
20
|
assert self.start_urls, "没有起始 URL 列表"
|
17
21
|
for url in self.start_urls:
|
18
|
-
yield
|
22
|
+
yield Request(url, self.parse)
|
19
23
|
|
20
24
|
def middleware(self, request: Request):
|
21
25
|
request.headers.setdefault("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36 Edg/134.0.0.0")
|
22
26
|
|
23
27
|
async def get_response(self, request: Request):
|
24
28
|
"""发送请求,获取响应"""
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
logger.error("{} {}".format(request.url, e))
|
31
|
-
|
32
|
-
def parse(self, response):
|
29
|
+
self.middleware(request)
|
30
|
+
response = await request.send()
|
31
|
+
return response
|
32
|
+
|
33
|
+
def parse(self, response: Response):
|
33
34
|
raise NotImplementedError("没有定义回调函数 {}.parse ".format(self.__class__.__name__))
|
34
35
|
|
36
|
+
def when_request_error(self, e: Exception, request: Request):
|
37
|
+
logger.error("{} {}".format(type(e).__name__, request.url))
|
38
|
+
|
35
39
|
async def worker(self, queue, semaphore):
|
36
40
|
"""工作协程,从队列中获取请求并处理"""
|
37
41
|
while True:
|
@@ -43,16 +47,26 @@ class MiniSpider:
|
|
43
47
|
|
44
48
|
# 控制并发
|
45
49
|
async with semaphore:
|
46
|
-
|
47
|
-
|
50
|
+
for i in range(self.max_retry_times + 1):
|
51
|
+
if i > 0:
|
52
|
+
logger.debug("正在重试第{}次...{}".format(i, request.url))
|
48
53
|
try:
|
49
|
-
|
54
|
+
response = await self.get_response(request)
|
55
|
+
cached = request.callback(Response(response), **request.cb_kwargs)
|
50
56
|
if isinstance(cached, Iterator):
|
51
57
|
for next_request in cached:
|
52
58
|
await queue.put(next_request) # 将后续请求加入队列
|
53
59
|
except Exception as e:
|
54
|
-
|
55
|
-
|
60
|
+
try:
|
61
|
+
result = self.when_request_error(e, request)
|
62
|
+
if isinstance(result, Request):
|
63
|
+
await queue.put(result)
|
64
|
+
logger.debug("新的请求 {}".format(result.url))
|
65
|
+
except IgnoreRequest as e:
|
66
|
+
logger.debug("{} 忽略请求 {}".format(e, request.url))
|
67
|
+
break
|
68
|
+
else:
|
69
|
+
break
|
56
70
|
queue.task_done()
|
57
71
|
|
58
72
|
async def run(self):
|
@@ -6,7 +6,7 @@ cli = httpx.AsyncClient()
|
|
6
6
|
|
7
7
|
|
8
8
|
class Request:
|
9
|
-
def __init__(self, url, callback: Callable = None, params=None, headers=None, data=None, json=None, timeout=None, cb_kwargs=None):
|
9
|
+
def __init__(self, url: str, callback: Callable = None, params=None, headers=None, data=None, json=None, timeout=None, cb_kwargs=None):
|
10
10
|
self.url = url
|
11
11
|
self.callback = callback
|
12
12
|
self.params = params
|
@@ -1,15 +1,15 @@
|
|
1
1
|
from typing import Callable
|
2
2
|
|
3
3
|
from cocoman.spider.errors import ResponseCodeError, ResponseTextError
|
4
|
+
from httpx import Response
|
4
5
|
from parsel import Selector
|
5
|
-
from requests import Response
|
6
6
|
|
7
7
|
|
8
8
|
class SelectorResponse(Response):
|
9
9
|
"""可以使用Xpath、CSS"""
|
10
10
|
|
11
11
|
def __init__(self, response: Response):
|
12
|
-
super().__init__()
|
12
|
+
super().__init__(response.status_code)
|
13
13
|
self.__dict__.update(response.__dict__)
|
14
14
|
self.selector = Selector(text=response.text)
|
15
15
|
|
@@ -1 +0,0 @@
|
|
1
|
-
from coocan.spider.base import MiniSpider
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|