coocan 0.3.1__tar.gz → 0.3.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {coocan-0.3.1 → coocan-0.3.3}/PKG-INFO +1 -1
- coocan-0.3.3/coocan/spider/__init__.py +1 -0
- {coocan-0.3.1 → coocan-0.3.3}/coocan/spider/base.py +30 -16
- coocan-0.3.3/coocan/url/__init__.py +2 -0
- coocan-0.3.3/coocan/url/request.py +26 -0
- {coocan-0.3.1 → coocan-0.3.3}/coocan/url/response.py +3 -3
- {coocan-0.3.1 → coocan-0.3.3}/coocan.egg-info/PKG-INFO +1 -1
- {coocan-0.3.1 → coocan-0.3.3}/setup.py +1 -1
- coocan-0.3.1/coocan/spider/__init__.py +0 -1
- coocan-0.3.1/coocan/url/__init__.py +0 -2
- coocan-0.3.1/coocan/url/request.py +0 -21
- {coocan-0.3.1 → coocan-0.3.3}/coocan/__init__.py +0 -0
- {coocan-0.3.1 → coocan-0.3.3}/coocan.egg-info/SOURCES.txt +0 -0
- {coocan-0.3.1 → coocan-0.3.3}/coocan.egg-info/dependency_links.txt +0 -0
- {coocan-0.3.1 → coocan-0.3.3}/coocan.egg-info/top_level.txt +0 -0
- {coocan-0.3.1 → coocan-0.3.3}/setup.cfg +0 -0
@@ -0,0 +1 @@
|
|
1
|
+
from coocan.spider.base import MiniSpider, IgnoreRequest
|
@@ -3,35 +3,39 @@ from collections.abc import Iterator
|
|
3
3
|
|
4
4
|
from loguru import logger
|
5
5
|
|
6
|
-
import
|
7
|
-
|
6
|
+
from coocan.url import Request, Response
|
7
|
+
|
8
|
+
|
9
|
+
class IgnoreRequest(Exception):
|
10
|
+
pass
|
8
11
|
|
9
12
|
|
10
13
|
class MiniSpider:
|
11
14
|
start_urls = []
|
12
15
|
max_requests = 5
|
16
|
+
max_retry_times = 3
|
13
17
|
|
14
18
|
def start_requests(self):
|
15
19
|
"""初始请求"""
|
16
20
|
assert self.start_urls, "没有起始 URL 列表"
|
17
21
|
for url in self.start_urls:
|
18
|
-
yield
|
22
|
+
yield Request(url, self.parse)
|
19
23
|
|
20
24
|
def middleware(self, request: Request):
|
21
25
|
request.headers.setdefault("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36 Edg/134.0.0.0")
|
22
26
|
|
23
27
|
async def get_response(self, request: Request):
|
24
28
|
"""发送请求,获取响应"""
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
logger.error("{} {}".format(request.url, e))
|
31
|
-
|
32
|
-
def parse(self, response):
|
29
|
+
self.middleware(request)
|
30
|
+
response = await request.send()
|
31
|
+
return response
|
32
|
+
|
33
|
+
def parse(self, response: Response):
|
33
34
|
raise NotImplementedError("没有定义回调函数 {}.parse ".format(self.__class__.__name__))
|
34
35
|
|
36
|
+
def when_request_error(self, e: Exception, request: Request):
|
37
|
+
logger.error("{} {}".format(type(e).__name__, request.url))
|
38
|
+
|
35
39
|
async def worker(self, queue, semaphore):
|
36
40
|
"""工作协程,从队列中获取请求并处理"""
|
37
41
|
while True:
|
@@ -43,16 +47,26 @@ class MiniSpider:
|
|
43
47
|
|
44
48
|
# 控制并发
|
45
49
|
async with semaphore:
|
46
|
-
|
47
|
-
|
50
|
+
for i in range(self.max_retry_times + 1):
|
51
|
+
if i > 0:
|
52
|
+
logger.debug("正在重试第{}次...{}".format(i, request.url))
|
48
53
|
try:
|
49
|
-
|
54
|
+
response = await self.get_response(request)
|
55
|
+
cached = request.callback(Response(response), **request.cb_kwargs)
|
50
56
|
if isinstance(cached, Iterator):
|
51
57
|
for next_request in cached:
|
52
58
|
await queue.put(next_request) # 将后续请求加入队列
|
53
59
|
except Exception as e:
|
54
|
-
|
55
|
-
|
60
|
+
try:
|
61
|
+
result = self.when_request_error(e, request)
|
62
|
+
if isinstance(result, Request):
|
63
|
+
await queue.put(result)
|
64
|
+
logger.debug("新的请求 {}".format(result.url))
|
65
|
+
except IgnoreRequest as e:
|
66
|
+
logger.debug("{} 忽略请求 {}".format(e, request.url))
|
67
|
+
break
|
68
|
+
else:
|
69
|
+
break
|
56
70
|
queue.task_done()
|
57
71
|
|
58
72
|
async def run(self):
|
@@ -0,0 +1,26 @@
|
|
1
|
+
from typing import Callable
|
2
|
+
|
3
|
+
import httpx
|
4
|
+
|
5
|
+
cli = httpx.AsyncClient()
|
6
|
+
|
7
|
+
|
8
|
+
class Request:
|
9
|
+
def __init__(self, url: str, callback: Callable = None, params=None, headers=None, data=None, json=None, timeout=None, cb_kwargs=None):
|
10
|
+
self.url = url
|
11
|
+
self.callback = callback
|
12
|
+
self.params = params
|
13
|
+
self.headers = headers or {}
|
14
|
+
self.data = data
|
15
|
+
self.json = json
|
16
|
+
self.timeout = timeout
|
17
|
+
self.cb_kwargs = cb_kwargs or {}
|
18
|
+
|
19
|
+
async def send(self):
|
20
|
+
if (self.data and self.json) is None:
|
21
|
+
response = await cli.get(self.url, params=self.params, headers=self.headers, timeout=self.timeout)
|
22
|
+
elif self.data or self.json:
|
23
|
+
response = await cli.post(self.url, params=self.params, headers=self.headers, data=self.data, json=self.json, timeout=self.timeout)
|
24
|
+
else:
|
25
|
+
raise Exception("仅支持 GET 和 POST 请求")
|
26
|
+
return response
|
@@ -1,15 +1,15 @@
|
|
1
1
|
from typing import Callable
|
2
2
|
|
3
3
|
from cocoman.spider.errors import ResponseCodeError, ResponseTextError
|
4
|
+
from httpx import Response
|
4
5
|
from parsel import Selector
|
5
|
-
from requests import Response
|
6
6
|
|
7
7
|
|
8
8
|
class SelectorResponse(Response):
|
9
9
|
"""可以使用Xpath、CSS"""
|
10
10
|
|
11
11
|
def __init__(self, response: Response):
|
12
|
-
super().__init__()
|
12
|
+
super().__init__(response.status_code)
|
13
13
|
self.__dict__.update(response.__dict__)
|
14
14
|
self.selector = Selector(text=response.text)
|
15
15
|
|
@@ -37,7 +37,7 @@ class SelectorResponse(Response):
|
|
37
37
|
if self.status_code not in codes:
|
38
38
|
raise ResponseCodeError("{} not in {}".format(self.status_code, codes))
|
39
39
|
|
40
|
-
def raise_for_text(self, validate: Callable[[str], bool]
|
40
|
+
def raise_for_text(self, validate: Callable[[str], bool]):
|
41
41
|
if validate(self.text) is False:
|
42
42
|
raise ResponseTextError("not ideal text")
|
43
43
|
|
@@ -1 +0,0 @@
|
|
1
|
-
from coocan.spider.base import MiniSpider
|
@@ -1,21 +0,0 @@
|
|
1
|
-
from typing import Callable
|
2
|
-
|
3
|
-
import httpx
|
4
|
-
|
5
|
-
cli = httpx.AsyncClient()
|
6
|
-
|
7
|
-
|
8
|
-
class Request:
|
9
|
-
def __init__(self, url, callback: Callable = None, params=None, headers=None, data=None, json=None, timeout=None, cb_kwargs=None):
|
10
|
-
self.url = url
|
11
|
-
self.callback = callback
|
12
|
-
self.params = params
|
13
|
-
self.headers = headers or {}
|
14
|
-
self.data = data
|
15
|
-
self.json = json
|
16
|
-
self.timeout = timeout
|
17
|
-
self.cb_kwargs = cb_kwargs or {}
|
18
|
-
|
19
|
-
async def send(self):
|
20
|
-
response = await cli.get(self.url, params=self.params, headers=self.headers, timeout=self.timeout)
|
21
|
-
return response
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|