coocan 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- coocan/__init__.py +2 -2
- coocan/_examples/crawl_csdn_detail.py +62 -62
- coocan/_examples/crawl_csdn_list.py +50 -50
- coocan/_examples/recv_item.py +31 -31
- coocan/_examples/view_local_ip.py +22 -22
- coocan/cmd/cli.py +68 -68
- coocan/gen.py +33 -33
- coocan/push_project.py +12 -12
- coocan/spider/__init__.py +1 -1
- coocan/spider/base.py +177 -177
- coocan/templates/spider.txt +17 -17
- coocan/url/__init__.py +2 -2
- coocan/url/request.py +31 -31
- coocan/url/response.py +50 -50
- {coocan-0.5.4.dist-info → coocan-0.5.6.dist-info}/METADATA +97 -99
- coocan-0.5.6.dist-info/RECORD +20 -0
- {coocan-0.5.4.dist-info → coocan-0.5.6.dist-info}/WHEEL +1 -1
- {coocan-0.5.4.dist-info → coocan-0.5.6.dist-info}/top_level.txt +0 -1
- _test/crawl_csdn.py +0 -53
- _test/demo.py +0 -33
- _test/err_demo.py +0 -27
- _test/test_priority.py +0 -21
- _test/test_req_delay.py +0 -19
- _test/test_req_err.py +0 -32
- coocan-0.5.4.dist-info/RECORD +0 -26
- {coocan-0.5.4.dist-info → coocan-0.5.6.dist-info}/entry_points.txt +0 -0
coocan/spider/base.py
CHANGED
@@ -1,177 +1,177 @@
|
|
1
|
-
import asyncio
|
2
|
-
from collections.abc import Iterator
|
3
|
-
|
4
|
-
from loguru import logger
|
5
|
-
|
6
|
-
from coocan.gen import gen_random_ua
|
7
|
-
from coocan.url import Request, Response
|
8
|
-
|
9
|
-
|
10
|
-
class IgnoreRequest(Exception):
|
11
|
-
"""忽略这个请求,不再重试"""
|
12
|
-
pass
|
13
|
-
|
14
|
-
|
15
|
-
class IgnoreResponse(Exception):
|
16
|
-
"""忽略这个响应,不进回调"""
|
17
|
-
pass
|
18
|
-
|
19
|
-
|
20
|
-
class MiniSpider:
|
21
|
-
start_urls = []
|
22
|
-
max_requests = 5
|
23
|
-
max_retry_times = 3
|
24
|
-
enable_random_ua = True
|
25
|
-
headers_extra_field = {}
|
26
|
-
delay = 0
|
27
|
-
item_speed = 100
|
28
|
-
|
29
|
-
def start_requests(self):
|
30
|
-
"""初始请求"""
|
31
|
-
assert self.start_urls, "没有起始 URL 列表"
|
32
|
-
for url in self.start_urls:
|
33
|
-
yield Request(url, self.parse)
|
34
|
-
|
35
|
-
def middleware(self, request: Request):
|
36
|
-
# 随机Ua
|
37
|
-
if self.enable_random_ua is True:
|
38
|
-
request.headers.setdefault("User-Agent", gen_random_ua())
|
39
|
-
|
40
|
-
# 为 headers 补充额外字段
|
41
|
-
if self.headers_extra_field:
|
42
|
-
request.headers.update(self.headers_extra_field)
|
43
|
-
|
44
|
-
def validator(self, response: Response):
|
45
|
-
"""校验响应"""
|
46
|
-
pass
|
47
|
-
|
48
|
-
def parse(self, response: Response):
|
49
|
-
"""默认回调函数"""
|
50
|
-
raise NotImplementedError("没有定义回调函数 {}.parse ".format(self.__class__.__name__))
|
51
|
-
|
52
|
-
def handle_request_excetpion(self, e: Exception, request: Request):
|
53
|
-
"""处理请求时的异常"""
|
54
|
-
logger.error("{} {}".format(type(e).__name__, request.url))
|
55
|
-
|
56
|
-
def handle_callback_excetpion(self, e: Exception, request: Request, response: Response):
|
57
|
-
logger.error("{} `回调`时出现异常 | {} | {} | {}".format(response.status_code, e, request.callback.__name__, request.url))
|
58
|
-
|
59
|
-
async def request_task(self, q1: asyncio.PriorityQueue, q2: asyncio.Queue, semaphore: asyncio.Semaphore):
|
60
|
-
"""工作协程,从队列中获取请求并处理"""
|
61
|
-
while True:
|
62
|
-
req: Request = await q1.get()
|
63
|
-
|
64
|
-
# 结束信号
|
65
|
-
if req.url == "":
|
66
|
-
break
|
67
|
-
|
68
|
-
# 控制并发
|
69
|
-
async with semaphore:
|
70
|
-
for i in range(self.max_retry_times + 1):
|
71
|
-
# 进入了重试
|
72
|
-
if i > 0:
|
73
|
-
logger.debug("正在重试第{}次... {}".format(i, req.url))
|
74
|
-
|
75
|
-
# 开始请求...
|
76
|
-
try:
|
77
|
-
self.middleware(req)
|
78
|
-
await asyncio.sleep(self.delay)
|
79
|
-
resp = await req.send()
|
80
|
-
|
81
|
-
# 请求失败
|
82
|
-
except Exception as e:
|
83
|
-
try:
|
84
|
-
result = self.handle_request_excetpion(e, req)
|
85
|
-
if isinstance(result, Request):
|
86
|
-
await q1.put(result)
|
87
|
-
break
|
88
|
-
except IgnoreRequest as e:
|
89
|
-
logger.debug("{} 忽略请求 {}".format(e, req.url))
|
90
|
-
break
|
91
|
-
except Exception as e:
|
92
|
-
logger.error("`处理异常函数`异常了 | {} | {}".format(e, req.url))
|
93
|
-
|
94
|
-
# 请求成功
|
95
|
-
else:
|
96
|
-
# 校验响应
|
97
|
-
try:
|
98
|
-
self.validator(resp)
|
99
|
-
except IgnoreResponse as e:
|
100
|
-
logger.debug("{} 忽略响应 {}".format(e, req.url))
|
101
|
-
break
|
102
|
-
except Exception as e:
|
103
|
-
logger.error("`校验器`函数异常了 | {} | {}".format(e, req.url))
|
104
|
-
|
105
|
-
# 进入回调
|
106
|
-
try:
|
107
|
-
cached = req.callback(Response(resp), **req.cb_kwargs)
|
108
|
-
if isinstance(cached, Iterator):
|
109
|
-
for c in cached:
|
110
|
-
if isinstance(c, Request):
|
111
|
-
await q1.put(c) # 把后续请求加入队列
|
112
|
-
elif isinstance(c, dict):
|
113
|
-
await q2.put(c)
|
114
|
-
else:
|
115
|
-
logger.warning("Please yield `Request` or `dict` Not {}".format(c))
|
116
|
-
except Exception as e:
|
117
|
-
self.handle_callback_excetpion(e, req, resp)
|
118
|
-
finally:
|
119
|
-
break
|
120
|
-
|
121
|
-
q1.task_done()
|
122
|
-
|
123
|
-
async def item_task(self, q2: asyncio.Queue):
|
124
|
-
while True:
|
125
|
-
item = await q2.get()
|
126
|
-
if item is None:
|
127
|
-
break
|
128
|
-
self.process_item(item)
|
129
|
-
q2.task_done()
|
130
|
-
|
131
|
-
def process_item(self, item: dict):
|
132
|
-
logger.success(item)
|
133
|
-
|
134
|
-
async def run(self):
|
135
|
-
"""爬取入口"""
|
136
|
-
request_queue = asyncio.PriorityQueue()
|
137
|
-
item_queue = asyncio.Queue()
|
138
|
-
semaphore = asyncio.Semaphore(self.max_requests)
|
139
|
-
|
140
|
-
# 处理请求...
|
141
|
-
request_tasks = [
|
142
|
-
asyncio.create_task(self.request_task(request_queue, item_queue, semaphore))
|
143
|
-
for _ in range(self.max_requests)
|
144
|
-
]
|
145
|
-
|
146
|
-
# 处理数据...
|
147
|
-
item_tasks = [
|
148
|
-
asyncio.create_task(self.item_task(item_queue))
|
149
|
-
for _ in range(self.item_speed)
|
150
|
-
]
|
151
|
-
|
152
|
-
# 发送最开始的请求
|
153
|
-
for req in self.start_requests():
|
154
|
-
await request_queue.put(req)
|
155
|
-
|
156
|
-
# 等待所有请求处理完成
|
157
|
-
await request_queue.join()
|
158
|
-
logger.debug("处理请求已结束")
|
159
|
-
|
160
|
-
# 等待所有数据处理完成
|
161
|
-
await item_queue.join()
|
162
|
-
logger.debug("处理数据已结束")
|
163
|
-
|
164
|
-
# 退出请求任务
|
165
|
-
for _ in range(self.max_requests):
|
166
|
-
await request_queue.put(Request(url=""))
|
167
|
-
|
168
|
-
# 退出数据任务
|
169
|
-
for _ in range(self.item_speed):
|
170
|
-
await item_queue.put(None)
|
171
|
-
|
172
|
-
# 等待所有工作协程完成
|
173
|
-
await asyncio.gather(*request_tasks)
|
174
|
-
await asyncio.gather(*item_tasks)
|
175
|
-
|
176
|
-
def go(self):
|
177
|
-
asyncio.run(self.run())
|
1
|
+
import asyncio
|
2
|
+
from collections.abc import Iterator
|
3
|
+
|
4
|
+
from loguru import logger
|
5
|
+
|
6
|
+
from coocan.gen import gen_random_ua
|
7
|
+
from coocan.url import Request, Response
|
8
|
+
|
9
|
+
|
10
|
+
class IgnoreRequest(Exception):
|
11
|
+
"""忽略这个请求,不再重试"""
|
12
|
+
pass
|
13
|
+
|
14
|
+
|
15
|
+
class IgnoreResponse(Exception):
|
16
|
+
"""忽略这个响应,不进回调"""
|
17
|
+
pass
|
18
|
+
|
19
|
+
|
20
|
+
class MiniSpider:
|
21
|
+
start_urls = []
|
22
|
+
max_requests = 5
|
23
|
+
max_retry_times = 3
|
24
|
+
enable_random_ua = True
|
25
|
+
headers_extra_field = {}
|
26
|
+
delay = 0
|
27
|
+
item_speed = 100
|
28
|
+
|
29
|
+
def start_requests(self):
|
30
|
+
"""初始请求"""
|
31
|
+
assert self.start_urls, "没有起始 URL 列表"
|
32
|
+
for url in self.start_urls:
|
33
|
+
yield Request(url, self.parse)
|
34
|
+
|
35
|
+
def middleware(self, request: Request):
|
36
|
+
# 随机Ua
|
37
|
+
if self.enable_random_ua is True:
|
38
|
+
request.headers.setdefault("User-Agent", gen_random_ua())
|
39
|
+
|
40
|
+
# 为 headers 补充额外字段
|
41
|
+
if self.headers_extra_field:
|
42
|
+
request.headers.update(self.headers_extra_field)
|
43
|
+
|
44
|
+
def validator(self, response: Response):
|
45
|
+
"""校验响应"""
|
46
|
+
pass
|
47
|
+
|
48
|
+
def parse(self, response: Response):
|
49
|
+
"""默认回调函数"""
|
50
|
+
raise NotImplementedError("没有定义回调函数 {}.parse ".format(self.__class__.__name__))
|
51
|
+
|
52
|
+
def handle_request_excetpion(self, e: Exception, request: Request):
|
53
|
+
"""处理请求时的异常"""
|
54
|
+
logger.error("{} {}".format(type(e).__name__, request.url))
|
55
|
+
|
56
|
+
def handle_callback_excetpion(self, e: Exception, request: Request, response: Response):
|
57
|
+
logger.error("{} `回调`时出现异常 | {} | {} | {}".format(response.status_code, e, request.callback.__name__, request.url))
|
58
|
+
|
59
|
+
async def request_task(self, q1: asyncio.PriorityQueue, q2: asyncio.Queue, semaphore: asyncio.Semaphore):
|
60
|
+
"""工作协程,从队列中获取请求并处理"""
|
61
|
+
while True:
|
62
|
+
req: Request = await q1.get()
|
63
|
+
|
64
|
+
# 结束信号
|
65
|
+
if req.url == "":
|
66
|
+
break
|
67
|
+
|
68
|
+
# 控制并发
|
69
|
+
async with semaphore:
|
70
|
+
for i in range(self.max_retry_times + 1):
|
71
|
+
# 进入了重试
|
72
|
+
if i > 0:
|
73
|
+
logger.debug("正在重试第{}次... {}".format(i, req.url))
|
74
|
+
|
75
|
+
# 开始请求...
|
76
|
+
try:
|
77
|
+
self.middleware(req)
|
78
|
+
await asyncio.sleep(self.delay)
|
79
|
+
resp = await req.send()
|
80
|
+
|
81
|
+
# 请求失败
|
82
|
+
except Exception as e:
|
83
|
+
try:
|
84
|
+
result = self.handle_request_excetpion(e, req)
|
85
|
+
if isinstance(result, Request):
|
86
|
+
await q1.put(result)
|
87
|
+
break
|
88
|
+
except IgnoreRequest as e:
|
89
|
+
logger.debug("{} 忽略请求 {}".format(e, req.url))
|
90
|
+
break
|
91
|
+
except Exception as e:
|
92
|
+
logger.error("`处理异常函数`异常了 | {} | {}".format(e, req.url))
|
93
|
+
|
94
|
+
# 请求成功
|
95
|
+
else:
|
96
|
+
# 校验响应
|
97
|
+
try:
|
98
|
+
self.validator(resp)
|
99
|
+
except IgnoreResponse as e:
|
100
|
+
logger.debug("{} 忽略响应 {}".format(e, req.url))
|
101
|
+
break
|
102
|
+
except Exception as e:
|
103
|
+
logger.error("`校验器`函数异常了 | {} | {}".format(e, req.url))
|
104
|
+
|
105
|
+
# 进入回调
|
106
|
+
try:
|
107
|
+
cached = req.callback(Response(resp), **req.cb_kwargs)
|
108
|
+
if isinstance(cached, Iterator):
|
109
|
+
for c in cached:
|
110
|
+
if isinstance(c, Request):
|
111
|
+
await q1.put(c) # 把后续请求加入队列
|
112
|
+
elif isinstance(c, dict):
|
113
|
+
await q2.put(c)
|
114
|
+
else:
|
115
|
+
logger.warning("Please yield `Request` or `dict` Not {}".format(c))
|
116
|
+
except Exception as e:
|
117
|
+
self.handle_callback_excetpion(e, req, resp)
|
118
|
+
finally:
|
119
|
+
break
|
120
|
+
|
121
|
+
q1.task_done()
|
122
|
+
|
123
|
+
async def item_task(self, q2: asyncio.Queue):
|
124
|
+
while True:
|
125
|
+
item = await q2.get()
|
126
|
+
if item is None:
|
127
|
+
break
|
128
|
+
self.process_item(item)
|
129
|
+
q2.task_done()
|
130
|
+
|
131
|
+
def process_item(self, item: dict):
|
132
|
+
logger.success(item)
|
133
|
+
|
134
|
+
async def run(self):
|
135
|
+
"""爬取入口"""
|
136
|
+
request_queue = asyncio.PriorityQueue()
|
137
|
+
item_queue = asyncio.Queue()
|
138
|
+
semaphore = asyncio.Semaphore(self.max_requests)
|
139
|
+
|
140
|
+
# 处理请求...
|
141
|
+
request_tasks = [
|
142
|
+
asyncio.create_task(self.request_task(request_queue, item_queue, semaphore))
|
143
|
+
for _ in range(self.max_requests)
|
144
|
+
]
|
145
|
+
|
146
|
+
# 处理数据...
|
147
|
+
item_tasks = [
|
148
|
+
asyncio.create_task(self.item_task(item_queue))
|
149
|
+
for _ in range(self.item_speed)
|
150
|
+
]
|
151
|
+
|
152
|
+
# 发送最开始的请求
|
153
|
+
for req in self.start_requests():
|
154
|
+
await request_queue.put(req)
|
155
|
+
|
156
|
+
# 等待所有请求处理完成
|
157
|
+
await request_queue.join()
|
158
|
+
logger.debug("处理请求已结束")
|
159
|
+
|
160
|
+
# 等待所有数据处理完成
|
161
|
+
await item_queue.join()
|
162
|
+
logger.debug("处理数据已结束")
|
163
|
+
|
164
|
+
# 退出请求任务
|
165
|
+
for _ in range(self.max_requests):
|
166
|
+
await request_queue.put(Request(url=""))
|
167
|
+
|
168
|
+
# 退出数据任务
|
169
|
+
for _ in range(self.item_speed):
|
170
|
+
await item_queue.put(None)
|
171
|
+
|
172
|
+
# 等待所有工作协程完成
|
173
|
+
await asyncio.gather(*request_tasks)
|
174
|
+
await asyncio.gather(*item_tasks)
|
175
|
+
|
176
|
+
def go(self):
|
177
|
+
asyncio.run(self.run())
|
coocan/templates/spider.txt
CHANGED
@@ -1,18 +1,18 @@
|
|
1
|
-
from coocan import Request, Response, MiniSpider
|
2
|
-
|
3
|
-
|
4
|
-
class {SpiderClassName}(MiniSpider):
|
5
|
-
start_urls = ["https://github.com/markadc/coocan"]
|
6
|
-
max_requests = 10
|
7
|
-
|
8
|
-
def middleware(self, request: Request):
|
9
|
-
request.headers["Referer"] = "https://github.com"
|
10
|
-
|
11
|
-
def parse(self, response: Response):
|
12
|
-
print(response.status_code)
|
13
|
-
print(response.get_one("//title/text()"))
|
14
|
-
|
15
|
-
|
16
|
-
if __name__ == '__main__':
|
17
|
-
s = {SpiderClassName}()
|
1
|
+
from coocan import Request, Response, MiniSpider
|
2
|
+
|
3
|
+
|
4
|
+
class {SpiderClassName}(MiniSpider):
|
5
|
+
start_urls = ["https://github.com/markadc/coocan"]
|
6
|
+
max_requests = 10
|
7
|
+
|
8
|
+
def middleware(self, request: Request):
|
9
|
+
request.headers["Referer"] = "https://github.com"
|
10
|
+
|
11
|
+
def parse(self, response: Response):
|
12
|
+
print(response.status_code)
|
13
|
+
print(response.get_one("//title/text()"))
|
14
|
+
|
15
|
+
|
16
|
+
if __name__ == '__main__':
|
17
|
+
s = {SpiderClassName}()
|
18
18
|
s.go()
|
coocan/url/__init__.py
CHANGED
@@ -1,2 +1,2 @@
|
|
1
|
-
from coocan.url.request import Request
|
2
|
-
from coocan.url.response import SelectorResponse as Response
|
1
|
+
from coocan.url.request import Request
|
2
|
+
from coocan.url.response import SelectorResponse as Response
|
coocan/url/request.py
CHANGED
@@ -1,31 +1,31 @@
|
|
1
|
-
import time
|
2
|
-
from typing import Callable
|
3
|
-
|
4
|
-
import httpx
|
5
|
-
|
6
|
-
cli = httpx.AsyncClient()
|
7
|
-
|
8
|
-
|
9
|
-
class Request:
|
10
|
-
def __init__(self, url: str, callback: Callable = None, params=None, headers=None, data=None, json=None, timeout=None, cb_kwargs=None, priority=None):
|
11
|
-
self.url = url
|
12
|
-
self.callback = callback
|
13
|
-
self.params = params
|
14
|
-
self.headers = headers or {}
|
15
|
-
self.data = data
|
16
|
-
self.json = json
|
17
|
-
self.timeout = timeout
|
18
|
-
self.cb_kwargs = cb_kwargs or {}
|
19
|
-
self.priority = priority or time.time()
|
20
|
-
|
21
|
-
async def send(self):
|
22
|
-
if (self.data and self.json) is None:
|
23
|
-
response = await cli.get(self.url, params=self.params, headers=self.headers, timeout=self.timeout)
|
24
|
-
elif self.data or self.json:
|
25
|
-
response = await cli.post(self.url, params=self.params, headers=self.headers, data=self.data, json=self.json, timeout=self.timeout)
|
26
|
-
else:
|
27
|
-
raise Exception("仅支持 GET 和 POST 请求")
|
28
|
-
return response
|
29
|
-
|
30
|
-
def __lt__(self, other):
|
31
|
-
return self.priority < other.priority
|
1
|
+
import time
|
2
|
+
from typing import Callable
|
3
|
+
|
4
|
+
import httpx
|
5
|
+
|
6
|
+
cli = httpx.AsyncClient()
|
7
|
+
|
8
|
+
|
9
|
+
class Request:
|
10
|
+
def __init__(self, url: str, callback: Callable = None, params=None, headers=None, data=None, json=None, timeout=None, cb_kwargs=None, priority=None):
|
11
|
+
self.url = url
|
12
|
+
self.callback = callback
|
13
|
+
self.params = params
|
14
|
+
self.headers = headers or {}
|
15
|
+
self.data = data
|
16
|
+
self.json = json
|
17
|
+
self.timeout = timeout
|
18
|
+
self.cb_kwargs = cb_kwargs or {}
|
19
|
+
self.priority = priority or time.time()
|
20
|
+
|
21
|
+
async def send(self):
|
22
|
+
if (self.data and self.json) is None:
|
23
|
+
response = await cli.get(self.url, params=self.params, headers=self.headers, timeout=self.timeout)
|
24
|
+
elif self.data or self.json:
|
25
|
+
response = await cli.post(self.url, params=self.params, headers=self.headers, data=self.data, json=self.json, timeout=self.timeout)
|
26
|
+
else:
|
27
|
+
raise Exception("仅支持 GET 和 POST 请求")
|
28
|
+
return response
|
29
|
+
|
30
|
+
def __lt__(self, other):
|
31
|
+
return self.priority < other.priority
|
coocan/url/response.py
CHANGED
@@ -1,50 +1,50 @@
|
|
1
|
-
from typing import Callable
|
2
|
-
|
3
|
-
from cocoman.spider.errors import ResponseCodeError, ResponseTextError
|
4
|
-
from httpx import Response
|
5
|
-
from parsel import Selector
|
6
|
-
|
7
|
-
|
8
|
-
class SelectorResponse(Response):
|
9
|
-
"""可以使用Xpath、CSS"""
|
10
|
-
|
11
|
-
def __init__(self, response: Response):
|
12
|
-
super().__init__(response.status_code)
|
13
|
-
self.__dict__.update(response.__dict__)
|
14
|
-
self.selector = Selector(text=response.text)
|
15
|
-
|
16
|
-
def __str__(self):
|
17
|
-
return "<Response {}>".format(self.status_code)
|
18
|
-
|
19
|
-
def xpath(self, query: str):
|
20
|
-
sel = self.selector.xpath(query)
|
21
|
-
return sel
|
22
|
-
|
23
|
-
def css(self, query: str):
|
24
|
-
sel = self.selector.css(query)
|
25
|
-
return sel
|
26
|
-
|
27
|
-
def get_one(self, query: str, default=None, strip=True):
|
28
|
-
v = self.selector.xpath(query).get(default=default)
|
29
|
-
return v.strip() if strip and isinstance(v, str) else v
|
30
|
-
|
31
|
-
def get_all(self, query: str, strip=True):
|
32
|
-
vs = [v.strip() if strip else v for v in self.selector.xpath(query).getall()]
|
33
|
-
return vs
|
34
|
-
|
35
|
-
def raise_for_status(self, codes: list = None):
|
36
|
-
codes = codes or [200]
|
37
|
-
if self.status_code not in codes:
|
38
|
-
raise ResponseCodeError("{} not in {}".format(self.status_code, codes))
|
39
|
-
|
40
|
-
def raise_for_text(self, validate: Callable[[str], bool]):
|
41
|
-
if validate(self.text) is False:
|
42
|
-
raise ResponseTextError("not ideal text")
|
43
|
-
|
44
|
-
def raise_has_text(self, text: str):
|
45
|
-
"""有此文本则抛出异常"""
|
46
|
-
assert self.text.find(text) == -1, ResponseTextError("has text: {}".format(text))
|
47
|
-
|
48
|
-
def raise_no_text(self, text: str):
|
49
|
-
"""无此文本则抛出异常"""
|
50
|
-
assert self.text.find(text) != -1, ResponseTextError("no text: {}".format(text))
|
1
|
+
from typing import Callable
|
2
|
+
|
3
|
+
from cocoman.spider.errors import ResponseCodeError, ResponseTextError
|
4
|
+
from httpx import Response
|
5
|
+
from parsel import Selector
|
6
|
+
|
7
|
+
|
8
|
+
class SelectorResponse(Response):
|
9
|
+
"""可以使用Xpath、CSS"""
|
10
|
+
|
11
|
+
def __init__(self, response: Response):
|
12
|
+
super().__init__(response.status_code)
|
13
|
+
self.__dict__.update(response.__dict__)
|
14
|
+
self.selector = Selector(text=response.text)
|
15
|
+
|
16
|
+
def __str__(self):
|
17
|
+
return "<Response {}>".format(self.status_code)
|
18
|
+
|
19
|
+
def xpath(self, query: str):
|
20
|
+
sel = self.selector.xpath(query)
|
21
|
+
return sel
|
22
|
+
|
23
|
+
def css(self, query: str):
|
24
|
+
sel = self.selector.css(query)
|
25
|
+
return sel
|
26
|
+
|
27
|
+
def get_one(self, query: str, default=None, strip=True):
|
28
|
+
v = self.selector.xpath(query).get(default=default)
|
29
|
+
return v.strip() if strip and isinstance(v, str) else v
|
30
|
+
|
31
|
+
def get_all(self, query: str, strip=True):
|
32
|
+
vs = [v.strip() if strip else v for v in self.selector.xpath(query).getall()]
|
33
|
+
return vs
|
34
|
+
|
35
|
+
def raise_for_status(self, codes: list = None):
|
36
|
+
codes = codes or [200]
|
37
|
+
if self.status_code not in codes:
|
38
|
+
raise ResponseCodeError("{} not in {}".format(self.status_code, codes))
|
39
|
+
|
40
|
+
def raise_for_text(self, validate: Callable[[str], bool]):
|
41
|
+
if validate(self.text) is False:
|
42
|
+
raise ResponseTextError("not ideal text")
|
43
|
+
|
44
|
+
def raise_has_text(self, text: str):
|
45
|
+
"""有此文本则抛出异常"""
|
46
|
+
assert self.text.find(text) == -1, ResponseTextError("has text: {}".format(text))
|
47
|
+
|
48
|
+
def raise_no_text(self, text: str):
|
49
|
+
"""无此文本则抛出异常"""
|
50
|
+
assert self.text.find(text) != -1, ResponseTextError("no text: {}".format(text))
|