coocan 0.5.5__py3-none-any.whl → 0.5.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- coocan/__init__.py +2 -2
- coocan/_examples/crawl_csdn_detail.py +62 -62
- coocan/_examples/crawl_csdn_list.py +50 -50
- coocan/_examples/recv_item.py +31 -31
- coocan/_examples/use_proxy.py +22 -0
- coocan/_examples/view_local_ip.py +22 -22
- coocan/cmd/cli.py +68 -68
- coocan/gen.py +33 -33
- coocan/push_project.py +12 -12
- coocan/spider/__init__.py +1 -1
- coocan/spider/base.py +177 -177
- coocan/templates/spider.txt +17 -17
- coocan/url/__init__.py +2 -2
- coocan/url/request.py +46 -31
- coocan/url/response.py +50 -50
- {coocan-0.5.5.dist-info → coocan-0.5.6.1.dist-info}/METADATA +94 -99
- coocan-0.5.6.1.dist-info/RECORD +21 -0
- {coocan-0.5.5.dist-info → coocan-0.5.6.1.dist-info}/WHEEL +1 -1
- {coocan-0.5.5.dist-info → coocan-0.5.6.1.dist-info}/top_level.txt +0 -1
- _test/crawl_csdn.py +0 -53
- _test/demo.py +0 -33
- _test/err_demo.py +0 -27
- _test/test_priority.py +0 -21
- _test/test_req_delay.py +0 -19
- _test/test_req_err.py +0 -32
- coocan-0.5.5.dist-info/RECORD +0 -26
- {coocan-0.5.5.dist-info → coocan-0.5.6.1.dist-info}/entry_points.txt +0 -0
@@ -1,99 +1,94 @@
|
|
1
|
-
Metadata-Version: 2.4
|
2
|
-
Name: coocan
|
3
|
-
Version: 0.5.
|
4
|
-
Summary: Air Async Spider Framework
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
Requires-
|
11
|
-
|
12
|
-
Requires-Dist:
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
import
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
if __name__ == '__main__':
|
97
|
-
s = CSDNDetailSpider()
|
98
|
-
s.go()
|
99
|
-
```
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: coocan
|
3
|
+
Version: 0.5.6.1
|
4
|
+
Summary: Air Async Spider Framework
|
5
|
+
Author-email: wauo <markadc@126.com>
|
6
|
+
License-Expression: MIT
|
7
|
+
Project-URL: Homepage, https://github.com/markadc/coocan
|
8
|
+
Requires-Python: >=3.10
|
9
|
+
Description-Content-Type: text/markdown
|
10
|
+
Requires-Dist: click>=8.0.0
|
11
|
+
Requires-Dist: httpx
|
12
|
+
Requires-Dist: loguru
|
13
|
+
|
14
|
+
# 项目说明
|
15
|
+
|
16
|
+
- 一个非常轻量的异步爬虫框架
|
17
|
+
|
18
|
+
# 项目地址
|
19
|
+
|
20
|
+
- https://github.com/markadc/coocan
|
21
|
+
|
22
|
+
## demo
|
23
|
+
|
24
|
+
- 效果
|
25
|
+
<br>
|
26
|
+

|
27
|
+
|
28
|
+
|
29
|
+
- 代码
|
30
|
+
|
31
|
+
```python
|
32
|
+
import json
|
33
|
+
|
34
|
+
from loguru import logger
|
35
|
+
|
36
|
+
import coocan
|
37
|
+
from coocan import Request, MiniSpider
|
38
|
+
|
39
|
+
|
40
|
+
class CSDNDetailSpider(MiniSpider):
|
41
|
+
start_urls = ['http://www.csdn.net']
|
42
|
+
max_requests = 10
|
43
|
+
|
44
|
+
def middleware(self, request: Request):
|
45
|
+
request.headers["Referer"] = "http://www.csdn.net/"
|
46
|
+
|
47
|
+
def parse(self, response):
|
48
|
+
api = "https://blog.csdn.net/community/home-api/v1/get-business-list"
|
49
|
+
params = {
|
50
|
+
"page": "1",
|
51
|
+
"size": "20",
|
52
|
+
"businessType": "lately",
|
53
|
+
"noMore": "false",
|
54
|
+
"username": "markadc"
|
55
|
+
}
|
56
|
+
yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
|
57
|
+
|
58
|
+
def parse_page(self, response, api, params):
|
59
|
+
current_page = params["page"]
|
60
|
+
data = json.loads(response.text)
|
61
|
+
some = data["data"]["list"]
|
62
|
+
|
63
|
+
if not some:
|
64
|
+
logger.warning("没有第 {} 页".format(current_page))
|
65
|
+
return
|
66
|
+
|
67
|
+
for one in some:
|
68
|
+
date = one["formatTime"]
|
69
|
+
name = one["title"]
|
70
|
+
detail_url = one["url"]
|
71
|
+
logger.info(
|
72
|
+
"""
|
73
|
+
{}
|
74
|
+
{}
|
75
|
+
{}
|
76
|
+
""".format(date, name, detail_url)
|
77
|
+
)
|
78
|
+
yield coocan.Request(detail_url, self.parse_detail, cb_kwargs={"title": name})
|
79
|
+
|
80
|
+
logger.info("第 {} 页抓取成功".format(params["page"]))
|
81
|
+
|
82
|
+
# 抓取下一页
|
83
|
+
next_page = int(current_page) + 1
|
84
|
+
params["page"] = str(next_page)
|
85
|
+
yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
|
86
|
+
|
87
|
+
def parse_detail(self, response, title):
|
88
|
+
logger.success("{} 已访问 {}".format(response.status_code, title))
|
89
|
+
|
90
|
+
|
91
|
+
if __name__ == '__main__':
|
92
|
+
s = CSDNDetailSpider()
|
93
|
+
s.go()
|
94
|
+
```
|
@@ -0,0 +1,21 @@
|
|
1
|
+
coocan/__init__.py,sha256=R1gUEUP9v_2iA1gE5twrxr-XRnPhP9EFftLrDeudAoA,53
|
2
|
+
coocan/gen.py,sha256=4MCE3t24m3-rbq2snAzByPe58VAo-ShWn58iXpcEiBE,995
|
3
|
+
coocan/push_project.py,sha256=5filLp6ol_W7NapcvB3kHFlBm5Nq_6kYS0eb9mo0RbI,249
|
4
|
+
coocan/_examples/crawl_csdn_detail.py,sha256=S3lGihGZF-6KI_Kg5H23BQ9cVzOkZwKrS78n1lYOcAg,1830
|
5
|
+
coocan/_examples/crawl_csdn_list.py,sha256=D7j5W0WM_52PoWv-2KLuts2r4rabMXavbMS3wnIg6Gk,1454
|
6
|
+
coocan/_examples/recv_item.py,sha256=Iym6RbvL7j87SvK14Hw2Exvxx047jEF4zQV9yo4ZXF4,976
|
7
|
+
coocan/_examples/use_proxy.py,sha256=nybPmGHKvn3ZX6yICukDYtXW0NXfyFSrICeaD2rMNP4,575
|
8
|
+
coocan/_examples/view_local_ip.py,sha256=AcatCwtFF6NeYdzbvD8SXun0wn0IL0pX2D3GWUlQ0Sc,560
|
9
|
+
coocan/cmd/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
|
+
coocan/cmd/cli.py,sha256=T7U0QUtAC3O_ZAsglZoOo3nqUPmfceJWbkdso-SwI18,2412
|
11
|
+
coocan/spider/__init__.py,sha256=nqERS5a5eSgJfBiIp5moZvwS6JOToNCM_4kreRrtmaQ,57
|
12
|
+
coocan/spider/base.py,sha256=C26B7orcmGczVDq8MmAVH_VCgEQ17gszBZQaXflXkzg,6165
|
13
|
+
coocan/templates/spider.txt,sha256=1wcbmnv9mBi-21pdygDMukiMy6lEAbvYVRhVOfNY99k,463
|
14
|
+
coocan/url/__init__.py,sha256=KN0lLNVaAISoITrPyjD2HOf2A9UYb-9Bbw4xfs9Zqk4,100
|
15
|
+
coocan/url/request.py,sha256=1b7K3rDMxH9_LCx9yfoUART_Ntzm16kbCDjSMOE7wAM,1386
|
16
|
+
coocan/url/response.py,sha256=ruIzOcFcJqszTmbNV9y1BSxdWeyVydsMJd-cDMTJHLo,1735
|
17
|
+
coocan-0.5.6.1.dist-info/METADATA,sha256=toWhwGXOQhdU9Y2UBs2ltj-deB8hEX82jIT2nVKX0_o,2364
|
18
|
+
coocan-0.5.6.1.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
|
19
|
+
coocan-0.5.6.1.dist-info/entry_points.txt,sha256=hNdk42NPboC1o7s7GzMbpII5t2U2jWrtT5bpvliXRcw,47
|
20
|
+
coocan-0.5.6.1.dist-info/top_level.txt,sha256=VwB-Q4zEljgb9v1Ms1E59B-1pBYORXuhKjgZb-LHOhk,7
|
21
|
+
coocan-0.5.6.1.dist-info/RECORD,,
|
_test/crawl_csdn.py
DELETED
@@ -1,53 +0,0 @@
|
|
1
|
-
import json
|
2
|
-
|
3
|
-
from loguru import logger
|
4
|
-
|
5
|
-
import coocan
|
6
|
-
from coocan import Request, MiniSpider
|
7
|
-
|
8
|
-
api = "https://blog.csdn.net/community/home-api/v1/get-business-list"
|
9
|
-
params = {
|
10
|
-
"page": "1",
|
11
|
-
"size": "20",
|
12
|
-
"businessType": "lately",
|
13
|
-
"noMore": "false",
|
14
|
-
"username": "markadc"
|
15
|
-
}
|
16
|
-
|
17
|
-
|
18
|
-
class CsdnAirAsyncSpider(MiniSpider):
|
19
|
-
start_urls = ['http://www.csdn.net']
|
20
|
-
max_requests = 10
|
21
|
-
|
22
|
-
def parse(self, response):
|
23
|
-
yield coocan.Request(api, self.parse_page, params=params)
|
24
|
-
|
25
|
-
def middleware(self, request: Request):
|
26
|
-
request.headers["Referer"] = "http://www.csdn.net/"
|
27
|
-
|
28
|
-
def parse_page(self, response):
|
29
|
-
current_page = params["page"]
|
30
|
-
data = json.loads(response.text)
|
31
|
-
some = data["data"]["list"]
|
32
|
-
if not some:
|
33
|
-
logger.warning("没有第 {} 页".format(current_page))
|
34
|
-
return
|
35
|
-
for one in some:
|
36
|
-
date = one["formatTime"]
|
37
|
-
name = one["title"]
|
38
|
-
detail_url = one["url"]
|
39
|
-
yield coocan.Request(detail_url, self.parse_detail)
|
40
|
-
print(date, detail_url, name)
|
41
|
-
logger.info("第 {} 页抓取成功".format(params["page"]))
|
42
|
-
|
43
|
-
next_page = int(current_page) + 1
|
44
|
-
params["page"] = str(next_page)
|
45
|
-
yield coocan.Request(api, self.parse_page, params=params)
|
46
|
-
|
47
|
-
def parse_detail(self, response):
|
48
|
-
logger.success("{} {}".format(response.status_code, response.request.url))
|
49
|
-
|
50
|
-
|
51
|
-
if __name__ == '__main__':
|
52
|
-
s = CsdnAirAsyncSpider()
|
53
|
-
s.go()
|
_test/demo.py
DELETED
@@ -1,33 +0,0 @@
|
|
1
|
-
from loguru import logger
|
2
|
-
|
3
|
-
import coocan
|
4
|
-
|
5
|
-
|
6
|
-
class DemoSpider(coocan.MiniSpider):
|
7
|
-
start_urls = ["https://cn.bing.com/"]
|
8
|
-
max_requests = 5
|
9
|
-
|
10
|
-
def parse(self, response):
|
11
|
-
print(response.request.headers.get("User-Agent"))
|
12
|
-
logger.debug('{} {}'.format(response.status_code, len(response.text)))
|
13
|
-
for i in range(5):
|
14
|
-
yield coocan.Request('https://cn.bing.com/', self.parse2)
|
15
|
-
|
16
|
-
def parse2(self, response):
|
17
|
-
logger.info('{} {}'.format(response.status_code, len(response.text)))
|
18
|
-
for i in range(3):
|
19
|
-
yield coocan.Request('https://cn.bing.com/', self.parse3)
|
20
|
-
|
21
|
-
for i in range(4):
|
22
|
-
yield coocan.Request('https://cn.bing.com/', self.parse4)
|
23
|
-
|
24
|
-
def parse3(self, response):
|
25
|
-
logger.warning('{} {}'.format(response.status_code, len(response.text)))
|
26
|
-
|
27
|
-
def parse4(self, response):
|
28
|
-
logger.error('{} {}'.format(response.status_code, len(response.text)))
|
29
|
-
|
30
|
-
|
31
|
-
if __name__ == '__main__':
|
32
|
-
my_spider = DemoSpider()
|
33
|
-
my_spider.go()
|
_test/err_demo.py
DELETED
@@ -1,27 +0,0 @@
|
|
1
|
-
from loguru import logger
|
2
|
-
|
3
|
-
import coocan
|
4
|
-
from coocan.spider import MiniSpider
|
5
|
-
|
6
|
-
|
7
|
-
class ErrDemoSpider(MiniSpider):
|
8
|
-
start_urls = ["https://cn.bing.com/"]
|
9
|
-
max_requests = 5
|
10
|
-
|
11
|
-
def parse(self, response):
|
12
|
-
print(response.request.headers.get("User-Agent"))
|
13
|
-
logger.debug('{} {}'.format(response.status_code, len(response.text)))
|
14
|
-
yield coocan.Request('https://cn.bing.com/', self.parse2, cb_kwargs={"name": "CLOS"})
|
15
|
-
|
16
|
-
def parse2(self, response, name):
|
17
|
-
print(name)
|
18
|
-
logger.debug('{} {}'.format(response.status_code, len(response.text)))
|
19
|
-
yield coocan.Request('https://cn.bing.com/', self.parse3, cb_kwargs={"a1": 1, "a2": 2})
|
20
|
-
|
21
|
-
def parse3(self, response, a1, a22):
|
22
|
-
print(a1, a22)
|
23
|
-
|
24
|
-
|
25
|
-
if __name__ == '__main__':
|
26
|
-
my_spider = ErrDemoSpider()
|
27
|
-
my_spider.go()
|
_test/test_priority.py
DELETED
@@ -1,21 +0,0 @@
|
|
1
|
-
from coocan import MiniSpider, Request, Response
|
2
|
-
|
3
|
-
|
4
|
-
class TestPrioritySpider(MiniSpider):
|
5
|
-
headers_extra_field = {"Name": "Coocan"}
|
6
|
-
|
7
|
-
def start_requests(self):
|
8
|
-
for i in range(100):
|
9
|
-
url = 'https://www.baidu.com/s?w={}'.format(i)
|
10
|
-
yield Request(url, callback=self.parse, priority=100 - i)
|
11
|
-
|
12
|
-
def parse(self, response: Response):
|
13
|
-
print(response.request.url)
|
14
|
-
print(response.request.headers["User-Agent"])
|
15
|
-
print(response.request.headers)
|
16
|
-
print()
|
17
|
-
|
18
|
-
|
19
|
-
if __name__ == '__main__':
|
20
|
-
s = TestPrioritySpider()
|
21
|
-
s.go()
|
_test/test_req_delay.py
DELETED
@@ -1,19 +0,0 @@
|
|
1
|
-
from coocan import MiniSpider, Request, Response
|
2
|
-
|
3
|
-
|
4
|
-
class TestReqDelaySpider(MiniSpider):
|
5
|
-
max_requests = 5
|
6
|
-
delay = 3
|
7
|
-
|
8
|
-
def start_requests(self):
|
9
|
-
for i in range(100):
|
10
|
-
url = 'https://www.baidu.com/s?w={}'.format(i)
|
11
|
-
yield Request(url, callback=self.parse, priority=100 - i)
|
12
|
-
|
13
|
-
def parse(self, response: Response):
|
14
|
-
print(response.request.url)
|
15
|
-
|
16
|
-
|
17
|
-
if __name__ == '__main__':
|
18
|
-
s = TestReqDelaySpider()
|
19
|
-
s.go()
|
_test/test_req_err.py
DELETED
@@ -1,32 +0,0 @@
|
|
1
|
-
import random
|
2
|
-
|
3
|
-
from coocan import MiniSpider, Request, Response, IgnoreRequest
|
4
|
-
|
5
|
-
|
6
|
-
class TestReqErrSpider(MiniSpider):
|
7
|
-
def start_requests(self):
|
8
|
-
for i in range(5):
|
9
|
-
url = "https://www.google.com/{}".format(i + 1)
|
10
|
-
yield Request(url, callback=self.parse, timeout=1)
|
11
|
-
|
12
|
-
def handle_request_excetpion(self, e: Exception, request: Request):
|
13
|
-
v = random.randint(1, 3)
|
14
|
-
if v == 1:
|
15
|
-
raise IgnoreRequest("出验证码了")
|
16
|
-
if v == 2:
|
17
|
-
1 / 0
|
18
|
-
if v == 3:
|
19
|
-
new_url = "https://www.baidu.com/s?wd={}".format(random.randint(1, 100))
|
20
|
-
return Request(new_url, callback=self.parse, timeout=1)
|
21
|
-
|
22
|
-
def parse(self, response: Response):
|
23
|
-
v = random.randint(1, 2)
|
24
|
-
if v == 1:
|
25
|
-
print("爬取成功", response.url, len(response.text))
|
26
|
-
print(response.get_one("//title/text()"))
|
27
|
-
aaa
|
28
|
-
|
29
|
-
|
30
|
-
if __name__ == '__main__':
|
31
|
-
my_spider = TestReqErrSpider()
|
32
|
-
my_spider.go()
|
coocan-0.5.5.dist-info/RECORD
DELETED
@@ -1,26 +0,0 @@
|
|
1
|
-
_test/crawl_csdn.py,sha256=ap2mOq3ps7KEbqqKWH5uJqIK_IQ8YFSRRAMzpreQvww,1555
|
2
|
-
_test/demo.py,sha256=ZxfJzWuNVGhDjhUruyVyZ-BoULHMbzgpnxefLSkheZI,1051
|
3
|
-
_test/err_demo.py,sha256=EWGqb00KyB192qv3uxMr6YgOr2zKJQb6gkeFtknMLv8,845
|
4
|
-
_test/test_priority.py,sha256=K8JLC-PaVM4ztLZdYFCumDQP5m2hB8qWAIWXTOMMUyM,601
|
5
|
-
_test/test_req_delay.py,sha256=35afyHcZk3Gmja9xXXjJSHXnU8WVJGph2ZcTQRxRMNk,479
|
6
|
-
_test/test_req_err.py,sha256=magK1BUConCBj8TEC29rzmDCbI2u2XXVcPowL6ttP9g,1025
|
7
|
-
coocan/__init__.py,sha256=UqFmE7ucuR_xR3OyyBU8pxqLfCJ5AdH_HsDdTsYPf6g,55
|
8
|
-
coocan/gen.py,sha256=J6QWXkBVbiCQqey8i0BDqleRNpBswI8AyvrYmkDVQPw,1028
|
9
|
-
coocan/push_project.py,sha256=X2fjtYk1oI0ElcibA9wChLx0lCc8hwSelhUNfkJal5o,220
|
10
|
-
coocan/_examples/crawl_csdn_detail.py,sha256=J2hiKHCS7RskQ9UmNMjE8i6braFwGchH6BxtdulV9RM,1892
|
11
|
-
coocan/_examples/crawl_csdn_list.py,sha256=ZvhFvBbVXQe-qtXf1T_waXuM4tBleBqbpvzP-5z0RCg,1504
|
12
|
-
coocan/_examples/recv_item.py,sha256=iJqPuHZ2FykeleFl0Xr0yPwq4UhCnNw84lCPlYyGFzM,1007
|
13
|
-
coocan/_examples/view_local_ip.py,sha256=Sl086xNNuZqFoRM31_gMvcISSa2QoL3OGghECkQktxg,582
|
14
|
-
coocan/cmd/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
|
-
coocan/cmd/cli.py,sha256=FRggXqDeLsGs-7u3zhaokfk0GCpItwqudf14W9zUfYE,2480
|
16
|
-
coocan/spider/__init__.py,sha256=kMDCGeqtN50raCzwfCn18s_W8xV6KO_Ny9Xol4I48Ag,58
|
17
|
-
coocan/spider/base.py,sha256=9Dgn2920Lb9TZGV0cAZSBMvIWuTMqs9M8ZYspx9W0Io,6342
|
18
|
-
coocan/templates/spider.txt,sha256=5UEXUzb0ses_4ctn0b3vgbpUJ7tCde91ul6rp-g7Hxw,480
|
19
|
-
coocan/url/__init__.py,sha256=rEMx66XDy5AIJ9mF_2UVzHW5mRLBAWZEyQ3txrZzuZA,102
|
20
|
-
coocan/url/request.py,sha256=seZaQXQRvRMIf9WnCp3mAgNA-kxsj9P2JzAvuIt2Dx8,1116
|
21
|
-
coocan/url/response.py,sha256=AnC0xsF34q68r62EVlcHYmDH6skm9RBwRHITTb4iBbU,1785
|
22
|
-
coocan-0.5.5.dist-info/METADATA,sha256=aYnL3IygL_Dw7iwvDtK81oVQRWIygXmiiSO94C01Bv4,2579
|
23
|
-
coocan-0.5.5.dist-info/WHEEL,sha256=ck4Vq1_RXyvS4Jt6SI0Vz6fyVs4GWg7AINwpsaGEgPE,91
|
24
|
-
coocan-0.5.5.dist-info/entry_points.txt,sha256=hNdk42NPboC1o7s7GzMbpII5t2U2jWrtT5bpvliXRcw,47
|
25
|
-
coocan-0.5.5.dist-info/top_level.txt,sha256=WiN3Gh529qzUs0jVvEReeZsKxFguIQKrFlMOjtxGblM,13
|
26
|
-
coocan-0.5.5.dist-info/RECORD,,
|
File without changes
|