coocan 0.5.5__py3-none-any.whl → 0.5.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,99 +1,94 @@
1
- Metadata-Version: 2.4
2
- Name: coocan
3
- Version: 0.5.5
4
- Summary: Air Async Spider Framework
5
- Home-page: https://github.com/markadc/coocan
6
- Author: wauo
7
- Author-email: wauo <markadc@126.com>
8
- License-Expression: MIT
9
- Project-URL: Homepage, https://github.com/markadc/coocan
10
- Requires-Python: >=3.10
11
- Description-Content-Type: text/markdown
12
- Requires-Dist: click>=8.0.0
13
- Requires-Dist: httpx
14
- Requires-Dist: loguru
15
- Dynamic: author
16
- Dynamic: home-page
17
- Dynamic: requires-python
18
-
19
- # 项目说明
20
-
21
- - 一个非常轻量的异步爬虫框架
22
-
23
- # 项目地址
24
-
25
- - https://github.com/markadc/coocan
26
-
27
- ## demo
28
-
29
- - 效果
30
- <br>
31
- ![效果](coocan/demo.gif)
32
-
33
-
34
- - 代码
35
-
36
- ```python
37
- import json
38
-
39
- from loguru import logger
40
-
41
- import coocan
42
- from coocan import Request, MiniSpider
43
-
44
-
45
- class CSDNDetailSpider(MiniSpider):
46
- start_urls = ['http://www.csdn.net']
47
- max_requests = 10
48
-
49
- def middleware(self, request: Request):
50
- request.headers["Referer"] = "http://www.csdn.net/"
51
-
52
- def parse(self, response):
53
- api = "https://blog.csdn.net/community/home-api/v1/get-business-list"
54
- params = {
55
- "page": "1",
56
- "size": "20",
57
- "businessType": "lately",
58
- "noMore": "false",
59
- "username": "markadc"
60
- }
61
- yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
62
-
63
- def parse_page(self, response, api, params):
64
- current_page = params["page"]
65
- data = json.loads(response.text)
66
- some = data["data"]["list"]
67
-
68
- if not some:
69
- logger.warning("没有第 {} ".format(current_page))
70
- return
71
-
72
- for one in some:
73
- date = one["formatTime"]
74
- name = one["title"]
75
- detail_url = one["url"]
76
- logger.info(
77
- """
78
- {}
79
- {}
80
- {}
81
- """.format(date, name, detail_url)
82
- )
83
- yield coocan.Request(detail_url, self.parse_detail, cb_kwargs={"title": name})
84
-
85
- logger.info("第 {} 页抓取成功".format(params["page"]))
86
-
87
- # 抓取下一页
88
- next_page = int(current_page) + 1
89
- params["page"] = str(next_page)
90
- yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
91
-
92
- def parse_detail(self, response, title):
93
- logger.success("{} 已访问 {}".format(response.status_code, title))
94
-
95
-
96
- if __name__ == '__main__':
97
- s = CSDNDetailSpider()
98
- s.go()
99
- ```
1
+ Metadata-Version: 2.4
2
+ Name: coocan
3
+ Version: 0.5.6.1
4
+ Summary: Air Async Spider Framework
5
+ Author-email: wauo <markadc@126.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/markadc/coocan
8
+ Requires-Python: >=3.10
9
+ Description-Content-Type: text/markdown
10
+ Requires-Dist: click>=8.0.0
11
+ Requires-Dist: httpx
12
+ Requires-Dist: loguru
13
+
14
+ # 项目说明
15
+
16
+ - 一个非常轻量的异步爬虫框架
17
+
18
+ # 项目地址
19
+
20
+ - https://github.com/markadc/coocan
21
+
22
+ ## demo
23
+
24
+ - 效果
25
+ <br>
26
+ ![效果](coocan/demo.gif)
27
+
28
+
29
+ - 代码
30
+
31
+ ```python
32
+ import json
33
+
34
+ from loguru import logger
35
+
36
+ import coocan
37
+ from coocan import Request, MiniSpider
38
+
39
+
40
+ class CSDNDetailSpider(MiniSpider):
41
+ start_urls = ['http://www.csdn.net']
42
+ max_requests = 10
43
+
44
+ def middleware(self, request: Request):
45
+ request.headers["Referer"] = "http://www.csdn.net/"
46
+
47
+ def parse(self, response):
48
+ api = "https://blog.csdn.net/community/home-api/v1/get-business-list"
49
+ params = {
50
+ "page": "1",
51
+ "size": "20",
52
+ "businessType": "lately",
53
+ "noMore": "false",
54
+ "username": "markadc"
55
+ }
56
+ yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
57
+
58
+ def parse_page(self, response, api, params):
59
+ current_page = params["page"]
60
+ data = json.loads(response.text)
61
+ some = data["data"]["list"]
62
+
63
+ if not some:
64
+ logger.warning("没有第 {} ".format(current_page))
65
+ return
66
+
67
+ for one in some:
68
+ date = one["formatTime"]
69
+ name = one["title"]
70
+ detail_url = one["url"]
71
+ logger.info(
72
+ """
73
+ {}
74
+ {}
75
+ {}
76
+ """.format(date, name, detail_url)
77
+ )
78
+ yield coocan.Request(detail_url, self.parse_detail, cb_kwargs={"title": name})
79
+
80
+ logger.info("第 {} 页抓取成功".format(params["page"]))
81
+
82
+ # 抓取下一页
83
+ next_page = int(current_page) + 1
84
+ params["page"] = str(next_page)
85
+ yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
86
+
87
+ def parse_detail(self, response, title):
88
+ logger.success("{} 已访问 {}".format(response.status_code, title))
89
+
90
+
91
+ if __name__ == '__main__':
92
+ s = CSDNDetailSpider()
93
+ s.go()
94
+ ```
@@ -0,0 +1,21 @@
1
+ coocan/__init__.py,sha256=R1gUEUP9v_2iA1gE5twrxr-XRnPhP9EFftLrDeudAoA,53
2
+ coocan/gen.py,sha256=4MCE3t24m3-rbq2snAzByPe58VAo-ShWn58iXpcEiBE,995
3
+ coocan/push_project.py,sha256=5filLp6ol_W7NapcvB3kHFlBm5Nq_6kYS0eb9mo0RbI,249
4
+ coocan/_examples/crawl_csdn_detail.py,sha256=S3lGihGZF-6KI_Kg5H23BQ9cVzOkZwKrS78n1lYOcAg,1830
5
+ coocan/_examples/crawl_csdn_list.py,sha256=D7j5W0WM_52PoWv-2KLuts2r4rabMXavbMS3wnIg6Gk,1454
6
+ coocan/_examples/recv_item.py,sha256=Iym6RbvL7j87SvK14Hw2Exvxx047jEF4zQV9yo4ZXF4,976
7
+ coocan/_examples/use_proxy.py,sha256=nybPmGHKvn3ZX6yICukDYtXW0NXfyFSrICeaD2rMNP4,575
8
+ coocan/_examples/view_local_ip.py,sha256=AcatCwtFF6NeYdzbvD8SXun0wn0IL0pX2D3GWUlQ0Sc,560
9
+ coocan/cmd/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ coocan/cmd/cli.py,sha256=T7U0QUtAC3O_ZAsglZoOo3nqUPmfceJWbkdso-SwI18,2412
11
+ coocan/spider/__init__.py,sha256=nqERS5a5eSgJfBiIp5moZvwS6JOToNCM_4kreRrtmaQ,57
12
+ coocan/spider/base.py,sha256=C26B7orcmGczVDq8MmAVH_VCgEQ17gszBZQaXflXkzg,6165
13
+ coocan/templates/spider.txt,sha256=1wcbmnv9mBi-21pdygDMukiMy6lEAbvYVRhVOfNY99k,463
14
+ coocan/url/__init__.py,sha256=KN0lLNVaAISoITrPyjD2HOf2A9UYb-9Bbw4xfs9Zqk4,100
15
+ coocan/url/request.py,sha256=1b7K3rDMxH9_LCx9yfoUART_Ntzm16kbCDjSMOE7wAM,1386
16
+ coocan/url/response.py,sha256=ruIzOcFcJqszTmbNV9y1BSxdWeyVydsMJd-cDMTJHLo,1735
17
+ coocan-0.5.6.1.dist-info/METADATA,sha256=toWhwGXOQhdU9Y2UBs2ltj-deB8hEX82jIT2nVKX0_o,2364
18
+ coocan-0.5.6.1.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
19
+ coocan-0.5.6.1.dist-info/entry_points.txt,sha256=hNdk42NPboC1o7s7GzMbpII5t2U2jWrtT5bpvliXRcw,47
20
+ coocan-0.5.6.1.dist-info/top_level.txt,sha256=VwB-Q4zEljgb9v1Ms1E59B-1pBYORXuhKjgZb-LHOhk,7
21
+ coocan-0.5.6.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.0.0)
2
+ Generator: setuptools (80.7.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
_test/crawl_csdn.py DELETED
@@ -1,53 +0,0 @@
1
- import json
2
-
3
- from loguru import logger
4
-
5
- import coocan
6
- from coocan import Request, MiniSpider
7
-
8
- api = "https://blog.csdn.net/community/home-api/v1/get-business-list"
9
- params = {
10
- "page": "1",
11
- "size": "20",
12
- "businessType": "lately",
13
- "noMore": "false",
14
- "username": "markadc"
15
- }
16
-
17
-
18
- class CsdnAirAsyncSpider(MiniSpider):
19
- start_urls = ['http://www.csdn.net']
20
- max_requests = 10
21
-
22
- def parse(self, response):
23
- yield coocan.Request(api, self.parse_page, params=params)
24
-
25
- def middleware(self, request: Request):
26
- request.headers["Referer"] = "http://www.csdn.net/"
27
-
28
- def parse_page(self, response):
29
- current_page = params["page"]
30
- data = json.loads(response.text)
31
- some = data["data"]["list"]
32
- if not some:
33
- logger.warning("没有第 {} 页".format(current_page))
34
- return
35
- for one in some:
36
- date = one["formatTime"]
37
- name = one["title"]
38
- detail_url = one["url"]
39
- yield coocan.Request(detail_url, self.parse_detail)
40
- print(date, detail_url, name)
41
- logger.info("第 {} 页抓取成功".format(params["page"]))
42
-
43
- next_page = int(current_page) + 1
44
- params["page"] = str(next_page)
45
- yield coocan.Request(api, self.parse_page, params=params)
46
-
47
- def parse_detail(self, response):
48
- logger.success("{} {}".format(response.status_code, response.request.url))
49
-
50
-
51
- if __name__ == '__main__':
52
- s = CsdnAirAsyncSpider()
53
- s.go()
_test/demo.py DELETED
@@ -1,33 +0,0 @@
1
- from loguru import logger
2
-
3
- import coocan
4
-
5
-
6
- class DemoSpider(coocan.MiniSpider):
7
- start_urls = ["https://cn.bing.com/"]
8
- max_requests = 5
9
-
10
- def parse(self, response):
11
- print(response.request.headers.get("User-Agent"))
12
- logger.debug('{} {}'.format(response.status_code, len(response.text)))
13
- for i in range(5):
14
- yield coocan.Request('https://cn.bing.com/', self.parse2)
15
-
16
- def parse2(self, response):
17
- logger.info('{} {}'.format(response.status_code, len(response.text)))
18
- for i in range(3):
19
- yield coocan.Request('https://cn.bing.com/', self.parse3)
20
-
21
- for i in range(4):
22
- yield coocan.Request('https://cn.bing.com/', self.parse4)
23
-
24
- def parse3(self, response):
25
- logger.warning('{} {}'.format(response.status_code, len(response.text)))
26
-
27
- def parse4(self, response):
28
- logger.error('{} {}'.format(response.status_code, len(response.text)))
29
-
30
-
31
- if __name__ == '__main__':
32
- my_spider = DemoSpider()
33
- my_spider.go()
_test/err_demo.py DELETED
@@ -1,27 +0,0 @@
1
- from loguru import logger
2
-
3
- import coocan
4
- from coocan.spider import MiniSpider
5
-
6
-
7
- class ErrDemoSpider(MiniSpider):
8
- start_urls = ["https://cn.bing.com/"]
9
- max_requests = 5
10
-
11
- def parse(self, response):
12
- print(response.request.headers.get("User-Agent"))
13
- logger.debug('{} {}'.format(response.status_code, len(response.text)))
14
- yield coocan.Request('https://cn.bing.com/', self.parse2, cb_kwargs={"name": "CLOS"})
15
-
16
- def parse2(self, response, name):
17
- print(name)
18
- logger.debug('{} {}'.format(response.status_code, len(response.text)))
19
- yield coocan.Request('https://cn.bing.com/', self.parse3, cb_kwargs={"a1": 1, "a2": 2})
20
-
21
- def parse3(self, response, a1, a22):
22
- print(a1, a22)
23
-
24
-
25
- if __name__ == '__main__':
26
- my_spider = ErrDemoSpider()
27
- my_spider.go()
_test/test_priority.py DELETED
@@ -1,21 +0,0 @@
1
- from coocan import MiniSpider, Request, Response
2
-
3
-
4
- class TestPrioritySpider(MiniSpider):
5
- headers_extra_field = {"Name": "Coocan"}
6
-
7
- def start_requests(self):
8
- for i in range(100):
9
- url = 'https://www.baidu.com/s?w={}'.format(i)
10
- yield Request(url, callback=self.parse, priority=100 - i)
11
-
12
- def parse(self, response: Response):
13
- print(response.request.url)
14
- print(response.request.headers["User-Agent"])
15
- print(response.request.headers)
16
- print()
17
-
18
-
19
- if __name__ == '__main__':
20
- s = TestPrioritySpider()
21
- s.go()
_test/test_req_delay.py DELETED
@@ -1,19 +0,0 @@
1
- from coocan import MiniSpider, Request, Response
2
-
3
-
4
- class TestReqDelaySpider(MiniSpider):
5
- max_requests = 5
6
- delay = 3
7
-
8
- def start_requests(self):
9
- for i in range(100):
10
- url = 'https://www.baidu.com/s?w={}'.format(i)
11
- yield Request(url, callback=self.parse, priority=100 - i)
12
-
13
- def parse(self, response: Response):
14
- print(response.request.url)
15
-
16
-
17
- if __name__ == '__main__':
18
- s = TestReqDelaySpider()
19
- s.go()
_test/test_req_err.py DELETED
@@ -1,32 +0,0 @@
1
- import random
2
-
3
- from coocan import MiniSpider, Request, Response, IgnoreRequest
4
-
5
-
6
- class TestReqErrSpider(MiniSpider):
7
- def start_requests(self):
8
- for i in range(5):
9
- url = "https://www.google.com/{}".format(i + 1)
10
- yield Request(url, callback=self.parse, timeout=1)
11
-
12
- def handle_request_excetpion(self, e: Exception, request: Request):
13
- v = random.randint(1, 3)
14
- if v == 1:
15
- raise IgnoreRequest("出验证码了")
16
- if v == 2:
17
- 1 / 0
18
- if v == 3:
19
- new_url = "https://www.baidu.com/s?wd={}".format(random.randint(1, 100))
20
- return Request(new_url, callback=self.parse, timeout=1)
21
-
22
- def parse(self, response: Response):
23
- v = random.randint(1, 2)
24
- if v == 1:
25
- print("爬取成功", response.url, len(response.text))
26
- print(response.get_one("//title/text()"))
27
- aaa
28
-
29
-
30
- if __name__ == '__main__':
31
- my_spider = TestReqErrSpider()
32
- my_spider.go()
@@ -1,26 +0,0 @@
1
- _test/crawl_csdn.py,sha256=ap2mOq3ps7KEbqqKWH5uJqIK_IQ8YFSRRAMzpreQvww,1555
2
- _test/demo.py,sha256=ZxfJzWuNVGhDjhUruyVyZ-BoULHMbzgpnxefLSkheZI,1051
3
- _test/err_demo.py,sha256=EWGqb00KyB192qv3uxMr6YgOr2zKJQb6gkeFtknMLv8,845
4
- _test/test_priority.py,sha256=K8JLC-PaVM4ztLZdYFCumDQP5m2hB8qWAIWXTOMMUyM,601
5
- _test/test_req_delay.py,sha256=35afyHcZk3Gmja9xXXjJSHXnU8WVJGph2ZcTQRxRMNk,479
6
- _test/test_req_err.py,sha256=magK1BUConCBj8TEC29rzmDCbI2u2XXVcPowL6ttP9g,1025
7
- coocan/__init__.py,sha256=UqFmE7ucuR_xR3OyyBU8pxqLfCJ5AdH_HsDdTsYPf6g,55
8
- coocan/gen.py,sha256=J6QWXkBVbiCQqey8i0BDqleRNpBswI8AyvrYmkDVQPw,1028
9
- coocan/push_project.py,sha256=X2fjtYk1oI0ElcibA9wChLx0lCc8hwSelhUNfkJal5o,220
10
- coocan/_examples/crawl_csdn_detail.py,sha256=J2hiKHCS7RskQ9UmNMjE8i6braFwGchH6BxtdulV9RM,1892
11
- coocan/_examples/crawl_csdn_list.py,sha256=ZvhFvBbVXQe-qtXf1T_waXuM4tBleBqbpvzP-5z0RCg,1504
12
- coocan/_examples/recv_item.py,sha256=iJqPuHZ2FykeleFl0Xr0yPwq4UhCnNw84lCPlYyGFzM,1007
13
- coocan/_examples/view_local_ip.py,sha256=Sl086xNNuZqFoRM31_gMvcISSa2QoL3OGghECkQktxg,582
14
- coocan/cmd/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
- coocan/cmd/cli.py,sha256=FRggXqDeLsGs-7u3zhaokfk0GCpItwqudf14W9zUfYE,2480
16
- coocan/spider/__init__.py,sha256=kMDCGeqtN50raCzwfCn18s_W8xV6KO_Ny9Xol4I48Ag,58
17
- coocan/spider/base.py,sha256=9Dgn2920Lb9TZGV0cAZSBMvIWuTMqs9M8ZYspx9W0Io,6342
18
- coocan/templates/spider.txt,sha256=5UEXUzb0ses_4ctn0b3vgbpUJ7tCde91ul6rp-g7Hxw,480
19
- coocan/url/__init__.py,sha256=rEMx66XDy5AIJ9mF_2UVzHW5mRLBAWZEyQ3txrZzuZA,102
20
- coocan/url/request.py,sha256=seZaQXQRvRMIf9WnCp3mAgNA-kxsj9P2JzAvuIt2Dx8,1116
21
- coocan/url/response.py,sha256=AnC0xsF34q68r62EVlcHYmDH6skm9RBwRHITTb4iBbU,1785
22
- coocan-0.5.5.dist-info/METADATA,sha256=aYnL3IygL_Dw7iwvDtK81oVQRWIygXmiiSO94C01Bv4,2579
23
- coocan-0.5.5.dist-info/WHEEL,sha256=ck4Vq1_RXyvS4Jt6SI0Vz6fyVs4GWg7AINwpsaGEgPE,91
24
- coocan-0.5.5.dist-info/entry_points.txt,sha256=hNdk42NPboC1o7s7GzMbpII5t2U2jWrtT5bpvliXRcw,47
25
- coocan-0.5.5.dist-info/top_level.txt,sha256=WiN3Gh529qzUs0jVvEReeZsKxFguIQKrFlMOjtxGblM,13
26
- coocan-0.5.5.dist-info/RECORD,,