coocan 0.5.5__tar.gz → 0.5.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {coocan-0.5.5 → coocan-0.5.6}/PKG-INFO +97 -99
  2. {coocan-0.5.5 → coocan-0.5.6}/coocan/__init__.py +2 -2
  3. {coocan-0.5.5 → coocan-0.5.6}/coocan/_examples/crawl_csdn_detail.py +62 -62
  4. {coocan-0.5.5 → coocan-0.5.6}/coocan/_examples/crawl_csdn_list.py +50 -50
  5. {coocan-0.5.5 → coocan-0.5.6}/coocan/_examples/recv_item.py +31 -31
  6. {coocan-0.5.5 → coocan-0.5.6}/coocan/_examples/view_local_ip.py +22 -22
  7. {coocan-0.5.5 → coocan-0.5.6}/coocan/cmd/cli.py +68 -68
  8. {coocan-0.5.5 → coocan-0.5.6}/coocan/gen.py +33 -33
  9. {coocan-0.5.5 → coocan-0.5.6}/coocan/push_project.py +12 -12
  10. {coocan-0.5.5 → coocan-0.5.6}/coocan/spider/__init__.py +1 -1
  11. {coocan-0.5.5 → coocan-0.5.6}/coocan/spider/base.py +177 -177
  12. {coocan-0.5.5 → coocan-0.5.6}/coocan/templates/spider.txt +17 -17
  13. {coocan-0.5.5 → coocan-0.5.6}/coocan/url/__init__.py +2 -2
  14. {coocan-0.5.5 → coocan-0.5.6}/coocan/url/request.py +31 -31
  15. {coocan-0.5.5 → coocan-0.5.6}/coocan/url/response.py +50 -50
  16. {coocan-0.5.5 → coocan-0.5.6}/coocan.egg-info/PKG-INFO +97 -99
  17. {coocan-0.5.5 → coocan-0.5.6}/coocan.egg-info/SOURCES.txt +0 -6
  18. {coocan-0.5.5 → coocan-0.5.6}/coocan.egg-info/top_level.txt +0 -1
  19. {coocan-0.5.5 → coocan-0.5.6}/pyproject.toml +1 -1
  20. {coocan-0.5.5 → coocan-0.5.6}/setup.cfg +4 -4
  21. {coocan-0.5.5 → coocan-0.5.6}/setup.py +1 -2
  22. coocan-0.5.5/_test/crawl_csdn.py +0 -53
  23. coocan-0.5.5/_test/demo.py +0 -33
  24. coocan-0.5.5/_test/err_demo.py +0 -27
  25. coocan-0.5.5/_test/test_priority.py +0 -21
  26. coocan-0.5.5/_test/test_req_delay.py +0 -19
  27. coocan-0.5.5/_test/test_req_err.py +0 -32
  28. {coocan-0.5.5 → coocan-0.5.6}/README.md +0 -0
  29. {coocan-0.5.5 → coocan-0.5.6}/coocan/cmd/__init__.py +0 -0
  30. {coocan-0.5.5 → coocan-0.5.6}/coocan.egg-info/dependency_links.txt +0 -0
  31. {coocan-0.5.5 → coocan-0.5.6}/coocan.egg-info/entry_points.txt +0 -0
  32. {coocan-0.5.5 → coocan-0.5.6}/coocan.egg-info/requires.txt +0 -0
@@ -1,99 +1,97 @@
1
- Metadata-Version: 2.4
2
- Name: coocan
3
- Version: 0.5.5
4
- Summary: Air Async Spider Framework
5
- Home-page: https://github.com/markadc/coocan
6
- Author: wauo
7
- Author-email: wauo <markadc@126.com>
8
- License-Expression: MIT
9
- Project-URL: Homepage, https://github.com/markadc/coocan
10
- Requires-Python: >=3.10
11
- Description-Content-Type: text/markdown
12
- Requires-Dist: click>=8.0.0
13
- Requires-Dist: httpx
14
- Requires-Dist: loguru
15
- Dynamic: author
16
- Dynamic: home-page
17
- Dynamic: requires-python
18
-
19
- # 项目说明
20
-
21
- - 一个非常轻量的异步爬虫框架
22
-
23
- # 项目地址
24
-
25
- - https://github.com/markadc/coocan
26
-
27
- ## demo
28
-
29
- - 效果
30
- <br>
31
- ![效果](coocan/demo.gif)
32
-
33
-
34
- - 代码
35
-
36
- ```python
37
- import json
38
-
39
- from loguru import logger
40
-
41
- import coocan
42
- from coocan import Request, MiniSpider
43
-
44
-
45
- class CSDNDetailSpider(MiniSpider):
46
- start_urls = ['http://www.csdn.net']
47
- max_requests = 10
48
-
49
- def middleware(self, request: Request):
50
- request.headers["Referer"] = "http://www.csdn.net/"
51
-
52
- def parse(self, response):
53
- api = "https://blog.csdn.net/community/home-api/v1/get-business-list"
54
- params = {
55
- "page": "1",
56
- "size": "20",
57
- "businessType": "lately",
58
- "noMore": "false",
59
- "username": "markadc"
60
- }
61
- yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
62
-
63
- def parse_page(self, response, api, params):
64
- current_page = params["page"]
65
- data = json.loads(response.text)
66
- some = data["data"]["list"]
67
-
68
- if not some:
69
- logger.warning("没有第 {} 页".format(current_page))
70
- return
71
-
72
- for one in some:
73
- date = one["formatTime"]
74
- name = one["title"]
75
- detail_url = one["url"]
76
- logger.info(
77
- """
78
- {}
79
- {}
80
- {}
81
- """.format(date, name, detail_url)
82
- )
83
- yield coocan.Request(detail_url, self.parse_detail, cb_kwargs={"title": name})
84
-
85
- logger.info("第 {} 页抓取成功".format(params["page"]))
86
-
87
- # 抓取下一页
88
- next_page = int(current_page) + 1
89
- params["page"] = str(next_page)
90
- yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
91
-
92
- def parse_detail(self, response, title):
93
- logger.success("{} 已访问 {}".format(response.status_code, title))
94
-
95
-
96
- if __name__ == '__main__':
97
- s = CSDNDetailSpider()
98
- s.go()
99
- ```
1
+ Metadata-Version: 2.4
2
+ Name: coocan
3
+ Version: 0.5.6
4
+ Summary: Air Async Spider Framework
5
+ Author: wauo
6
+ Author-email: wauo <markadc@126.com>
7
+ License-Expression: MIT
8
+ Project-URL: Homepage, https://github.com/markadc/coocan
9
+ Requires-Python: >=3.10
10
+ Description-Content-Type: text/markdown
11
+ Requires-Dist: click>=8.0.0
12
+ Requires-Dist: httpx
13
+ Requires-Dist: loguru
14
+ Dynamic: author
15
+ Dynamic: requires-python
16
+
17
+ # 项目说明
18
+
19
+ - 一个非常轻量的异步爬虫框架
20
+
21
+ # 项目地址
22
+
23
+ - https://github.com/markadc/coocan
24
+
25
+ ## demo
26
+
27
+ - 效果
28
+ <br>
29
+ ![效果](coocan/demo.gif)
30
+
31
+
32
+ - 代码
33
+
34
+ ```python
35
+ import json
36
+
37
+ from loguru import logger
38
+
39
+ import coocan
40
+ from coocan import Request, MiniSpider
41
+
42
+
43
+ class CSDNDetailSpider(MiniSpider):
44
+ start_urls = ['http://www.csdn.net']
45
+ max_requests = 10
46
+
47
+ def middleware(self, request: Request):
48
+ request.headers["Referer"] = "http://www.csdn.net/"
49
+
50
+ def parse(self, response):
51
+ api = "https://blog.csdn.net/community/home-api/v1/get-business-list"
52
+ params = {
53
+ "page": "1",
54
+ "size": "20",
55
+ "businessType": "lately",
56
+ "noMore": "false",
57
+ "username": "markadc"
58
+ }
59
+ yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
60
+
61
+ def parse_page(self, response, api, params):
62
+ current_page = params["page"]
63
+ data = json.loads(response.text)
64
+ some = data["data"]["list"]
65
+
66
+ if not some:
67
+ logger.warning("没有第 {} 页".format(current_page))
68
+ return
69
+
70
+ for one in some:
71
+ date = one["formatTime"]
72
+ name = one["title"]
73
+ detail_url = one["url"]
74
+ logger.info(
75
+ """
76
+ {}
77
+ {}
78
+ {}
79
+ """.format(date, name, detail_url)
80
+ )
81
+ yield coocan.Request(detail_url, self.parse_detail, cb_kwargs={"title": name})
82
+
83
+ logger.info("第 {} 页抓取成功".format(params["page"]))
84
+
85
+ # 抓取下一页
86
+ next_page = int(current_page) + 1
87
+ params["page"] = str(next_page)
88
+ yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
89
+
90
+ def parse_detail(self, response, title):
91
+ logger.success("{} 已访问 {}".format(response.status_code, title))
92
+
93
+
94
+ if __name__ == '__main__':
95
+ s = CSDNDetailSpider()
96
+ s.go()
97
+ ```
@@ -1,2 +1,2 @@
1
- from coocan.spider import *
2
- from coocan.url import *
1
+ from coocan.spider import *
2
+ from coocan.url import *
@@ -1,62 +1,62 @@
1
- import json
2
-
3
- from loguru import logger
4
-
5
- import coocan
6
- from coocan import Request, MiniSpider
7
-
8
-
9
- class CSDNDetailSpider(MiniSpider):
10
- start_urls = ['http://www.csdn.net']
11
- max_requests = 10
12
-
13
- def middleware(self, request: Request):
14
- request.headers["Referer"] = "http://www.csdn.net/"
15
-
16
- def parse(self, response):
17
- api = "https://blog.csdn.net/community/home-api/v1/get-business-list"
18
- params = {
19
- "page": "1",
20
- "size": "20",
21
- "businessType": "lately",
22
- "noMore": "false",
23
- "username": "markadc"
24
- }
25
- yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
26
-
27
- def parse_page(self, response, api, params):
28
- current_page = params["page"]
29
- data = json.loads(response.text)
30
- some = data["data"]["list"]
31
-
32
- if not some:
33
- logger.warning("没有第 {} 页".format(current_page))
34
- return
35
-
36
- for one in some:
37
- date = one["formatTime"]
38
- name = one["title"]
39
- detail_url = one["url"]
40
- logger.info(
41
- """
42
- {}
43
- {}
44
- {}
45
- """.format(date, name, detail_url)
46
- )
47
- yield coocan.Request(detail_url, self.parse_detail, cb_kwargs={"title": name})
48
-
49
- logger.info("第 {} 页抓取成功".format(params["page"]))
50
-
51
- # 抓取下一页
52
- next_page = int(current_page) + 1
53
- params["page"] = str(next_page)
54
- yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
55
-
56
- def parse_detail(self, response, title):
57
- logger.success("{} 已访问 {}".format(response.status_code, title))
58
-
59
-
60
- if __name__ == '__main__':
61
- s = CSDNDetailSpider()
62
- s.go()
1
+ import json
2
+
3
+ from loguru import logger
4
+
5
+ import coocan
6
+ from coocan import Request, MiniSpider
7
+
8
+
9
+ class CSDNDetailSpider(MiniSpider):
10
+ start_urls = ['http://www.csdn.net']
11
+ max_requests = 10
12
+
13
+ def middleware(self, request: Request):
14
+ request.headers["Referer"] = "http://www.csdn.net/"
15
+
16
+ def parse(self, response):
17
+ api = "https://blog.csdn.net/community/home-api/v1/get-business-list"
18
+ params = {
19
+ "page": "1",
20
+ "size": "20",
21
+ "businessType": "lately",
22
+ "noMore": "false",
23
+ "username": "markadc"
24
+ }
25
+ yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
26
+
27
+ def parse_page(self, response, api, params):
28
+ current_page = params["page"]
29
+ data = json.loads(response.text)
30
+ some = data["data"]["list"]
31
+
32
+ if not some:
33
+ logger.warning("没有第 {} 页".format(current_page))
34
+ return
35
+
36
+ for one in some:
37
+ date = one["formatTime"]
38
+ name = one["title"]
39
+ detail_url = one["url"]
40
+ logger.info(
41
+ """
42
+ {}
43
+ {}
44
+ {}
45
+ """.format(date, name, detail_url)
46
+ )
47
+ yield coocan.Request(detail_url, self.parse_detail, cb_kwargs={"title": name})
48
+
49
+ logger.info("第 {} 页抓取成功".format(params["page"]))
50
+
51
+ # 抓取下一页
52
+ next_page = int(current_page) + 1
53
+ params["page"] = str(next_page)
54
+ yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
55
+
56
+ def parse_detail(self, response, title):
57
+ logger.success("{} 已访问 {}".format(response.status_code, title))
58
+
59
+
60
+ if __name__ == '__main__':
61
+ s = CSDNDetailSpider()
62
+ s.go()
@@ -1,50 +1,50 @@
1
- import json
2
-
3
- from loguru import logger
4
-
5
- from coocan import Request, MiniSpider
6
-
7
-
8
- class CSDNSpider(MiniSpider):
9
- start_urls = ['http://www.csdn.net']
10
- max_requests = 10
11
-
12
- def middleware(self, request: Request):
13
- request.headers["Referer"] = "http://www.csdn.net/"
14
-
15
- def parse(self, response):
16
- api = "https://blog.csdn.net/community/home-api/v1/get-business-list"
17
- params = {
18
- "page": "1",
19
- "size": "20",
20
- "businessType": "lately",
21
- "noMore": "false",
22
- "username": "markadc"
23
- }
24
- yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
25
-
26
- def parse_page(self, response, api, params):
27
- current_page = params["page"]
28
- data = json.loads(response.text)
29
- some = data["data"]["list"]
30
-
31
- if not some:
32
- logger.warning("没有第 {} 页".format(current_page))
33
- return
34
-
35
- for one in some:
36
- date = one["formatTime"]
37
- name = one["title"]
38
- detail_url = one["url"]
39
- print(date, detail_url, name)
40
- print("第 {} 页抓取成功".format(params["page"]))
41
-
42
- # 抓取下一页
43
- next_page = int(current_page) + 1
44
- params["page"] = str(next_page)
45
- yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
46
-
47
-
48
- if __name__ == '__main__':
49
- s = CSDNSpider()
50
- s.go()
1
+ import json
2
+
3
+ from loguru import logger
4
+
5
+ from coocan import Request, MiniSpider
6
+
7
+
8
+ class CSDNSpider(MiniSpider):
9
+ start_urls = ['http://www.csdn.net']
10
+ max_requests = 10
11
+
12
+ def middleware(self, request: Request):
13
+ request.headers["Referer"] = "http://www.csdn.net/"
14
+
15
+ def parse(self, response):
16
+ api = "https://blog.csdn.net/community/home-api/v1/get-business-list"
17
+ params = {
18
+ "page": "1",
19
+ "size": "20",
20
+ "businessType": "lately",
21
+ "noMore": "false",
22
+ "username": "markadc"
23
+ }
24
+ yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
25
+
26
+ def parse_page(self, response, api, params):
27
+ current_page = params["page"]
28
+ data = json.loads(response.text)
29
+ some = data["data"]["list"]
30
+
31
+ if not some:
32
+ logger.warning("没有第 {} 页".format(current_page))
33
+ return
34
+
35
+ for one in some:
36
+ date = one["formatTime"]
37
+ name = one["title"]
38
+ detail_url = one["url"]
39
+ print(date, detail_url, name)
40
+ print("第 {} 页抓取成功".format(params["page"]))
41
+
42
+ # 抓取下一页
43
+ next_page = int(current_page) + 1
44
+ params["page"] = str(next_page)
45
+ yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
46
+
47
+
48
+ if __name__ == '__main__':
49
+ s = CSDNSpider()
50
+ s.go()
@@ -1,31 +1,31 @@
1
- import random
2
- import time
3
-
4
- from loguru import logger
5
-
6
- from coocan import MiniSpider, Request, Response
7
-
8
-
9
- class RecvItemSpider(MiniSpider):
10
- start_urls = ["https://cn.bing.com/search?q=1"]
11
- max_requests = 10
12
-
13
- def parse(self, response: Response):
14
- logger.warning("{} {}".format(response.status_code, response.request.url, response.get_one("//title/text()")))
15
- for _ in range(10):
16
- item = {"timestamp": int(time.time() * 1000), "mark": random.randint(1, 10000)} # 假设这里是爬虫的数据
17
- yield item
18
- head, tail = str(response.request.url).split("=")
19
- next_url = "{}={}".format(head, int(tail) + 1)
20
- if next_url.endswith("11"):
21
- yield "coocan" # 出现警告日志
22
- return
23
- yield Request(next_url, callback=self.parse)
24
-
25
- def process_item(self, item: dict):
26
- logger.success("Get => {}".format(item))
27
-
28
-
29
- if __name__ == '__main__':
30
- s = RecvItemSpider()
31
- s.go()
1
+ import random
2
+ import time
3
+
4
+ from loguru import logger
5
+
6
+ from coocan import MiniSpider, Request, Response
7
+
8
+
9
+ class RecvItemSpider(MiniSpider):
10
+ start_urls = ["https://cn.bing.com/search?q=1"]
11
+ max_requests = 10
12
+
13
+ def parse(self, response: Response):
14
+ logger.warning("{} {}".format(response.status_code, response.request.url, response.get_one("//title/text()")))
15
+ for _ in range(10):
16
+ item = {"timestamp": int(time.time() * 1000), "mark": random.randint(1, 10000)} # 假设这里是爬虫的数据
17
+ yield item
18
+ head, tail = str(response.request.url).split("=")
19
+ next_url = "{}={}".format(head, int(tail) + 1)
20
+ if next_url.endswith("11"):
21
+ yield "coocan" # 出现警告日志
22
+ return
23
+ yield Request(next_url, callback=self.parse)
24
+
25
+ def process_item(self, item: dict):
26
+ logger.success("Get => {}".format(item))
27
+
28
+
29
+ if __name__ == '__main__':
30
+ s = RecvItemSpider()
31
+ s.go()
@@ -1,22 +1,22 @@
1
- from coocan import Request, Response, MiniSpider
2
-
3
-
4
- class ViewLocalIPSpider(MiniSpider):
5
- start_urls = ["https://httpbin.org/ip"]
6
- max_requests = 5
7
- delay = 5
8
-
9
- def start_requests(self):
10
- for _ in range(10):
11
- yield Request(self.start_urls[0], callback=self.parse)
12
-
13
- def middleware(self, request: Request):
14
- request.headers["Referer"] = "https://httpbin.org"
15
-
16
- def parse(self, response: Response):
17
- print(response.status_code, response.json())
18
-
19
-
20
- if __name__ == '__main__':
21
- s = ViewLocalIPSpider()
22
- s.go()
1
+ from coocan import Request, Response, MiniSpider
2
+
3
+
4
+ class ViewLocalIPSpider(MiniSpider):
5
+ start_urls = ["https://httpbin.org/ip"]
6
+ max_requests = 5
7
+ delay = 5
8
+
9
+ def start_requests(self):
10
+ for _ in range(10):
11
+ yield Request(self.start_urls[0], callback=self.parse)
12
+
13
+ def middleware(self, request: Request):
14
+ request.headers["Referer"] = "https://httpbin.org"
15
+
16
+ def parse(self, response: Response):
17
+ print(response.status_code, response.json())
18
+
19
+
20
+ if __name__ == '__main__':
21
+ s = ViewLocalIPSpider()
22
+ s.go()