coocan 0.5.5__tar.gz → 0.5.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {coocan-0.5.5 → coocan-0.5.6}/PKG-INFO +97 -99
- {coocan-0.5.5 → coocan-0.5.6}/coocan/__init__.py +2 -2
- {coocan-0.5.5 → coocan-0.5.6}/coocan/_examples/crawl_csdn_detail.py +62 -62
- {coocan-0.5.5 → coocan-0.5.6}/coocan/_examples/crawl_csdn_list.py +50 -50
- {coocan-0.5.5 → coocan-0.5.6}/coocan/_examples/recv_item.py +31 -31
- {coocan-0.5.5 → coocan-0.5.6}/coocan/_examples/view_local_ip.py +22 -22
- {coocan-0.5.5 → coocan-0.5.6}/coocan/cmd/cli.py +68 -68
- {coocan-0.5.5 → coocan-0.5.6}/coocan/gen.py +33 -33
- {coocan-0.5.5 → coocan-0.5.6}/coocan/push_project.py +12 -12
- {coocan-0.5.5 → coocan-0.5.6}/coocan/spider/__init__.py +1 -1
- {coocan-0.5.5 → coocan-0.5.6}/coocan/spider/base.py +177 -177
- {coocan-0.5.5 → coocan-0.5.6}/coocan/templates/spider.txt +17 -17
- {coocan-0.5.5 → coocan-0.5.6}/coocan/url/__init__.py +2 -2
- {coocan-0.5.5 → coocan-0.5.6}/coocan/url/request.py +31 -31
- {coocan-0.5.5 → coocan-0.5.6}/coocan/url/response.py +50 -50
- {coocan-0.5.5 → coocan-0.5.6}/coocan.egg-info/PKG-INFO +97 -99
- {coocan-0.5.5 → coocan-0.5.6}/coocan.egg-info/SOURCES.txt +0 -6
- {coocan-0.5.5 → coocan-0.5.6}/coocan.egg-info/top_level.txt +0 -1
- {coocan-0.5.5 → coocan-0.5.6}/pyproject.toml +1 -1
- {coocan-0.5.5 → coocan-0.5.6}/setup.cfg +4 -4
- {coocan-0.5.5 → coocan-0.5.6}/setup.py +1 -2
- coocan-0.5.5/_test/crawl_csdn.py +0 -53
- coocan-0.5.5/_test/demo.py +0 -33
- coocan-0.5.5/_test/err_demo.py +0 -27
- coocan-0.5.5/_test/test_priority.py +0 -21
- coocan-0.5.5/_test/test_req_delay.py +0 -19
- coocan-0.5.5/_test/test_req_err.py +0 -32
- {coocan-0.5.5 → coocan-0.5.6}/README.md +0 -0
- {coocan-0.5.5 → coocan-0.5.6}/coocan/cmd/__init__.py +0 -0
- {coocan-0.5.5 → coocan-0.5.6}/coocan.egg-info/dependency_links.txt +0 -0
- {coocan-0.5.5 → coocan-0.5.6}/coocan.egg-info/entry_points.txt +0 -0
- {coocan-0.5.5 → coocan-0.5.6}/coocan.egg-info/requires.txt +0 -0
@@ -1,99 +1,97 @@
|
|
1
|
-
Metadata-Version: 2.4
|
2
|
-
Name: coocan
|
3
|
-
Version: 0.5.
|
4
|
-
Summary: Air Async Spider Framework
|
5
|
-
|
6
|
-
Author: wauo
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
Requires-Dist:
|
13
|
-
Requires-Dist:
|
14
|
-
|
15
|
-
Dynamic:
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
import
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
"
|
56
|
-
"
|
57
|
-
"
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
{}
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
s.go()
|
99
|
-
```
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: coocan
|
3
|
+
Version: 0.5.6
|
4
|
+
Summary: Air Async Spider Framework
|
5
|
+
Author: wauo
|
6
|
+
Author-email: wauo <markadc@126.com>
|
7
|
+
License-Expression: MIT
|
8
|
+
Project-URL: Homepage, https://github.com/markadc/coocan
|
9
|
+
Requires-Python: >=3.10
|
10
|
+
Description-Content-Type: text/markdown
|
11
|
+
Requires-Dist: click>=8.0.0
|
12
|
+
Requires-Dist: httpx
|
13
|
+
Requires-Dist: loguru
|
14
|
+
Dynamic: author
|
15
|
+
Dynamic: requires-python
|
16
|
+
|
17
|
+
# 项目说明
|
18
|
+
|
19
|
+
- 一个非常轻量的异步爬虫框架
|
20
|
+
|
21
|
+
# 项目地址
|
22
|
+
|
23
|
+
- https://github.com/markadc/coocan
|
24
|
+
|
25
|
+
## demo
|
26
|
+
|
27
|
+
- 效果
|
28
|
+
<br>
|
29
|
+

|
30
|
+
|
31
|
+
|
32
|
+
- 代码
|
33
|
+
|
34
|
+
```python
|
35
|
+
import json
|
36
|
+
|
37
|
+
from loguru import logger
|
38
|
+
|
39
|
+
import coocan
|
40
|
+
from coocan import Request, MiniSpider
|
41
|
+
|
42
|
+
|
43
|
+
class CSDNDetailSpider(MiniSpider):
|
44
|
+
start_urls = ['http://www.csdn.net']
|
45
|
+
max_requests = 10
|
46
|
+
|
47
|
+
def middleware(self, request: Request):
|
48
|
+
request.headers["Referer"] = "http://www.csdn.net/"
|
49
|
+
|
50
|
+
def parse(self, response):
|
51
|
+
api = "https://blog.csdn.net/community/home-api/v1/get-business-list"
|
52
|
+
params = {
|
53
|
+
"page": "1",
|
54
|
+
"size": "20",
|
55
|
+
"businessType": "lately",
|
56
|
+
"noMore": "false",
|
57
|
+
"username": "markadc"
|
58
|
+
}
|
59
|
+
yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
|
60
|
+
|
61
|
+
def parse_page(self, response, api, params):
|
62
|
+
current_page = params["page"]
|
63
|
+
data = json.loads(response.text)
|
64
|
+
some = data["data"]["list"]
|
65
|
+
|
66
|
+
if not some:
|
67
|
+
logger.warning("没有第 {} 页".format(current_page))
|
68
|
+
return
|
69
|
+
|
70
|
+
for one in some:
|
71
|
+
date = one["formatTime"]
|
72
|
+
name = one["title"]
|
73
|
+
detail_url = one["url"]
|
74
|
+
logger.info(
|
75
|
+
"""
|
76
|
+
{}
|
77
|
+
{}
|
78
|
+
{}
|
79
|
+
""".format(date, name, detail_url)
|
80
|
+
)
|
81
|
+
yield coocan.Request(detail_url, self.parse_detail, cb_kwargs={"title": name})
|
82
|
+
|
83
|
+
logger.info("第 {} 页抓取成功".format(params["page"]))
|
84
|
+
|
85
|
+
# 抓取下一页
|
86
|
+
next_page = int(current_page) + 1
|
87
|
+
params["page"] = str(next_page)
|
88
|
+
yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
|
89
|
+
|
90
|
+
def parse_detail(self, response, title):
|
91
|
+
logger.success("{} 已访问 {}".format(response.status_code, title))
|
92
|
+
|
93
|
+
|
94
|
+
if __name__ == '__main__':
|
95
|
+
s = CSDNDetailSpider()
|
96
|
+
s.go()
|
97
|
+
```
|
@@ -1,2 +1,2 @@
|
|
1
|
-
from coocan.spider import *
|
2
|
-
from coocan.url import *
|
1
|
+
from coocan.spider import *
|
2
|
+
from coocan.url import *
|
@@ -1,62 +1,62 @@
|
|
1
|
-
import json
|
2
|
-
|
3
|
-
from loguru import logger
|
4
|
-
|
5
|
-
import coocan
|
6
|
-
from coocan import Request, MiniSpider
|
7
|
-
|
8
|
-
|
9
|
-
class CSDNDetailSpider(MiniSpider):
|
10
|
-
start_urls = ['http://www.csdn.net']
|
11
|
-
max_requests = 10
|
12
|
-
|
13
|
-
def middleware(self, request: Request):
|
14
|
-
request.headers["Referer"] = "http://www.csdn.net/"
|
15
|
-
|
16
|
-
def parse(self, response):
|
17
|
-
api = "https://blog.csdn.net/community/home-api/v1/get-business-list"
|
18
|
-
params = {
|
19
|
-
"page": "1",
|
20
|
-
"size": "20",
|
21
|
-
"businessType": "lately",
|
22
|
-
"noMore": "false",
|
23
|
-
"username": "markadc"
|
24
|
-
}
|
25
|
-
yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
|
26
|
-
|
27
|
-
def parse_page(self, response, api, params):
|
28
|
-
current_page = params["page"]
|
29
|
-
data = json.loads(response.text)
|
30
|
-
some = data["data"]["list"]
|
31
|
-
|
32
|
-
if not some:
|
33
|
-
logger.warning("没有第 {} 页".format(current_page))
|
34
|
-
return
|
35
|
-
|
36
|
-
for one in some:
|
37
|
-
date = one["formatTime"]
|
38
|
-
name = one["title"]
|
39
|
-
detail_url = one["url"]
|
40
|
-
logger.info(
|
41
|
-
"""
|
42
|
-
{}
|
43
|
-
{}
|
44
|
-
{}
|
45
|
-
""".format(date, name, detail_url)
|
46
|
-
)
|
47
|
-
yield coocan.Request(detail_url, self.parse_detail, cb_kwargs={"title": name})
|
48
|
-
|
49
|
-
logger.info("第 {} 页抓取成功".format(params["page"]))
|
50
|
-
|
51
|
-
# 抓取下一页
|
52
|
-
next_page = int(current_page) + 1
|
53
|
-
params["page"] = str(next_page)
|
54
|
-
yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
|
55
|
-
|
56
|
-
def parse_detail(self, response, title):
|
57
|
-
logger.success("{} 已访问 {}".format(response.status_code, title))
|
58
|
-
|
59
|
-
|
60
|
-
if __name__ == '__main__':
|
61
|
-
s = CSDNDetailSpider()
|
62
|
-
s.go()
|
1
|
+
import json
|
2
|
+
|
3
|
+
from loguru import logger
|
4
|
+
|
5
|
+
import coocan
|
6
|
+
from coocan import Request, MiniSpider
|
7
|
+
|
8
|
+
|
9
|
+
class CSDNDetailSpider(MiniSpider):
|
10
|
+
start_urls = ['http://www.csdn.net']
|
11
|
+
max_requests = 10
|
12
|
+
|
13
|
+
def middleware(self, request: Request):
|
14
|
+
request.headers["Referer"] = "http://www.csdn.net/"
|
15
|
+
|
16
|
+
def parse(self, response):
|
17
|
+
api = "https://blog.csdn.net/community/home-api/v1/get-business-list"
|
18
|
+
params = {
|
19
|
+
"page": "1",
|
20
|
+
"size": "20",
|
21
|
+
"businessType": "lately",
|
22
|
+
"noMore": "false",
|
23
|
+
"username": "markadc"
|
24
|
+
}
|
25
|
+
yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
|
26
|
+
|
27
|
+
def parse_page(self, response, api, params):
|
28
|
+
current_page = params["page"]
|
29
|
+
data = json.loads(response.text)
|
30
|
+
some = data["data"]["list"]
|
31
|
+
|
32
|
+
if not some:
|
33
|
+
logger.warning("没有第 {} 页".format(current_page))
|
34
|
+
return
|
35
|
+
|
36
|
+
for one in some:
|
37
|
+
date = one["formatTime"]
|
38
|
+
name = one["title"]
|
39
|
+
detail_url = one["url"]
|
40
|
+
logger.info(
|
41
|
+
"""
|
42
|
+
{}
|
43
|
+
{}
|
44
|
+
{}
|
45
|
+
""".format(date, name, detail_url)
|
46
|
+
)
|
47
|
+
yield coocan.Request(detail_url, self.parse_detail, cb_kwargs={"title": name})
|
48
|
+
|
49
|
+
logger.info("第 {} 页抓取成功".format(params["page"]))
|
50
|
+
|
51
|
+
# 抓取下一页
|
52
|
+
next_page = int(current_page) + 1
|
53
|
+
params["page"] = str(next_page)
|
54
|
+
yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
|
55
|
+
|
56
|
+
def parse_detail(self, response, title):
|
57
|
+
logger.success("{} 已访问 {}".format(response.status_code, title))
|
58
|
+
|
59
|
+
|
60
|
+
if __name__ == '__main__':
|
61
|
+
s = CSDNDetailSpider()
|
62
|
+
s.go()
|
@@ -1,50 +1,50 @@
|
|
1
|
-
import json
|
2
|
-
|
3
|
-
from loguru import logger
|
4
|
-
|
5
|
-
from coocan import Request, MiniSpider
|
6
|
-
|
7
|
-
|
8
|
-
class CSDNSpider(MiniSpider):
|
9
|
-
start_urls = ['http://www.csdn.net']
|
10
|
-
max_requests = 10
|
11
|
-
|
12
|
-
def middleware(self, request: Request):
|
13
|
-
request.headers["Referer"] = "http://www.csdn.net/"
|
14
|
-
|
15
|
-
def parse(self, response):
|
16
|
-
api = "https://blog.csdn.net/community/home-api/v1/get-business-list"
|
17
|
-
params = {
|
18
|
-
"page": "1",
|
19
|
-
"size": "20",
|
20
|
-
"businessType": "lately",
|
21
|
-
"noMore": "false",
|
22
|
-
"username": "markadc"
|
23
|
-
}
|
24
|
-
yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
|
25
|
-
|
26
|
-
def parse_page(self, response, api, params):
|
27
|
-
current_page = params["page"]
|
28
|
-
data = json.loads(response.text)
|
29
|
-
some = data["data"]["list"]
|
30
|
-
|
31
|
-
if not some:
|
32
|
-
logger.warning("没有第 {} 页".format(current_page))
|
33
|
-
return
|
34
|
-
|
35
|
-
for one in some:
|
36
|
-
date = one["formatTime"]
|
37
|
-
name = one["title"]
|
38
|
-
detail_url = one["url"]
|
39
|
-
print(date, detail_url, name)
|
40
|
-
print("第 {} 页抓取成功".format(params["page"]))
|
41
|
-
|
42
|
-
# 抓取下一页
|
43
|
-
next_page = int(current_page) + 1
|
44
|
-
params["page"] = str(next_page)
|
45
|
-
yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
|
46
|
-
|
47
|
-
|
48
|
-
if __name__ == '__main__':
|
49
|
-
s = CSDNSpider()
|
50
|
-
s.go()
|
1
|
+
import json
|
2
|
+
|
3
|
+
from loguru import logger
|
4
|
+
|
5
|
+
from coocan import Request, MiniSpider
|
6
|
+
|
7
|
+
|
8
|
+
class CSDNSpider(MiniSpider):
|
9
|
+
start_urls = ['http://www.csdn.net']
|
10
|
+
max_requests = 10
|
11
|
+
|
12
|
+
def middleware(self, request: Request):
|
13
|
+
request.headers["Referer"] = "http://www.csdn.net/"
|
14
|
+
|
15
|
+
def parse(self, response):
|
16
|
+
api = "https://blog.csdn.net/community/home-api/v1/get-business-list"
|
17
|
+
params = {
|
18
|
+
"page": "1",
|
19
|
+
"size": "20",
|
20
|
+
"businessType": "lately",
|
21
|
+
"noMore": "false",
|
22
|
+
"username": "markadc"
|
23
|
+
}
|
24
|
+
yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
|
25
|
+
|
26
|
+
def parse_page(self, response, api, params):
|
27
|
+
current_page = params["page"]
|
28
|
+
data = json.loads(response.text)
|
29
|
+
some = data["data"]["list"]
|
30
|
+
|
31
|
+
if not some:
|
32
|
+
logger.warning("没有第 {} 页".format(current_page))
|
33
|
+
return
|
34
|
+
|
35
|
+
for one in some:
|
36
|
+
date = one["formatTime"]
|
37
|
+
name = one["title"]
|
38
|
+
detail_url = one["url"]
|
39
|
+
print(date, detail_url, name)
|
40
|
+
print("第 {} 页抓取成功".format(params["page"]))
|
41
|
+
|
42
|
+
# 抓取下一页
|
43
|
+
next_page = int(current_page) + 1
|
44
|
+
params["page"] = str(next_page)
|
45
|
+
yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
|
46
|
+
|
47
|
+
|
48
|
+
if __name__ == '__main__':
|
49
|
+
s = CSDNSpider()
|
50
|
+
s.go()
|
@@ -1,31 +1,31 @@
|
|
1
|
-
import random
|
2
|
-
import time
|
3
|
-
|
4
|
-
from loguru import logger
|
5
|
-
|
6
|
-
from coocan import MiniSpider, Request, Response
|
7
|
-
|
8
|
-
|
9
|
-
class RecvItemSpider(MiniSpider):
|
10
|
-
start_urls = ["https://cn.bing.com/search?q=1"]
|
11
|
-
max_requests = 10
|
12
|
-
|
13
|
-
def parse(self, response: Response):
|
14
|
-
logger.warning("{} {}".format(response.status_code, response.request.url, response.get_one("//title/text()")))
|
15
|
-
for _ in range(10):
|
16
|
-
item = {"timestamp": int(time.time() * 1000), "mark": random.randint(1, 10000)} # 假设这里是爬虫的数据
|
17
|
-
yield item
|
18
|
-
head, tail = str(response.request.url).split("=")
|
19
|
-
next_url = "{}={}".format(head, int(tail) + 1)
|
20
|
-
if next_url.endswith("11"):
|
21
|
-
yield "coocan" # 出现警告日志
|
22
|
-
return
|
23
|
-
yield Request(next_url, callback=self.parse)
|
24
|
-
|
25
|
-
def process_item(self, item: dict):
|
26
|
-
logger.success("Get => {}".format(item))
|
27
|
-
|
28
|
-
|
29
|
-
if __name__ == '__main__':
|
30
|
-
s = RecvItemSpider()
|
31
|
-
s.go()
|
1
|
+
import random
|
2
|
+
import time
|
3
|
+
|
4
|
+
from loguru import logger
|
5
|
+
|
6
|
+
from coocan import MiniSpider, Request, Response
|
7
|
+
|
8
|
+
|
9
|
+
class RecvItemSpider(MiniSpider):
|
10
|
+
start_urls = ["https://cn.bing.com/search?q=1"]
|
11
|
+
max_requests = 10
|
12
|
+
|
13
|
+
def parse(self, response: Response):
|
14
|
+
logger.warning("{} {}".format(response.status_code, response.request.url, response.get_one("//title/text()")))
|
15
|
+
for _ in range(10):
|
16
|
+
item = {"timestamp": int(time.time() * 1000), "mark": random.randint(1, 10000)} # 假设这里是爬虫的数据
|
17
|
+
yield item
|
18
|
+
head, tail = str(response.request.url).split("=")
|
19
|
+
next_url = "{}={}".format(head, int(tail) + 1)
|
20
|
+
if next_url.endswith("11"):
|
21
|
+
yield "coocan" # 出现警告日志
|
22
|
+
return
|
23
|
+
yield Request(next_url, callback=self.parse)
|
24
|
+
|
25
|
+
def process_item(self, item: dict):
|
26
|
+
logger.success("Get => {}".format(item))
|
27
|
+
|
28
|
+
|
29
|
+
if __name__ == '__main__':
|
30
|
+
s = RecvItemSpider()
|
31
|
+
s.go()
|
@@ -1,22 +1,22 @@
|
|
1
|
-
from coocan import Request, Response, MiniSpider
|
2
|
-
|
3
|
-
|
4
|
-
class ViewLocalIPSpider(MiniSpider):
|
5
|
-
start_urls = ["https://httpbin.org/ip"]
|
6
|
-
max_requests = 5
|
7
|
-
delay = 5
|
8
|
-
|
9
|
-
def start_requests(self):
|
10
|
-
for _ in range(10):
|
11
|
-
yield Request(self.start_urls[0], callback=self.parse)
|
12
|
-
|
13
|
-
def middleware(self, request: Request):
|
14
|
-
request.headers["Referer"] = "https://httpbin.org"
|
15
|
-
|
16
|
-
def parse(self, response: Response):
|
17
|
-
print(response.status_code, response.json())
|
18
|
-
|
19
|
-
|
20
|
-
if __name__ == '__main__':
|
21
|
-
s = ViewLocalIPSpider()
|
22
|
-
s.go()
|
1
|
+
from coocan import Request, Response, MiniSpider
|
2
|
+
|
3
|
+
|
4
|
+
class ViewLocalIPSpider(MiniSpider):
|
5
|
+
start_urls = ["https://httpbin.org/ip"]
|
6
|
+
max_requests = 5
|
7
|
+
delay = 5
|
8
|
+
|
9
|
+
def start_requests(self):
|
10
|
+
for _ in range(10):
|
11
|
+
yield Request(self.start_urls[0], callback=self.parse)
|
12
|
+
|
13
|
+
def middleware(self, request: Request):
|
14
|
+
request.headers["Referer"] = "https://httpbin.org"
|
15
|
+
|
16
|
+
def parse(self, response: Response):
|
17
|
+
print(response.status_code, response.json())
|
18
|
+
|
19
|
+
|
20
|
+
if __name__ == '__main__':
|
21
|
+
s = ViewLocalIPSpider()
|
22
|
+
s.go()
|