coocan 0.4.9__py3-none-any.whl → 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _test/crawl_csdn.py +53 -0
- _test/demo.py +33 -0
- _test/err_demo.py +27 -0
- _test/test_priority.py +21 -0
- _test/test_req_delay.py +19 -0
- _test/test_req_err.py +32 -0
- coocan/_examples/crawl_csdn_detail.py +62 -0
- coocan/_examples/crawl_csdn_list.py +50 -0
- coocan/_examples/recv_item.py +31 -0
- coocan/_examples/view_local_ip.py +22 -0
- coocan/cmd/cli.py +10 -14
- coocan/push_project.py +12 -0
- coocan/spider/base.py +51 -17
- coocan/{cmd/templates → templates}/spider.txt +1 -1
- {coocan-0.4.9.dist-info → coocan-0.5.4.dist-info}/METADATA +10 -4
- coocan-0.5.4.dist-info/RECORD +26 -0
- {coocan-0.4.9.dist-info → coocan-0.5.4.dist-info}/WHEEL +1 -1
- coocan-0.5.4.dist-info/entry_points.txt +2 -0
- {coocan-0.4.9.dist-info → coocan-0.5.4.dist-info}/top_level.txt +1 -0
- coocan-0.4.9.dist-info/RECORD +0 -15
- coocan-0.4.9.dist-info/entry_points.txt +0 -2
_test/crawl_csdn.py
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
import json
|
2
|
+
|
3
|
+
from loguru import logger
|
4
|
+
|
5
|
+
import coocan
|
6
|
+
from coocan import Request, MiniSpider
|
7
|
+
|
8
|
+
api = "https://blog.csdn.net/community/home-api/v1/get-business-list"
|
9
|
+
params = {
|
10
|
+
"page": "1",
|
11
|
+
"size": "20",
|
12
|
+
"businessType": "lately",
|
13
|
+
"noMore": "false",
|
14
|
+
"username": "markadc"
|
15
|
+
}
|
16
|
+
|
17
|
+
|
18
|
+
class CsdnAirAsyncSpider(MiniSpider):
|
19
|
+
start_urls = ['http://www.csdn.net']
|
20
|
+
max_requests = 10
|
21
|
+
|
22
|
+
def parse(self, response):
|
23
|
+
yield coocan.Request(api, self.parse_page, params=params)
|
24
|
+
|
25
|
+
def middleware(self, request: Request):
|
26
|
+
request.headers["Referer"] = "http://www.csdn.net/"
|
27
|
+
|
28
|
+
def parse_page(self, response):
|
29
|
+
current_page = params["page"]
|
30
|
+
data = json.loads(response.text)
|
31
|
+
some = data["data"]["list"]
|
32
|
+
if not some:
|
33
|
+
logger.warning("没有第 {} 页".format(current_page))
|
34
|
+
return
|
35
|
+
for one in some:
|
36
|
+
date = one["formatTime"]
|
37
|
+
name = one["title"]
|
38
|
+
detail_url = one["url"]
|
39
|
+
yield coocan.Request(detail_url, self.parse_detail)
|
40
|
+
print(date, detail_url, name)
|
41
|
+
logger.info("第 {} 页抓取成功".format(params["page"]))
|
42
|
+
|
43
|
+
next_page = int(current_page) + 1
|
44
|
+
params["page"] = str(next_page)
|
45
|
+
yield coocan.Request(api, self.parse_page, params=params)
|
46
|
+
|
47
|
+
def parse_detail(self, response):
|
48
|
+
logger.success("{} {}".format(response.status_code, response.request.url))
|
49
|
+
|
50
|
+
|
51
|
+
if __name__ == '__main__':
|
52
|
+
s = CsdnAirAsyncSpider()
|
53
|
+
s.go()
|
_test/demo.py
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
from loguru import logger
|
2
|
+
|
3
|
+
import coocan
|
4
|
+
|
5
|
+
|
6
|
+
class DemoSpider(coocan.MiniSpider):
|
7
|
+
start_urls = ["https://cn.bing.com/"]
|
8
|
+
max_requests = 5
|
9
|
+
|
10
|
+
def parse(self, response):
|
11
|
+
print(response.request.headers.get("User-Agent"))
|
12
|
+
logger.debug('{} {}'.format(response.status_code, len(response.text)))
|
13
|
+
for i in range(5):
|
14
|
+
yield coocan.Request('https://cn.bing.com/', self.parse2)
|
15
|
+
|
16
|
+
def parse2(self, response):
|
17
|
+
logger.info('{} {}'.format(response.status_code, len(response.text)))
|
18
|
+
for i in range(3):
|
19
|
+
yield coocan.Request('https://cn.bing.com/', self.parse3)
|
20
|
+
|
21
|
+
for i in range(4):
|
22
|
+
yield coocan.Request('https://cn.bing.com/', self.parse4)
|
23
|
+
|
24
|
+
def parse3(self, response):
|
25
|
+
logger.warning('{} {}'.format(response.status_code, len(response.text)))
|
26
|
+
|
27
|
+
def parse4(self, response):
|
28
|
+
logger.error('{} {}'.format(response.status_code, len(response.text)))
|
29
|
+
|
30
|
+
|
31
|
+
if __name__ == '__main__':
|
32
|
+
my_spider = DemoSpider()
|
33
|
+
my_spider.go()
|
_test/err_demo.py
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
from loguru import logger
|
2
|
+
|
3
|
+
import coocan
|
4
|
+
from coocan.spider import MiniSpider
|
5
|
+
|
6
|
+
|
7
|
+
class ErrDemoSpider(MiniSpider):
|
8
|
+
start_urls = ["https://cn.bing.com/"]
|
9
|
+
max_requests = 5
|
10
|
+
|
11
|
+
def parse(self, response):
|
12
|
+
print(response.request.headers.get("User-Agent"))
|
13
|
+
logger.debug('{} {}'.format(response.status_code, len(response.text)))
|
14
|
+
yield coocan.Request('https://cn.bing.com/', self.parse2, cb_kwargs={"name": "CLOS"})
|
15
|
+
|
16
|
+
def parse2(self, response, name):
|
17
|
+
print(name)
|
18
|
+
logger.debug('{} {}'.format(response.status_code, len(response.text)))
|
19
|
+
yield coocan.Request('https://cn.bing.com/', self.parse3, cb_kwargs={"a1": 1, "a2": 2})
|
20
|
+
|
21
|
+
def parse3(self, response, a1, a22):
|
22
|
+
print(a1, a22)
|
23
|
+
|
24
|
+
|
25
|
+
if __name__ == '__main__':
|
26
|
+
my_spider = ErrDemoSpider()
|
27
|
+
my_spider.go()
|
_test/test_priority.py
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
from coocan import MiniSpider, Request, Response
|
2
|
+
|
3
|
+
|
4
|
+
class TestPrioritySpider(MiniSpider):
|
5
|
+
headers_extra_field = {"Name": "Coocan"}
|
6
|
+
|
7
|
+
def start_requests(self):
|
8
|
+
for i in range(100):
|
9
|
+
url = 'https://www.baidu.com/s?w={}'.format(i)
|
10
|
+
yield Request(url, callback=self.parse, priority=100 - i)
|
11
|
+
|
12
|
+
def parse(self, response: Response):
|
13
|
+
print(response.request.url)
|
14
|
+
print(response.request.headers["User-Agent"])
|
15
|
+
print(response.request.headers)
|
16
|
+
print()
|
17
|
+
|
18
|
+
|
19
|
+
if __name__ == '__main__':
|
20
|
+
s = TestPrioritySpider()
|
21
|
+
s.go()
|
_test/test_req_delay.py
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
from coocan import MiniSpider, Request, Response
|
2
|
+
|
3
|
+
|
4
|
+
class TestReqDelaySpider(MiniSpider):
|
5
|
+
max_requests = 5
|
6
|
+
delay = 3
|
7
|
+
|
8
|
+
def start_requests(self):
|
9
|
+
for i in range(100):
|
10
|
+
url = 'https://www.baidu.com/s?w={}'.format(i)
|
11
|
+
yield Request(url, callback=self.parse, priority=100 - i)
|
12
|
+
|
13
|
+
def parse(self, response: Response):
|
14
|
+
print(response.request.url)
|
15
|
+
|
16
|
+
|
17
|
+
if __name__ == '__main__':
|
18
|
+
s = TestReqDelaySpider()
|
19
|
+
s.go()
|
_test/test_req_err.py
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
import random
|
2
|
+
|
3
|
+
from coocan import MiniSpider, Request, Response, IgnoreRequest
|
4
|
+
|
5
|
+
|
6
|
+
class TestReqErrSpider(MiniSpider):
|
7
|
+
def start_requests(self):
|
8
|
+
for i in range(5):
|
9
|
+
url = "https://www.google.com/{}".format(i + 1)
|
10
|
+
yield Request(url, callback=self.parse, timeout=1)
|
11
|
+
|
12
|
+
def handle_request_excetpion(self, e: Exception, request: Request):
|
13
|
+
v = random.randint(1, 3)
|
14
|
+
if v == 1:
|
15
|
+
raise IgnoreRequest("出验证码了")
|
16
|
+
if v == 2:
|
17
|
+
1 / 0
|
18
|
+
if v == 3:
|
19
|
+
new_url = "https://www.baidu.com/s?wd={}".format(random.randint(1, 100))
|
20
|
+
return Request(new_url, callback=self.parse, timeout=1)
|
21
|
+
|
22
|
+
def parse(self, response: Response):
|
23
|
+
v = random.randint(1, 2)
|
24
|
+
if v == 1:
|
25
|
+
print("爬取成功", response.url, len(response.text))
|
26
|
+
print(response.get_one("//title/text()"))
|
27
|
+
aaa
|
28
|
+
|
29
|
+
|
30
|
+
if __name__ == '__main__':
|
31
|
+
my_spider = TestReqErrSpider()
|
32
|
+
my_spider.go()
|
@@ -0,0 +1,62 @@
|
|
1
|
+
import json
|
2
|
+
|
3
|
+
from loguru import logger
|
4
|
+
|
5
|
+
import coocan
|
6
|
+
from coocan import Request, MiniSpider
|
7
|
+
|
8
|
+
|
9
|
+
class CSDNDetailSpider(MiniSpider):
|
10
|
+
start_urls = ['http://www.csdn.net']
|
11
|
+
max_requests = 10
|
12
|
+
|
13
|
+
def middleware(self, request: Request):
|
14
|
+
request.headers["Referer"] = "http://www.csdn.net/"
|
15
|
+
|
16
|
+
def parse(self, response):
|
17
|
+
api = "https://blog.csdn.net/community/home-api/v1/get-business-list"
|
18
|
+
params = {
|
19
|
+
"page": "1",
|
20
|
+
"size": "20",
|
21
|
+
"businessType": "lately",
|
22
|
+
"noMore": "false",
|
23
|
+
"username": "markadc"
|
24
|
+
}
|
25
|
+
yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
|
26
|
+
|
27
|
+
def parse_page(self, response, api, params):
|
28
|
+
current_page = params["page"]
|
29
|
+
data = json.loads(response.text)
|
30
|
+
some = data["data"]["list"]
|
31
|
+
|
32
|
+
if not some:
|
33
|
+
logger.warning("没有第 {} 页".format(current_page))
|
34
|
+
return
|
35
|
+
|
36
|
+
for one in some:
|
37
|
+
date = one["formatTime"]
|
38
|
+
name = one["title"]
|
39
|
+
detail_url = one["url"]
|
40
|
+
logger.info(
|
41
|
+
"""
|
42
|
+
{}
|
43
|
+
{}
|
44
|
+
{}
|
45
|
+
""".format(date, name, detail_url)
|
46
|
+
)
|
47
|
+
yield coocan.Request(detail_url, self.parse_detail, cb_kwargs={"title": name})
|
48
|
+
|
49
|
+
logger.info("第 {} 页抓取成功".format(params["page"]))
|
50
|
+
|
51
|
+
# 抓取下一页
|
52
|
+
next_page = int(current_page) + 1
|
53
|
+
params["page"] = str(next_page)
|
54
|
+
yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
|
55
|
+
|
56
|
+
def parse_detail(self, response, title):
|
57
|
+
logger.success("{} 已访问 {}".format(response.status_code, title))
|
58
|
+
|
59
|
+
|
60
|
+
if __name__ == '__main__':
|
61
|
+
s = CSDNDetailSpider()
|
62
|
+
s.go()
|
@@ -0,0 +1,50 @@
|
|
1
|
+
import json
|
2
|
+
|
3
|
+
from loguru import logger
|
4
|
+
|
5
|
+
from coocan import Request, MiniSpider
|
6
|
+
|
7
|
+
|
8
|
+
class CSDNSpider(MiniSpider):
|
9
|
+
start_urls = ['http://www.csdn.net']
|
10
|
+
max_requests = 10
|
11
|
+
|
12
|
+
def middleware(self, request: Request):
|
13
|
+
request.headers["Referer"] = "http://www.csdn.net/"
|
14
|
+
|
15
|
+
def parse(self, response):
|
16
|
+
api = "https://blog.csdn.net/community/home-api/v1/get-business-list"
|
17
|
+
params = {
|
18
|
+
"page": "1",
|
19
|
+
"size": "20",
|
20
|
+
"businessType": "lately",
|
21
|
+
"noMore": "false",
|
22
|
+
"username": "markadc"
|
23
|
+
}
|
24
|
+
yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
|
25
|
+
|
26
|
+
def parse_page(self, response, api, params):
|
27
|
+
current_page = params["page"]
|
28
|
+
data = json.loads(response.text)
|
29
|
+
some = data["data"]["list"]
|
30
|
+
|
31
|
+
if not some:
|
32
|
+
logger.warning("没有第 {} 页".format(current_page))
|
33
|
+
return
|
34
|
+
|
35
|
+
for one in some:
|
36
|
+
date = one["formatTime"]
|
37
|
+
name = one["title"]
|
38
|
+
detail_url = one["url"]
|
39
|
+
print(date, detail_url, name)
|
40
|
+
print("第 {} 页抓取成功".format(params["page"]))
|
41
|
+
|
42
|
+
# 抓取下一页
|
43
|
+
next_page = int(current_page) + 1
|
44
|
+
params["page"] = str(next_page)
|
45
|
+
yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
|
46
|
+
|
47
|
+
|
48
|
+
if __name__ == '__main__':
|
49
|
+
s = CSDNSpider()
|
50
|
+
s.go()
|
@@ -0,0 +1,31 @@
|
|
1
|
+
import random
|
2
|
+
import time
|
3
|
+
|
4
|
+
from loguru import logger
|
5
|
+
|
6
|
+
from coocan import MiniSpider, Request, Response
|
7
|
+
|
8
|
+
|
9
|
+
class RecvItemSpider(MiniSpider):
|
10
|
+
start_urls = ["https://cn.bing.com/search?q=1"]
|
11
|
+
max_requests = 10
|
12
|
+
|
13
|
+
def parse(self, response: Response):
|
14
|
+
logger.warning("{} {}".format(response.status_code, response.request.url, response.get_one("//title/text()")))
|
15
|
+
for _ in range(10):
|
16
|
+
item = {"timestamp": int(time.time() * 1000), "mark": random.randint(1, 10000)} # 假设这里是爬虫的数据
|
17
|
+
yield item
|
18
|
+
head, tail = str(response.request.url).split("=")
|
19
|
+
next_url = "{}={}".format(head, int(tail) + 1)
|
20
|
+
if next_url.endswith("11"):
|
21
|
+
yield "coocan" # 出现警告日志
|
22
|
+
return
|
23
|
+
yield Request(next_url, callback=self.parse)
|
24
|
+
|
25
|
+
def process_item(self, item: dict):
|
26
|
+
logger.success("Get => {}".format(item))
|
27
|
+
|
28
|
+
|
29
|
+
if __name__ == '__main__':
|
30
|
+
s = RecvItemSpider()
|
31
|
+
s.go()
|
@@ -0,0 +1,22 @@
|
|
1
|
+
from coocan import Request, Response, MiniSpider
|
2
|
+
|
3
|
+
|
4
|
+
class ViewLocalIPSpider(MiniSpider):
|
5
|
+
start_urls = ["https://httpbin.org/ip"]
|
6
|
+
max_requests = 5
|
7
|
+
delay = 5
|
8
|
+
|
9
|
+
def start_requests(self):
|
10
|
+
for _ in range(10):
|
11
|
+
yield Request(self.start_urls[0], callback=self.parse)
|
12
|
+
|
13
|
+
def middleware(self, request: Request):
|
14
|
+
request.headers["Referer"] = "https://httpbin.org"
|
15
|
+
|
16
|
+
def parse(self, response: Response):
|
17
|
+
print(response.status_code, response.json())
|
18
|
+
|
19
|
+
|
20
|
+
if __name__ == '__main__':
|
21
|
+
s = ViewLocalIPSpider()
|
22
|
+
s.go()
|
coocan/cmd/cli.py
CHANGED
@@ -4,7 +4,7 @@ from pathlib import Path
|
|
4
4
|
|
5
5
|
import click
|
6
6
|
|
7
|
-
TEMPLATE_DIR = Path(__file__).parent /
|
7
|
+
TEMPLATE_DIR = Path(__file__).parent.parent / 'templates'
|
8
8
|
|
9
9
|
help_info = """
|
10
10
|
██████╗ ██████╗ ██████╗ ██████╗ █████╗ ███╗ ██╗
|
@@ -16,10 +16,6 @@ help_info = """
|
|
16
16
|
"""
|
17
17
|
|
18
18
|
|
19
|
-
def show_help_info():
|
20
|
-
print(help_info)
|
21
|
-
|
22
|
-
|
23
19
|
def snake_to_pascal(snake_str: str):
|
24
20
|
"""小蛇变成大驼峰"""
|
25
21
|
words = snake_str.split('_')
|
@@ -31,37 +27,37 @@ def snake_to_pascal(snake_str: str):
|
|
31
27
|
@click.pass_context
|
32
28
|
def main(ctx):
|
33
29
|
if ctx.invoked_subcommand is None:
|
34
|
-
|
35
|
-
click.echo("
|
30
|
+
print(help_info)
|
31
|
+
click.echo("coocan new -s <spider_file_name>")
|
36
32
|
|
37
33
|
|
38
34
|
@main.command()
|
39
35
|
@click.option('-s', '--spider', required=True, help='爬虫文件名字')
|
40
|
-
def new(spider):
|
36
|
+
def new(spider: str):
|
41
37
|
"""新建"""
|
42
38
|
if not re.search("^[a-zA-Z0-9_]*$", spider):
|
43
39
|
click.echo("只支持字母、数字、下划线")
|
44
40
|
return
|
45
41
|
|
46
|
-
|
47
|
-
if not
|
48
|
-
|
42
|
+
spider_class_name = snake_to_pascal(spider)
|
43
|
+
if not spider_class_name.lower().endswith("spider"):
|
44
|
+
spider_class_name += "Spider"
|
49
45
|
|
50
46
|
try:
|
51
47
|
template_path = TEMPLATE_DIR / "spider.txt"
|
52
48
|
with open(template_path, 'r') as f:
|
53
49
|
text = f.read()
|
54
|
-
spider_py_text = text.replace("{SpiderClassName}",
|
50
|
+
spider_py_text = text.replace("{SpiderClassName}", spider_class_name)
|
55
51
|
|
56
52
|
py_file = "{}.py".format(spider)
|
57
53
|
if os.path.exists(py_file):
|
58
|
-
click.echo("
|
54
|
+
click.echo("Failed because file {} already exists".format(py_file))
|
59
55
|
return
|
60
56
|
|
61
57
|
with open(py_file, 'w') as f:
|
62
58
|
f.write(spider_py_text)
|
63
59
|
|
64
|
-
click.echo("Success")
|
60
|
+
click.echo("Success create {}".format(py_file))
|
65
61
|
|
66
62
|
except Exception as e:
|
67
63
|
click.echo(str(e))
|
coocan/push_project.py
ADDED
coocan/spider/base.py
CHANGED
@@ -24,6 +24,7 @@ class MiniSpider:
|
|
24
24
|
enable_random_ua = True
|
25
25
|
headers_extra_field = {}
|
26
26
|
delay = 0
|
27
|
+
item_speed = 100
|
27
28
|
|
28
29
|
def start_requests(self):
|
29
30
|
"""初始请求"""
|
@@ -55,10 +56,10 @@ class MiniSpider:
|
|
55
56
|
def handle_callback_excetpion(self, e: Exception, request: Request, response: Response):
|
56
57
|
logger.error("{} `回调`时出现异常 | {} | {} | {}".format(response.status_code, e, request.callback.__name__, request.url))
|
57
58
|
|
58
|
-
async def
|
59
|
+
async def request_task(self, q1: asyncio.PriorityQueue, q2: asyncio.Queue, semaphore: asyncio.Semaphore):
|
59
60
|
"""工作协程,从队列中获取请求并处理"""
|
60
61
|
while True:
|
61
|
-
req: Request = await
|
62
|
+
req: Request = await q1.get()
|
62
63
|
|
63
64
|
# 结束信号
|
64
65
|
if req.url == "":
|
@@ -82,7 +83,7 @@ class MiniSpider:
|
|
82
83
|
try:
|
83
84
|
result = self.handle_request_excetpion(e, req)
|
84
85
|
if isinstance(result, Request):
|
85
|
-
await
|
86
|
+
await q1.put(result)
|
86
87
|
break
|
87
88
|
except IgnoreRequest as e:
|
88
89
|
logger.debug("{} 忽略请求 {}".format(e, req.url))
|
@@ -105,39 +106,72 @@ class MiniSpider:
|
|
105
106
|
try:
|
106
107
|
cached = req.callback(Response(resp), **req.cb_kwargs)
|
107
108
|
if isinstance(cached, Iterator):
|
108
|
-
for
|
109
|
-
|
109
|
+
for c in cached:
|
110
|
+
if isinstance(c, Request):
|
111
|
+
await q1.put(c) # 把后续请求加入队列
|
112
|
+
elif isinstance(c, dict):
|
113
|
+
await q2.put(c)
|
114
|
+
else:
|
115
|
+
logger.warning("Please yield `Request` or `dict` Not {}".format(c))
|
110
116
|
except Exception as e:
|
111
117
|
self.handle_callback_excetpion(e, req, resp)
|
112
118
|
finally:
|
113
119
|
break
|
114
120
|
|
115
|
-
|
121
|
+
q1.task_done()
|
122
|
+
|
123
|
+
async def item_task(self, q2: asyncio.Queue):
|
124
|
+
while True:
|
125
|
+
item = await q2.get()
|
126
|
+
if item is None:
|
127
|
+
break
|
128
|
+
self.process_item(item)
|
129
|
+
q2.task_done()
|
130
|
+
|
131
|
+
def process_item(self, item: dict):
|
132
|
+
logger.success(item)
|
116
133
|
|
117
134
|
async def run(self):
|
118
135
|
"""爬取入口"""
|
119
|
-
|
136
|
+
request_queue = asyncio.PriorityQueue()
|
137
|
+
item_queue = asyncio.Queue()
|
120
138
|
semaphore = asyncio.Semaphore(self.max_requests)
|
121
139
|
|
122
|
-
#
|
123
|
-
|
124
|
-
asyncio.create_task(self.
|
140
|
+
# 处理请求...
|
141
|
+
request_tasks = [
|
142
|
+
asyncio.create_task(self.request_task(request_queue, item_queue, semaphore))
|
125
143
|
for _ in range(self.max_requests)
|
126
144
|
]
|
127
145
|
|
128
|
-
#
|
146
|
+
# 处理数据...
|
147
|
+
item_tasks = [
|
148
|
+
asyncio.create_task(self.item_task(item_queue))
|
149
|
+
for _ in range(self.item_speed)
|
150
|
+
]
|
151
|
+
|
152
|
+
# 发送最开始的请求
|
129
153
|
for req in self.start_requests():
|
130
|
-
await
|
154
|
+
await request_queue.put(req)
|
131
155
|
|
132
|
-
#
|
133
|
-
await
|
156
|
+
# 等待所有请求处理完成
|
157
|
+
await request_queue.join()
|
158
|
+
logger.debug("处理请求已结束")
|
134
159
|
|
135
|
-
#
|
160
|
+
# 等待所有数据处理完成
|
161
|
+
await item_queue.join()
|
162
|
+
logger.debug("处理数据已结束")
|
163
|
+
|
164
|
+
# 退出请求任务
|
136
165
|
for _ in range(self.max_requests):
|
137
|
-
await
|
166
|
+
await request_queue.put(Request(url=""))
|
167
|
+
|
168
|
+
# 退出数据任务
|
169
|
+
for _ in range(self.item_speed):
|
170
|
+
await item_queue.put(None)
|
138
171
|
|
139
172
|
# 等待所有工作协程完成
|
140
|
-
await asyncio.gather(*
|
173
|
+
await asyncio.gather(*request_tasks)
|
174
|
+
await asyncio.gather(*item_tasks)
|
141
175
|
|
142
176
|
def go(self):
|
143
177
|
asyncio.run(self.run())
|
@@ -1,14 +1,20 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: coocan
|
3
|
-
Version: 0.4
|
4
|
-
Summary: Air Spider Framework
|
3
|
+
Version: 0.5.4
|
4
|
+
Summary: Air Async Spider Framework
|
5
|
+
Home-page: https://github.com/markadc/coocan
|
5
6
|
Author: wauo
|
6
|
-
Author-email: markadc@126.com
|
7
|
+
Author-email: wauo <markadc@126.com>
|
8
|
+
License: MIT
|
9
|
+
Project-URL: Homepage, https://github.com/markadc/coocan
|
7
10
|
Requires-Python: >=3.10
|
8
11
|
Description-Content-Type: text/markdown
|
9
12
|
Requires-Dist: click>=8.0.0
|
10
13
|
Requires-Dist: httpx
|
11
14
|
Requires-Dist: loguru
|
15
|
+
Dynamic: author
|
16
|
+
Dynamic: home-page
|
17
|
+
Dynamic: requires-python
|
12
18
|
|
13
19
|
# 项目说明
|
14
20
|
|
@@ -0,0 +1,26 @@
|
|
1
|
+
_test/crawl_csdn.py,sha256=ap2mOq3ps7KEbqqKWH5uJqIK_IQ8YFSRRAMzpreQvww,1555
|
2
|
+
_test/demo.py,sha256=ZxfJzWuNVGhDjhUruyVyZ-BoULHMbzgpnxefLSkheZI,1051
|
3
|
+
_test/err_demo.py,sha256=EWGqb00KyB192qv3uxMr6YgOr2zKJQb6gkeFtknMLv8,845
|
4
|
+
_test/test_priority.py,sha256=K8JLC-PaVM4ztLZdYFCumDQP5m2hB8qWAIWXTOMMUyM,601
|
5
|
+
_test/test_req_delay.py,sha256=35afyHcZk3Gmja9xXXjJSHXnU8WVJGph2ZcTQRxRMNk,479
|
6
|
+
_test/test_req_err.py,sha256=magK1BUConCBj8TEC29rzmDCbI2u2XXVcPowL6ttP9g,1025
|
7
|
+
coocan/__init__.py,sha256=UqFmE7ucuR_xR3OyyBU8pxqLfCJ5AdH_HsDdTsYPf6g,55
|
8
|
+
coocan/gen.py,sha256=J6QWXkBVbiCQqey8i0BDqleRNpBswI8AyvrYmkDVQPw,1028
|
9
|
+
coocan/push_project.py,sha256=X2fjtYk1oI0ElcibA9wChLx0lCc8hwSelhUNfkJal5o,220
|
10
|
+
coocan/_examples/crawl_csdn_detail.py,sha256=J2hiKHCS7RskQ9UmNMjE8i6braFwGchH6BxtdulV9RM,1892
|
11
|
+
coocan/_examples/crawl_csdn_list.py,sha256=ZvhFvBbVXQe-qtXf1T_waXuM4tBleBqbpvzP-5z0RCg,1504
|
12
|
+
coocan/_examples/recv_item.py,sha256=iJqPuHZ2FykeleFl0Xr0yPwq4UhCnNw84lCPlYyGFzM,1007
|
13
|
+
coocan/_examples/view_local_ip.py,sha256=Sl086xNNuZqFoRM31_gMvcISSa2QoL3OGghECkQktxg,582
|
14
|
+
coocan/cmd/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
|
+
coocan/cmd/cli.py,sha256=FRggXqDeLsGs-7u3zhaokfk0GCpItwqudf14W9zUfYE,2480
|
16
|
+
coocan/spider/__init__.py,sha256=kMDCGeqtN50raCzwfCn18s_W8xV6KO_Ny9Xol4I48Ag,58
|
17
|
+
coocan/spider/base.py,sha256=9Dgn2920Lb9TZGV0cAZSBMvIWuTMqs9M8ZYspx9W0Io,6342
|
18
|
+
coocan/templates/spider.txt,sha256=5UEXUzb0ses_4ctn0b3vgbpUJ7tCde91ul6rp-g7Hxw,480
|
19
|
+
coocan/url/__init__.py,sha256=rEMx66XDy5AIJ9mF_2UVzHW5mRLBAWZEyQ3txrZzuZA,102
|
20
|
+
coocan/url/request.py,sha256=seZaQXQRvRMIf9WnCp3mAgNA-kxsj9P2JzAvuIt2Dx8,1116
|
21
|
+
coocan/url/response.py,sha256=AnC0xsF34q68r62EVlcHYmDH6skm9RBwRHITTb4iBbU,1785
|
22
|
+
coocan-0.5.4.dist-info/METADATA,sha256=2tOjNlxrqZ9NUZtBvu6MfsuOlKVGhGl-Bu7nuUUW4Ps,2568
|
23
|
+
coocan-0.5.4.dist-info/WHEEL,sha256=ck4Vq1_RXyvS4Jt6SI0Vz6fyVs4GWg7AINwpsaGEgPE,91
|
24
|
+
coocan-0.5.4.dist-info/entry_points.txt,sha256=hNdk42NPboC1o7s7GzMbpII5t2U2jWrtT5bpvliXRcw,47
|
25
|
+
coocan-0.5.4.dist-info/top_level.txt,sha256=WiN3Gh529qzUs0jVvEReeZsKxFguIQKrFlMOjtxGblM,13
|
26
|
+
coocan-0.5.4.dist-info/RECORD,,
|
coocan-0.4.9.dist-info/RECORD
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
coocan/__init__.py,sha256=UqFmE7ucuR_xR3OyyBU8pxqLfCJ5AdH_HsDdTsYPf6g,55
|
2
|
-
coocan/gen.py,sha256=J6QWXkBVbiCQqey8i0BDqleRNpBswI8AyvrYmkDVQPw,1028
|
3
|
-
coocan/cmd/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
|
-
coocan/cmd/cli.py,sha256=d_sG63wz8RHHa5u7HabOz36yai_D8S3bSr7VprVpTck,2420
|
5
|
-
coocan/cmd/templates/spider.txt,sha256=Htd7nOs1EeKbc8LNRUX7xyHkLWz3S0kaTPRW0M3NuUw,480
|
6
|
-
coocan/spider/__init__.py,sha256=kMDCGeqtN50raCzwfCn18s_W8xV6KO_Ny9Xol4I48Ag,58
|
7
|
-
coocan/spider/base.py,sha256=WMTnMQd7Dnv2aC7rnmkAo_WJu33p9g3GN07A2DnbdLI,5104
|
8
|
-
coocan/url/__init__.py,sha256=rEMx66XDy5AIJ9mF_2UVzHW5mRLBAWZEyQ3txrZzuZA,102
|
9
|
-
coocan/url/request.py,sha256=seZaQXQRvRMIf9WnCp3mAgNA-kxsj9P2JzAvuIt2Dx8,1116
|
10
|
-
coocan/url/response.py,sha256=AnC0xsF34q68r62EVlcHYmDH6skm9RBwRHITTb4iBbU,1785
|
11
|
-
coocan-0.4.9.dist-info/METADATA,sha256=WZhIMdcypGrPcE1_bAMxMUa3puvkngstEU4PSTR5UXo,2374
|
12
|
-
coocan-0.4.9.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
13
|
-
coocan-0.4.9.dist-info/entry_points.txt,sha256=tOLQN_TVhl_9f2YBASTGBE_ClmG-iQ4rKmyhE2WAOY0,43
|
14
|
-
coocan-0.4.9.dist-info/top_level.txt,sha256=VwB-Q4zEljgb9v1Ms1E59B-1pBYORXuhKjgZb-LHOhk,7
|
15
|
-
coocan-0.4.9.dist-info/RECORD,,
|