coocan 0.4.6__tar.gz → 0.4.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- coocan-0.4.7/PKG-INFO +90 -0
- coocan-0.4.7/README.md +81 -0
- coocan-0.4.7/coocan.egg-info/PKG-INFO +90 -0
- {coocan-0.4.6 → coocan-0.4.7}/coocan.egg-info/SOURCES.txt +1 -0
- {coocan-0.4.6 → coocan-0.4.7}/setup.py +8 -1
- coocan-0.4.6/PKG-INFO +0 -7
- coocan-0.4.6/coocan.egg-info/PKG-INFO +0 -7
- {coocan-0.4.6 → coocan-0.4.7}/coocan/__init__.py +0 -0
- {coocan-0.4.6 → coocan-0.4.7}/coocan/cmd/__init__.py +0 -0
- {coocan-0.4.6 → coocan-0.4.7}/coocan/cmd/cli.py +0 -0
- {coocan-0.4.6 → coocan-0.4.7}/coocan/cmd/templates/spider.txt +0 -0
- {coocan-0.4.6 → coocan-0.4.7}/coocan/gen.py +0 -0
- {coocan-0.4.6 → coocan-0.4.7}/coocan/spider/__init__.py +0 -0
- {coocan-0.4.6 → coocan-0.4.7}/coocan/spider/base.py +0 -0
- {coocan-0.4.6 → coocan-0.4.7}/coocan/url/__init__.py +0 -0
- {coocan-0.4.6 → coocan-0.4.7}/coocan/url/request.py +0 -0
- {coocan-0.4.6 → coocan-0.4.7}/coocan/url/response.py +0 -0
- {coocan-0.4.6 → coocan-0.4.7}/coocan.egg-info/dependency_links.txt +0 -0
- {coocan-0.4.6 → coocan-0.4.7}/coocan.egg-info/entry_points.txt +0 -0
- {coocan-0.4.6 → coocan-0.4.7}/coocan.egg-info/requires.txt +0 -0
- {coocan-0.4.6 → coocan-0.4.7}/coocan.egg-info/top_level.txt +0 -0
- {coocan-0.4.6 → coocan-0.4.7}/setup.cfg +0 -0
coocan-0.4.7/PKG-INFO
ADDED
@@ -0,0 +1,90 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: coocan
|
3
|
+
Version: 0.4.7
|
4
|
+
Summary: Air Spider Framework
|
5
|
+
Author: wauo
|
6
|
+
Author-email: markadc@126.com
|
7
|
+
Requires-Python: >=3.10
|
8
|
+
Description-Content-Type: text/markdown
|
9
|
+
|
10
|
+
# 项目说明
|
11
|
+
|
12
|
+
- 一个非常轻量的异步爬虫框架
|
13
|
+
|
14
|
+
# 项目地址
|
15
|
+
|
16
|
+
- https://github.com/markadc/coocan
|
17
|
+
|
18
|
+
## demo
|
19
|
+
|
20
|
+
- 效果
|
21
|
+
<br>
|
22
|
+

|
23
|
+
|
24
|
+
|
25
|
+
- 代码
|
26
|
+
|
27
|
+
```python
|
28
|
+
import json
|
29
|
+
|
30
|
+
from loguru import logger
|
31
|
+
|
32
|
+
import coocan
|
33
|
+
from coocan import Request, MiniSpider
|
34
|
+
|
35
|
+
|
36
|
+
class CSDNDetailSpider(MiniSpider):
|
37
|
+
start_urls = ['http://www.csdn.net']
|
38
|
+
max_requests = 10
|
39
|
+
|
40
|
+
def middleware(self, request: Request):
|
41
|
+
request.headers["Referer"] = "http://www.csdn.net/"
|
42
|
+
|
43
|
+
def parse(self, response):
|
44
|
+
api = "https://blog.csdn.net/community/home-api/v1/get-business-list"
|
45
|
+
params = {
|
46
|
+
"page": "1",
|
47
|
+
"size": "20",
|
48
|
+
"businessType": "lately",
|
49
|
+
"noMore": "false",
|
50
|
+
"username": "markadc"
|
51
|
+
}
|
52
|
+
yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
|
53
|
+
|
54
|
+
def parse_page(self, response, api, params):
|
55
|
+
current_page = params["page"]
|
56
|
+
data = json.loads(response.text)
|
57
|
+
some = data["data"]["list"]
|
58
|
+
|
59
|
+
if not some:
|
60
|
+
logger.warning("没有第 {} 页".format(current_page))
|
61
|
+
return
|
62
|
+
|
63
|
+
for one in some:
|
64
|
+
date = one["formatTime"]
|
65
|
+
name = one["title"]
|
66
|
+
detail_url = one["url"]
|
67
|
+
logger.info(
|
68
|
+
"""
|
69
|
+
{}
|
70
|
+
{}
|
71
|
+
{}
|
72
|
+
""".format(date, name, detail_url)
|
73
|
+
)
|
74
|
+
yield coocan.Request(detail_url, self.parse_detail, cb_kwargs={"title": name})
|
75
|
+
|
76
|
+
logger.info("第 {} 页抓取成功".format(params["page"]))
|
77
|
+
|
78
|
+
# 抓取下一页
|
79
|
+
next_page = int(current_page) + 1
|
80
|
+
params["page"] = str(next_page)
|
81
|
+
yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
|
82
|
+
|
83
|
+
def parse_detail(self, response, title):
|
84
|
+
logger.success("{} 已访问 {}".format(response.status_code, title))
|
85
|
+
|
86
|
+
|
87
|
+
if __name__ == '__main__':
|
88
|
+
s = CSDNDetailSpider()
|
89
|
+
s.go()
|
90
|
+
```
|
coocan-0.4.7/README.md
ADDED
@@ -0,0 +1,81 @@
|
|
1
|
+
# 项目说明
|
2
|
+
|
3
|
+
- 一个非常轻量的异步爬虫框架
|
4
|
+
|
5
|
+
# 项目地址
|
6
|
+
|
7
|
+
- https://github.com/markadc/coocan
|
8
|
+
|
9
|
+
## demo
|
10
|
+
|
11
|
+
- 效果
|
12
|
+
<br>
|
13
|
+

|
14
|
+
|
15
|
+
|
16
|
+
- 代码
|
17
|
+
|
18
|
+
```python
|
19
|
+
import json
|
20
|
+
|
21
|
+
from loguru import logger
|
22
|
+
|
23
|
+
import coocan
|
24
|
+
from coocan import Request, MiniSpider
|
25
|
+
|
26
|
+
|
27
|
+
class CSDNDetailSpider(MiniSpider):
|
28
|
+
start_urls = ['http://www.csdn.net']
|
29
|
+
max_requests = 10
|
30
|
+
|
31
|
+
def middleware(self, request: Request):
|
32
|
+
request.headers["Referer"] = "http://www.csdn.net/"
|
33
|
+
|
34
|
+
def parse(self, response):
|
35
|
+
api = "https://blog.csdn.net/community/home-api/v1/get-business-list"
|
36
|
+
params = {
|
37
|
+
"page": "1",
|
38
|
+
"size": "20",
|
39
|
+
"businessType": "lately",
|
40
|
+
"noMore": "false",
|
41
|
+
"username": "markadc"
|
42
|
+
}
|
43
|
+
yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
|
44
|
+
|
45
|
+
def parse_page(self, response, api, params):
|
46
|
+
current_page = params["page"]
|
47
|
+
data = json.loads(response.text)
|
48
|
+
some = data["data"]["list"]
|
49
|
+
|
50
|
+
if not some:
|
51
|
+
logger.warning("没有第 {} 页".format(current_page))
|
52
|
+
return
|
53
|
+
|
54
|
+
for one in some:
|
55
|
+
date = one["formatTime"]
|
56
|
+
name = one["title"]
|
57
|
+
detail_url = one["url"]
|
58
|
+
logger.info(
|
59
|
+
"""
|
60
|
+
{}
|
61
|
+
{}
|
62
|
+
{}
|
63
|
+
""".format(date, name, detail_url)
|
64
|
+
)
|
65
|
+
yield coocan.Request(detail_url, self.parse_detail, cb_kwargs={"title": name})
|
66
|
+
|
67
|
+
logger.info("第 {} 页抓取成功".format(params["page"]))
|
68
|
+
|
69
|
+
# 抓取下一页
|
70
|
+
next_page = int(current_page) + 1
|
71
|
+
params["page"] = str(next_page)
|
72
|
+
yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
|
73
|
+
|
74
|
+
def parse_detail(self, response, title):
|
75
|
+
logger.success("{} 已访问 {}".format(response.status_code, title))
|
76
|
+
|
77
|
+
|
78
|
+
if __name__ == '__main__':
|
79
|
+
s = CSDNDetailSpider()
|
80
|
+
s.go()
|
81
|
+
```
|
@@ -0,0 +1,90 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: coocan
|
3
|
+
Version: 0.4.7
|
4
|
+
Summary: Air Spider Framework
|
5
|
+
Author: wauo
|
6
|
+
Author-email: markadc@126.com
|
7
|
+
Requires-Python: >=3.10
|
8
|
+
Description-Content-Type: text/markdown
|
9
|
+
|
10
|
+
# 项目说明
|
11
|
+
|
12
|
+
- 一个非常轻量的异步爬虫框架
|
13
|
+
|
14
|
+
# 项目地址
|
15
|
+
|
16
|
+
- https://github.com/markadc/coocan
|
17
|
+
|
18
|
+
## demo
|
19
|
+
|
20
|
+
- 效果
|
21
|
+
<br>
|
22
|
+

|
23
|
+
|
24
|
+
|
25
|
+
- 代码
|
26
|
+
|
27
|
+
```python
|
28
|
+
import json
|
29
|
+
|
30
|
+
from loguru import logger
|
31
|
+
|
32
|
+
import coocan
|
33
|
+
from coocan import Request, MiniSpider
|
34
|
+
|
35
|
+
|
36
|
+
class CSDNDetailSpider(MiniSpider):
|
37
|
+
start_urls = ['http://www.csdn.net']
|
38
|
+
max_requests = 10
|
39
|
+
|
40
|
+
def middleware(self, request: Request):
|
41
|
+
request.headers["Referer"] = "http://www.csdn.net/"
|
42
|
+
|
43
|
+
def parse(self, response):
|
44
|
+
api = "https://blog.csdn.net/community/home-api/v1/get-business-list"
|
45
|
+
params = {
|
46
|
+
"page": "1",
|
47
|
+
"size": "20",
|
48
|
+
"businessType": "lately",
|
49
|
+
"noMore": "false",
|
50
|
+
"username": "markadc"
|
51
|
+
}
|
52
|
+
yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
|
53
|
+
|
54
|
+
def parse_page(self, response, api, params):
|
55
|
+
current_page = params["page"]
|
56
|
+
data = json.loads(response.text)
|
57
|
+
some = data["data"]["list"]
|
58
|
+
|
59
|
+
if not some:
|
60
|
+
logger.warning("没有第 {} 页".format(current_page))
|
61
|
+
return
|
62
|
+
|
63
|
+
for one in some:
|
64
|
+
date = one["formatTime"]
|
65
|
+
name = one["title"]
|
66
|
+
detail_url = one["url"]
|
67
|
+
logger.info(
|
68
|
+
"""
|
69
|
+
{}
|
70
|
+
{}
|
71
|
+
{}
|
72
|
+
""".format(date, name, detail_url)
|
73
|
+
)
|
74
|
+
yield coocan.Request(detail_url, self.parse_detail, cb_kwargs={"title": name})
|
75
|
+
|
76
|
+
logger.info("第 {} 页抓取成功".format(params["page"]))
|
77
|
+
|
78
|
+
# 抓取下一页
|
79
|
+
next_page = int(current_page) + 1
|
80
|
+
params["page"] = str(next_page)
|
81
|
+
yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
|
82
|
+
|
83
|
+
def parse_detail(self, response, title):
|
84
|
+
logger.success("{} 已访问 {}".format(response.status_code, title))
|
85
|
+
|
86
|
+
|
87
|
+
if __name__ == '__main__':
|
88
|
+
s = CSDNDetailSpider()
|
89
|
+
s.go()
|
90
|
+
```
|
@@ -1,13 +1,20 @@
|
|
1
1
|
from setuptools import setup, find_packages
|
2
2
|
|
3
|
+
with open("README.md", "r", encoding="utf-8") as f:
|
4
|
+
long_description = f.read()
|
5
|
+
|
3
6
|
setup(
|
4
7
|
name="coocan",
|
5
|
-
version="0.4.
|
8
|
+
version="0.4.7",
|
6
9
|
author="wauo",
|
7
10
|
author_email="markadc@126.com",
|
8
11
|
description="Air Spider Framework",
|
9
12
|
packages=find_packages(),
|
10
13
|
python_requires=">=3.10",
|
14
|
+
|
15
|
+
long_description=long_description,
|
16
|
+
long_description_content_type="text/markdown",
|
17
|
+
|
11
18
|
install_requires=[
|
12
19
|
'click>=8.0.0', 'httpx', 'loguru'
|
13
20
|
],
|
coocan-0.4.6/PKG-INFO
DELETED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|