crawlo 1.0.4__py3-none-any.whl → 1.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +25 -9
- crawlo/__version__.py +1 -1
- crawlo/core/__init__.py +2 -2
- crawlo/core/engine.py +158 -158
- crawlo/core/processor.py +40 -40
- crawlo/core/scheduler.py +57 -57
- crawlo/crawler.py +424 -242
- crawlo/downloader/__init__.py +78 -78
- crawlo/downloader/aiohttp_downloader.py +200 -259
- crawlo/downloader/cffi_downloader.py +277 -0
- crawlo/downloader/httpx_downloader.py +246 -187
- crawlo/event.py +11 -11
- crawlo/exceptions.py +73 -64
- crawlo/extension/__init__.py +31 -31
- crawlo/extension/log_interval.py +49 -49
- crawlo/extension/log_stats.py +44 -44
- crawlo/extension/logging_extension.py +35 -0
- crawlo/filters/__init__.py +37 -37
- crawlo/filters/aioredis_filter.py +150 -150
- crawlo/filters/memory_filter.py +202 -202
- crawlo/items/__init__.py +62 -62
- crawlo/items/items.py +115 -119
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +32 -32
- crawlo/middleware/download_delay.py +28 -28
- crawlo/middleware/middleware_manager.py +135 -140
- crawlo/middleware/proxy.py +246 -0
- crawlo/middleware/request_ignore.py +30 -30
- crawlo/middleware/response_code.py +18 -18
- crawlo/middleware/response_filter.py +26 -26
- crawlo/middleware/retry.py +90 -90
- crawlo/network/__init__.py +7 -7
- crawlo/network/request.py +203 -204
- crawlo/network/response.py +166 -166
- crawlo/pipelines/__init__.py +13 -13
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/mongo_pipeline.py +116 -116
- crawlo/pipelines/mysql_batch_pipline.py +273 -134
- crawlo/pipelines/mysql_pipeline.py +195 -195
- crawlo/pipelines/pipeline_manager.py +56 -56
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +169 -94
- crawlo/settings/setting_manager.py +99 -99
- crawlo/spider/__init__.py +41 -36
- crawlo/stats_collector.py +59 -59
- crawlo/subscriber.py +106 -106
- crawlo/task_manager.py +27 -27
- crawlo/templates/item_template.tmpl +21 -21
- crawlo/templates/project_template/main.py +32 -32
- crawlo/templates/project_template/setting.py +189 -189
- crawlo/templates/spider_template.tmpl +30 -30
- crawlo/utils/__init__.py +7 -7
- crawlo/utils/concurrency_manager.py +124 -124
- crawlo/utils/date_tools.py +233 -177
- crawlo/utils/db_helper.py +344 -0
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/log.py +129 -39
- crawlo/utils/pqueue.py +173 -173
- crawlo/utils/project.py +59 -59
- crawlo/utils/request.py +267 -122
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +5 -303
- crawlo/utils/url.py +39 -39
- {crawlo-1.0.4.dist-info → crawlo-1.0.5.dist-info}/METADATA +49 -48
- crawlo-1.0.5.dist-info/RECORD +84 -0
- {crawlo-1.0.4.dist-info → crawlo-1.0.5.dist-info}/top_level.txt +1 -0
- examples/__init__.py +0 -0
- examples/gxb/__init__.py +0 -0
- examples/gxb/items.py +36 -0
- examples/gxb/run.py +15 -0
- examples/gxb/settings.py +71 -0
- examples/gxb/spider/__init__.py +0 -0
- examples/gxb/spider/miit_spider.py +180 -0
- examples/gxb/spider/telecom_device_licenses.py +129 -0
- tests/__init__.py +7 -7
- tests/test_proxy_health_check.py +33 -0
- tests/test_proxy_middleware_integration.py +137 -0
- tests/test_proxy_providers.py +57 -0
- tests/test_proxy_stats.py +20 -0
- tests/test_proxy_strategies.py +60 -0
- crawlo/downloader/playwright_downloader.py +0 -161
- crawlo-1.0.4.dist-info/RECORD +0 -79
- tests/baidu_spider/__init__.py +0 -7
- tests/baidu_spider/demo.py +0 -94
- tests/baidu_spider/items.py +0 -25
- tests/baidu_spider/middleware.py +0 -49
- tests/baidu_spider/pipeline.py +0 -55
- tests/baidu_spider/request_fingerprints.txt +0 -9
- tests/baidu_spider/run.py +0 -27
- tests/baidu_spider/settings.py +0 -80
- tests/baidu_spider/spiders/__init__.py +0 -7
- tests/baidu_spider/spiders/bai_du.py +0 -61
- tests/baidu_spider/spiders/sina.py +0 -79
- {crawlo-1.0.4.dist-info → crawlo-1.0.5.dist-info}/WHEEL +0 -0
- {crawlo-1.0.4.dist-info → crawlo-1.0.5.dist-info}/entry_points.txt +0 -0
tests/baidu_spider/settings.py
DELETED
|
@@ -1,80 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
|
|
4
|
-
PROJECT_NAME = 'baidu_spider'
|
|
5
|
-
|
|
6
|
-
CONCURRENCY = 4
|
|
7
|
-
|
|
8
|
-
USE_SESSION = True
|
|
9
|
-
|
|
10
|
-
# 下载延迟
|
|
11
|
-
DOWNLOAD_DELAY = 0.5
|
|
12
|
-
RANDOMNESS = False
|
|
13
|
-
|
|
14
|
-
# --------------------------------------------------- 公共MySQL配置 -----------------------------------------------------
|
|
15
|
-
MYSQL_HOST = '43.139.14.225'
|
|
16
|
-
MYSQL_PORT = 3306
|
|
17
|
-
MYSQL_USER = 'picker'
|
|
18
|
-
MYSQL_PASSWORD = 'kmcNbbz6TbSihttZ'
|
|
19
|
-
MYSQL_DB = 'stock_pro'
|
|
20
|
-
MYSQL_TABLE = 'articles' # 可选,默认使用spider名称
|
|
21
|
-
MYSQL_BATCH_SIZE = 500
|
|
22
|
-
|
|
23
|
-
# asyncmy专属配置
|
|
24
|
-
MYSQL_POOL_MIN = 5 # 连接池最小连接数
|
|
25
|
-
MYSQL_POOL_MAX = 20 # 连接池最大连接数
|
|
26
|
-
|
|
27
|
-
# 选择下载器
|
|
28
|
-
# DOWNLOADER = "crawlo.downloader.httpx_downloader.HttpXDownloader"
|
|
29
|
-
|
|
30
|
-
MIDDLEWARES = [
|
|
31
|
-
'crawlo.middleware.download_delay.DownloadDelayMiddleware',
|
|
32
|
-
'crawlo.middleware.default_header.DefaultHeaderMiddleware',
|
|
33
|
-
'crawlo.middleware.response_filter.ResponseFilterMiddleware',
|
|
34
|
-
'crawlo.middleware.retry.RetryMiddleware',
|
|
35
|
-
'crawlo.middleware.response_code.ResponseCodeMiddleware',
|
|
36
|
-
'crawlo.middleware.request_ignore.RequestIgnoreMiddleware',
|
|
37
|
-
# 'baidu_spider.middleware.TestMiddleWare',
|
|
38
|
-
# 'baidu_spider.middleware.TestMiddleWare2'
|
|
39
|
-
]
|
|
40
|
-
|
|
41
|
-
EXTENSIONS = [
|
|
42
|
-
'crawlo.extension.log_interval.LogIntervalExtension',
|
|
43
|
-
'crawlo.extension.log_stats.LogStats',
|
|
44
|
-
]
|
|
45
|
-
|
|
46
|
-
PIPELINES = [
|
|
47
|
-
'crawlo.pipelines.console_pipeline.ConsolePipeline',
|
|
48
|
-
'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline', # 或 AiomysqlMySQLPipeline
|
|
49
|
-
# 'crawlo.pipelines.mysql_batch_pipline.AsyncmyMySQLPipeline', # 或 AiomysqlMySQLPipeline
|
|
50
|
-
# 'baidu_spider.pipeline.TestPipeline',
|
|
51
|
-
# 'baidu_spider.pipeline.MongoPipeline',
|
|
52
|
-
]
|
|
53
|
-
|
|
54
|
-
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36'
|
|
55
|
-
DEFAULT_HEADERS = {
|
|
56
|
-
"accept": "application/json, text/javascript, */*; q=0.01",
|
|
57
|
-
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
|
|
58
|
-
"cache-control": "no-cache",
|
|
59
|
-
"pragma": "no-cache",
|
|
60
|
-
"priority": "u=1, i",
|
|
61
|
-
"sec-ch-ua": "\"Chromium\";v=\"136\", \"Google Chrome\";v=\"136\", \"Not.A/Brand\";v=\"99\"",
|
|
62
|
-
"sec-ch-ua-mobile": "?0",
|
|
63
|
-
"sec-ch-ua-platform": "\"macOS\"",
|
|
64
|
-
"sec-fetch-dest": "empty",
|
|
65
|
-
"sec-fetch-mode": "cors",
|
|
66
|
-
"sec-fetch-site": "same-origin",
|
|
67
|
-
# "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
|
|
68
|
-
"x-requested-with": "XMLHttpRequest"
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
# --------------------------------------DB ---------------------------------------------
|
|
72
|
-
Mongo_Params = ''
|
|
73
|
-
MONGODB_DB = 'news'
|
|
74
|
-
|
|
75
|
-
REDIS_TTL = 0
|
|
76
|
-
CLEANUP_FP = False
|
|
77
|
-
|
|
78
|
-
FILTER_CLASS = 'crawlo.filters.aioredis_filter.AioRedisFilter'
|
|
79
|
-
# FILTER_CLASS = 'crawlo.filters.redis_filter.RedisFilter'
|
|
80
|
-
# FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFileFilter'
|
|
@@ -1,61 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
# @Time : 2025-02-05 13:05
|
|
5
|
-
# @Author : oscar
|
|
6
|
-
# @Desc : None
|
|
7
|
-
"""
|
|
8
|
-
import asyncio
|
|
9
|
-
from crawlo import Request
|
|
10
|
-
from crawlo.spider import Spider
|
|
11
|
-
|
|
12
|
-
from items import BauDuItem
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class BaiDuSpider(Spider):
|
|
16
|
-
start_urls = ["https://www.baidu.com/", "https://www.baidu.com/"]
|
|
17
|
-
|
|
18
|
-
custom_settings = {
|
|
19
|
-
'CONCURRENCY': 1
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
name = "bai_du"
|
|
23
|
-
|
|
24
|
-
# headers = {
|
|
25
|
-
# "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"
|
|
26
|
-
# }
|
|
27
|
-
#
|
|
28
|
-
user_gent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"
|
|
29
|
-
|
|
30
|
-
async def parse(self, response):
|
|
31
|
-
for i in range(5):
|
|
32
|
-
url = f"https://www.baidu.com"
|
|
33
|
-
# url = f"https://www.httpbin.org/404"
|
|
34
|
-
r = Request(url=url, callback=self.parse_page, dont_filter=True)
|
|
35
|
-
yield r
|
|
36
|
-
|
|
37
|
-
async def parse_page(self, response):
|
|
38
|
-
for i in range(5):
|
|
39
|
-
url = f"https://www.baidu.com"
|
|
40
|
-
meta = {'test': 'hhhh'}
|
|
41
|
-
r = Request(url=url, callback=self.parse_detail, meta=meta, dont_filter=False)
|
|
42
|
-
yield r
|
|
43
|
-
|
|
44
|
-
def parse_detail(self, response):
|
|
45
|
-
item = BauDuItem()
|
|
46
|
-
item['title'] = response.xpath('//title/text()').get()
|
|
47
|
-
|
|
48
|
-
item['url'] = response.url
|
|
49
|
-
|
|
50
|
-
yield item
|
|
51
|
-
|
|
52
|
-
async def spider_opened(self):
|
|
53
|
-
pass
|
|
54
|
-
|
|
55
|
-
async def spider_closed(self):
|
|
56
|
-
pass
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
if __name__ == '__main__':
|
|
60
|
-
b = BaiDuSpider()
|
|
61
|
-
b.start_requests()
|
|
@@ -1,79 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
# @Time : 2025-02-05 13:05
|
|
5
|
-
# @Author : oscar
|
|
6
|
-
# @Desc : None
|
|
7
|
-
"""
|
|
8
|
-
import time
|
|
9
|
-
|
|
10
|
-
from crawlo import Request
|
|
11
|
-
from crawlo.spider import Spider
|
|
12
|
-
from crawlo.utils.date_tools import timestamp_to_datetime, format_datetime
|
|
13
|
-
|
|
14
|
-
from tests.baidu_spider.items import ArticleItem
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class SinaSpider(Spider):
|
|
18
|
-
# 获取当前时间戳,并减去 10 分钟(600 秒)
|
|
19
|
-
current_time_minus_10min = int(time.time()) - 6000
|
|
20
|
-
# 构造 URL
|
|
21
|
-
url = f'https://news.10jqka.com.cn/tapp/news/push/stock/?page=1&tag=&track=website&ctime={current_time_minus_10min}'
|
|
22
|
-
|
|
23
|
-
start_urls = [url]
|
|
24
|
-
name = 'sina'
|
|
25
|
-
# mysql_table = 'news_10jqka'
|
|
26
|
-
|
|
27
|
-
allowed_domains = ['*']
|
|
28
|
-
|
|
29
|
-
def start_requests(self):
|
|
30
|
-
for url in self.start_urls:
|
|
31
|
-
yield Request(url=url, callback=self.parse, dont_filter=True)
|
|
32
|
-
|
|
33
|
-
async def parse(self, response):
|
|
34
|
-
jsonp_str = response.json()
|
|
35
|
-
rows = jsonp_str.get('data', {}).get('list', [])
|
|
36
|
-
for row in rows:
|
|
37
|
-
article_id = row.get('id')
|
|
38
|
-
title = row.get('title')
|
|
39
|
-
digest = row.get('digest')
|
|
40
|
-
short = row.get('short')
|
|
41
|
-
detail_url = row.get('url')
|
|
42
|
-
tag = row.get('tag')
|
|
43
|
-
ctime = row.get('ctime')
|
|
44
|
-
source = row.get('source')
|
|
45
|
-
meta = {
|
|
46
|
-
'article_id': article_id,
|
|
47
|
-
'title': title,
|
|
48
|
-
'digest': digest,
|
|
49
|
-
'short': short,
|
|
50
|
-
'detail_url': detail_url,
|
|
51
|
-
'source': source,
|
|
52
|
-
'tag': tag,
|
|
53
|
-
'ctime': timestamp_to_datetime(int(ctime))
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
yield Request(url=detail_url, callback=self.parse_detail, encoding='gbk', meta=meta)
|
|
57
|
-
|
|
58
|
-
@staticmethod
|
|
59
|
-
async def parse_detail(response):
|
|
60
|
-
item = ArticleItem()
|
|
61
|
-
meta = response.meta
|
|
62
|
-
content = ''.join(response.xpath('//*[@id="contentApp"]/p/text()').extract()).strip()
|
|
63
|
-
ctime = meta.get('ctime')
|
|
64
|
-
item['article_id'] = meta.get('article_id')
|
|
65
|
-
item['title'] = meta.get('title')
|
|
66
|
-
item['digest'] = content
|
|
67
|
-
item['short'] = meta.get('short')
|
|
68
|
-
item['url'] = meta.get('detail_url')
|
|
69
|
-
item['tag'] = meta.get('tag').strip()
|
|
70
|
-
item['ctime'] = format_datetime(ctime)
|
|
71
|
-
item['source'] = meta.get('source')
|
|
72
|
-
|
|
73
|
-
yield item
|
|
74
|
-
|
|
75
|
-
async def spider_opened(self):
|
|
76
|
-
pass
|
|
77
|
-
|
|
78
|
-
async def spider_closed(self):
|
|
79
|
-
pass
|
|
File without changes
|
|
File without changes
|