crawlo 1.0.2__py3-none-any.whl → 1.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +9 -6
- crawlo/__version__.py +1 -2
- crawlo/core/__init__.py +2 -2
- crawlo/core/engine.py +158 -158
- crawlo/core/processor.py +40 -40
- crawlo/core/scheduler.py +57 -59
- crawlo/crawler.py +242 -222
- crawlo/downloader/__init__.py +78 -78
- crawlo/downloader/aiohttp_downloader.py +259 -96
- crawlo/downloader/httpx_downloader.py +187 -48
- crawlo/downloader/playwright_downloader.py +160 -160
- crawlo/event.py +11 -11
- crawlo/exceptions.py +64 -64
- crawlo/extension/__init__.py +31 -31
- crawlo/extension/log_interval.py +49 -49
- crawlo/extension/log_stats.py +44 -44
- crawlo/filters/__init__.py +37 -37
- crawlo/filters/aioredis_filter.py +150 -130
- crawlo/filters/memory_filter.py +202 -203
- crawlo/items/__init__.py +62 -62
- crawlo/items/items.py +118 -118
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +32 -32
- crawlo/middleware/download_delay.py +28 -28
- crawlo/middleware/middleware_manager.py +140 -140
- crawlo/middleware/request_ignore.py +30 -30
- crawlo/middleware/response_code.py +18 -18
- crawlo/middleware/response_filter.py +26 -26
- crawlo/middleware/retry.py +90 -90
- crawlo/network/__init__.py +7 -7
- crawlo/network/request.py +204 -233
- crawlo/network/response.py +166 -162
- crawlo/pipelines/__init__.py +13 -13
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/mongo_pipeline.py +116 -116
- crawlo/pipelines/mysql_batch_pipline.py +133 -133
- crawlo/pipelines/mysql_pipeline.py +195 -195
- crawlo/pipelines/pipeline_manager.py +56 -56
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +94 -89
- crawlo/settings/setting_manager.py +99 -99
- crawlo/spider/__init__.py +36 -36
- crawlo/stats_collector.py +59 -47
- crawlo/subscriber.py +106 -106
- crawlo/task_manager.py +27 -27
- crawlo/templates/item_template.tmpl +21 -21
- crawlo/templates/project_template/main.py +32 -32
- crawlo/templates/project_template/setting.py +189 -189
- crawlo/templates/spider_template.tmpl +30 -30
- crawlo/utils/__init__.py +7 -7
- crawlo/utils/concurrency_manager.py +124 -124
- crawlo/utils/date_tools.py +177 -177
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/log.py +39 -39
- crawlo/utils/pqueue.py +173 -173
- crawlo/utils/project.py +59 -59
- crawlo/utils/request.py +122 -85
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +302 -302
- crawlo/utils/url.py +39 -39
- {crawlo-1.0.2.dist-info → crawlo-1.0.4.dist-info}/METADATA +48 -48
- crawlo-1.0.4.dist-info/RECORD +79 -0
- {crawlo-1.0.2.dist-info → crawlo-1.0.4.dist-info}/top_level.txt +1 -0
- tests/__init__.py +7 -0
- tests/baidu_spider/__init__.py +7 -0
- tests/baidu_spider/demo.py +94 -0
- tests/baidu_spider/items.py +25 -0
- tests/baidu_spider/middleware.py +49 -0
- tests/baidu_spider/pipeline.py +55 -0
- tests/baidu_spider/request_fingerprints.txt +9 -0
- tests/baidu_spider/run.py +27 -0
- tests/baidu_spider/settings.py +80 -0
- tests/baidu_spider/spiders/__init__.py +7 -0
- tests/baidu_spider/spiders/bai_du.py +61 -0
- tests/baidu_spider/spiders/sina.py +79 -0
- crawlo/filters/redis_filter.py +0 -120
- crawlo-1.0.2.dist-info/RECORD +0 -68
- {crawlo-1.0.2.dist-info → crawlo-1.0.4.dist-info}/WHEEL +0 -0
- {crawlo-1.0.2.dist-info → crawlo-1.0.4.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
|
|
4
|
+
PROJECT_NAME = 'baidu_spider'
|
|
5
|
+
|
|
6
|
+
CONCURRENCY = 4
|
|
7
|
+
|
|
8
|
+
USE_SESSION = True
|
|
9
|
+
|
|
10
|
+
# 下载延迟
|
|
11
|
+
DOWNLOAD_DELAY = 0.5
|
|
12
|
+
RANDOMNESS = False
|
|
13
|
+
|
|
14
|
+
# --------------------------------------------------- 公共MySQL配置 -----------------------------------------------------
|
|
15
|
+
MYSQL_HOST = '43.139.14.225'
|
|
16
|
+
MYSQL_PORT = 3306
|
|
17
|
+
MYSQL_USER = 'picker'
|
|
18
|
+
MYSQL_PASSWORD = 'kmcNbbz6TbSihttZ'
|
|
19
|
+
MYSQL_DB = 'stock_pro'
|
|
20
|
+
MYSQL_TABLE = 'articles' # 可选,默认使用spider名称
|
|
21
|
+
MYSQL_BATCH_SIZE = 500
|
|
22
|
+
|
|
23
|
+
# asyncmy专属配置
|
|
24
|
+
MYSQL_POOL_MIN = 5 # 连接池最小连接数
|
|
25
|
+
MYSQL_POOL_MAX = 20 # 连接池最大连接数
|
|
26
|
+
|
|
27
|
+
# 选择下载器
|
|
28
|
+
# DOWNLOADER = "crawlo.downloader.httpx_downloader.HttpXDownloader"
|
|
29
|
+
|
|
30
|
+
MIDDLEWARES = [
|
|
31
|
+
'crawlo.middleware.download_delay.DownloadDelayMiddleware',
|
|
32
|
+
'crawlo.middleware.default_header.DefaultHeaderMiddleware',
|
|
33
|
+
'crawlo.middleware.response_filter.ResponseFilterMiddleware',
|
|
34
|
+
'crawlo.middleware.retry.RetryMiddleware',
|
|
35
|
+
'crawlo.middleware.response_code.ResponseCodeMiddleware',
|
|
36
|
+
'crawlo.middleware.request_ignore.RequestIgnoreMiddleware',
|
|
37
|
+
# 'baidu_spider.middleware.TestMiddleWare',
|
|
38
|
+
# 'baidu_spider.middleware.TestMiddleWare2'
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
EXTENSIONS = [
|
|
42
|
+
'crawlo.extension.log_interval.LogIntervalExtension',
|
|
43
|
+
'crawlo.extension.log_stats.LogStats',
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
PIPELINES = [
|
|
47
|
+
'crawlo.pipelines.console_pipeline.ConsolePipeline',
|
|
48
|
+
'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline', # 或 AiomysqlMySQLPipeline
|
|
49
|
+
# 'crawlo.pipelines.mysql_batch_pipline.AsyncmyMySQLPipeline', # 或 AiomysqlMySQLPipeline
|
|
50
|
+
# 'baidu_spider.pipeline.TestPipeline',
|
|
51
|
+
# 'baidu_spider.pipeline.MongoPipeline',
|
|
52
|
+
]
|
|
53
|
+
|
|
54
|
+
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36'
|
|
55
|
+
DEFAULT_HEADERS = {
|
|
56
|
+
"accept": "application/json, text/javascript, */*; q=0.01",
|
|
57
|
+
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
|
|
58
|
+
"cache-control": "no-cache",
|
|
59
|
+
"pragma": "no-cache",
|
|
60
|
+
"priority": "u=1, i",
|
|
61
|
+
"sec-ch-ua": "\"Chromium\";v=\"136\", \"Google Chrome\";v=\"136\", \"Not.A/Brand\";v=\"99\"",
|
|
62
|
+
"sec-ch-ua-mobile": "?0",
|
|
63
|
+
"sec-ch-ua-platform": "\"macOS\"",
|
|
64
|
+
"sec-fetch-dest": "empty",
|
|
65
|
+
"sec-fetch-mode": "cors",
|
|
66
|
+
"sec-fetch-site": "same-origin",
|
|
67
|
+
# "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
|
|
68
|
+
"x-requested-with": "XMLHttpRequest"
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
# --------------------------------------DB ---------------------------------------------
|
|
72
|
+
Mongo_Params = ''
|
|
73
|
+
MONGODB_DB = 'news'
|
|
74
|
+
|
|
75
|
+
REDIS_TTL = 0
|
|
76
|
+
CLEANUP_FP = False
|
|
77
|
+
|
|
78
|
+
FILTER_CLASS = 'crawlo.filters.aioredis_filter.AioRedisFilter'
|
|
79
|
+
# FILTER_CLASS = 'crawlo.filters.redis_filter.RedisFilter'
|
|
80
|
+
# FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFileFilter'
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
# @Time : 2025-02-05 13:05
|
|
5
|
+
# @Author : oscar
|
|
6
|
+
# @Desc : None
|
|
7
|
+
"""
|
|
8
|
+
import asyncio
|
|
9
|
+
from crawlo import Request
|
|
10
|
+
from crawlo.spider import Spider
|
|
11
|
+
|
|
12
|
+
from items import BauDuItem
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BaiDuSpider(Spider):
|
|
16
|
+
start_urls = ["https://www.baidu.com/", "https://www.baidu.com/"]
|
|
17
|
+
|
|
18
|
+
custom_settings = {
|
|
19
|
+
'CONCURRENCY': 1
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
name = "bai_du"
|
|
23
|
+
|
|
24
|
+
# headers = {
|
|
25
|
+
# "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"
|
|
26
|
+
# }
|
|
27
|
+
#
|
|
28
|
+
user_gent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"
|
|
29
|
+
|
|
30
|
+
async def parse(self, response):
|
|
31
|
+
for i in range(5):
|
|
32
|
+
url = f"https://www.baidu.com"
|
|
33
|
+
# url = f"https://www.httpbin.org/404"
|
|
34
|
+
r = Request(url=url, callback=self.parse_page, dont_filter=True)
|
|
35
|
+
yield r
|
|
36
|
+
|
|
37
|
+
async def parse_page(self, response):
|
|
38
|
+
for i in range(5):
|
|
39
|
+
url = f"https://www.baidu.com"
|
|
40
|
+
meta = {'test': 'hhhh'}
|
|
41
|
+
r = Request(url=url, callback=self.parse_detail, meta=meta, dont_filter=False)
|
|
42
|
+
yield r
|
|
43
|
+
|
|
44
|
+
def parse_detail(self, response):
|
|
45
|
+
item = BauDuItem()
|
|
46
|
+
item['title'] = response.xpath('//title/text()').get()
|
|
47
|
+
|
|
48
|
+
item['url'] = response.url
|
|
49
|
+
|
|
50
|
+
yield item
|
|
51
|
+
|
|
52
|
+
async def spider_opened(self):
|
|
53
|
+
pass
|
|
54
|
+
|
|
55
|
+
async def spider_closed(self):
|
|
56
|
+
pass
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
if __name__ == '__main__':
|
|
60
|
+
b = BaiDuSpider()
|
|
61
|
+
b.start_requests()
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
# @Time : 2025-02-05 13:05
|
|
5
|
+
# @Author : oscar
|
|
6
|
+
# @Desc : None
|
|
7
|
+
"""
|
|
8
|
+
import time
|
|
9
|
+
|
|
10
|
+
from crawlo import Request
|
|
11
|
+
from crawlo.spider import Spider
|
|
12
|
+
from crawlo.utils.date_tools import timestamp_to_datetime, format_datetime
|
|
13
|
+
|
|
14
|
+
from tests.baidu_spider.items import ArticleItem
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class SinaSpider(Spider):
|
|
18
|
+
# 获取当前时间戳,并减去 10 分钟(600 秒)
|
|
19
|
+
current_time_minus_10min = int(time.time()) - 6000
|
|
20
|
+
# 构造 URL
|
|
21
|
+
url = f'https://news.10jqka.com.cn/tapp/news/push/stock/?page=1&tag=&track=website&ctime={current_time_minus_10min}'
|
|
22
|
+
|
|
23
|
+
start_urls = [url]
|
|
24
|
+
name = 'sina'
|
|
25
|
+
# mysql_table = 'news_10jqka'
|
|
26
|
+
|
|
27
|
+
allowed_domains = ['*']
|
|
28
|
+
|
|
29
|
+
def start_requests(self):
|
|
30
|
+
for url in self.start_urls:
|
|
31
|
+
yield Request(url=url, callback=self.parse, dont_filter=True)
|
|
32
|
+
|
|
33
|
+
async def parse(self, response):
|
|
34
|
+
jsonp_str = response.json()
|
|
35
|
+
rows = jsonp_str.get('data', {}).get('list', [])
|
|
36
|
+
for row in rows:
|
|
37
|
+
article_id = row.get('id')
|
|
38
|
+
title = row.get('title')
|
|
39
|
+
digest = row.get('digest')
|
|
40
|
+
short = row.get('short')
|
|
41
|
+
detail_url = row.get('url')
|
|
42
|
+
tag = row.get('tag')
|
|
43
|
+
ctime = row.get('ctime')
|
|
44
|
+
source = row.get('source')
|
|
45
|
+
meta = {
|
|
46
|
+
'article_id': article_id,
|
|
47
|
+
'title': title,
|
|
48
|
+
'digest': digest,
|
|
49
|
+
'short': short,
|
|
50
|
+
'detail_url': detail_url,
|
|
51
|
+
'source': source,
|
|
52
|
+
'tag': tag,
|
|
53
|
+
'ctime': timestamp_to_datetime(int(ctime))
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
yield Request(url=detail_url, callback=self.parse_detail, encoding='gbk', meta=meta)
|
|
57
|
+
|
|
58
|
+
@staticmethod
|
|
59
|
+
async def parse_detail(response):
|
|
60
|
+
item = ArticleItem()
|
|
61
|
+
meta = response.meta
|
|
62
|
+
content = ''.join(response.xpath('//*[@id="contentApp"]/p/text()').extract()).strip()
|
|
63
|
+
ctime = meta.get('ctime')
|
|
64
|
+
item['article_id'] = meta.get('article_id')
|
|
65
|
+
item['title'] = meta.get('title')
|
|
66
|
+
item['digest'] = content
|
|
67
|
+
item['short'] = meta.get('short')
|
|
68
|
+
item['url'] = meta.get('detail_url')
|
|
69
|
+
item['tag'] = meta.get('tag').strip()
|
|
70
|
+
item['ctime'] = format_datetime(ctime)
|
|
71
|
+
item['source'] = meta.get('source')
|
|
72
|
+
|
|
73
|
+
yield item
|
|
74
|
+
|
|
75
|
+
async def spider_opened(self):
|
|
76
|
+
pass
|
|
77
|
+
|
|
78
|
+
async def spider_closed(self):
|
|
79
|
+
pass
|
crawlo/filters/redis_filter.py
DELETED
|
@@ -1,120 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
import redis
|
|
4
|
-
|
|
5
|
-
from crawlo import Request
|
|
6
|
-
from crawlo.filters import BaseFilter
|
|
7
|
-
from crawlo.utils.log import get_logger
|
|
8
|
-
from crawlo.utils.request import request_fingerprint
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class RedisFilter(BaseFilter):
|
|
12
|
-
"""使用Redis集合实现的同步请求去重过滤器"""
|
|
13
|
-
|
|
14
|
-
def __init__(
|
|
15
|
-
self,
|
|
16
|
-
redis_key: str,
|
|
17
|
-
client: redis.Redis,
|
|
18
|
-
stats: dict,
|
|
19
|
-
debug: bool,
|
|
20
|
-
log_level: str,
|
|
21
|
-
save_fp: bool
|
|
22
|
-
):
|
|
23
|
-
"""
|
|
24
|
-
初始化过滤器
|
|
25
|
-
|
|
26
|
-
:param redis_key: Redis存储键名
|
|
27
|
-
:param client: redis客户端实例
|
|
28
|
-
:param stats: 统计字典
|
|
29
|
-
:param debug: 是否启用调试模式
|
|
30
|
-
:param log_level: 日志级别
|
|
31
|
-
:param save_fp: 是否保留指纹数据
|
|
32
|
-
"""
|
|
33
|
-
self.logger = get_logger(self.__class__.__name__, log_level)
|
|
34
|
-
super().__init__(self.logger, stats, debug)
|
|
35
|
-
|
|
36
|
-
self.redis_key = redis_key
|
|
37
|
-
self.redis = client
|
|
38
|
-
self.save_fp = save_fp
|
|
39
|
-
|
|
40
|
-
@classmethod
|
|
41
|
-
def create_instance(cls, crawler) -> 'BaseFilter':
|
|
42
|
-
"""工厂方法创建实例"""
|
|
43
|
-
redis_url = crawler.settings.get('REDIS_URL', 'redis://localhost:6379')
|
|
44
|
-
decode_responses = crawler.settings.get_bool('DECODE_RESPONSES', True)
|
|
45
|
-
|
|
46
|
-
try:
|
|
47
|
-
# 添加连接池配置
|
|
48
|
-
redis_client = redis.from_url(
|
|
49
|
-
redis_url,
|
|
50
|
-
decode_responses=decode_responses,
|
|
51
|
-
socket_timeout=5, # 超时设置
|
|
52
|
-
socket_connect_timeout=5,
|
|
53
|
-
max_connections=20 # 连接池大小
|
|
54
|
-
)
|
|
55
|
-
# 测试连接是否有效
|
|
56
|
-
redis_client.ping()
|
|
57
|
-
except redis.RedisError as e:
|
|
58
|
-
raise RuntimeError(f"Redis连接失败: {str(e)}")
|
|
59
|
-
|
|
60
|
-
return cls(
|
|
61
|
-
redis_key=f"{crawler.settings.get('PROJECT_NAME')}:{crawler.settings.get('REDIS_KEY', 'request_fingerprints')}",
|
|
62
|
-
client=redis_client,
|
|
63
|
-
stats=crawler.stats,
|
|
64
|
-
save_fp=crawler.settings.get_bool('SAVE_FP', False),
|
|
65
|
-
debug=crawler.settings.get_bool('FILTER_DEBUG', False),
|
|
66
|
-
log_level=crawler.settings.get('LOG_LEVEL', 'INFO')
|
|
67
|
-
)
|
|
68
|
-
|
|
69
|
-
def requested(self, request: Request) -> bool:
|
|
70
|
-
"""
|
|
71
|
-
检查请求是否已存在
|
|
72
|
-
|
|
73
|
-
:param request: 请求对象
|
|
74
|
-
:return: 是否重复
|
|
75
|
-
"""
|
|
76
|
-
fp = request_fingerprint(request)
|
|
77
|
-
try:
|
|
78
|
-
if self.redis.sismember(self.redis_key, fp):
|
|
79
|
-
self.logger.debug(f"重复请求: {fp}")
|
|
80
|
-
return True
|
|
81
|
-
|
|
82
|
-
self.add_fingerprint(fp)
|
|
83
|
-
return False
|
|
84
|
-
except redis.RedisError as e:
|
|
85
|
-
self.logger.error(f"Redis操作失败: {str(e)}")
|
|
86
|
-
raise
|
|
87
|
-
|
|
88
|
-
def add_fingerprint(self, fp: str) -> None:
|
|
89
|
-
"""添加指纹到Redis集合"""
|
|
90
|
-
try:
|
|
91
|
-
self.redis.sadd(self.redis_key, fp)
|
|
92
|
-
self.logger.debug(f"新增指纹: {fp}")
|
|
93
|
-
except redis.RedisError as e:
|
|
94
|
-
self.logger.error(f"指纹添加失败: {str(e)}")
|
|
95
|
-
raise
|
|
96
|
-
|
|
97
|
-
def __contains__(self, item) -> bool:
|
|
98
|
-
"""支持 in 操作符检查 (必须返回bool类型)"""
|
|
99
|
-
try:
|
|
100
|
-
# 显式将redis返回的0/1转换为bool
|
|
101
|
-
return bool(self.redis.sismember(self.redis_key, item))
|
|
102
|
-
except redis.RedisError as e:
|
|
103
|
-
self.logger.error(f"Redis查询失败: {str(e)}")
|
|
104
|
-
raise
|
|
105
|
-
|
|
106
|
-
def close(self) -> None:
|
|
107
|
-
"""同步清理方法(注意不是异步的closed)"""
|
|
108
|
-
if not self.save_fp:
|
|
109
|
-
try:
|
|
110
|
-
count = self.redis.delete(self.redis_key)
|
|
111
|
-
self.logger.info(f"已清理Redis键 {self.redis_key}, 删除数量: {count}")
|
|
112
|
-
except redis.RedisError as e:
|
|
113
|
-
self.logger.error(f"清理失败: {str(e)}")
|
|
114
|
-
finally:
|
|
115
|
-
# 同步客户端需要手动关闭连接池
|
|
116
|
-
self.redis.close()
|
|
117
|
-
|
|
118
|
-
async def closed(self):
|
|
119
|
-
"""兼容异步接口的同步实现"""
|
|
120
|
-
self.close()
|
crawlo-1.0.2.dist-info/RECORD
DELETED
|
@@ -1,68 +0,0 @@
|
|
|
1
|
-
crawlo/__init__.py,sha256=BoRtaB19VFlByP3JKzXQbmg4Jb6i6yVnpTR3jCSrCig,208
|
|
2
|
-
crawlo/__version__.py,sha256=FlR3yFykEvKzEITpEQq_qx6Uq29lYlhxcnSHeRP1LgI,23
|
|
3
|
-
crawlo/crawler.py,sha256=XLiDyFyoiJr8BzhLbCB15zbccVHQiiLGDM1zFUkdrAI,8544
|
|
4
|
-
crawlo/event.py,sha256=ZhoPW5CglCEuZNFEwviSCBIw0pT5O6jT98bqYrDFd3E,324
|
|
5
|
-
crawlo/exceptions.py,sha256=trxM2c0jw50QsGSoFAKC2RrKpapOFHQDq0wQuLWqmKE,980
|
|
6
|
-
crawlo/stats_collector.py,sha256=jhAW8k0SzjqelkpiWpfGmMw2DBkgTjpwnObqTNDOp6A,1286
|
|
7
|
-
crawlo/subscriber.py,sha256=udlHeTR0ymGQhCDxVUGwUzeeeR4TYCEJrJwFnkgr0cU,3836
|
|
8
|
-
crawlo/task_manager.py,sha256=D9m-nqnGj-FZPtGk4CdwZX3Gw7IWyYvTS7CHpRGWc_w,748
|
|
9
|
-
crawlo/core/__init__.py,sha256=JYSAn15r8yWgRK_Nc69t_8tZCyb70MiPZKssA8wrYz0,43
|
|
10
|
-
crawlo/core/engine.py,sha256=JFHooPp-5cfHSyxEh87nOOR5NMaPLVDfNSqAsbtx4PM,6030
|
|
11
|
-
crawlo/core/processor.py,sha256=oHLs-cno0bJGTNc9NGD2S7_2-grI3ruvggO0SY2mf3Q,1180
|
|
12
|
-
crawlo/core/scheduler.py,sha256=ZwPoU_QRjs9wwrxdt-MGPwsSmKhvvhgmcnBllDrXnhg,2014
|
|
13
|
-
crawlo/downloader/__init__.py,sha256=72u2Hef4HaMfs9VCqEjbMtiaRXbaXmgNiJn6qy09LHs,2384
|
|
14
|
-
crawlo/downloader/aiohttp_downloader.py,sha256=4C2BDloKzwss16kfD7tH0WPugPbSSFxl-5-_DLWB0vM,3676
|
|
15
|
-
crawlo/downloader/httpx_downloader.py,sha256=ra6Ae_lv8pNyvLzPQYBgTNuBdMVBYi86kNt2OdZlcSo,1704
|
|
16
|
-
crawlo/downloader/playwright_downloader.py,sha256=mEGlSd6A6sN0Wyq-TDkownIElOgxnwVfY3rS5wtLoYY,6726
|
|
17
|
-
crawlo/extension/__init__.py,sha256=O2BVK1U3WwmurZb-PaYVz3g1tZ_iYUjCwilmUKf6844,1170
|
|
18
|
-
crawlo/extension/log_interval.py,sha256=FOWeTOuWtOpCz2UPV5F_--QIa8yomltSpjxbw3F7bkU,1971
|
|
19
|
-
crawlo/extension/log_stats.py,sha256=-V7ipdIfYMQdp1ZDc4kvNEAIHIR74U1ZHV5FhlLyGGU,1786
|
|
20
|
-
crawlo/filters/__init__.py,sha256=9fJQRVkxWWPChajYbAGe1O6UYB639xWt0hiLUGBs4hQ,1014
|
|
21
|
-
crawlo/filters/aioredis_filter.py,sha256=H_HAFfE9mHnPrzQcjyXXCseQ77iN4nT9a3lukHiu8M8,4874
|
|
22
|
-
crawlo/filters/memory_filter.py,sha256=pk2o0kbX0zrGLJ6poKhSa-cfOmsp68fA65hXEGQCQ5M,6895
|
|
23
|
-
crawlo/filters/redis_filter.py,sha256=m1nRsf_3slnWSb80RtTEURi5kwjdI0xotoFxnscx974,4211
|
|
24
|
-
crawlo/items/__init__.py,sha256=o5BSpS1Byivr-bpdfFgc9GCoGi8ThNuPJiTW7lz85-I,2125
|
|
25
|
-
crawlo/items/items.py,sha256=myOOjWaSByKW0r8SxIQ0bxS3PXjUDE1c-Pe38z6fSF8,4108
|
|
26
|
-
crawlo/middleware/__init__.py,sha256=PSwpRLdBUopaQzBp1S0zK_TZbrRagQ4yzvgyLy4tBk8,570
|
|
27
|
-
crawlo/middleware/default_header.py,sha256=OVW4vpRPp3Y6qYXtiEYlGqVjCYcbuv1Iecc7zEgwCsI,1099
|
|
28
|
-
crawlo/middleware/download_delay.py,sha256=P2eyAJXwdLdC4yYuLhvKZVa1b5YQvQD0GpsR8aDW8-8,994
|
|
29
|
-
crawlo/middleware/middleware_manager.py,sha256=T4axTY89Z0BOwaWDWcUTABeDNTvyPFiyrbwj-H4sbSA,6629
|
|
30
|
-
crawlo/middleware/request_ignore.py,sha256=jdybWFVXuA5YsAPfZJFzLTWkYhEAewNgxuhFqczPW9M,1027
|
|
31
|
-
crawlo/middleware/response_code.py,sha256=vgXWv3mMu_v9URvhKA9myIFH4u6L4EwNme80wL4DCGc,677
|
|
32
|
-
crawlo/middleware/response_filter.py,sha256=O2gkV_Yjart8kmmXTGzrtZnb_Uuefap4uL2Cu01iRs4,863
|
|
33
|
-
crawlo/middleware/retry.py,sha256=9SnE7l3Nhh143AqCiL0gfE6dl_gF1Kr6CjoNxvMH_Ps,3465
|
|
34
|
-
crawlo/network/__init__.py,sha256=DVz1JpasjxCgOlXvm76gz-S18OXr4emG_J39yi5iVuA,130
|
|
35
|
-
crawlo/network/request.py,sha256=yUAL6oecm1TniD9dsBmRedEaEHzh3rtr_6p3dMK2EfQ,8195
|
|
36
|
-
crawlo/network/response.py,sha256=amnLEExKq11dLh6m_YdSqdKLl0srh1eeY5uPz15fwyo,6055
|
|
37
|
-
crawlo/pipelines/__init__.py,sha256=IbXJ6B8LqxVVjeLNgL_12AxV6zbV8hNRQxAfMLjjSaw,273
|
|
38
|
-
crawlo/pipelines/console_pipeline.py,sha256=bwe5hZgaVSWmh3R8XpOaaeAjJme-Ttrpo6G6f1cnLIg,1287
|
|
39
|
-
crawlo/pipelines/mongo_pipeline.py,sha256=Yr48D0T61-_Y-EpgWXf7BUn9w8e-Pj5P07QDSPZ0pYU,4558
|
|
40
|
-
crawlo/pipelines/mysql_batch_pipline.py,sha256=7KXd0IUV0h3IViD8R0iruyWv5XdZR1pANB8EY9z6iMI,5022
|
|
41
|
-
crawlo/pipelines/mysql_pipeline.py,sha256=TzyaBg1oBj9pgzuVUrb5VVtkiwZH6gqP_6IYM2QCkLs,8052
|
|
42
|
-
crawlo/pipelines/pipeline_manager.py,sha256=k-Rg0os0Havrov99D-Jn3ROpnz154K30tf7aARE5W3k,2174
|
|
43
|
-
crawlo/settings/__init__.py,sha256=NgYFLfk_Bw7h6KSoepJn_lMBSqVbCHebjKxaE3_eMgw,130
|
|
44
|
-
crawlo/settings/default_settings.py,sha256=JS1QKYe7jkdFlOjqZ-eOeOcVS3AXCZynoNH95GuEnds,2556
|
|
45
|
-
crawlo/settings/setting_manager.py,sha256=4xXOzKwZCgAp8ybwvVcs2R--CsOD7c6dBIkj6DJHB3c,2998
|
|
46
|
-
crawlo/spider/__init__.py,sha256=1tmKkr2-oJi0w9r2ho9nn6Z_VDn18pjXHXU0Hv2eheY,941
|
|
47
|
-
crawlo/templates/item_template.tmpl,sha256=0bGFnlwJRqstxMNEj1H_pEICybwoueRhs31QaDPXrS0,372
|
|
48
|
-
crawlo/templates/spider_template.tmpl,sha256=JzphuA87Yl_F1xR9zOIi_ZSazyT8eSNPxYYPMv3Uiko,835
|
|
49
|
-
crawlo/templates/project_template/main.py,sha256=BcCP294ycCPsHi_AMN7OAJtcrLvQdf91meH93PqbQgs,626
|
|
50
|
-
crawlo/templates/project_template/setting.py,sha256=Ce4nMbrdhL1ioRdTcB0vV_vK_50cfnwVqSvt49QsNkA,9395
|
|
51
|
-
crawlo/templates/project_template/items/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
52
|
-
crawlo/templates/project_template/spiders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
53
|
-
crawlo/utils/__init__.py,sha256=XCYumI8wJ1jU_Myn_K0LT-LVygPDUCdETCbXM3EWvlo,130
|
|
54
|
-
crawlo/utils/concurrency_manager.py,sha256=o-_cfeUHdlBOM3eAXF857MtekSrRcVTBJ2jWZvY6weQ,5230
|
|
55
|
-
crawlo/utils/date_tools.py,sha256=Y7pnGNn1-5vkiHtydAgmQ-qR3pSO30k5WEYigOPifPQ,5496
|
|
56
|
-
crawlo/utils/func_tools.py,sha256=y-TYP9H3X67MS_foWy9Z2LIS6GP7Y4Cy3T168ulq3Jc,2451
|
|
57
|
-
crawlo/utils/log.py,sha256=LU0J3boPCL-Kynx3wR_CAryRgScNmPPn4pBitLrrsX4,1028
|
|
58
|
-
crawlo/utils/pqueue.py,sha256=HDgX4HAkc7RqYUtX6q51tzI1ZRTACf8P_4jLqC4-uC0,5559
|
|
59
|
-
crawlo/utils/project.py,sha256=FfBaMfxcau4yL59O-DfD7FAii8k6gXWQmQ1YU6aaUCE,1544
|
|
60
|
-
crawlo/utils/request.py,sha256=BEBtxwejvQw5euEiSclHCbqkNcBwUMY7KPGGkvj8BjE,2936
|
|
61
|
-
crawlo/utils/system.py,sha256=24zGmtHNhDFMGVo7ftMV-Pqg6_5d63zsyNey9udvJJk,248
|
|
62
|
-
crawlo/utils/tools.py,sha256=U7xGKgXe2PmMyvNxZ1vlJYcv4Ei1WhIBMf8XcJZ7XCY,9764
|
|
63
|
-
crawlo/utils/url.py,sha256=RKe_iqdjafsNcp-P2GVLYpsL1qbxiuZLiFc-SqOQkcs,1521
|
|
64
|
-
crawlo-1.0.2.dist-info/METADATA,sha256=kzNv4kckIbSZCADon5g2Ik5QJhubNxWra0-aXX6bMug,1784
|
|
65
|
-
crawlo-1.0.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
66
|
-
crawlo-1.0.2.dist-info/entry_points.txt,sha256=GD9PBhKQN83EaxPYtz7NhcGeZeh3bdr2jWbTixOs-lw,59
|
|
67
|
-
crawlo-1.0.2.dist-info/top_level.txt,sha256=Dwuv-Y1aGSJD3mjFrCdNGQ8EHroMj7RgVcxDdcczx4k,7
|
|
68
|
-
crawlo-1.0.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|