crawlo 1.3.1__py3-none-any.whl → 1.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

crawlo/__version__.py CHANGED
@@ -1 +1 @@
1
- __version__ = '1.3.1'
1
+ __version__ = '1.3.2'
@@ -28,8 +28,16 @@ class OffsiteMiddleware:
28
28
  创建中间件实例
29
29
  从爬虫设置中获取允许的域名列表
30
30
  """
31
- # 从爬虫设置中获取允许的域名
32
- allowed_domains = crawler.settings.get_list('ALLOWED_DOMAINS')
31
+ # 优先使用 Spider 实例的 allowed_domains,回退到全局设置中的 ALLOWED_DOMAINS
32
+ allowed_domains = []
33
+
34
+ # 检查当前爬虫实例是否有 allowed_domains 属性
35
+ if hasattr(crawler, 'spider') and crawler.spider and hasattr(crawler.spider, 'allowed_domains'):
36
+ allowed_domains = getattr(crawler.spider, 'allowed_domains', [])
37
+
38
+ # 如果 Spider 实例没有设置 allowed_domains,则从全局设置中获取
39
+ if not allowed_domains:
40
+ allowed_domains = crawler.settings.get_list('ALLOWED_DOMAINS')
33
41
 
34
42
  # 如果没有配置允许的域名,则禁用此中间件
35
43
  if not allowed_domains:
crawlo/mode_manager.py CHANGED
@@ -40,6 +40,45 @@ class ModeManager:
40
40
  'DOWNLOAD_DELAY': 1.0,
41
41
  }
42
42
 
43
+ @staticmethod
44
+ def get_distributed_settings(
45
+ redis_host: str = '127.0.0.1',
46
+ redis_port: int = 6379,
47
+ redis_password: Optional[str] = None,
48
+ redis_db: int = 0,
49
+ project_name: str = 'crawlo'
50
+ ) -> Dict[str, Any]:
51
+ """获取分布式模式配置"""
52
+ # 构建 Redis URL
53
+ if redis_password:
54
+ redis_url = f'redis://:{redis_password}@{redis_host}:{redis_port}/{redis_db}'
55
+ else:
56
+ redis_url = f'redis://{redis_host}:{redis_port}/{redis_db}'
57
+
58
+ return {
59
+ 'QUEUE_TYPE': 'redis',
60
+ 'FILTER_CLASS': 'crawlo.filters.aioredis_filter.AioRedisFilter',
61
+ 'DEFAULT_DEDUP_PIPELINE': 'crawlo.pipelines.redis_dedup_pipeline.RedisDedupPipeline',
62
+ 'REDIS_HOST': redis_host,
63
+ 'REDIS_PORT': redis_port,
64
+ 'REDIS_PASSWORD': redis_password,
65
+ 'REDIS_DB': redis_db,
66
+ 'REDIS_URL': redis_url,
67
+ 'PROJECT_NAME': project_name,
68
+ 'SCHEDULER_QUEUE_NAME': f'crawlo:{project_name}:queue:requests',
69
+ 'CONCURRENCY': 16,
70
+ 'MAX_RUNNING_SPIDERS': 10,
71
+ 'DOWNLOAD_DELAY': 1.0,
72
+ }
73
+
74
+ @staticmethod
75
+ def get_auto_settings() -> Dict[str, Any]:
76
+ """获取自动检测模式配置"""
77
+ # 默认使用内存队列和过滤器
78
+ settings = ModeManager.get_standalone_settings()
79
+ settings['QUEUE_TYPE'] = 'auto'
80
+ return settings
81
+
43
82
  def resolve_mode_settings(
44
83
  self,
45
84
  mode: str = 'standalone',
@@ -156,6 +156,11 @@ MONGO_USE_BATCH = False # 是否启用批量插入
156
156
  # FILTER_DEBUG = True
157
157
  # DECODE_RESPONSES = True
158
158
 
159
+ # ============================== 域名过滤配置 ==============================
160
+ # OffsiteMiddleware 配置,用于限制爬虫只爬取指定域名的页面
161
+ # 如需启用域名过滤功能,请取消注释并配置允许的域名列表
162
+ # ALLOWED_DOMAINS = ['example.com', 'www.example.com']
163
+
159
164
  # ============================== 用户自定义中间件配置 ==============================
160
165
  # 注意:框架默认中间件已自动加载,此处可添加或覆盖默认中间件
161
166
  # 如需启用代理功能,请取消注释 ProxyMiddleware 并配置代理相关参数
@@ -121,13 +121,18 @@ MONGO_USE_BATCH = True
121
121
  # 注意:框架已提供默认的去重配置,以下配置项通常无需修改
122
122
  # 如需自定义,请取消注释并修改相应值
123
123
 
124
- # 明确指定分布式模式下使用Redis去重管道
124
+ # 明确指定分布式模式下使用Redis去重管道和过滤器
125
125
  # DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.redis_dedup_pipeline.RedisDedupPipeline'
126
126
  # FILTER_CLASS = 'crawlo.filters.aioredis_filter.AioRedisFilter'
127
127
  # REDIS_TTL = 0
128
128
  # CLEANUP_FP = 0
129
129
  # FILTER_DEBUG = True
130
130
 
131
+ # ============================== 域名过滤配置 ==============================
132
+ # OffsiteMiddleware 配置,用于限制爬虫只爬取指定域名的页面
133
+ # 如需启用域名过滤功能,请取消注释并配置允许的域名列表
134
+ # ALLOWED_DOMAINS = ['example.com', 'www.example.com']
135
+
131
136
  # ============================== 用户自定义中间件配置 ==============================
132
137
  # 注意:框架默认中间件已自动加载,此处可添加或覆盖默认中间件
133
138
 
@@ -1,43 +1,3 @@
1
- # -*- coding: UTF-8 -*-
2
- """
3
- 温和模式配置模板
4
- 低负载配置,对目标网站友好
5
- """
6
-
7
- # ============================== 项目基本信息 ==============================
8
- PROJECT_NAME = '{{project_name}}'
9
-
10
- # ============================== 温和运行模式 ==============================
11
- # 运行模式:'standalone'(单机), 'distributed'(分布式), 'auto'(自动检测)
12
- RUN_MODE = 'standalone' # 单机模式 - 适用于开发和小规模数据采集
13
-
14
- # 并发配置
15
- CONCURRENCY = 2 # 极低并发数以减少目标网站压力
16
- DOWNLOAD_DELAY = 3.0 # 增加延迟以降低目标网站压力
17
- RANDOMNESS = True # 启用随机延迟
18
- RANDOM_RANGE = (0.5, 2.0) # 随机延迟范围
19
-
20
- # ============================== 队列配置 ==============================
21
-
22
- # 队列类型:'auto'(自动选择), 'memory'(内存队列), 'redis'(分布式队列)
23
- QUEUE_TYPE = 'auto' # 自动检测,如果Redis可用则使用Redis队列
24
- SCHEDULER_MAX_QUEUE_SIZE = 500
25
- SCHEDULER_QUEUE_NAME = f'crawlo:{{project_name}}:queue:requests'
26
- QUEUE_MAX_RETRIES = 3
27
- QUEUE_TIMEOUT = 300
28
-
29
- # ============================== 去重过滤配置 ==============================
30
-
31
- # 温和模式下使用内存去重管道和过滤器
32
- DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline'
33
- FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
34
-
35
- # --- Redis 配置(用于分布式去重和队列) ---
36
- REDIS_HOST = '127.0.0.1'
37
- REDIS_PORT = 6379
38
- REDIS_PASSWORD = '' # 如果有密码,请填写
39
-
40
- # 根据是否有密码生成 URL
41
1
  if REDIS_PASSWORD:
42
2
  REDIS_URL = f'redis://:{REDIS_PASSWORD}@{REDIS_HOST}:{REDIS_PORT}/0'
43
3
  else:
@@ -55,6 +15,11 @@ CLEANUP_FP = 0
55
15
  FILTER_DEBUG = True
56
16
  DECODE_RESPONSES = True
57
17
 
18
+ # ============================== 域名过滤配置 ==============================
19
+ # OffsiteMiddleware 配置,用于限制爬虫只爬取指定域名的页面
20
+ # 如需启用域名过滤功能,请取消注释并配置允许的域名列表
21
+ # ALLOWED_DOMAINS = ['example.com', 'www.example.com']
22
+
58
23
  # ============================== 用户自定义中间件配置 ==============================
59
24
  # 注意:框架默认中间件已自动加载,此处可添加或覆盖默认中间件
60
25
 
@@ -86,6 +86,11 @@ DECODE_RESPONSES = True
86
86
  # 'crawlo.extension.health_check.HealthCheckExtension', # 健康检查
87
87
  # ]
88
88
 
89
+ # ============================== 域名过滤配置 ==============================
90
+ # OffsiteMiddleware 配置,用于限制爬虫只爬取指定域名的页面
91
+ # 如需启用域名过滤功能,请取消注释并配置允许的域名列表
92
+ # ALLOWED_DOMAINS = ['example.com', 'www.example.com']
93
+
89
94
  # ============================== 日志配置 ==============================
90
95
 
91
96
  LOG_LEVEL = 'INFO'
@@ -21,6 +21,11 @@ PIPELINES = [
21
21
  'crawlo.pipelines.json_pipeline.JsonPipeline',
22
22
  ]
23
23
 
24
+ # ============================== 域名过滤配置 ==============================
25
+ # OffsiteMiddleware 配置,用于限制爬虫只爬取指定域名的页面
26
+ # 如需启用域名过滤功能,请取消注释并配置允许的域名列表
27
+ # ALLOWED_DOMAINS = ['example.com', 'www.example.com']
28
+
24
29
  # ============================== 日志配置 ==============================
25
30
  LOG_LEVEL = 'INFO'
26
31
  LOG_FILE = f'logs/{{project_name}}.log'
@@ -59,6 +59,11 @@ DOWNLOAD_DELAY = 1.0 # 增加延迟以降低目标网站压力
59
59
  # FILTER_DEBUG = True
60
60
  # DECODE_RESPONSES = True
61
61
 
62
+ # ============================== 域名过滤配置 ==============================
63
+ # OffsiteMiddleware 配置,用于限制爬虫只爬取指定域名的页面
64
+ # 如需启用域名过滤功能,请取消注释并配置允许的域名列表
65
+ # ALLOWED_DOMAINS = ['example.com', 'www.example.com']
66
+
62
67
  # ============================== 用户自定义中间件配置 ==============================
63
68
  # 注意:框架默认中间件已自动加载,此处可添加或覆盖默认中间件
64
69
 
@@ -5,18 +5,9 @@
5
5
  ============================
6
6
  基于 Crawlo 框架的简化爬虫启动器。
7
7
  """
8
-
9
8
  import sys
10
- import os
11
9
  import asyncio
12
10
 
13
- # 添加项目根目录到 Python 路径
14
- project_root = os.path.dirname(os.path.abspath(__file__))
15
- sys.path.insert(0, project_root)
16
-
17
- # 切换到项目根目录
18
- os.chdir(project_root)
19
-
20
11
  from crawlo.crawler import CrawlerProcess
21
12
 
22
13
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: crawlo
3
- Version: 1.3.1
3
+ Version: 1.3.2
4
4
  Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
5
5
  Home-page: https://github.com/crawl-coder/Crawlo.git
6
6
  Author: crawl-coder
@@ -1,12 +1,12 @@
1
1
  crawlo/__init__.py,sha256=qZzTmb7hw5h_qcP2EYGUZcoSScxlKZFJ76CjSeS7UfA,1381
2
- crawlo/__version__.py,sha256=P2DFKJQEJRlJhF0IW0Lwt4G4uMYyJJ5ymhv-XrCcPGo,22
2
+ crawlo/__version__.py,sha256=FVIvqGrcsQXkMjh8L0_Hc4T61ShpSr0KRWouUASp_pM,22
3
3
  crawlo/cli.py,sha256=OXprmcTUbFK02ptw_Gq8Gk4-ZCU-WEMJgzU1ztgP6Bk,2327
4
4
  crawlo/config.py,sha256=dNoNyTkXLe2msQ7bZx3YTQItk1m49nIg5-g89FQDNwE,9486
5
5
  crawlo/config_validator.py,sha256=gsiLqf5swWd9ISDvoLqCdG7iSXr-ZdBPD4iT6ug1ua4,11239
6
6
  crawlo/crawler.py,sha256=Fiu9O_eFHKCfzgzFe0O9gpzWGyneY-imI8-9O4hiWqU,42608
7
7
  crawlo/event.py,sha256=7-y6HNv_EIJSYQNzsj0mVK-Gg4ON3wdQeMdQjfFJPlw,313
8
8
  crawlo/exceptions.py,sha256=sMay0wnWLfc_FXWslqxm60qz6b66LXs3EdN_w8ygE9k,1166
9
- crawlo/mode_manager.py,sha256=IN3CsWW1pzKnzHxvULxJTTx3Vw2kxJilfeLuDWfxm4Q,4890
9
+ crawlo/mode_manager.py,sha256=QPVFZmsreysYAVxFRdtuknPqkwXM5mLtuLJxpPeI-sQ,6386
10
10
  crawlo/project.py,sha256=swSTcan4Ky7sYfCatpNLKsVxztmPkIVwjdo3u6dgcpI,11128
11
11
  crawlo/stats_collector.py,sha256=NkO09CB-220qz5rxFcD_dedGfr2VPFrDo4hya0Zh8Qc,1577
12
12
  crawlo/subscriber.py,sha256=D3hzE7Pc_zJjc-zR7lct5pt32bz6LsDYeC8uHlS4Hso,4986
@@ -52,7 +52,7 @@ crawlo/middleware/__init__.py,sha256=ldaGFNbiJnK9Fx12Vdf9fDNfzXxoETtShp5r-vodtw0
52
52
  crawlo/middleware/default_header.py,sha256=wQ7BrUHd-hRosFoKsReV9hwNNr_jwK6V0ZfxL6MOGrk,5032
53
53
  crawlo/middleware/download_delay.py,sha256=zt9R5g2HWErWA_MAOnGcw_D8l6HD769Kyaw-Hv-vcTc,3438
54
54
  crawlo/middleware/middleware_manager.py,sha256=9Sj9rrWK6R9NZq9eT38sWRGuBKLKfjSgEAxu-5NCWgU,6278
55
- crawlo/middleware/offsite.py,sha256=cR0nVAygxAGbkmyI8yqiRtrZWTRLWddUiToMYGmhrfs,4084
55
+ crawlo/middleware/offsite.py,sha256=R9e5haPpCs2Uw9Hm5MW93G4usRZ-DqTqa33eVMoaK-4,4557
56
56
  crawlo/middleware/proxy.py,sha256=NquB6tqHAgHs3-2_1_5220kJYfjNG5JyHRJyo_2j4wo,15636
57
57
  crawlo/middleware/request_ignore.py,sha256=xcyZ1c7r_HhbzR3r9pfjsLGW7L7FBVeYvlNt8cpP2wY,2577
58
58
  crawlo/middleware/response_code.py,sha256=-Aa9Mm9nJN-WdddN7iTanJRMA83_LYYgSEz3XLQGvMo,4934
@@ -82,17 +82,17 @@ crawlo/settings/default_settings.py,sha256=VKaCb8JnHx_B-Zi3hN6Mt0QIdv0YRhFlhVEZ2
82
82
  crawlo/settings/setting_manager.py,sha256=4uuMpGVYzxjmQjvlGqfZ8hDaoSh34OAoL0LCATsMCkI,7512
83
83
  crawlo/spider/__init__.py,sha256=ZnSAL9PXLZSIH-Jdv-P6RuWmQUdukr8KPLQK6SXZZaU,20435
84
84
  crawlo/templates/crawlo.cfg.tmpl,sha256=9BAmwEibS5Tvy6HIcGXPb0BGeuesmibebmTW0iAEkmo,230
85
- crawlo/templates/run.py.tmpl,sha256=v_g-LQMYJ6pC8TZgyWj0yB2yTTKrwy9lEJufAYCXyxY,1228
85
+ crawlo/templates/run.py.tmpl,sha256=iICBXZAEkQnn2Z-72vBwnvYghBHK02u6I8uYncY-WPY,1033
86
86
  crawlo/templates/project/__init__.py.tmpl,sha256=f3ETIXw_O6K-lkL6lXM5znMPJW1FZYGFrwDs2BnHcnQ,58
87
87
  crawlo/templates/project/items.py.tmpl,sha256=mt1Mm--H2Ouos3r7JPkYh0r33rgYJf1YOMz0OZy8TYs,297
88
88
  crawlo/templates/project/middlewares.py.tmpl,sha256=T67p8j0laL4NJJ_3xzPM9yivgZRjTEMiEtEWLPwbkmw,4160
89
89
  crawlo/templates/project/pipelines.py.tmpl,sha256=GBHYU0Jx8sKDCdGJp44FMSH7u2slxoFg6a-R9Uwg_-I,2608
90
- crawlo/templates/project/settings.py.tmpl,sha256=K0WOyCJsiykbZjoZRhzmTVssoahETkYS2zb2q3Ai5Ts,9998
91
- crawlo/templates/project/settings_distributed.py.tmpl,sha256=BsSqtYl69NtFUTq-lDkXTzU7cwhqZhkna3x__pQx7oc,6692
92
- crawlo/templates/project/settings_gentle.py.tmpl,sha256=9S6l-v0yJOPDy3oxsCIpSDieXjxtHNPHhBqfGP28CG4,3975
93
- crawlo/templates/project/settings_high_performance.py.tmpl,sha256=zjlGESHvt3m_vsvAYDc5oE2Eui3eI9QEn3-uKFyHXpc,4706
94
- crawlo/templates/project/settings_minimal.py.tmpl,sha256=dFoz39BGkSzLDTwBN-mQh242SLzcP6g8MhI8Zk49jvw,909
95
- crawlo/templates/project/settings_simple.py.tmpl,sha256=iWQFaw1WxTJA2QF-kXH4nspDGuXvnGEAuqpGQHCfuew,4101
90
+ crawlo/templates/project/settings.py.tmpl,sha256=LBILzRqj5oOOCZdaoWvHBoc_0ZFgcbvHaJ58N2S0d28,10296
91
+ crawlo/templates/project/settings_distributed.py.tmpl,sha256=WrIA5k8Je-o4BX2sbv2YffzdskSwNeptKAywNOFMNTM,7002
92
+ crawlo/templates/project/settings_gentle.py.tmpl,sha256=C2OB_dVLz2u1et5YNudOgl7PpF2lLgKiYoU-fzfvB3I,2710
93
+ crawlo/templates/project/settings_high_performance.py.tmpl,sha256=cjS-0Q6yx7nPhGr2uaFVTKcQQdsoBcDEkwtkbtNFOAQ,5004
94
+ crawlo/templates/project/settings_minimal.py.tmpl,sha256=sVs8eCeB2zGed1ttTWFKJOLzxWE9Gk3KV5Nx5e73e-0,1207
95
+ crawlo/templates/project/settings_simple.py.tmpl,sha256=p6FWlTgLkiYiMbFolTWwS3BAAuR3rfCQrVB-55ve-Sk,4399
96
96
  crawlo/templates/project/spiders/__init__.py.tmpl,sha256=j_YKsw6HQMJyqlk3WUouP3bsr-XVxshRoSNakHBc00g,106
97
97
  crawlo/templates/spider/spider.py.tmpl,sha256=jMhzyxpIpV_KigB-pmN-5mGMiYtu4mfQIOvpZcCGGJI,5055
98
98
  crawlo/tools/__init__.py,sha256=8igeUXLD0vJ5ta2X91QyTvna6dOioKCn0z7EF4oHvHI,3942
@@ -212,8 +212,8 @@ tests/test_tools.py,sha256=9t9FXZ61MfdB70nck9NYzCq97yd3SLVlLiMybEAlClk,5345
212
212
  tests/test_user_agents.py,sha256=rUotyuE2iJDi2LQBrUh980U-dAMTs4ARPMJxICOoQFY,3231
213
213
  tests/tools_example.py,sha256=MtIypR-OFiWwi-skurwmq4fM0cGTt-GUX4hSekYs7BY,7739
214
214
  tests/verify_distributed.py,sha256=krnYYA5Qx9xXDMWc9YF5DxPSplGvawDg2n0l-3CAqoM,3928
215
- crawlo-1.3.1.dist-info/METADATA,sha256=6m0OV78Iso8DwQFiIB0_mWkYSDymuhjw69F9aVKwtCo,26813
216
- crawlo-1.3.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
217
- crawlo-1.3.1.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
218
- crawlo-1.3.1.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
219
- crawlo-1.3.1.dist-info/RECORD,,
215
+ crawlo-1.3.2.dist-info/METADATA,sha256=VOvsyZ5e2RkIdzSCj9NPKOpqKG6SJ9UXleWwUSQqyxE,26813
216
+ crawlo-1.3.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
217
+ crawlo-1.3.2.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
218
+ crawlo-1.3.2.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
219
+ crawlo-1.3.2.dist-info/RECORD,,
File without changes