crawlo 1.4.3__py3-none-any.whl → 1.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (30) hide show
  1. crawlo/__version__.py +1 -1
  2. crawlo/commands/genspider.py +52 -17
  3. crawlo/crawler.py +6 -0
  4. crawlo/queue/pqueue.py +2 -6
  5. crawlo/queue/queue_manager.py +1 -2
  6. crawlo/settings/default_settings.py +11 -30
  7. crawlo/templates/project/settings.py.tmpl +51 -65
  8. crawlo/templates/project/settings_distributed.py.tmpl +59 -67
  9. crawlo/templates/project/settings_gentle.py.tmpl +45 -40
  10. crawlo/templates/project/settings_high_performance.py.tmpl +45 -40
  11. crawlo/templates/project/settings_minimal.py.tmpl +37 -26
  12. crawlo/templates/project/settings_simple.py.tmpl +45 -40
  13. crawlo/templates/run.py.tmpl +3 -7
  14. {crawlo-1.4.3.dist-info → crawlo-1.4.4.dist-info}/METADATA +1 -1
  15. {crawlo-1.4.3.dist-info → crawlo-1.4.4.dist-info}/RECORD +21 -27
  16. tests/test_multi_directory.py +68 -0
  17. tests/test_multiple_spider_modules.py +81 -0
  18. tests/test_spider_modules.py +85 -0
  19. examples/test_project/__init__.py +0 -7
  20. examples/test_project/run.py +0 -35
  21. examples/test_project/test_project/__init__.py +0 -4
  22. examples/test_project/test_project/items.py +0 -18
  23. examples/test_project/test_project/middlewares.py +0 -119
  24. examples/test_project/test_project/pipelines.py +0 -97
  25. examples/test_project/test_project/settings.py +0 -170
  26. examples/test_project/test_project/spiders/__init__.py +0 -10
  27. examples/test_project/test_project/spiders/of_week_dis.py +0 -144
  28. {crawlo-1.4.3.dist-info → crawlo-1.4.4.dist-info}/WHEEL +0 -0
  29. {crawlo-1.4.3.dist-info → crawlo-1.4.4.dist-info}/entry_points.txt +0 -0
  30. {crawlo-1.4.3.dist-info → crawlo-1.4.4.dist-info}/top_level.txt +0 -0
@@ -6,75 +6,79 @@
6
6
  适合快速开始和简单项目。
7
7
  """
8
8
 
9
- # ============================== 项目基本信息 ==============================
9
+ # =================================== 基础配置 ===================================
10
+
11
+ # 项目基本信息
10
12
  PROJECT_NAME = '{{project_name}}'
11
13
 
12
- # ============================== 运行模式 ==============================
14
+ # 运行模式
13
15
  RUN_MODE = 'standalone'
14
16
 
15
- # ============================== 并发配置 ==============================
17
+ # 并发配置
16
18
  CONCURRENCY = 4
17
19
  MAX_RUNNING_SPIDERS = 1
18
20
  DOWNLOAD_DELAY = 1.0
19
21
 
20
- # ============================== 下载器配置 ==============================
21
- # 可选下载器:
22
- # DOWNLOADER = 'crawlo.downloader.aiohttp_downloader.AioHttpDownloader'
23
- # DOWNLOADER = 'crawlo.downloader.httpx_downloader.HttpXDownloader'
24
- # DOWNLOADER = 'crawlo.downloader.cffi_downloader.CurlCffiDownloader'
22
+ # =================================== 核心组件配置 ===================================
23
+
24
+ # 下载器配置
25
25
  DOWNLOADER = 'crawlo.downloader.aiohttp_downloader.AioHttpDownloader'
26
26
 
27
- # ============================== 队列配置 ==============================
27
+ # 队列配置
28
28
  QUEUE_TYPE = 'memory'
29
29
 
30
- # ============================== 去重过滤器 ==============================
30
+ # 去重过滤器
31
31
  FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
32
32
 
33
- # ============================== 默认去重管道 ==============================
33
+ # 默认去重管道
34
34
  DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline'
35
35
 
36
- # ============================== 爬虫模块配置 ==============================
37
- SPIDER_MODULES = ['{{project_name}}.spiders']
36
+ # =================================== 爬虫配置 ===================================
38
37
 
39
- # ============================== 中间件 ==============================
40
- # MIDDLEWARES = [
41
- # 'crawlo.middleware.simple_proxy.SimpleProxyMiddleware',
42
- # ]
38
+ # 爬虫模块配置
39
+ SPIDER_MODULES = ['{{project_name}}.spiders']
43
40
 
44
- # ============================== 默认请求头配置 ==============================
41
+ # 默认请求头配置
45
42
  # 为DefaultHeaderMiddleware配置默认请求头
46
- DEFAULT_REQUEST_HEADERS = {
47
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
48
- 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
49
- 'Accept-Encoding': 'gzip, deflate, br',
50
- }
43
+ # DEFAULT_REQUEST_HEADERS = {}
51
44
 
52
- # ============================== 允许的域名 ==============================
45
+ # 允许的域名
53
46
  # 为OffsiteMiddleware配置允许的域名
54
- # ALLOWED_DOMAINS = ['example.com']
47
+ # ALLOWED_DOMAINS = []
55
48
 
56
- # ============================== 数据管道 ==============================
49
+ # 数据管道
50
+ # 如需添加自定义管道,请取消注释并添加
57
51
  # PIPELINES = [
58
- # 'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline', # MySQL 存储(使用asyncmy异步库)
52
+ # 'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline', # MySQL 存储(使用asyncmy异步库)
53
+ # # '{{project_name}}.pipelines.CustomPipeline', # 用户自定义管道示例
59
54
  # ]
60
55
 
61
- # ============================== 扩展组件 ==============================
56
+ # =================================== 系统配置 ===================================
57
+
58
+ # 扩展组件
59
+ # 如需添加自定义扩展,请取消注释并添加
62
60
  # EXTENSIONS = [
63
- # 'crawlo.extension.log_interval.LogIntervalExtension',
64
- # 'crawlo.extension.log_stats.LogStats',
65
- # 'crawlo.extension.logging_extension.CustomLoggerExtension',
61
+ # # '{{project_name}}.extensions.CustomExtension', # 用户自定义扩展示例
62
+ # ]
63
+
64
+ # 中间件
65
+ # 如需添加自定义中间件,请取消注释并添加
66
+ # MIDDLEWARES = [
67
+ # # '{{project_name}}.middlewares.CustomMiddleware', # 用户自定义中间件示例
66
68
  # ]
67
69
 
68
- # ============================== 日志配置 ==============================
70
+ # 日志配置
69
71
  LOG_LEVEL = 'INFO'
70
72
  LOG_FILE = 'logs/{{project_name}}.log'
71
73
  LOG_ENCODING = 'utf-8' # 明确指定日志文件编码
72
74
  STATS_DUMP = True
73
75
 
74
- # ============================== 输出配置 ==============================
76
+ # 输出配置
75
77
  OUTPUT_DIR = 'output'
76
78
 
77
- # ============================== Redis配置 ==============================
79
+ # =================================== 数据库配置 ===================================
80
+
81
+ # Redis配置
78
82
  REDIS_HOST = '127.0.0.1'
79
83
  REDIS_PORT = 6379
80
84
  REDIS_PASSWORD = ''
@@ -86,7 +90,7 @@ if REDIS_PASSWORD:
86
90
  else:
87
91
  REDIS_URL = f'redis://{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB}'
88
92
 
89
- # ============================== MySQL配置 ==============================
93
+ # MySQL配置
90
94
  MYSQL_HOST = '127.0.0.1'
91
95
  MYSQL_PORT = 3306
92
96
  MYSQL_USER = 'root'
@@ -96,7 +100,7 @@ MYSQL_TABLE = '{{project_name}}_data'
96
100
  MYSQL_BATCH_SIZE = 100
97
101
  MYSQL_USE_BATCH = False # 是否启用批量插入
98
102
 
99
- # ============================== MongoDB配置 ==============================
103
+ # MongoDB配置
100
104
  MONGO_URI = 'mongodb://localhost:27017'
101
105
  MONGO_DATABASE = '{{project_name}}_db'
102
106
  MONGO_COLLECTION = '{{project_name}}_items'
@@ -105,7 +109,9 @@ MONGO_MIN_POOL_SIZE = 20
105
109
  MONGO_BATCH_SIZE = 100 # 批量插入条数
106
110
  MONGO_USE_BATCH = False # 是否启用批量插入
107
111
 
108
- # ============================== 代理配置 ==============================
112
+ # =================================== 网络配置 ===================================
113
+
114
+ # 代理配置
109
115
  # 代理功能默认不启用,如需使用请在项目配置文件中启用并配置相关参数
110
116
  PROXY_ENABLED = False # 是否启用代理
111
117
 
@@ -124,7 +130,6 @@ PROXY_EXTRACTOR = "proxy"
124
130
  PROXY_REFRESH_INTERVAL = 60 # 代理刷新间隔(秒)
125
131
  PROXY_API_TIMEOUT = 10 # 请求代理 API 超时时间
126
132
 
127
- # ============================== Curl-Cffi 特有配置 ==============================
128
133
  # 浏览器指纹模拟(仅 CurlCffi 下载器有效)
129
134
  CURL_BROWSER_TYPE = "chrome" # 可选: chrome, edge, safari, firefox 或版本如 chrome136
130
135
 
@@ -136,7 +141,7 @@ CURL_BROWSER_VERSION_MAP = {
136
141
  "firefox": "firefox135",
137
142
  }
138
143
 
139
- # ============================== 下载器优化配置 ==============================
144
+ # 下载器优化配置
140
145
  # 下载器健康检查
141
146
  DOWNLOADER_HEALTH_CHECK = True # 是否启用下载器健康检查
142
147
  HEALTH_CHECK_INTERVAL = 60 # 健康检查间隔(秒)
@@ -157,7 +162,7 @@ AIOHTTP_FORCE_CLOSE = False # 是否强制关闭连接
157
162
  CONNECTION_TTL_DNS_CACHE = 300 # DNS缓存TTL(秒)
158
163
  CONNECTION_KEEPALIVE_TIMEOUT = 15 # Keep-Alive超时(秒)
159
164
 
160
- # ============================== 内存监控配置 ==============================
165
+ # 内存监控配置
161
166
  # 内存监控扩展默认不启用,如需使用请在项目配置文件中启用
162
167
  MEMORY_MONITOR_ENABLED = False # 是否启用内存监控
163
168
  MEMORY_MONITOR_INTERVAL = 60 # 内存监控检查间隔(秒)
@@ -6,7 +6,7 @@
6
6
  基于 Crawlo 框架的简化爬虫启动器。
7
7
 
8
8
  框架会自动处理爬虫模块的导入和注册,用户无需手动导入。
9
- 只需指定spider_modules参数,框架会自动扫描并导入所有爬虫。
9
+ 框架会自动从settings.py中读取SPIDER_MODULES配置。
10
10
  """
11
11
  import sys
12
12
  import asyncio
@@ -17,12 +17,8 @@ from crawlo.crawler import CrawlerProcess
17
17
  def main():
18
18
  """主函数:运行爬虫"""
19
19
  try:
20
- # 指定爬虫模块路径,框架会自动导入并注册所有爬虫
21
- spider_modules = ['{{project_name}}.spiders']
22
- process = CrawlerProcess(spider_modules=spider_modules)
23
-
24
- # TODO 运行指定的爬虫
25
- asyncio.run(process.crawl('spider_name'))
20
+ # TODO: 请将 'spider_name' 替换为实际要运行的爬虫名称
21
+ asyncio.run(CrawlerProcess().crawl('spider_name'))
26
22
 
27
23
  except Exception as e:
28
24
  print(f"❌ 运行失败: {e}")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: crawlo
3
- Version: 1.4.3
3
+ Version: 1.4.4
4
4
  Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
5
5
  Home-page: https://github.com/crawl-coder/Crawlo.git
6
6
  Author: crawl-coder
@@ -1,9 +1,9 @@
1
1
  crawlo/__init__.py,sha256=2Io5P9qJghOAjjD3YWdaiIq5laPLyLWVkEqgiVfUa3o,2381
2
- crawlo/__version__.py,sha256=adPldP4cMp2T8pbTFGYTXV50hu_smS3hxWkk5kLXpZE,23
2
+ crawlo/__version__.py,sha256=2ik6wvURqg571WApVvR_ELhg_eclmC_WvbDLEPmoO4Q,23
3
3
  crawlo/cli.py,sha256=AQnAB5NMI-Ic1VPw_Jjng8L4AI4-wMozOwzE6CfXkZU,2402
4
4
  crawlo/config.py,sha256=EQIT7WpkXAlr2ocd5SYJYOKTSWUlQx2AkTHX7ErEWxw,9798
5
5
  crawlo/config_validator.py,sha256=oY4-2bwXUlwHAnGgkI-EznviDfML_dcxbWSGXNSxC2k,11516
6
- crawlo/crawler.py,sha256=i4rc9beEOilKGK633nRh5UxCNgciil9Lyfj38xgIauU,26998
6
+ crawlo/crawler.py,sha256=E-fgYVtx6v2xEPixlQeWfNYVbW1oeWE0fQFZTQ6_K-I,27305
7
7
  crawlo/event.py,sha256=ZhoPW5CglCEuZNFEwviSCBIw0pT5O6jT98bqYrDFd3E,324
8
8
  crawlo/exceptions.py,sha256=YVIDnC1bKSMv3fXH_6tinWMuD9HmKHIaUfO4_fkX5sY,1247
9
9
  crawlo/framework.py,sha256=9gP6VN4MHqutGXaxnwpNMSULfVYbNp906UdZiJGywlQ,9458
@@ -14,7 +14,7 @@ crawlo/subscriber.py,sha256=h8fx69NJZeWem0ZkCmfHAi2kgfDGFObHpwN0aGNUM6Y,5115
14
14
  crawlo/task_manager.py,sha256=I9h3Rl0VRAfwqp24CHT3TuEAapNdTbVghkmuJhtM7jg,5966
15
15
  crawlo/commands/__init__.py,sha256=orvY6wLOBwGUEJKeF3h_T1fxj8AaQLjngBDd-3xKOE4,392
16
16
  crawlo/commands/check.py,sha256=TKDhI_sj7kErgiJpt2vCZ9QL-g6yWjrrPWKbgh8pgEU,23199
17
- crawlo/commands/genspider.py,sha256=7YGZdv12G341SWmkGbyDeMde2RgqGYxYXRExFy7KKNc,5088
17
+ crawlo/commands/genspider.py,sha256=JB4ZuFpKsYwtjx3DSsxugH7e3kqxhDWPG5ZKfvM0isI,6041
18
18
  crawlo/commands/help.py,sha256=8xPC0iNCg1rRBoK2bb6noAEANc1JwrdM35eF-j6yeZM,5111
19
19
  crawlo/commands/list.py,sha256=trzcd3kG6DhkOqYZADcl3yR7M8iJBgRw5fE-g9e0gVM,5877
20
20
  crawlo/commands/run.py,sha256=EjpIilgCTkXGVSV4rEISbJubdhqrok9nNe5-xDfDK5E,13169
@@ -92,26 +92,26 @@ crawlo/pipelines/mysql_pipeline.py,sha256=Kjgu6cks1KD4FPXwlTnFaos2LG-N8LLaBDyKZ_
92
92
  crawlo/pipelines/pipeline_manager.py,sha256=R6MRb5d-caOit7PZoglJLHa3qQ68U5YAQlwt8KcjRxo,4393
93
93
  crawlo/pipelines/redis_dedup_pipeline.py,sha256=RB1kXLr8ZuWNrgZKYwt--tlmnWsQTbuwTsSt3pafol8,6077
94
94
  crawlo/queue/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
95
- crawlo/queue/pqueue.py,sha256=Q4H2ePag7-pBjzJ0a4S1P4z_IT_G08T6l1uZBkuxO5A,1262
96
- crawlo/queue/queue_manager.py,sha256=Y2K4P-il5ACdP72X5jgTT-65Fu94_kQZ7H4c7AdRt-c,21547
95
+ crawlo/queue/pqueue.py,sha256=bbgd3l1VfqYXfz-4VFaiWLmJit1LdB3qHalCtNqyrqI,1210
96
+ crawlo/queue/queue_manager.py,sha256=8rKygMxr6DgSjnGsKFmvlTI5XAARvQIN_ENkAruHGXs,21532
97
97
  crawlo/queue/redis_priority_queue.py,sha256=vLvg2toKaRrXD1QyEdu1ZjTmANv7clFaBF7mCtstBmI,15995
98
98
  crawlo/settings/__init__.py,sha256=NgYFLfk_Bw7h6KSoepJn_lMBSqVbCHebjKxaE3_eMgw,130
99
- crawlo/settings/default_settings.py,sha256=DdoMvB7M_OyGKlBOr0lY-T5TN4V1bo6PlPvu4ZcQXnY,12836
99
+ crawlo/settings/default_settings.py,sha256=IKh2eZ9WWXkAbHx5K5KX0whNtumATRpZ7ifFPZJFfBk,11827
100
100
  crawlo/settings/setting_manager.py,sha256=yI1tGaludevxKGGZO3Pn4aYofrg2cwYwvMZCFC5PPZw,8595
101
101
  crawlo/spider/__init__.py,sha256=QGhe_yNsnfnCF3G9nSoWEw23b8SkP5oSFU5W79C5DzI,21881
102
102
  crawlo/templates/crawlo.cfg.tmpl,sha256=lwiUVe5sFixJgHFEjn1OtbAeyWsECOrz37uheuVtulk,240
103
- crawlo/templates/run.py.tmpl,sha256=Mt4gcw7RJu9ri4eBRnsAzf0EDt2giFWzpX36OTqEUpQ,961
103
+ crawlo/templates/run.py.tmpl,sha256=g8yst2hkqhKGNotR33fDxwmEsX6aEvhrXY_cfYos_vc,788
104
104
  crawlo/templates/spiders_init.py.tmpl,sha256=p6UK8KWr8FDydNxiAh6Iz29MY5WmgXIkf2z-buOGhOM,354
105
105
  crawlo/templates/project/__init__.py.tmpl,sha256=aQnHaOjMSkTviOC8COUX0fKymuyf8lx2tGduxkMkXEE,61
106
106
  crawlo/templates/project/items.py.tmpl,sha256=8_3DBA8HrS2XbfHzsMZNJiZbFY6fDJUUMFoFti_obJk,314
107
107
  crawlo/templates/project/middlewares.py.tmpl,sha256=fxHqi-Sjec5GiHJciprOU-6SAUTzM728NlZckIqf9hM,4278
108
108
  crawlo/templates/project/pipelines.py.tmpl,sha256=j9oqEhCezmmHlBhMWgYtlgup4jhWnMlv6AEiAOHODkg,2704
109
- crawlo/templates/project/settings.py.tmpl,sha256=gGYwPf-RCuw1pWyBZRUvL1BHzgoLYcIQ1sXUmWDGp1k,7050
110
- crawlo/templates/project/settings_distributed.py.tmpl,sha256=YSN7YcA05MT2yO5mg5MpryVyeEDhcaJEp9TU2NAUPS8,7147
111
- crawlo/templates/project/settings_gentle.py.tmpl,sha256=HS1qumqDciXFBTZYm-RS__ldphDVbuo6Poz8pTCHNEg,6797
112
- crawlo/templates/project/settings_high_performance.py.tmpl,sha256=u6n7S0KzSgJU23pkRifXnsS7AdQplJXefRyeG51v_QI,6887
113
- crawlo/templates/project/settings_minimal.py.tmpl,sha256=GJG44CJlop0_kDwtqhbKJlG_42VThHfICGVUNijkkRs,2550
114
- crawlo/templates/project/settings_simple.py.tmpl,sha256=fmy5PgrS_uVlXnIwXQFQ1Q28Ls121lbVg21Dbxrn6B4,6649
109
+ crawlo/templates/project/settings.py.tmpl,sha256=mL9_JAyz8R35r-ywRHi4T-dtal7oczU5kodEWxldw40,5265
110
+ crawlo/templates/project/settings_distributed.py.tmpl,sha256=RHzfWZITv-0ErCR9OYEswAZHpA5d9fYil0ZoGCtFt8g,5459
111
+ crawlo/templates/project/settings_gentle.py.tmpl,sha256=pmjrBLjnpGcR90RkcJrM5O8PsTrRhUB92QR3R4TJyko,5733
112
+ crawlo/templates/project/settings_high_performance.py.tmpl,sha256=9QhXSzfxIsMPyq0kZY9h2YBllyXGpGE37bMEbSrs_Ag,5823
113
+ crawlo/templates/project/settings_minimal.py.tmpl,sha256=1qUPhSdHtvLSHTpytUJ8K63sMROhTwkz8e4tVg1fYoM,2222
114
+ crawlo/templates/project/settings_simple.py.tmpl,sha256=sIyrCIVXsHSKl8Yjj8HkGs-ppMFH26a5yp6egVNlT2Q,5585
115
115
  crawlo/templates/project/spiders/__init__.py.tmpl,sha256=llhcIItXpm0TlEeumeLwp4fcYv2NHl8Iru7tLhDhxiE,216
116
116
  crawlo/templates/spider/spider.py.tmpl,sha256=KvU-9YpN6MifDE7XzejjyyQS7RUjLDLZ8zqJcLwSsu0,5198
117
117
  crawlo/tools/__init__.py,sha256=tOYfYPvZlrO8cmvnMWBjTma6UTLTFZN3qdC8pJwHrzI,4142
@@ -151,15 +151,6 @@ crawlo/utils/system.py,sha256=24zGmtHNhDFMGVo7ftMV-Pqg6_5d63zsyNey9udvJJk,248
151
151
  crawlo/utils/tools.py,sha256=uy7qw5Z1BIhyEgiHENvtM7WoGCJxlS8EX3PmOA7ouCo,275
152
152
  crawlo/utils/url.py,sha256=RKe_iqdjafsNcp-P2GVLYpsL1qbxiuZLiFc-SqOQkcs,1521
153
153
  examples/__init__.py,sha256=NkRbV8_S1tb8S2AW6BE2U6P2-eGOPwMR1k0YQAwQpSE,130
154
- examples/test_project/__init__.py,sha256=BQ6FVVDjB00-Vyib8h2I3P-LV5tjsFzTSLC2rFDe7Gw,136
155
- examples/test_project/run.py,sha256=HHMFVY8D9ouWytp-xNvFDKB8NphrZGuFbVrah6-afG8,953
156
- examples/test_project/test_project/__init__.py,sha256=RU5IUwU1oeDABAj3ZsLdMo53r3XG4saYIYgO1SXo13g,57
157
- examples/test_project/test_project/items.py,sha256=L_bnTuJLW7xtuja4AaqYuSsQvVdaqMunZWdh1OTz4HY,310
158
- examples/test_project/test_project/middlewares.py,sha256=fE7AUI2Yb9_ZwA4vC4ngn-s231FtEYeYWjXOwd5mkzE,4270
159
- examples/test_project/test_project/pipelines.py,sha256=7EDGrvuNH6JY0XINRwzgPmB-X4Ax6J790jtxbkL2O_U,2696
160
- examples/test_project/test_project/settings.py,sha256=imfrkrwSJ1V3jINe2ZhynlT4_w5bUBkeI9DFK7DS2g0,7115
161
- examples/test_project/test_project/spiders/__init__.py,sha256=Lx_To88ShpAR6Pyd9PUMjct690MZH0gETxz6knRqPRY,212
162
- examples/test_project/test_project/spiders/of_week_dis.py,sha256=-gDLaKGoF5birxCoLL_CX82bYopXjo4QmOV6a7I-Ci0,5178
163
154
  tests/__init__.py,sha256=409aRX8hsPffiZCVjOogtxwhACzBp8G2UTJyUQSxhK0,136
164
155
  tests/advanced_tools_example.py,sha256=1_iitECKCuWUYMNNGo61l3lmwMRrWdA8F_Xw56UaGZY,9340
165
156
  tests/authenticated_proxy_example.py,sha256=fKmHXXxIxCJXjEplttCWRh7PZhbxkBSxJF91Bx-qOME,3019
@@ -260,6 +251,8 @@ tests/test_logging_system.py,sha256=LGfK14ZEWzRtl3_VkBGz-AaVa_dDtuC5zu40m8FvmMo,
260
251
  tests/test_middleware_debug.py,sha256=gtiaWCxBSTcaNkdqXirM7CsThr_HfiCueBdQCpp7rqg,4572
261
252
  tests/test_mode_change.py,sha256=GT53CBdxcG3-evcKz_OOfH4PBiq_oqQyuDjRXrvv1UU,2665
262
253
  tests/test_mode_consistency.py,sha256=t72WX0etC_AayaL2AT6e2lIgbfP-zxTgYAiTARSN2Jk,1276
254
+ tests/test_multi_directory.py,sha256=sH9Y3B-fuESlc7J1aICa-AlBcCW8HFR-Q5j2anUr8l0,2196
255
+ tests/test_multiple_spider_modules.py,sha256=M0wPyQW7HMasbMIgn_R78wjZEj4A_DgqaGHp0qF9Y0c,2567
263
256
  tests/test_offsite_middleware.py,sha256=njpXTdngOqBs60Wj6xgo5EEXlJnMHd7vtYGi9dVauW0,10602
264
257
  tests/test_offsite_middleware_simple.py,sha256=4MfDKSXGHcoFLYnnxCH2rmnzztWyN0xByYLoHtepyiA,7918
265
258
  tests/test_parsel.py,sha256=wuZqRFIm9xx1tt6o3Xi_OjvwhT_MPmHiUEj2ax06zlo,701
@@ -306,6 +299,7 @@ tests/test_retry_middleware_realistic.py,sha256=Sam5y4jCN8oeElU4xxeS5zjAyzS-P8si
306
299
  tests/test_scheduler.py,sha256=1fCu35QgK5gzgrhD0aUZj5lxL0QbokzPav-yEJxz9Ig,8182
307
300
  tests/test_scheduler_config_update.py,sha256=LuxjEbt20QrPyVkjSFxvTnFtUxwMaHB6TcqjFyo8bow,4261
308
301
  tests/test_simple_response.py,sha256=_ui2PuVZvJcAuLY7HZ8xcsy_tDBimgBqX0ukj3kE5J0,1549
302
+ tests/test_spider_modules.py,sha256=wxPs28FtpGnQTemMY6r7WxVrwYo3bHnAd5dq94qj1K4,2797
309
303
  tests/test_telecom_spider_redis_key.py,sha256=c-gfixPul2VlYMQJGf0H5ZgYJ461fQgSKbCPrbAU45M,7625
310
304
  tests/test_template_content.py,sha256=2RgCdOA3pMUSOqC_JbTGeW7KonbTqJ0ySYJNWegU-v0,2903
311
305
  tests/test_template_redis_key.py,sha256=99-s0_-8MFJbIvGG_X__sH0qkXWTtJv8fdTdlftsq4I,4876
@@ -319,8 +313,8 @@ tests/verify_distributed.py,sha256=0IolM4ymuPOz_uTfHSWFO3Vxzp7Lo6i0zhSbzJhHFtI,4
319
313
  tests/verify_log_fix.py,sha256=7reyVl3MXTDASyChgU5BAYuzuxvFjSLG9HywAHso0qg,4336
320
314
  tests/scrapy_comparison/ofweek_scrapy.py,sha256=rhVds_WjYum1bLuWWe90HtXE51fZXEqhhPSc822ZasQ,5790
321
315
  tests/scrapy_comparison/scrapy_test.py,sha256=-IsGUHPBgEL0TmXjeLZl-TUA01B7Dsc2nRo4JZbFwZA,5599
322
- crawlo-1.4.3.dist-info/METADATA,sha256=PTbW7GP9xgBjFEX1adwjxeJHqM_x9VRU_ZLZsezaydU,4848
323
- crawlo-1.4.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
324
- crawlo-1.4.3.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
325
- crawlo-1.4.3.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
326
- crawlo-1.4.3.dist-info/RECORD,,
316
+ crawlo-1.4.4.dist-info/METADATA,sha256=LAg9xmMfxLUwVUGPqw_p48hGJYZqsRC9Mc4KqDroAUQ,4848
317
+ crawlo-1.4.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
318
+ crawlo-1.4.4.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
319
+ crawlo-1.4.4.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
320
+ crawlo-1.4.4.dist-info/RECORD,,
@@ -0,0 +1,68 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 测试多个爬虫目录的支持
5
+ """
6
+ import sys
7
+ import os
8
+
9
+ # 添加项目根目录到Python路径
10
+ sys.path.insert(0, os.path.dirname(__file__))
11
+
12
+ # 添加ofweek_standalone到Python路径
13
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'examples', 'ofweek_standalone'))
14
+
15
+ def test_multiple_spider_directories():
16
+ """测试多个爬虫目录的支持"""
17
+ print("测试多个爬虫目录的支持...")
18
+
19
+ # 导入设置
20
+ import examples.ofweek_standalone.ofweek_standalone.settings as settings_module
21
+
22
+ # 创建设置管理器
23
+ from crawlo.settings.setting_manager import SettingManager
24
+ settings = SettingManager()
25
+ settings.set_settings(settings_module)
26
+
27
+ # 检查SPIDER_MODULES配置
28
+ spider_modules = settings.get('SPIDER_MODULES')
29
+ print(f"SPIDER_MODULES配置: {spider_modules}")
30
+
31
+ # 创建CrawlerProcess实例
32
+ from crawlo.crawler import CrawlerProcess
33
+ process = CrawlerProcess(settings=settings)
34
+
35
+ # 检查是否注册了爬虫
36
+ spider_names = process.get_spider_names()
37
+ print(f"已注册的爬虫: {spider_names}")
38
+
39
+ # 验证期望的爬虫是否已注册
40
+ expected_spiders = ['of_week_standalone', 'test_spider']
41
+ registered_spiders = []
42
+
43
+ for spider_name in expected_spiders:
44
+ if spider_name in spider_names:
45
+ print(f"✅ 成功: 爬虫 '{spider_name}' 已注册")
46
+ registered_spiders.append(spider_name)
47
+ else:
48
+ print(f"❌ 失败: 爬虫 '{spider_name}' 未找到")
49
+
50
+ if len(registered_spiders) == len(expected_spiders):
51
+ print(f"🎉 所有爬虫都已成功注册!")
52
+ return True
53
+ else:
54
+ print(f"⚠️ 部分爬虫未注册: {set(expected_spiders) - set(registered_spiders)}")
55
+ return False
56
+
57
+
58
+ if __name__ == '__main__':
59
+ print("开始测试多个爬虫目录的支持...\n")
60
+
61
+ success = test_multiple_spider_directories()
62
+
63
+ if success:
64
+ print("\n🎉 测试通过!")
65
+ sys.exit(0)
66
+ else:
67
+ print("\n❌ 测试失败!")
68
+ sys.exit(1)
@@ -0,0 +1,81 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 测试多个SPIDER_MODULES目录的支持
5
+ """
6
+ import sys
7
+ import os
8
+ import asyncio
9
+
10
+ # 添加项目根目录到Python路径
11
+ sys.path.insert(0, os.path.dirname(__file__))
12
+
13
+ # 添加ofweek_standalone到Python路径
14
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'examples', 'ofweek_standalone'))
15
+
16
+ from crawlo.crawler import CrawlerProcess
17
+ from crawlo.spider import get_spider_names
18
+
19
+
20
+ def test_multiple_spider_modules():
21
+ """测试多个SPIDER_MODULES目录的支持"""
22
+ print("测试多个SPIDER_MODULES目录的支持...")
23
+
24
+ # 模拟包含多个目录的SPIDER_MODULES配置
25
+ spider_modules = ['ofweek_standalone.spiders', 'ofweek_standalone.new_spiders']
26
+
27
+ # 创建CrawlerProcess实例
28
+ process = CrawlerProcess(spider_modules=spider_modules)
29
+
30
+ # 检查是否注册了爬虫
31
+ spider_names = process.get_spider_names()
32
+ print(f"已注册的爬虫: {spider_names}")
33
+
34
+ # 验证期望的爬虫是否已注册
35
+ expected_spider = 'of_week_standalone'
36
+ if expected_spider in spider_names:
37
+ print(f"✅ 成功: 爬虫 '{expected_spider}' 已注册")
38
+ return True
39
+ else:
40
+ print(f"❌ 失败: 爬虫 '{expected_spider}' 未找到")
41
+ return False
42
+
43
+
44
+ def test_settings_with_multiple_spider_modules():
45
+ """测试settings中配置多个SPIDER_MODULES目录"""
46
+ print("\n测试settings中配置多个SPIDER_MODULES目录...")
47
+
48
+ # 创建模拟的settings对象
49
+ class MockSettings:
50
+ def get(self, key, default=None):
51
+ if key == 'SPIDER_MODULES':
52
+ return ['ofweek_standalone.spiders', 'ofweek_standalone.new_spiders']
53
+ return default
54
+
55
+ settings = MockSettings()
56
+
57
+ # 创建CrawlerProcess实例
58
+ process = CrawlerProcess(settings=settings)
59
+
60
+ # 检查是否注册了爬虫
61
+ spider_names = process.get_spider_names()
62
+ print(f"已注册的爬虫: {spider_names}")
63
+
64
+ return True
65
+
66
+
67
+ if __name__ == '__main__':
68
+ print("开始测试多个SPIDER_MODULES目录的支持...\n")
69
+
70
+ # 测试显式传递多个spider_modules参数
71
+ success1 = test_multiple_spider_modules()
72
+
73
+ # 测试从settings中读取多个spider_modules配置
74
+ success2 = test_settings_with_multiple_spider_modules()
75
+
76
+ if success1 and success2:
77
+ print("\n🎉 所有测试通过!")
78
+ sys.exit(0)
79
+ else:
80
+ print("\n❌ 部分测试失败!")
81
+ sys.exit(1)
@@ -0,0 +1,85 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 测试SPIDER_MODULES配置的自动读取功能
5
+ """
6
+ import sys
7
+ import os
8
+ import asyncio
9
+
10
+ # 添加项目根目录到Python路径
11
+ sys.path.insert(0, os.path.dirname(__file__))
12
+
13
+ # 添加ofweek_standalone到Python路径
14
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'examples', 'ofweek_standalone'))
15
+
16
+ from crawlo.crawler import CrawlerProcess
17
+ from crawlo.spider import get_spider_names
18
+
19
+
20
+ def test_spider_modules_auto_discovery():
21
+ """测试SPIDER_MODULES配置的自动读取功能"""
22
+ print("测试SPIDER_MODULES配置的自动读取功能...")
23
+
24
+ # 导入设置
25
+ import examples.ofweek_standalone.ofweek_standalone.settings as settings_module
26
+
27
+ # 创建设置管理器
28
+ from crawlo.settings.setting_manager import SettingManager
29
+ settings = SettingManager()
30
+ settings.set_settings(settings_module)
31
+
32
+ # 创建CrawlerProcess实例,不显式传递spider_modules
33
+ process = CrawlerProcess(settings=settings)
34
+
35
+ # 检查是否自动注册了爬虫
36
+ spider_names = process.get_spider_names()
37
+ print(f"已注册的爬虫: {spider_names}")
38
+
39
+ # 验证期望的爬虫是否已注册
40
+ expected_spider = 'of_week_standalone'
41
+ if expected_spider in spider_names:
42
+ print(f"✅ 成功: 爬虫 '{expected_spider}' 已自动注册")
43
+ return True
44
+ else:
45
+ print(f"❌ 失败: 爬虫 '{expected_spider}' 未找到")
46
+ return False
47
+
48
+
49
+ def test_crawler_process_with_explicit_spider_modules():
50
+ """测试显式传递spider_modules参数的功能"""
51
+ print("\n测试显式传递spider_modules参数的功能...")
52
+
53
+ # 显式传递spider_modules参数
54
+ spider_modules = ['ofweek_standalone.spiders']
55
+ process = CrawlerProcess(spider_modules=spider_modules)
56
+
57
+ # 检查是否注册了爬虫
58
+ spider_names = process.get_spider_names()
59
+ print(f"已注册的爬虫: {spider_names}")
60
+
61
+ # 验证期望的爬虫是否已注册
62
+ expected_spider = 'of_week_standalone'
63
+ if expected_spider in spider_names:
64
+ print(f"✅ 成功: 爬虫 '{expected_spider}' 已注册")
65
+ return True
66
+ else:
67
+ print(f"❌ 失败: 爬虫 '{expected_spider}' 未找到")
68
+ return False
69
+
70
+
71
+ if __name__ == '__main__':
72
+ print("开始测试SPIDER_MODULES配置功能...\n")
73
+
74
+ # 测试自动发现功能
75
+ success1 = test_spider_modules_auto_discovery()
76
+
77
+ # 测试显式传递参数功能
78
+ success2 = test_crawler_process_with_explicit_spider_modules()
79
+
80
+ if success1 and success2:
81
+ print("\n🎉 所有测试通过!")
82
+ sys.exit(0)
83
+ else:
84
+ print("\n❌ 部分测试失败!")
85
+ sys.exit(1)
@@ -1,7 +0,0 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- """
4
- # @Time : 2025-09-27 23:36
5
- # @Author : crawl-coder
6
- # @Desc : None
7
- """
@@ -1,35 +0,0 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: UTF-8 -*-
3
- """
4
- test_project 项目运行脚本
5
- ============================
6
- 基于 Crawlo 框架的简化爬虫启动器。
7
-
8
- 框架会自动处理爬虫模块的导入和注册,用户无需手动导入。
9
- 只需指定spider_modules参数,框架会自动扫描并导入所有爬虫。
10
- """
11
- import sys
12
- import asyncio
13
-
14
- from crawlo.crawler import CrawlerProcess
15
-
16
-
17
- def main():
18
- """主函数:运行爬虫"""
19
- try:
20
- # 指定爬虫模块路径,框架会自动导入并注册所有爬虫
21
- spider_modules = ['test_project.spiders']
22
- process = CrawlerProcess(spider_modules=spider_modules)
23
-
24
- # TODO 运行指定的爬虫
25
- asyncio.run(process.crawl('of_week_dis'))
26
-
27
- except Exception as e:
28
- print(f"❌ 运行失败: {e}")
29
- import traceback
30
- traceback.print_exc()
31
- sys.exit(1)
32
-
33
-
34
- if __name__ == '__main__':
35
- main()
@@ -1,4 +0,0 @@
1
- # -*- coding: UTF-8 -*-
2
- """
3
- test_project 项目包
4
- """
@@ -1,18 +0,0 @@
1
- # -*- coding: UTF-8 -*-
2
- """
3
- test_project.items
4
- ======================
5
- 定义你抓取的数据结构。
6
- """
7
-
8
- from crawlo.items import Item, Field
9
-
10
-
11
- class ExampleItem(Item):
12
- """
13
- 一个示例数据项。
14
- """
15
- id = Field()
16
- # price = Field()
17
- # description = Field()
18
- pass