crawlo 1.2.9__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__version__.py +1 -1
- crawlo/commands/run.py +26 -35
- crawlo/core/engine.py +1 -2
- crawlo/crawler.py +48 -53
- crawlo/extension/logging_extension.py +4 -2
- crawlo/middleware/middleware_manager.py +1 -1
- crawlo/mode_manager.py +37 -36
- crawlo/pipelines/pipeline_manager.py +13 -1
- crawlo/project.py +28 -34
- crawlo/settings/setting_manager.py +31 -19
- crawlo/utils/log.py +20 -61
- {crawlo-1.2.9.dist-info → crawlo-1.3.0.dist-info}/METADATA +1 -1
- {crawlo-1.2.9.dist-info → crawlo-1.3.0.dist-info}/RECORD +16 -16
- {crawlo-1.2.9.dist-info → crawlo-1.3.0.dist-info}/WHEEL +0 -0
- {crawlo-1.2.9.dist-info → crawlo-1.3.0.dist-info}/entry_points.txt +0 -0
- {crawlo-1.2.9.dist-info → crawlo-1.3.0.dist-info}/top_level.txt +0 -0
crawlo/__version__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = '1.
|
|
1
|
+
__version__ = '1.3.0'
|
crawlo/commands/run.py
CHANGED
|
@@ -5,26 +5,27 @@
|
|
|
5
5
|
# @Author : crawl-coder
|
|
6
6
|
# @Desc : 命令行入口:crawlo run <spider_name>|all,用于运行指定爬虫。
|
|
7
7
|
"""
|
|
8
|
+
import os
|
|
8
9
|
import sys
|
|
9
10
|
import asyncio
|
|
10
11
|
import configparser
|
|
11
|
-
import os
|
|
12
|
-
from pathlib import Path
|
|
13
12
|
from importlib import import_module
|
|
14
13
|
|
|
14
|
+
from rich import box
|
|
15
15
|
from rich.console import Console
|
|
16
16
|
from rich.panel import Panel
|
|
17
|
+
from rich.progress import Progress, SpinnerColumn, TextColumn
|
|
17
18
|
from rich.table import Table
|
|
18
19
|
from rich.text import Text
|
|
19
|
-
from rich import box
|
|
20
|
-
from rich.progress import Progress, SpinnerColumn, TextColumn
|
|
21
20
|
|
|
21
|
+
from crawlo.commands.stats import record_stats
|
|
22
22
|
from crawlo.crawler import CrawlerProcess
|
|
23
|
-
from crawlo.utils.log import get_logger
|
|
24
23
|
from crawlo.project import get_settings, _find_project_root
|
|
25
|
-
|
|
24
|
+
# 使用自定义日志系统
|
|
25
|
+
from crawlo.utils.log import get_logger
|
|
26
26
|
|
|
27
27
|
logger = get_logger(__name__)
|
|
28
|
+
|
|
28
29
|
console = Console()
|
|
29
30
|
|
|
30
31
|
|
|
@@ -77,6 +78,9 @@ def main(args):
|
|
|
77
78
|
用法:
|
|
78
79
|
crawlo run <spider_name>|all [--json] [--no-stats]
|
|
79
80
|
"""
|
|
81
|
+
# 添加调试信息
|
|
82
|
+
logger.debug("DEBUG: 进入main函数")
|
|
83
|
+
|
|
80
84
|
if len(args) < 1:
|
|
81
85
|
console.print("[bold red]用法:[/bold red] [blue]crawlo run[/blue] <爬虫名称>|all [bold yellow][--json] [--no-stats][/bold yellow]")
|
|
82
86
|
console.print("示例:")
|
|
@@ -187,21 +191,7 @@ def main(args):
|
|
|
187
191
|
return 1
|
|
188
192
|
|
|
189
193
|
# 显示即将运行的爬虫列表
|
|
190
|
-
|
|
191
|
-
title=f"启动全部 {len(spider_names)} 个爬虫",
|
|
192
|
-
box=box.ROUNDED,
|
|
193
|
-
show_header=True,
|
|
194
|
-
header_style="bold magenta"
|
|
195
|
-
)
|
|
196
|
-
table.add_column("名称", style="cyan")
|
|
197
|
-
table.add_column("类名", style="green")
|
|
198
|
-
|
|
199
|
-
for name in sorted(spider_names):
|
|
200
|
-
cls = process.get_spider_class(name)
|
|
201
|
-
table.add_row(name, cls.__name__)
|
|
202
|
-
|
|
203
|
-
console.print(table)
|
|
204
|
-
console.print()
|
|
194
|
+
# 根据用户要求,不再显示详细的爬虫列表信息
|
|
205
195
|
|
|
206
196
|
# 注册 stats 记录(除非 --no-stats)
|
|
207
197
|
if not no_stats:
|
|
@@ -260,20 +250,21 @@ def main(args):
|
|
|
260
250
|
spider_class = process.get_spider_class(spider_name)
|
|
261
251
|
|
|
262
252
|
# 显示启动信息
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
253
|
+
# 根据用户要求,不再显示项目启动信息
|
|
254
|
+
# if not show_json:
|
|
255
|
+
# info_table = Table(
|
|
256
|
+
# title=f"启动爬虫: [bold cyan]{spider_name}[/bold cyan]",
|
|
257
|
+
# box=box.SIMPLE,
|
|
258
|
+
# show_header=False,
|
|
259
|
+
# title_style="bold green"
|
|
260
|
+
# )
|
|
261
|
+
# info_table.add_column("Key", style="yellow")
|
|
262
|
+
# info_table.add_column("Value", style="cyan")
|
|
263
|
+
# info_table.add_row("Project", project_package)
|
|
264
|
+
# info_table.add_row("Class", spider_class.__name__)
|
|
265
|
+
# info_table.add_row("Module", spider_class.__module__)
|
|
266
|
+
# console.print(info_table)
|
|
267
|
+
# console.print()
|
|
277
268
|
|
|
278
269
|
# 注册 stats 记录
|
|
279
270
|
if not no_stats:
|
crawlo/core/engine.py
CHANGED
|
@@ -75,8 +75,7 @@ class Engine(object):
|
|
|
75
75
|
version = '1.0.0'
|
|
76
76
|
# Change INFO level log to DEBUG level to avoid duplication with CrawlerProcess startup log
|
|
77
77
|
self.logger.debug(
|
|
78
|
-
f"Crawlo Started version {version}
|
|
79
|
-
# f"(project name : {self.settings.get('PROJECT_NAME')})"
|
|
78
|
+
f"Crawlo Started version {version}"
|
|
80
79
|
)
|
|
81
80
|
|
|
82
81
|
async def start_spider(self, spider):
|
crawlo/crawler.py
CHANGED
|
@@ -21,7 +21,7 @@ Example Usage:
|
|
|
21
21
|
# Single crawler run
|
|
22
22
|
crawler = Crawler(MySpider, settings)
|
|
23
23
|
await crawler.crawl()
|
|
24
|
-
|
|
24
|
+
|
|
25
25
|
# Multi-crawler concurrent management
|
|
26
26
|
process = CrawlerProcess()
|
|
27
27
|
await process.crawl([Spider1, Spider2])
|
|
@@ -34,7 +34,6 @@ import threading
|
|
|
34
34
|
from typing import Type, Optional, Set, List, Union, Dict, Any
|
|
35
35
|
from .spider import Spider, get_global_spider_registry
|
|
36
36
|
from .core.engine import Engine
|
|
37
|
-
from .utils.log import get_logger
|
|
38
37
|
from .subscriber import Subscriber
|
|
39
38
|
from .extension import ExtensionManager
|
|
40
39
|
from .stats_collector import StatsCollector
|
|
@@ -42,16 +41,9 @@ from .event import spider_opened, spider_closed
|
|
|
42
41
|
from .settings.setting_manager import SettingManager
|
|
43
42
|
from crawlo.project import merge_settings, get_settings
|
|
44
43
|
|
|
45
|
-
#
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def _get_logger():
|
|
50
|
-
"""延迟获取logger实例,确保在配置加载后创建"""
|
|
51
|
-
global logger
|
|
52
|
-
if logger is None:
|
|
53
|
-
logger = get_logger(__name__)
|
|
54
|
-
return logger
|
|
44
|
+
# 使用自定义日志系统
|
|
45
|
+
from crawlo.utils.log import get_logger
|
|
46
|
+
logger = get_logger(__name__)
|
|
55
47
|
|
|
56
48
|
|
|
57
49
|
class CrawlerContext:
|
|
@@ -110,7 +102,7 @@ class CrawlerContext:
|
|
|
110
102
|
class Crawler:
|
|
111
103
|
"""
|
|
112
104
|
Single crawler runtime instance, managing Spider and engine lifecycle
|
|
113
|
-
|
|
105
|
+
|
|
114
106
|
Provides functionality:
|
|
115
107
|
- Spider lifecycle management (initialization, running, closing)
|
|
116
108
|
- Engine component coordination management
|
|
@@ -148,7 +140,7 @@ class Crawler:
|
|
|
148
140
|
async def crawl(self):
|
|
149
141
|
"""
|
|
150
142
|
Start the crawler core process
|
|
151
|
-
|
|
143
|
+
|
|
152
144
|
Includes the following stages:
|
|
153
145
|
1. Initialization stage: Create all components
|
|
154
146
|
2. Validation stage: Check configuration and state
|
|
@@ -190,12 +182,12 @@ class Crawler:
|
|
|
190
182
|
# Update context status
|
|
191
183
|
self.context.increment_completed()
|
|
192
184
|
|
|
193
|
-
|
|
185
|
+
logger.info(f"Spider {self.spider.name} completed, took {self._get_total_duration():.2f} seconds")
|
|
194
186
|
|
|
195
187
|
except Exception as e:
|
|
196
188
|
self._performance_metrics['error_count'] += 1
|
|
197
189
|
self.context.increment_failed(str(e))
|
|
198
|
-
|
|
190
|
+
logger.error(f"Spider {getattr(self.spider, 'name', 'Unknown')} failed to run: {e}", exc_info=True)
|
|
199
191
|
raise
|
|
200
192
|
finally:
|
|
201
193
|
self.context.decrement_active()
|
|
@@ -213,7 +205,7 @@ class Crawler:
|
|
|
213
205
|
else:
|
|
214
206
|
spider_name = 'Unknown'
|
|
215
207
|
|
|
216
|
-
|
|
208
|
+
logger.info(f"Starting running {spider_name}")
|
|
217
209
|
|
|
218
210
|
def _validate_crawler_state(self):
|
|
219
211
|
"""
|
|
@@ -233,7 +225,7 @@ class Crawler:
|
|
|
233
225
|
if not self.spider.name:
|
|
234
226
|
raise ValueError("Spider name cannot be empty")
|
|
235
227
|
|
|
236
|
-
|
|
228
|
+
logger.debug(f"Spider {self.spider.name} state validation passed")
|
|
237
229
|
|
|
238
230
|
def _get_total_duration(self) -> float:
|
|
239
231
|
"""Get total runtime"""
|
|
@@ -247,7 +239,7 @@ class Crawler:
|
|
|
247
239
|
if not self._closed:
|
|
248
240
|
await self.close()
|
|
249
241
|
except Exception as e:
|
|
250
|
-
|
|
242
|
+
logger.warning(f"Error cleaning up resources: {e}")
|
|
251
243
|
|
|
252
244
|
def get_performance_metrics(self) -> Dict[str, Any]:
|
|
253
245
|
"""Get performance metrics"""
|
|
@@ -267,7 +259,7 @@ class Crawler:
|
|
|
267
259
|
def _create_spider(self) -> Spider:
|
|
268
260
|
"""
|
|
269
261
|
Create and validate spider instance (enhanced version)
|
|
270
|
-
|
|
262
|
+
|
|
271
263
|
Performs the following validations:
|
|
272
264
|
- Spider name must exist
|
|
273
265
|
- start_requests method must be callable
|
|
@@ -300,7 +292,7 @@ class Crawler:
|
|
|
300
292
|
|
|
301
293
|
# parse method check (warning instead of error)
|
|
302
294
|
if not callable(getattr(spider, 'parse', None)):
|
|
303
|
-
|
|
295
|
+
logger.warning(
|
|
304
296
|
f"Spider '{spider.name}' does not define 'parse' method.\n"
|
|
305
297
|
f"Ensure all Requests specify a callback function, otherwise responses will be ignored."
|
|
306
298
|
)
|
|
@@ -308,27 +300,29 @@ class Crawler:
|
|
|
308
300
|
# Set spider configuration
|
|
309
301
|
self._set_spider(spider)
|
|
310
302
|
|
|
311
|
-
|
|
303
|
+
logger.debug(f"Spider '{spider.name}' initialized successfully")
|
|
312
304
|
return spider
|
|
313
305
|
|
|
314
306
|
def _create_engine(self) -> Engine:
|
|
315
307
|
"""Create and initialize engine"""
|
|
316
308
|
engine = Engine(self)
|
|
317
309
|
engine.engine_start()
|
|
318
|
-
|
|
310
|
+
logger.debug(f"Engine initialized successfully, spider: {getattr(self.spider, 'name', 'Unknown')}")
|
|
319
311
|
return engine
|
|
320
312
|
|
|
321
313
|
def _create_stats(self) -> StatsCollector:
|
|
322
314
|
"""Create stats collector"""
|
|
323
315
|
stats = StatsCollector(self)
|
|
324
|
-
|
|
316
|
+
logger.debug(
|
|
317
|
+
f"Stats collector initialized successfully, spider: {getattr(self.spider, 'name', 'Unknown')}")
|
|
325
318
|
return stats
|
|
326
319
|
|
|
327
320
|
def _create_extension(self) -> ExtensionManager:
|
|
328
321
|
"""Create extension manager"""
|
|
329
322
|
# Modify extension manager creation method, delay initialization until needed
|
|
330
323
|
extension = ExtensionManager.create_instance(self)
|
|
331
|
-
|
|
324
|
+
logger.debug(
|
|
325
|
+
f"Extension manager initialized successfully, spider: {getattr(self.spider, 'name', 'Unknown')}")
|
|
332
326
|
return extension
|
|
333
327
|
|
|
334
328
|
def _set_spider(self, spider: Spider):
|
|
@@ -343,12 +337,12 @@ class Crawler:
|
|
|
343
337
|
# Merge spider custom configuration
|
|
344
338
|
merge_settings(spider, self.settings)
|
|
345
339
|
|
|
346
|
-
|
|
340
|
+
logger.debug(f"Spider '{spider.name}' configuration merged successfully")
|
|
347
341
|
|
|
348
342
|
async def close(self, reason='finished') -> None:
|
|
349
343
|
"""
|
|
350
344
|
Close crawler and clean up resources (enhanced version)
|
|
351
|
-
|
|
345
|
+
|
|
352
346
|
Ensure closing only once and handle all cleanup operations
|
|
353
347
|
"""
|
|
354
348
|
async with self._close_lock:
|
|
@@ -371,15 +365,15 @@ class Crawler:
|
|
|
371
365
|
from crawlo.commands.stats import record_stats
|
|
372
366
|
record_stats(self)
|
|
373
367
|
except ImportError:
|
|
374
|
-
|
|
368
|
+
logger.debug("Statistics recording module does not exist, skipping statistics recording")
|
|
375
369
|
|
|
376
|
-
|
|
370
|
+
logger.info(
|
|
377
371
|
f"Spider '{getattr(self.spider, 'name', 'Unknown')}' closed, "
|
|
378
372
|
f"reason: {reason}, took: {self._get_total_duration():.2f} seconds"
|
|
379
373
|
)
|
|
380
374
|
|
|
381
375
|
except Exception as e:
|
|
382
|
-
|
|
376
|
+
logger.error(f"Error closing crawler: {e}", exc_info=True)
|
|
383
377
|
finally:
|
|
384
378
|
# Ensure resource cleanup
|
|
385
379
|
await self._cleanup_resources()
|
|
@@ -413,13 +407,13 @@ class Crawler:
|
|
|
413
407
|
if cleanup_tasks:
|
|
414
408
|
await asyncio.gather(*cleanup_tasks, return_exceptions=True)
|
|
415
409
|
|
|
416
|
-
|
|
410
|
+
logger.debug("Resource cleanup completed")
|
|
417
411
|
|
|
418
412
|
|
|
419
413
|
class CrawlerProcess:
|
|
420
414
|
"""
|
|
421
415
|
Crawler process manager
|
|
422
|
-
|
|
416
|
+
|
|
423
417
|
Supported features:
|
|
424
418
|
- Multi-crawler concurrent scheduling and resource management
|
|
425
419
|
- Automatic module discovery and spider registration
|
|
@@ -428,15 +422,15 @@ class CrawlerProcess:
|
|
|
428
422
|
- Real-time status monitoring and statistics
|
|
429
423
|
- Error recovery and retry mechanism
|
|
430
424
|
- Large-scale crawler optimization support
|
|
431
|
-
|
|
425
|
+
|
|
432
426
|
Usage example:
|
|
433
427
|
# Basic usage
|
|
434
428
|
process = CrawlerProcess()
|
|
435
429
|
await process.crawl(MySpider)
|
|
436
|
-
|
|
430
|
+
|
|
437
431
|
# Multi-crawler concurrency
|
|
438
432
|
await process.crawl([Spider1, Spider2, 'spider_name'])
|
|
439
|
-
|
|
433
|
+
|
|
440
434
|
# Custom concurrency
|
|
441
435
|
process = CrawlerProcess(max_concurrency=8)
|
|
442
436
|
"""
|
|
@@ -563,7 +557,7 @@ class CrawlerProcess:
|
|
|
563
557
|
def auto_discover(modules: List[str]):
|
|
564
558
|
"""
|
|
565
559
|
Automatically import modules, trigger Spider class definition and registration (enhanced version)
|
|
566
|
-
|
|
560
|
+
|
|
567
561
|
Supports recursive scanning and error recovery
|
|
568
562
|
"""
|
|
569
563
|
import importlib
|
|
@@ -617,7 +611,7 @@ class CrawlerProcess:
|
|
|
617
611
|
async def crawl(self, spiders: Union[Type[Spider], str, List[Union[Type[Spider], str]]]):
|
|
618
612
|
"""
|
|
619
613
|
Start one or more crawlers
|
|
620
|
-
|
|
614
|
+
|
|
621
615
|
Enhanced features:
|
|
622
616
|
- Intelligent concurrency control
|
|
623
617
|
- Real-time monitoring and statistics
|
|
@@ -639,7 +633,7 @@ class CrawlerProcess:
|
|
|
639
633
|
await self.start_monitoring()
|
|
640
634
|
|
|
641
635
|
try:
|
|
642
|
-
# Phase 3:
|
|
636
|
+
# Phase 3: Initialize context and monitoring
|
|
643
637
|
spider_classes_to_run.sort(key=lambda cls: cls.__name__.lower())
|
|
644
638
|
|
|
645
639
|
logger.debug(
|
|
@@ -738,7 +732,7 @@ class CrawlerProcess:
|
|
|
738
732
|
) -> List[Type[Spider]]:
|
|
739
733
|
"""
|
|
740
734
|
Resolve input to spider class list
|
|
741
|
-
|
|
735
|
+
|
|
742
736
|
Supports various input formats and validates uniqueness
|
|
743
737
|
"""
|
|
744
738
|
inputs = self._normalize_inputs(spiders_input)
|
|
@@ -762,7 +756,8 @@ class CrawlerProcess:
|
|
|
762
756
|
seen_spider_names.add(spider_name)
|
|
763
757
|
spider_classes.append(spider_cls)
|
|
764
758
|
|
|
765
|
-
logger.debug(
|
|
759
|
+
logger.debug(
|
|
760
|
+
f"Spider resolved successfully: {item} -> {spider_cls.__name__} (name='{spider_name}')")
|
|
766
761
|
|
|
767
762
|
except Exception as e:
|
|
768
763
|
logger.error(f"Failed to resolve spider: {item} - {e}")
|
|
@@ -774,7 +769,7 @@ class CrawlerProcess:
|
|
|
774
769
|
def _normalize_inputs(spiders_input) -> List[Union[Type[Spider], str]]:
|
|
775
770
|
"""
|
|
776
771
|
Normalize input to list
|
|
777
|
-
|
|
772
|
+
|
|
778
773
|
Supports more input types and provides better error information
|
|
779
774
|
"""
|
|
780
775
|
if isinstance(spiders_input, (type, str)):
|
|
@@ -793,7 +788,7 @@ class CrawlerProcess:
|
|
|
793
788
|
def _resolve_spider_class(self, item: Union[Type[Spider], str]) -> Type[Spider]:
|
|
794
789
|
"""
|
|
795
790
|
Resolve single input item to spider class
|
|
796
|
-
|
|
791
|
+
|
|
797
792
|
Provides better error prompts and debugging information
|
|
798
793
|
"""
|
|
799
794
|
if isinstance(item, type) and issubclass(item, Spider):
|
|
@@ -820,7 +815,7 @@ class CrawlerProcess:
|
|
|
820
815
|
async def _run_spider_with_limit(self, spider_cls: Type[Spider], seq: int, total: int):
|
|
821
816
|
"""
|
|
822
817
|
Spider running function limited by semaphore
|
|
823
|
-
|
|
818
|
+
|
|
824
819
|
Includes enhanced error handling and monitoring functionality
|
|
825
820
|
"""
|
|
826
821
|
task = asyncio.current_task()
|
|
@@ -888,7 +883,7 @@ class CrawlerProcess:
|
|
|
888
883
|
def _shutdown(self, _signum, _frame):
|
|
889
884
|
"""
|
|
890
885
|
Graceful shutdown signal handling
|
|
891
|
-
|
|
886
|
+
|
|
892
887
|
Provides better shutdown experience and resource cleanup
|
|
893
888
|
"""
|
|
894
889
|
signal_name = {signal.SIGINT: 'SIGINT', signal.SIGTERM: 'SIGTERM'}.get(_signum, str(_signum))
|
|
@@ -913,7 +908,7 @@ class CrawlerProcess:
|
|
|
913
908
|
async def _wait_for_shutdown(self):
|
|
914
909
|
"""
|
|
915
910
|
Wait for all active tasks to complete
|
|
916
|
-
|
|
911
|
+
|
|
917
912
|
Provides better shutdown time control and progress feedback
|
|
918
913
|
"""
|
|
919
914
|
try:
|
|
@@ -967,15 +962,15 @@ class CrawlerProcess:
|
|
|
967
962
|
def _get_default_settings(cls) -> SettingManager:
|
|
968
963
|
"""
|
|
969
964
|
Load default configuration
|
|
970
|
-
|
|
965
|
+
|
|
971
966
|
Provides better error handling and fallback strategy
|
|
972
967
|
"""
|
|
973
968
|
try:
|
|
974
969
|
settings = get_settings()
|
|
975
|
-
|
|
970
|
+
logger.debug("Default configuration loaded successfully")
|
|
976
971
|
return settings
|
|
977
972
|
except Exception as e:
|
|
978
|
-
|
|
973
|
+
logger.warning(f"Unable to load default configuration: {e}, using empty configuration")
|
|
979
974
|
return SettingManager()
|
|
980
975
|
|
|
981
976
|
def _log_startup_info(self):
|
|
@@ -990,7 +985,7 @@ class CrawlerProcess:
|
|
|
990
985
|
|
|
991
986
|
# Build startup info log
|
|
992
987
|
startup_info = [
|
|
993
|
-
f"Crawlo Framework Started
|
|
988
|
+
f"Crawlo Framework Started {version}"
|
|
994
989
|
]
|
|
995
990
|
|
|
996
991
|
# Get actual queue type
|
|
@@ -1018,7 +1013,7 @@ class CrawlerProcess:
|
|
|
1018
1013
|
else:
|
|
1019
1014
|
startup_info.append(f"Run Mode: {run_mode}")
|
|
1020
1015
|
|
|
1021
|
-
# Print startup information
|
|
1016
|
+
# Print startup information at INFO level
|
|
1022
1017
|
for info in startup_info:
|
|
1023
1018
|
logger.info(info)
|
|
1024
1019
|
|
|
@@ -1032,7 +1027,7 @@ def create_crawler_with_optimizations(
|
|
|
1032
1027
|
) -> Crawler:
|
|
1033
1028
|
"""
|
|
1034
1029
|
Create an optimized crawler instance
|
|
1035
|
-
|
|
1030
|
+
|
|
1036
1031
|
:param spider_cls: Spider class
|
|
1037
1032
|
:param settings: Settings manager
|
|
1038
1033
|
:param optimization_kwargs: Optimization parameters
|
|
@@ -1056,7 +1051,7 @@ def create_process_with_large_scale_config(
|
|
|
1056
1051
|
) -> CrawlerProcess:
|
|
1057
1052
|
"""
|
|
1058
1053
|
Create a process manager that supports large-scale optimization
|
|
1059
|
-
|
|
1054
|
+
|
|
1060
1055
|
:param config_type: Configuration type ('conservative', 'balanced', 'aggressive', 'memory_optimized')
|
|
1061
1056
|
:param concurrency: Concurrency count
|
|
1062
1057
|
:param kwargs: Other parameters
|
|
@@ -1100,4 +1095,4 @@ __all__ = [
|
|
|
1100
1095
|
'CrawlerContext',
|
|
1101
1096
|
'create_crawler_with_optimizations',
|
|
1102
1097
|
'create_process_with_large_scale_config'
|
|
1103
|
-
]
|
|
1098
|
+
]
|
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
from typing import Any
|
|
2
2
|
from crawlo.exceptions import NotConfigured
|
|
3
|
-
from crawlo.utils.log import get_logger
|
|
4
3
|
from crawlo.utils.log import LoggerManager
|
|
5
4
|
|
|
5
|
+
# 使用全局logger,避免每个模块都创建自己的延迟初始化函数
|
|
6
|
+
logger = LoggerManager.get_logger(__name__)
|
|
7
|
+
|
|
6
8
|
|
|
7
9
|
class CustomLoggerExtension:
|
|
8
10
|
"""
|
|
@@ -32,7 +34,7 @@ class CustomLoggerExtension:
|
|
|
32
34
|
return cls(crawler.settings)
|
|
33
35
|
|
|
34
36
|
def spider_opened(self, spider: Any) -> None:
|
|
35
|
-
logger = get_logger(__name__)
|
|
37
|
+
logger = LoggerManager.get_logger(__name__)
|
|
36
38
|
try:
|
|
37
39
|
logger.info(
|
|
38
40
|
f"CustomLoggerExtension: Logging initialized. "
|
|
@@ -133,4 +133,4 @@ class MiddlewareManager:
|
|
|
133
133
|
def _validate_middleware_method(method_name, middleware) -> bool:
|
|
134
134
|
method = getattr(type(middleware), method_name)
|
|
135
135
|
base_method = getattr(BaseMiddleware, method_name)
|
|
136
|
-
return False if method == base_method else True
|
|
136
|
+
return False if method == base_method else True
|
crawlo/mode_manager.py
CHANGED
|
@@ -19,36 +19,37 @@ from crawlo.utils.log import get_logger
|
|
|
19
19
|
|
|
20
20
|
class RunMode(Enum):
|
|
21
21
|
"""运行模式枚举"""
|
|
22
|
-
STANDALONE = "standalone"
|
|
22
|
+
STANDALONE = "standalone" # 单机模式
|
|
23
23
|
DISTRIBUTED = "distributed" # 分布式模式
|
|
24
|
-
AUTO = "auto"
|
|
24
|
+
AUTO = "auto" # 自动检测模式
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
class ModeManager:
|
|
28
28
|
"""运行模式管理器"""
|
|
29
|
-
|
|
29
|
+
|
|
30
30
|
def __init__(self):
|
|
31
31
|
self.logger = get_logger(self.__class__.__name__)
|
|
32
|
-
|
|
32
|
+
|
|
33
33
|
@staticmethod
|
|
34
34
|
def get_standalone_settings() -> Dict[str, Any]:
|
|
35
35
|
"""获取单机模式配置"""
|
|
36
36
|
return {
|
|
37
37
|
'QUEUE_TYPE': 'memory',
|
|
38
38
|
'FILTER_CLASS': 'crawlo.filters.memory_filter.MemoryFilter',
|
|
39
|
+
'DEFAULT_DEDUP_PIPELINE': 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline',
|
|
39
40
|
'CONCURRENCY': 8,
|
|
40
41
|
'MAX_RUNNING_SPIDERS': 1,
|
|
41
42
|
'DOWNLOAD_DELAY': 1.0,
|
|
42
43
|
'LOG_LEVEL': 'INFO',
|
|
43
44
|
}
|
|
44
|
-
|
|
45
|
+
|
|
45
46
|
@staticmethod
|
|
46
47
|
def get_distributed_settings(
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
48
|
+
redis_host: str = '127.0.0.1',
|
|
49
|
+
redis_port: int = 6379,
|
|
50
|
+
redis_password: Optional[str] = None,
|
|
51
|
+
redis_db: int = 0, # 添加 redis_db 参数
|
|
52
|
+
project_name: str = 'crawlo'
|
|
52
53
|
) -> Dict[str, Any]:
|
|
53
54
|
"""获取分布式模式配置"""
|
|
54
55
|
# 构建 Redis URL,使用传入的 redis_db 参数
|
|
@@ -56,7 +57,7 @@ class ModeManager:
|
|
|
56
57
|
redis_url = f'redis://:{redis_password}@{redis_host}:{redis_port}/{redis_db}'
|
|
57
58
|
else:
|
|
58
59
|
redis_url = f'redis://{redis_host}:{redis_port}/{redis_db}'
|
|
59
|
-
|
|
60
|
+
|
|
60
61
|
return {
|
|
61
62
|
'PROJECT_NAME': project_name, # 添加项目名称到配置中
|
|
62
63
|
'QUEUE_TYPE': 'redis',
|
|
@@ -74,7 +75,7 @@ class ModeManager:
|
|
|
74
75
|
'DOWNLOAD_DELAY': 1.0,
|
|
75
76
|
'LOG_LEVEL': 'INFO',
|
|
76
77
|
}
|
|
77
|
-
|
|
78
|
+
|
|
78
79
|
@staticmethod
|
|
79
80
|
def get_auto_settings() -> Dict[str, Any]:
|
|
80
81
|
"""获取自动检测模式配置"""
|
|
@@ -86,28 +87,28 @@ class ModeManager:
|
|
|
86
87
|
'DOWNLOAD_DELAY': 1.0,
|
|
87
88
|
'LOG_LEVEL': 'INFO',
|
|
88
89
|
}
|
|
89
|
-
|
|
90
|
+
|
|
90
91
|
def resolve_mode_settings(
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
92
|
+
self,
|
|
93
|
+
mode: str = 'standalone',
|
|
94
|
+
**kwargs
|
|
94
95
|
) -> Dict[str, Any]:
|
|
95
96
|
"""
|
|
96
97
|
解析运行模式并返回对应配置
|
|
97
|
-
|
|
98
|
+
|
|
98
99
|
Args:
|
|
99
100
|
mode: 运行模式 ('standalone', 'distributed', 'auto')
|
|
100
101
|
**kwargs: 额外配置参数
|
|
101
|
-
|
|
102
|
+
|
|
102
103
|
Returns:
|
|
103
104
|
Dict[str, Any]: 配置字典
|
|
104
105
|
"""
|
|
105
106
|
mode = RunMode(mode.lower())
|
|
106
|
-
|
|
107
|
+
|
|
107
108
|
if mode == RunMode.STANDALONE:
|
|
108
109
|
self.logger.info("使用单机模式 - 简单快速,适合开发和中小规模爬取")
|
|
109
110
|
settings = self.get_standalone_settings()
|
|
110
|
-
|
|
111
|
+
|
|
111
112
|
elif mode == RunMode.DISTRIBUTED:
|
|
112
113
|
self.logger.info("使用分布式模式 - 支持多节点扩展,适合大规模爬取")
|
|
113
114
|
settings = self.get_distributed_settings(
|
|
@@ -117,25 +118,25 @@ class ModeManager:
|
|
|
117
118
|
redis_db=kwargs.get('redis_db', 0), # 添加 redis_db 参数
|
|
118
119
|
project_name=kwargs.get('project_name', 'crawlo')
|
|
119
120
|
)
|
|
120
|
-
|
|
121
|
+
|
|
121
122
|
elif mode == RunMode.AUTO:
|
|
122
123
|
self.logger.info("使用自动检测模式 - 智能选择最佳运行方式")
|
|
123
124
|
settings = self.get_auto_settings()
|
|
124
|
-
|
|
125
|
+
|
|
125
126
|
else:
|
|
126
127
|
raise ValueError(f"不支持的运行模式: {mode}")
|
|
127
|
-
|
|
128
|
+
|
|
128
129
|
# 合并用户自定义配置
|
|
129
|
-
user_settings = {k: v for k, v in kwargs.items()
|
|
130
|
-
|
|
130
|
+
user_settings = {k: v for k, v in kwargs.items()
|
|
131
|
+
if k not in ['redis_host', 'redis_port', 'redis_password', 'project_name']}
|
|
131
132
|
settings.update(user_settings)
|
|
132
|
-
|
|
133
|
+
|
|
133
134
|
return settings
|
|
134
|
-
|
|
135
|
+
|
|
135
136
|
def from_environment(self) -> Dict[str, Any]:
|
|
136
137
|
"""从环境变量构建配置"""
|
|
137
138
|
config = {}
|
|
138
|
-
|
|
139
|
+
|
|
139
140
|
# 扫描 CRAWLO_ 前缀的环境变量
|
|
140
141
|
for key, value in os.environ.items():
|
|
141
142
|
if key.startswith('CRAWLO_'):
|
|
@@ -150,7 +151,7 @@ class ModeManager:
|
|
|
150
151
|
config[config_key] = float(value)
|
|
151
152
|
except ValueError:
|
|
152
153
|
config[config_key] = value
|
|
153
|
-
|
|
154
|
+
|
|
154
155
|
return config
|
|
155
156
|
|
|
156
157
|
|
|
@@ -161,12 +162,12 @@ def standalone_mode(**kwargs) -> Dict[str, Any]:
|
|
|
161
162
|
|
|
162
163
|
|
|
163
164
|
def distributed_mode(
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
165
|
+
redis_host: str = '127.0.0.1',
|
|
166
|
+
redis_port: int = 6379,
|
|
167
|
+
redis_password: Optional[str] = None,
|
|
168
|
+
redis_db: int = 0, # 添加 redis_db 参数
|
|
169
|
+
project_name: str = 'crawlo',
|
|
170
|
+
**kwargs
|
|
170
171
|
) -> Dict[str, Any]:
|
|
171
172
|
"""快速创建分布式模式配置"""
|
|
172
173
|
return ModeManager().resolve_mode_settings(
|
|
@@ -190,7 +191,7 @@ def from_env(default_mode: str = 'standalone') -> Dict[str, Any]:
|
|
|
190
191
|
"""从环境变量创建配置"""
|
|
191
192
|
# 移除直接使用 os.getenv(),要求通过 settings 配置
|
|
192
193
|
raise RuntimeError("环境变量配置已移除,请在 settings 中配置相关参数")
|
|
193
|
-
|
|
194
|
+
|
|
194
195
|
# 保留原有代码作为参考
|
|
195
196
|
# mode = os.getenv('CRAWLO_MODE', default_mode).lower()
|
|
196
197
|
#
|
|
@@ -4,7 +4,6 @@ from typing import List
|
|
|
4
4
|
from pprint import pformat
|
|
5
5
|
from asyncio import create_task
|
|
6
6
|
|
|
7
|
-
|
|
8
7
|
from crawlo.utils.log import get_logger
|
|
9
8
|
from crawlo.event import item_successful, item_discard
|
|
10
9
|
from crawlo.project import load_class, common_call
|
|
@@ -20,6 +19,19 @@ class PipelineManager:
|
|
|
20
19
|
|
|
21
20
|
self.logger = get_logger(self.__class__.__name__, self.crawler.settings.get('LOG_LEVEL'))
|
|
22
21
|
pipelines = self.crawler.settings.get_list('PIPELINES')
|
|
22
|
+
dedup_pipeline = self.crawler.settings.get('DEFAULT_DEDUP_PIPELINE')
|
|
23
|
+
|
|
24
|
+
# 添加调试信息
|
|
25
|
+
self.logger.debug(f"PIPELINES from settings: {pipelines}")
|
|
26
|
+
self.logger.debug(f"DEFAULT_DEDUP_PIPELINE from settings: {dedup_pipeline}")
|
|
27
|
+
|
|
28
|
+
# 确保DEFAULT_DEDUP_PIPELINE被添加到管道列表开头
|
|
29
|
+
if dedup_pipeline:
|
|
30
|
+
# 移除所有去重管道实例(如果存在)
|
|
31
|
+
pipelines = [item for item in pipelines if item != dedup_pipeline]
|
|
32
|
+
# 在开头插入去重管道
|
|
33
|
+
pipelines.insert(0, dedup_pipeline)
|
|
34
|
+
|
|
23
35
|
self._add_pipelines(pipelines)
|
|
24
36
|
self._add_methods()
|
|
25
37
|
|
crawlo/project.py
CHANGED
|
@@ -7,18 +7,10 @@ from inspect import iscoroutinefunction
|
|
|
7
7
|
from typing import Callable, Optional, Any
|
|
8
8
|
|
|
9
9
|
from crawlo.settings.setting_manager import SettingManager
|
|
10
|
-
from crawlo.utils.log import get_logger
|
|
10
|
+
from crawlo.utils.log import get_logger, LoggerManager
|
|
11
11
|
|
|
12
|
-
#
|
|
13
|
-
logger =
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
def _get_logger():
|
|
17
|
-
"""延迟获取logger实例,确保在配置加载后创建"""
|
|
18
|
-
global logger
|
|
19
|
-
if logger is None:
|
|
20
|
-
logger = get_logger(__name__)
|
|
21
|
-
return logger
|
|
12
|
+
# 使用全局logger,避免每个模块都创建自己的延迟初始化函数
|
|
13
|
+
logger = get_logger(__name__)
|
|
22
14
|
|
|
23
15
|
|
|
24
16
|
def load_class(path: str) -> Any:
|
|
@@ -50,7 +42,7 @@ def merge_settings(spider, settings):
|
|
|
50
42
|
spider_name = getattr(spider, 'name', 'UnknownSpider')
|
|
51
43
|
# 检查 settings 是否为 SettingManager 实例
|
|
52
44
|
if not hasattr(settings, 'update_attributes'):
|
|
53
|
-
|
|
45
|
+
logger.error(f"merge_settings 接收到的 settings 不是 SettingManager 实例: {type(settings)}")
|
|
54
46
|
# 如果是字典,创建一个新的 SettingManager 实例
|
|
55
47
|
if isinstance(settings, dict):
|
|
56
48
|
from crawlo.settings.setting_manager import SettingManager
|
|
@@ -58,14 +50,14 @@ def merge_settings(spider, settings):
|
|
|
58
50
|
new_settings.update_attributes(settings)
|
|
59
51
|
settings = new_settings
|
|
60
52
|
else:
|
|
61
|
-
|
|
53
|
+
logger.error("无法处理的 settings 类型")
|
|
62
54
|
return
|
|
63
55
|
|
|
64
56
|
if hasattr(spider, 'custom_settings'):
|
|
65
57
|
custom_settings = getattr(spider, 'custom_settings')
|
|
66
58
|
settings.update_attributes(custom_settings)
|
|
67
59
|
else:
|
|
68
|
-
|
|
60
|
+
logger.debug(f"爬虫 '{spider_name}' 无 custom_settings,跳过合并")
|
|
69
61
|
|
|
70
62
|
|
|
71
63
|
async def common_call(func: Callable, *args, **kwargs):
|
|
@@ -93,7 +85,7 @@ def _get_settings_module_from_cfg(cfg_path: str) -> str:
|
|
|
93
85
|
config.read(cfg_path, encoding="utf-8")
|
|
94
86
|
if config.has_section("settings") and config.has_option("settings", "default"):
|
|
95
87
|
module_path = config.get("settings", "default")
|
|
96
|
-
|
|
88
|
+
logger.debug(f"📄 从 crawlo.cfg 加载 settings 模块: {module_path}")
|
|
97
89
|
return module_path
|
|
98
90
|
else:
|
|
99
91
|
raise RuntimeError(f"配置文件缺少 [settings] 或 default 选项: {cfg_path}")
|
|
@@ -114,7 +106,7 @@ def _find_project_root(start_path: str = ".") -> Optional[str]:
|
|
|
114
106
|
for root, dirs, files in os.walk(path):
|
|
115
107
|
if "crawlo.cfg" in files:
|
|
116
108
|
cfg_path = os.path.join(root, "crawlo.cfg")
|
|
117
|
-
|
|
109
|
+
logger.debug(f"✅ 找到项目配置文件: {cfg_path}")
|
|
118
110
|
return root
|
|
119
111
|
|
|
120
112
|
# 向上查找直到找到 crawlo.cfg 或包含 settings.py 和 __init__.py 的目录
|
|
@@ -130,20 +122,20 @@ def _find_project_root(start_path: str = ".") -> Optional[str]:
|
|
|
130
122
|
# 检查 crawlo.cfg
|
|
131
123
|
cfg_file = os.path.join(path, "crawlo.cfg")
|
|
132
124
|
if os.path.isfile(cfg_file):
|
|
133
|
-
|
|
125
|
+
logger.debug(f"✅ 找到项目配置文件: {cfg_file}")
|
|
134
126
|
return path
|
|
135
127
|
|
|
136
128
|
# 检查 settings.py 和 __init__.py
|
|
137
129
|
settings_file = os.path.join(path, "settings.py")
|
|
138
130
|
init_file = os.path.join(path, "__init__.py")
|
|
139
131
|
if os.path.isfile(settings_file) and os.path.isfile(init_file):
|
|
140
|
-
|
|
132
|
+
logger.debug(f"✅ 找到项目模块: {path}")
|
|
141
133
|
# 即使找到了项目模块,也继续向上查找是否有 crawlo.cfg
|
|
142
134
|
parent = os.path.dirname(path)
|
|
143
135
|
if parent != path:
|
|
144
136
|
parent_cfg = os.path.join(parent, "crawlo.cfg")
|
|
145
137
|
if os.path.isfile(parent_cfg):
|
|
146
|
-
|
|
138
|
+
logger.debug(f"✅ 在上层目录找到项目配置文件: {parent_cfg}")
|
|
147
139
|
return parent
|
|
148
140
|
return path
|
|
149
141
|
|
|
@@ -167,19 +159,19 @@ def _find_project_root(start_path: str = ".") -> Optional[str]:
|
|
|
167
159
|
|
|
168
160
|
cfg_file = os.path.join(path, "crawlo.cfg")
|
|
169
161
|
if os.path.isfile(cfg_file):
|
|
170
|
-
|
|
162
|
+
logger.debug(f"✅ 找到项目配置文件: {cfg_file}")
|
|
171
163
|
return path
|
|
172
164
|
|
|
173
165
|
settings_file = os.path.join(path, "settings.py")
|
|
174
166
|
init_file = os.path.join(path, "__init__.py")
|
|
175
167
|
if os.path.isfile(settings_file) and os.path.isfile(init_file):
|
|
176
|
-
|
|
168
|
+
logger.debug(f"✅ 找到项目模块: {path}")
|
|
177
169
|
# 即使找到了项目模块,也继续向上查找是否有 crawlo.cfg
|
|
178
170
|
parent = os.path.dirname(path)
|
|
179
171
|
if parent != path:
|
|
180
172
|
parent_cfg = os.path.join(parent, "crawlo.cfg")
|
|
181
173
|
if os.path.isfile(parent_cfg):
|
|
182
|
-
|
|
174
|
+
logger.debug(f"✅ 在上层目录找到项目配置文件: {parent_cfg}")
|
|
183
175
|
return parent
|
|
184
176
|
return path
|
|
185
177
|
|
|
@@ -204,19 +196,19 @@ def _find_project_root(start_path: str = ".") -> Optional[str]:
|
|
|
204
196
|
|
|
205
197
|
cfg_file = os.path.join(path, "crawlo.cfg")
|
|
206
198
|
if os.path.isfile(cfg_file):
|
|
207
|
-
|
|
199
|
+
logger.debug(f"找到项目配置文件: {cfg_file}")
|
|
208
200
|
return path
|
|
209
201
|
|
|
210
202
|
settings_file = os.path.join(path, "settings.py")
|
|
211
203
|
init_file = os.path.join(path, "__init__.py")
|
|
212
204
|
if os.path.isfile(settings_file) and os.path.isfile(init_file):
|
|
213
|
-
|
|
205
|
+
logger.debug(f"找到项目模块: {path}")
|
|
214
206
|
# 即使找到了项目模块,也继续向上查找是否有 crawlo.cfg
|
|
215
207
|
parent = os.path.dirname(path)
|
|
216
208
|
if parent != path:
|
|
217
209
|
parent_cfg = os.path.join(parent, "crawlo.cfg")
|
|
218
210
|
if os.path.isfile(parent_cfg):
|
|
219
|
-
|
|
211
|
+
logger.debug(f"在上层目录找到项目配置文件: {parent_cfg}")
|
|
220
212
|
return parent
|
|
221
213
|
return path
|
|
222
214
|
|
|
@@ -227,7 +219,7 @@ def _find_project_root(start_path: str = ".") -> Optional[str]:
|
|
|
227
219
|
except Exception:
|
|
228
220
|
pass
|
|
229
221
|
|
|
230
|
-
|
|
222
|
+
logger.warning("未找到 Crawlo 项目根目录。请确保在包含 'crawlo.cfg' 或 'settings.py' 的目录运行。")
|
|
231
223
|
return None
|
|
232
224
|
|
|
233
225
|
|
|
@@ -241,8 +233,7 @@ def get_settings(custom_settings: Optional[dict] = None) -> SettingManager:
|
|
|
241
233
|
Returns:
|
|
242
234
|
SettingManager: 已加载配置的实例
|
|
243
235
|
"""
|
|
244
|
-
|
|
245
|
-
_get_logger().debug("🚀 正在初始化 Crawlo 项目配置...")
|
|
236
|
+
logger.debug("🚀 正在初始化 Crawlo 项目配置...")
|
|
246
237
|
|
|
247
238
|
# 1. 查找项目根
|
|
248
239
|
project_root = _find_project_root()
|
|
@@ -259,32 +250,35 @@ def get_settings(custom_settings: Optional[dict] = None) -> SettingManager:
|
|
|
259
250
|
# 推断:项目目录名.settings
|
|
260
251
|
project_name = os.path.basename(project_root)
|
|
261
252
|
settings_module_path = f"{project_name}.settings"
|
|
262
|
-
|
|
253
|
+
logger.warning(f"⚠️ 未找到 crawlo.cfg,推断 settings 模块为: {settings_module_path}")
|
|
263
254
|
|
|
264
255
|
# 3. 注入 sys.path
|
|
265
256
|
project_root_str = os.path.abspath(project_root)
|
|
266
257
|
if project_root_str not in sys.path:
|
|
267
258
|
sys.path.insert(0, project_root_str)
|
|
268
|
-
|
|
259
|
+
logger.debug(f"📁 项目根目录已加入 sys.path: {project_root_str}")
|
|
269
260
|
|
|
270
261
|
# 4. 加载 SettingManager
|
|
271
|
-
|
|
262
|
+
logger.debug(f"⚙️ 正在加载配置模块: {settings_module_path}")
|
|
272
263
|
settings = SettingManager()
|
|
273
264
|
|
|
274
265
|
try:
|
|
275
266
|
settings.set_settings(settings_module_path)
|
|
276
|
-
|
|
267
|
+
logger.debug("✅ settings 模块加载成功")
|
|
277
268
|
except Exception as e:
|
|
278
269
|
raise ImportError(f"加载 settings 模块失败 '{settings_module_path}': {e}")
|
|
279
270
|
|
|
280
271
|
# 5. 合并运行时配置
|
|
281
272
|
if custom_settings:
|
|
282
273
|
settings.update_attributes(custom_settings)
|
|
283
|
-
|
|
274
|
+
logger.debug(f"🔧 已应用运行时自定义配置: {list(custom_settings.keys())}")
|
|
284
275
|
|
|
285
276
|
# 6. 显示核心配置摘要(INFO级别)
|
|
286
277
|
# _log_settings_summary(settings)
|
|
287
278
|
|
|
279
|
+
# 配置日志系统
|
|
280
|
+
LoggerManager.configure(settings)
|
|
281
|
+
|
|
288
282
|
# 将项目初始化完成的消息改为DEBUG级别
|
|
289
|
-
|
|
283
|
+
logger.debug("🎉 Crawlo 项目配置初始化完成!")
|
|
290
284
|
return settings
|
|
@@ -15,38 +15,50 @@ class SettingManager(MutableMapping):
|
|
|
15
15
|
self.set_settings(default_settings)
|
|
16
16
|
# 在初始化时合并配置
|
|
17
17
|
self._merge_config(values)
|
|
18
|
-
|
|
18
|
+
|
|
19
19
|
def _merge_config(self, user_config):
|
|
20
20
|
"""合并默认配置和用户配置"""
|
|
21
21
|
if not user_config:
|
|
22
22
|
return
|
|
23
|
-
|
|
23
|
+
|
|
24
24
|
# 合并中间件配置
|
|
25
25
|
if 'MIDDLEWARES' in user_config:
|
|
26
26
|
default_middlewares = self.attributes.get('MIDDLEWARES', [])
|
|
27
27
|
user_middlewares = user_config['MIDDLEWARES']
|
|
28
|
-
|
|
29
|
-
|
|
28
|
+
# 如果用户配置了空列表,则仍然使用默认配置
|
|
29
|
+
if user_middlewares:
|
|
30
|
+
self.attributes['MIDDLEWARES'] = default_middlewares + user_middlewares
|
|
31
|
+
|
|
30
32
|
# 合并管道配置
|
|
31
33
|
if 'PIPELINES' in user_config:
|
|
32
34
|
default_pipelines = self.attributes.get('PIPELINES', [])
|
|
33
35
|
user_pipelines = user_config['PIPELINES']
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
36
|
+
# 如果用户配置了空列表,则仍然使用默认配置
|
|
37
|
+
if user_pipelines:
|
|
38
|
+
# 过滤掉空值和注释
|
|
39
|
+
user_pipelines = [pipeline for pipeline in user_pipelines if pipeline and not pipeline.strip().startswith('#')]
|
|
40
|
+
if user_pipelines:
|
|
41
|
+
self.attributes['PIPELINES'] = user_pipelines
|
|
42
|
+
|
|
43
|
+
# 特殊处理PIPELINES,确保去重管道在最前面
|
|
44
|
+
dedup_pipeline = self.attributes.get('DEFAULT_DEDUP_PIPELINE')
|
|
45
|
+
if dedup_pipeline:
|
|
46
|
+
pipelines = self.attributes.get('PIPELINES', [])
|
|
47
|
+
# 移除所有去重管道实例(如果存在)
|
|
48
|
+
pipelines = [item for item in pipelines if item != dedup_pipeline]
|
|
49
|
+
# 在开头插入去重管道
|
|
50
|
+
pipelines.insert(0, dedup_pipeline)
|
|
51
|
+
self.attributes['PIPELINES'] = pipelines
|
|
52
|
+
|
|
44
53
|
# 合并扩展配置
|
|
45
54
|
if 'EXTENSIONS' in user_config:
|
|
46
55
|
default_extensions = self.attributes.get('EXTENSIONS', [])
|
|
47
56
|
user_extensions = user_config['EXTENSIONS']
|
|
48
|
-
|
|
49
|
-
|
|
57
|
+
# 如果用户配置了空列表,则仍然使用默认配置
|
|
58
|
+
if user_extensions:
|
|
59
|
+
self.attributes['EXTENSIONS'] = default_extensions + user_extensions
|
|
60
|
+
# 如果用户没有配置扩展,则使用默认配置
|
|
61
|
+
|
|
50
62
|
# 更新其他用户配置
|
|
51
63
|
for key, value in user_config.items():
|
|
52
64
|
if key not in ['MIDDLEWARES', 'PIPELINES', 'EXTENSIONS']:
|
|
@@ -147,7 +159,7 @@ class SettingManager(MutableMapping):
|
|
|
147
159
|
# 创建一个新的实例
|
|
148
160
|
cls = self.__class__
|
|
149
161
|
new_instance = cls.__new__(cls)
|
|
150
|
-
|
|
162
|
+
|
|
151
163
|
# 复制attributes字典,但排除不可pickle的对象
|
|
152
164
|
new_attributes = {}
|
|
153
165
|
for key, value in self.attributes.items():
|
|
@@ -157,8 +169,8 @@ class SettingManager(MutableMapping):
|
|
|
157
169
|
except Exception:
|
|
158
170
|
# 如果复制失败,保留原始引用(对于logger等对象)
|
|
159
171
|
new_attributes[key] = value
|
|
160
|
-
|
|
172
|
+
|
|
161
173
|
# 设置新实例的attributes
|
|
162
174
|
new_instance.attributes = new_attributes
|
|
163
|
-
|
|
175
|
+
|
|
164
176
|
return new_instance
|
crawlo/utils/log.py
CHANGED
|
@@ -8,28 +8,20 @@ from logging import (
|
|
|
8
8
|
INFO,
|
|
9
9
|
getLevelName,
|
|
10
10
|
)
|
|
11
|
-
# 导入日志轮转处理器
|
|
12
|
-
from logging.handlers import RotatingFileHandler, TimedRotatingFileHandler
|
|
13
11
|
|
|
14
12
|
LOG_FORMAT = '%(asctime)s - [%(name)s] - %(levelname)s: %(message)s'
|
|
15
13
|
|
|
16
14
|
|
|
17
15
|
class LoggerManager:
|
|
16
|
+
"""日志管理器,提供统一的日志配置和获取接口"""
|
|
18
17
|
logger_cache = {}
|
|
19
18
|
_default_filename = None
|
|
20
19
|
_default_level = DEBUG # 设置为最低级别,由handler控制实际输出
|
|
21
|
-
_default_file_level = INFO
|
|
22
|
-
_default_console_level = INFO
|
|
20
|
+
_default_file_level = INFO # 默认为INFO级别
|
|
21
|
+
_default_console_level = INFO # 默认为INFO级别
|
|
23
22
|
_default_log_format = LOG_FORMAT
|
|
24
23
|
_default_encoding = 'utf-8'
|
|
25
|
-
#
|
|
26
|
-
_default_log_max_bytes = 10 * 1024 * 1024 # 10MB
|
|
27
|
-
_default_log_backup_count = 5
|
|
28
|
-
_default_log_when = 'midnight'
|
|
29
|
-
_default_log_interval = 1
|
|
30
|
-
_default_log_use_rotation = False
|
|
31
|
-
_default_log_rotation_type = 'size' # 'size' or 'time'
|
|
32
|
-
_default_log_rotation_suffix = None # 轮转文件后缀格式
|
|
24
|
+
_configured = False # 标记是否已配置
|
|
33
25
|
|
|
34
26
|
@classmethod
|
|
35
27
|
def _to_level(cls, level):
|
|
@@ -67,19 +59,12 @@ class LoggerManager:
|
|
|
67
59
|
get_val = settings.get if hasattr(settings, 'get') else (lambda k, d=None: kwargs.get(k, d))
|
|
68
60
|
|
|
69
61
|
filename = get_val('LOG_FILE')
|
|
70
|
-
level = get_val('LOG_LEVEL', '
|
|
71
|
-
file_level = get_val('LOG_FILE_LEVEL',
|
|
72
|
-
|
|
62
|
+
level = get_val('LOG_LEVEL', 'INFO') # 默认为INFO级别
|
|
63
|
+
file_level = get_val('LOG_FILE_LEVEL', level) # 默认继承LOG_LEVEL的值
|
|
64
|
+
# 根据项目规范,已完全移除LOG_CONSOLE_LEVEL支持,统一使用LOG_LEVEL控制控制台和文件的日志输出级别
|
|
65
|
+
console_level = level # 控制台日志级别直接使用LOG_LEVEL的值
|
|
73
66
|
log_format = get_val('LOG_FORMAT', LOG_FORMAT)
|
|
74
67
|
encoding = get_val('LOG_ENCODING', 'utf-8')
|
|
75
|
-
# 获取日志轮转配置
|
|
76
|
-
use_rotation = get_val('LOG_USE_ROTATION', False)
|
|
77
|
-
rotation_type = get_val('LOG_ROTATION_TYPE', 'size')
|
|
78
|
-
max_bytes = get_val('LOG_MAX_BYTES', cls._default_log_max_bytes)
|
|
79
|
-
backup_count = get_val('LOG_BACKUP_COUNT', cls._default_log_backup_count)
|
|
80
|
-
when = get_val('LOG_WHEN', cls._default_log_when)
|
|
81
|
-
interval = get_val('LOG_INTERVAL', cls._default_log_interval)
|
|
82
|
-
rotation_suffix = get_val('LOG_ROTATION_SUFFIX', cls._default_log_rotation_suffix) # 轮转文件后缀
|
|
83
68
|
|
|
84
69
|
cls._default_filename = filename
|
|
85
70
|
cls._default_level = cls._to_level(level)
|
|
@@ -87,21 +72,13 @@ class LoggerManager:
|
|
|
87
72
|
cls._default_console_level = cls._to_level(console_level)
|
|
88
73
|
cls._default_log_format = log_format
|
|
89
74
|
cls._default_encoding = encoding
|
|
90
|
-
|
|
91
|
-
cls.
|
|
92
|
-
cls._default_log_rotation_type = rotation_type
|
|
93
|
-
cls._default_log_max_bytes = max_bytes
|
|
94
|
-
cls._default_log_backup_count = backup_count
|
|
95
|
-
cls._default_log_when = when
|
|
96
|
-
cls._default_log_interval = interval
|
|
97
|
-
cls._default_log_rotation_suffix = rotation_suffix
|
|
98
|
-
|
|
99
|
-
# 移除对根日志记录器级别的修改,避免副作用
|
|
75
|
+
|
|
76
|
+
cls._configured = True
|
|
100
77
|
|
|
101
78
|
@classmethod
|
|
102
79
|
def get_logger(cls, name='default', level=None, filename=None):
|
|
103
80
|
"""
|
|
104
|
-
|
|
81
|
+
获取logger实例
|
|
105
82
|
"""
|
|
106
83
|
# 确定最终参数
|
|
107
84
|
# 如果传入了level参数,则使用它,否则使用默认级别
|
|
@@ -110,7 +87,7 @@ class LoggerManager:
|
|
|
110
87
|
else:
|
|
111
88
|
# Logger级别设置为DEBUG(最低级别),由handler控制实际输出
|
|
112
89
|
final_level = DEBUG
|
|
113
|
-
|
|
90
|
+
|
|
114
91
|
final_filename = filename if filename is not None else cls._default_filename
|
|
115
92
|
|
|
116
93
|
# 安全的字符串化 key,避免任何 unhashable 类型
|
|
@@ -146,32 +123,9 @@ class LoggerManager:
|
|
|
146
123
|
if log_dir and not os.path.exists(log_dir):
|
|
147
124
|
os.makedirs(log_dir, exist_ok=True)
|
|
148
125
|
|
|
149
|
-
#
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
# 使用大小轮转
|
|
153
|
-
fh = RotatingFileHandler(
|
|
154
|
-
final_filename,
|
|
155
|
-
maxBytes=cls._default_log_max_bytes,
|
|
156
|
-
backupCount=cls._default_log_backup_count,
|
|
157
|
-
encoding=cls._default_encoding
|
|
158
|
-
)
|
|
159
|
-
else:
|
|
160
|
-
# 使用时间轮转
|
|
161
|
-
fh = TimedRotatingFileHandler(
|
|
162
|
-
final_filename,
|
|
163
|
-
when=cls._default_log_when,
|
|
164
|
-
interval=cls._default_log_interval,
|
|
165
|
-
backupCount=cls._default_log_backup_count,
|
|
166
|
-
encoding=cls._default_encoding
|
|
167
|
-
)
|
|
168
|
-
# 如果提供了自定义后缀格式,则设置
|
|
169
|
-
if cls._default_log_rotation_suffix:
|
|
170
|
-
fh.suffix = cls._default_log_rotation_suffix
|
|
171
|
-
else:
|
|
172
|
-
# 使用普通文件处理器(默认行为,会追加到文件)
|
|
173
|
-
fh = FileHandler(final_filename, mode='a', encoding=cls._default_encoding)
|
|
174
|
-
|
|
126
|
+
# 使用普通文件处理器(移除日志轮转功能)
|
|
127
|
+
fh = FileHandler(final_filename, mode='a', encoding=cls._default_encoding)
|
|
128
|
+
|
|
175
129
|
fh.setFormatter(formatter)
|
|
176
130
|
fh.setLevel(cls._default_file_level)
|
|
177
131
|
_logger.addHandler(fh)
|
|
@@ -183,6 +137,11 @@ class LoggerManager:
|
|
|
183
137
|
cls.logger_cache[key] = _logger
|
|
184
138
|
return _logger
|
|
185
139
|
|
|
140
|
+
@classmethod
|
|
141
|
+
def is_configured(cls):
|
|
142
|
+
"""检查日志系统是否已配置"""
|
|
143
|
+
return cls._configured
|
|
144
|
+
|
|
186
145
|
|
|
187
146
|
# 全局快捷函数
|
|
188
147
|
get_logger = LoggerManager.get_logger
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
crawlo/__init__.py,sha256=qZzTmb7hw5h_qcP2EYGUZcoSScxlKZFJ76CjSeS7UfA,1381
|
|
2
|
-
crawlo/__version__.py,sha256=
|
|
2
|
+
crawlo/__version__.py,sha256=zi_LaUT_OsChAtsPXbOeRpQkCohSsOyeXfavQPM0GoE,22
|
|
3
3
|
crawlo/cli.py,sha256=OXprmcTUbFK02ptw_Gq8Gk4-ZCU-WEMJgzU1ztgP6Bk,2327
|
|
4
4
|
crawlo/config.py,sha256=dNoNyTkXLe2msQ7bZx3YTQItk1m49nIg5-g89FQDNwE,9486
|
|
5
5
|
crawlo/config_validator.py,sha256=gsiLqf5swWd9ISDvoLqCdG7iSXr-ZdBPD4iT6ug1ua4,11239
|
|
6
|
-
crawlo/crawler.py,sha256=
|
|
6
|
+
crawlo/crawler.py,sha256=rxyjA5pXOd709bujgniqYG9tR3eoNaok6wJaeZOgzmo,39451
|
|
7
7
|
crawlo/event.py,sha256=7-y6HNv_EIJSYQNzsj0mVK-Gg4ON3wdQeMdQjfFJPlw,313
|
|
8
8
|
crawlo/exceptions.py,sha256=sMay0wnWLfc_FXWslqxm60qz6b66LXs3EdN_w8ygE9k,1166
|
|
9
|
-
crawlo/mode_manager.py,sha256=
|
|
10
|
-
crawlo/project.py,sha256=
|
|
9
|
+
crawlo/mode_manager.py,sha256=soEgZNBt6jA0qtC1WH-MG_2WngDk2RfmQckLsK3NzmQ,7510
|
|
10
|
+
crawlo/project.py,sha256=830PPRUD6ldE8MKPdkFkPiUcecHhlWP3fUXYC96_T0Y,10506
|
|
11
11
|
crawlo/stats_collector.py,sha256=NkO09CB-220qz5rxFcD_dedGfr2VPFrDo4hya0Zh8Qc,1577
|
|
12
12
|
crawlo/subscriber.py,sha256=D3hzE7Pc_zJjc-zR7lct5pt32bz6LsDYeC8uHlS4Hso,4986
|
|
13
13
|
crawlo/task_manager.py,sha256=19InAxS9oJ9EMj20Aw2urN-v6BeC22dkgPuW-B9-4UI,819
|
|
@@ -16,12 +16,12 @@ crawlo/commands/check.py,sha256=7pD43s97DD-fSLO9OEOuNcNr7o-2g94rJULL8fUzdaI,2260
|
|
|
16
16
|
crawlo/commands/genspider.py,sha256=HhtvBLkIuhYtJUzom6PquItiC22vU9LNpOkjDUiqdM4,4937
|
|
17
17
|
crawlo/commands/help.py,sha256=gwfHibRpdYDmZO6waUMOEn8SMJ_ubdjL-prD5fiuVY8,4973
|
|
18
18
|
crawlo/commands/list.py,sha256=BqlPjBa5FLotjAlyZ3-nGmXg5cWcCNbHi8U5znb2_D8,5722
|
|
19
|
-
crawlo/commands/run.py,sha256=
|
|
19
|
+
crawlo/commands/run.py,sha256=KcJ4h4D7lavB6qQDpYMrbgJMgY5vCSLHaLckos5EUNY,11793
|
|
20
20
|
crawlo/commands/startproject.py,sha256=aqKRJarKqTf5XjJnGXwjRpp0uYF16LreFbwwQLGpK-0,16070
|
|
21
21
|
crawlo/commands/stats.py,sha256=8wTubR1RQ1JPTlpOKArcGcQ39bM-0cuH27lDpndnwPQ,6014
|
|
22
22
|
crawlo/commands/utils.py,sha256=pXiFzwVIVXdSPO2Fty_u19P1lsE8HmuE8gTMamKZZUs,5047
|
|
23
23
|
crawlo/core/__init__.py,sha256=PnFyJdVNHBoPmV1sW0AHQXijeoSTQ8cMYrbNM1JK8kA,41
|
|
24
|
-
crawlo/core/engine.py,sha256=
|
|
24
|
+
crawlo/core/engine.py,sha256=Hy0K_g9My6aQ3CPkxAcCiPsumdwh4O8qRhmFlNoErd4,14496
|
|
25
25
|
crawlo/core/processor.py,sha256=qmCqAeqhwYu-UE86evYesaGt9qpuSIfH-ZIZKcXFCZc,1140
|
|
26
26
|
crawlo/core/scheduler.py,sha256=D-YzXVvnP6DEkovmz9hThhzIe2UgRrQLNt9pJCPEPwY,12593
|
|
27
27
|
crawlo/data/__init__.py,sha256=8MgDxcMhx-emFARcLAw_ODOZNz0neYBcx7kEbzothd8,116
|
|
@@ -37,7 +37,7 @@ crawlo/extension/__init__.py,sha256=FbOwJ4jh60xCbSh7P9CUGJsGAC-VH4MyOtCftRMlxbk,
|
|
|
37
37
|
crawlo/extension/health_check.py,sha256=0GveZgUtFwjYEKlm3qbwIvCmb4FR0qrIKc8cEF1yQV8,5516
|
|
38
38
|
crawlo/extension/log_interval.py,sha256=VCIeNqXcWDnxj4m6l77cjqgRzV8LfsPMb22X0Xc1Vwc,2417
|
|
39
39
|
crawlo/extension/log_stats.py,sha256=vrChs3bj_Dvay3kxxkBOp4-w0K-IG-2XZ0PoSUahTPs,2908
|
|
40
|
-
crawlo/extension/logging_extension.py,sha256=
|
|
40
|
+
crawlo/extension/logging_extension.py,sha256=RfL1wI4nz-1Xtg420Ktp7RXnOPnZSHwO0Zpg1w4fO4M,1726
|
|
41
41
|
crawlo/extension/memory_monitor.py,sha256=4aszl3C0GMQbqFhGZjZq5iQuXQR1sOz06VHjjEHgkyE,4290
|
|
42
42
|
crawlo/extension/performance_profiler.py,sha256=EPiNuXuPPDU0Jtgy8arYHpr_8ASK13cCI2BytdJnu_I,4899
|
|
43
43
|
crawlo/extension/request_recorder.py,sha256=RC23yzXClnVv9j2ljQvjBkUfWznfnDHsrQejKhE9y5E,4074
|
|
@@ -51,7 +51,7 @@ crawlo/items/items.py,sha256=e-3nXI9ckD64vcDxxQiAU6ufbtJMs09gbZQcYjxgwHY,3374
|
|
|
51
51
|
crawlo/middleware/__init__.py,sha256=ldaGFNbiJnK9Fx12Vdf9fDNfzXxoETtShp5r-vodtw0,549
|
|
52
52
|
crawlo/middleware/default_header.py,sha256=wQ7BrUHd-hRosFoKsReV9hwNNr_jwK6V0ZfxL6MOGrk,5032
|
|
53
53
|
crawlo/middleware/download_delay.py,sha256=zt9R5g2HWErWA_MAOnGcw_D8l6HD769Kyaw-Hv-vcTc,3438
|
|
54
|
-
crawlo/middleware/middleware_manager.py,sha256=
|
|
54
|
+
crawlo/middleware/middleware_manager.py,sha256=9Sj9rrWK6R9NZq9eT38sWRGuBKLKfjSgEAxu-5NCWgU,6278
|
|
55
55
|
crawlo/middleware/offsite.py,sha256=b3BMwNKGR41YGJGHt1S0H7yXujEkztVvonUQGO05hoM,4026
|
|
56
56
|
crawlo/middleware/proxy.py,sha256=NquB6tqHAgHs3-2_1_5220kJYfjNG5JyHRJyo_2j4wo,15636
|
|
57
57
|
crawlo/middleware/request_ignore.py,sha256=xcyZ1c7r_HhbzR3r9pfjsLGW7L7FBVeYvlNt8cpP2wY,2577
|
|
@@ -71,7 +71,7 @@ crawlo/pipelines/json_pipeline.py,sha256=vlu1nqbD2mtqtExt9cL5nibx1CwJM1RNqd4WGjZ
|
|
|
71
71
|
crawlo/pipelines/memory_dedup_pipeline.py,sha256=oIksbIrmSw9s9jMh6JJMfVbv6hzseVMV_g9S8UHQUP4,3837
|
|
72
72
|
crawlo/pipelines/mongo_pipeline.py,sha256=k7gNqAO-g2MtIfArphC6z5ZzkKVRkBKcv-2ImziPFA0,5706
|
|
73
73
|
crawlo/pipelines/mysql_pipeline.py,sha256=G2DMhdh0ihBBolIul4YVTDz2JbrZGJauDtWF-gqRW0w,13473
|
|
74
|
-
crawlo/pipelines/pipeline_manager.py,sha256=
|
|
74
|
+
crawlo/pipelines/pipeline_manager.py,sha256=vCgfbhgsKMLm_7jCnr3cE5GemIYkG9u4oF8u4Ta_7so,3013
|
|
75
75
|
crawlo/pipelines/redis_dedup_pipeline.py,sha256=POYRiWAOp1pqDW9iTPJ8h3VcpLALeLrpw74MvJJqPiM,6342
|
|
76
76
|
crawlo/queue/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
77
77
|
crawlo/queue/pqueue.py,sha256=yzF900ko2RReVNZtfk_VV3PzSXEUALI6SHf97geuu6s,1058
|
|
@@ -79,7 +79,7 @@ crawlo/queue/queue_manager.py,sha256=XqS_oVbNQJWdtokOuDDPK-FzMrVdnZ3UKp1MF_DMJww
|
|
|
79
79
|
crawlo/queue/redis_priority_queue.py,sha256=k1OChSMRovSMkbbJ9388axfhpYeMevuJTe-3N1oYhbA,13126
|
|
80
80
|
crawlo/settings/__init__.py,sha256=xsukVKn_h2Hopm1Nj-bXkhbfyS62QTTvJi7fhZUwR9M,123
|
|
81
81
|
crawlo/settings/default_settings.py,sha256=98URrj6QBrx_pmJ1yvK-MSAW8VrZ-pl0FfiZEHV0ZnI,9183
|
|
82
|
-
crawlo/settings/setting_manager.py,sha256=
|
|
82
|
+
crawlo/settings/setting_manager.py,sha256=V3nVJEPtusadoz5eILXFeNyDXX1u_MgIiKIFIWVDY1s,6189
|
|
83
83
|
crawlo/spider/__init__.py,sha256=ZnSAL9PXLZSIH-Jdv-P6RuWmQUdukr8KPLQK6SXZZaU,20435
|
|
84
84
|
crawlo/templates/crawlo.cfg.tmpl,sha256=9BAmwEibS5Tvy6HIcGXPb0BGeuesmibebmTW0iAEkmo,230
|
|
85
85
|
crawlo/templates/run.py.tmpl,sha256=v_g-LQMYJ6pC8TZgyWj0yB2yTTKrwy9lEJufAYCXyxY,1228
|
|
@@ -117,7 +117,7 @@ crawlo/utils/error_handler.py,sha256=q6NqHxjYrKdswfmhshMYMmfBIr0M2YWPYxts4ScHl4Y
|
|
|
117
117
|
crawlo/utils/func_tools.py,sha256=WUZEGpWMuDDX7g-QySM7iaiC74erW2SSkZoUvDw1NjM,2369
|
|
118
118
|
crawlo/utils/large_scale_config.py,sha256=j7wQ5ty7pQlBRygw2vhRJ7OI19RYBZKPfYMP3WeF2WI,8154
|
|
119
119
|
crawlo/utils/large_scale_helper.py,sha256=Kxdy3WMuqjzQTyCc6z4xEYxXDi4xnYKJzsVwaBYZrrg,12108
|
|
120
|
-
crawlo/utils/log.py,sha256=
|
|
120
|
+
crawlo/utils/log.py,sha256=xZe3UU78yr10lK0hxALBQB0Uv9cXShOPPksoe5n_qKI,5229
|
|
121
121
|
crawlo/utils/performance_monitor.py,sha256=Q9xxuXBIfFoig_U-FQPOUuPAh1axO3MzYgpielDyku0,9547
|
|
122
122
|
crawlo/utils/queue_helper.py,sha256=xpUUTOqlU1xz2Pb9NKAVGo3AfAO-7Xvx8Lm1q65Dgck,4743
|
|
123
123
|
crawlo/utils/redis_connection_pool.py,sha256=amGjhaKpodMrw9X56qxZ6f3OTZhjrI89sSVGqgwAQGU,11050
|
|
@@ -212,8 +212,8 @@ tests/test_tools.py,sha256=9t9FXZ61MfdB70nck9NYzCq97yd3SLVlLiMybEAlClk,5345
|
|
|
212
212
|
tests/test_user_agents.py,sha256=rUotyuE2iJDi2LQBrUh980U-dAMTs4ARPMJxICOoQFY,3231
|
|
213
213
|
tests/tools_example.py,sha256=MtIypR-OFiWwi-skurwmq4fM0cGTt-GUX4hSekYs7BY,7739
|
|
214
214
|
tests/verify_distributed.py,sha256=krnYYA5Qx9xXDMWc9YF5DxPSplGvawDg2n0l-3CAqoM,3928
|
|
215
|
-
crawlo-1.
|
|
216
|
-
crawlo-1.
|
|
217
|
-
crawlo-1.
|
|
218
|
-
crawlo-1.
|
|
219
|
-
crawlo-1.
|
|
215
|
+
crawlo-1.3.0.dist-info/METADATA,sha256=5BRT0EE3J1yUtWZ0l_pZqEWxTgGA1p3laxJjTSu7980,26298
|
|
216
|
+
crawlo-1.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
217
|
+
crawlo-1.3.0.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
|
|
218
|
+
crawlo-1.3.0.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
|
|
219
|
+
crawlo-1.3.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|