crawlo 1.0.0__tar.gz → 1.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. crawlo-1.0.2/MANIFEST.in +17 -0
  2. {crawlo-1.0.0/crawlo.egg-info → crawlo-1.0.2}/PKG-INFO +23 -11
  3. crawlo-1.0.2/README.md +2 -0
  4. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/__init__.py +1 -0
  5. crawlo-1.0.2/crawlo/__version__.py +2 -0
  6. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/core/engine.py +9 -7
  7. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/core/processor.py +1 -1
  8. crawlo-1.0.2/crawlo/core/scheduler.py +59 -0
  9. crawlo-1.0.2/crawlo/crawler.py +222 -0
  10. crawlo-1.0.2/crawlo/downloader/playwright_downloader.py +161 -0
  11. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/extension/log_stats.py +4 -4
  12. crawlo-1.0.2/crawlo/filters/__init__.py +37 -0
  13. crawlo-1.0.2/crawlo/filters/aioredis_filter.py +130 -0
  14. crawlo-1.0.2/crawlo/filters/memory_filter.py +203 -0
  15. crawlo-1.0.2/crawlo/filters/redis_filter.py +120 -0
  16. crawlo-1.0.2/crawlo/items/__init__.py +62 -0
  17. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/items/items.py +36 -5
  18. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/middleware/retry.py +8 -2
  19. crawlo-1.0.2/crawlo/network/request.py +234 -0
  20. crawlo-1.0.2/crawlo/network/response.py +162 -0
  21. crawlo-1.0.2/crawlo/pipelines/console_pipeline.py +40 -0
  22. crawlo-1.0.2/crawlo/pipelines/mongo_pipeline.py +117 -0
  23. crawlo-1.0.2/crawlo/pipelines/mysql_batch_pipline.py +134 -0
  24. crawlo-1.0.2/crawlo/pipelines/mysql_pipeline.py +195 -0
  25. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/pipelines/pipeline_manager.py +3 -3
  26. crawlo-1.0.2/crawlo/settings/default_settings.py +89 -0
  27. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/spider/__init__.py +2 -2
  28. crawlo-1.0.2/crawlo/subscriber.py +106 -0
  29. crawlo-1.0.2/crawlo/utils/concurrency_manager.py +125 -0
  30. crawlo-1.0.2/crawlo/utils/date_tools.py +177 -0
  31. crawlo-1.0.2/crawlo/utils/func_tools.py +82 -0
  32. crawlo-1.0.2/crawlo/utils/pqueue.py +174 -0
  33. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/utils/project.py +3 -2
  34. crawlo-1.0.2/crawlo/utils/request.py +85 -0
  35. crawlo-1.0.2/crawlo/utils/tools.py +303 -0
  36. crawlo-1.0.2/crawlo/utils/url.py +40 -0
  37. {crawlo-1.0.0 → crawlo-1.0.2/crawlo.egg-info}/PKG-INFO +23 -11
  38. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo.egg-info/SOURCES.txt +12 -1
  39. crawlo-1.0.2/crawlo.egg-info/requires.txt +34 -0
  40. crawlo-1.0.2/pyproject.toml +3 -0
  41. crawlo-1.0.2/requirements.txt +21 -0
  42. {crawlo-1.0.0 → crawlo-1.0.2}/setup.cfg +22 -7
  43. crawlo-1.0.0/MANIFEST.in +0 -11
  44. crawlo-1.0.0/README.md +0 -2
  45. crawlo-1.0.0/crawlo/__version__.py +0 -2
  46. crawlo-1.0.0/crawlo/core/scheduler.py +0 -35
  47. crawlo-1.0.0/crawlo/crawler.py +0 -107
  48. crawlo-1.0.0/crawlo/items/__init__.py +0 -24
  49. crawlo-1.0.0/crawlo/network/request.py +0 -52
  50. crawlo-1.0.0/crawlo/network/response.py +0 -93
  51. crawlo-1.0.0/crawlo/pipelines/console_pipeline.py +0 -20
  52. crawlo-1.0.0/crawlo/pipelines/mongo_pipeline.py +0 -5
  53. crawlo-1.0.0/crawlo/pipelines/mysql_pipeline.py +0 -5
  54. crawlo-1.0.0/crawlo/settings/default_settings.py +0 -39
  55. crawlo-1.0.0/crawlo/subscriber.py +0 -27
  56. crawlo-1.0.0/crawlo/utils/date_tools.py +0 -20
  57. crawlo-1.0.0/crawlo/utils/func_tools.py +0 -22
  58. crawlo-1.0.0/crawlo/utils/pqueue.py +0 -16
  59. crawlo-1.0.0/crawlo.egg-info/requires.txt +0 -20
  60. crawlo-1.0.0/pyproject.toml +0 -6
  61. {crawlo-1.0.0 → crawlo-1.0.2}/LICENSE +0 -0
  62. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/core/__init__.py +0 -0
  63. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/downloader/__init__.py +0 -0
  64. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/downloader/aiohttp_downloader.py +0 -0
  65. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/downloader/httpx_downloader.py +0 -0
  66. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/event.py +0 -0
  67. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/exceptions.py +0 -0
  68. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/extension/__init__.py +0 -0
  69. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/extension/log_interval.py +0 -0
  70. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/middleware/__init__.py +0 -0
  71. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/middleware/default_header.py +0 -0
  72. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/middleware/download_delay.py +0 -0
  73. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/middleware/middleware_manager.py +0 -0
  74. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/middleware/request_ignore.py +0 -0
  75. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/middleware/response_code.py +0 -0
  76. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/middleware/response_filter.py +0 -0
  77. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/network/__init__.py +0 -0
  78. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/pipelines/__init__.py +0 -0
  79. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/settings/__init__.py +0 -0
  80. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/settings/setting_manager.py +0 -0
  81. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/stats_collector.py +0 -0
  82. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/task_manager.py +0 -0
  83. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/templates/item_template.tmpl +0 -0
  84. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/templates/project_template/items/__init__.py +0 -0
  85. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/templates/project_template/main.py +0 -0
  86. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/templates/project_template/setting.py +0 -0
  87. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/templates/project_template/spiders/__init__.py +0 -0
  88. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/templates/spider_template.tmpl +0 -0
  89. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/utils/__init__.py +0 -0
  90. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/utils/log.py +0 -0
  91. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo/utils/system.py +0 -0
  92. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo.egg-info/dependency_links.txt +0 -0
  93. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo.egg-info/entry_points.txt +0 -0
  94. {crawlo-1.0.0 → crawlo-1.0.2}/crawlo.egg-info/top_level.txt +0 -0
@@ -0,0 +1,17 @@
1
+ include README.md
2
+ include LICENSE
3
+ include requirements.txt # 如果根目录有全局requirements.txt
4
+ include VERSION # 如果根目录有全局VERSION文件
5
+
6
+ # 包内文件包含
7
+ recursive-include crawlo/utils/js *
8
+ recursive-include crawlo/templates *
9
+
10
+ # 测试文件(如果需要在分发包中包含测试)
11
+ recursive-include tests *
12
+
13
+ # 排除项
14
+ global-exclude __pycache__ *.py[cod] .DS_Store *.so
15
+ global-exclude *.bak *.swp *.orig *.rej
16
+ prune samples # 排除示例目录
17
+ prune docs # 排除文档目录
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: crawlo
3
- Version: 1.0.0
4
- Summary: feapder是一款支持异步的python爬虫框架
3
+ Version: 1.0.2
4
+ Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
5
5
  Home-page: https://github.com/crawl-coder/Crawlo.git
6
6
  Author: crawl-coder
7
7
  Author-email: crawlo@qq.com
@@ -11,13 +11,26 @@ Classifier: License :: OSI Approved :: MIT License
11
11
  Classifier: Operating System :: OS Independent
12
12
  Requires-Python: >=3.6
13
13
  Description-Content-Type: text/markdown
14
- License-File: LICENSE
15
- Requires-Dist: aiohttp>=3.12.6
16
- Requires-Dist: httpx>=0.28.1
17
- Requires-Dist: DBUtils>=2.0
18
- Requires-Dist: parsel>=1.10.0
19
- Requires-Dist: pymysql>=1.1.1
20
- Requires-Dist: ujson>=5.10.0
14
+ Requires-Dist: aiohttp>=3.12.14
15
+ Requires-Dist: aiomysql>=0.2.0
16
+ Requires-Dist: aioredis>=2.0.1
17
+ Requires-Dist: asyncmy>=0.2.10
18
+ Requires-Dist: cssselect>=1.2.0
19
+ Requires-Dist: dateparser>=1.2.2
20
+ Requires-Dist: httpx>=0.27.0
21
+ Requires-Dist: lxml>=5.2.1
22
+ Requires-Dist: motor>=3.7.0
23
+ Requires-Dist: parsel>=1.9.1
24
+ Requires-Dist: pydantic>=2.11.7
25
+ Requires-Dist: pymongo>=4.11
26
+ Requires-Dist: PyMySQL>=1.1.1
27
+ Requires-Dist: python-dateutil>=2.9.0.post0
28
+ Requires-Dist: redis>=6.2.0
29
+ Requires-Dist: requests>=2.32.4
30
+ Requires-Dist: six>=1.17.0
31
+ Requires-Dist: ujson>=5.9.0
32
+ Requires-Dist: urllib3>=2.5.0
33
+ Requires-Dist: w3lib>=2.1.2
21
34
  Provides-Extra: render
22
35
  Requires-Dist: webdriver-manager>=4.0.0; extra == "render"
23
36
  Requires-Dist: playwright; extra == "render"
@@ -30,7 +43,6 @@ Requires-Dist: redis-py-cluster>=2.1.0; extra == "all"
30
43
  Requires-Dist: webdriver-manager>=4.0.0; extra == "all"
31
44
  Requires-Dist: playwright; extra == "all"
32
45
  Requires-Dist: selenium>=3.141.0; extra == "all"
33
- Dynamic: license-file
34
46
 
35
47
  # Crawlo
36
- 异步通用爬虫框架
48
+ Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取与数据管道。
crawlo-1.0.2/README.md ADDED
@@ -0,0 +1,2 @@
1
+ # Crawlo
2
+ Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取与数据管道。
@@ -3,3 +3,4 @@
3
3
  from crawlo.network.request import Request
4
4
  from crawlo.network.response import Response
5
5
  from crawlo.items.items import Item
6
+ from .__version__ import __version__
@@ -0,0 +1,2 @@
1
+
2
+ __version__ = "1.0.2"
@@ -1,20 +1,20 @@
1
1
  #!/usr/bin/python
2
2
  # -*- coding:UTF-8 -*-
3
3
  import asyncio
4
+ from inspect import iscoroutine
4
5
  from typing import Optional, Generator, Callable
5
- from inspect import iscoroutine, isgenerator, isasyncgen
6
6
 
7
7
  from crawlo import Request, Item
8
8
  from crawlo.spider import Spider
9
+ from crawlo.utils.log import get_logger
10
+ from crawlo.exceptions import OutputError
9
11
  from crawlo.core.scheduler import Scheduler
10
12
  from crawlo.core.processor import Processor
11
- from crawlo.utils.log import get_logger
12
13
  from crawlo.task_manager import TaskManager
13
14
  from crawlo.utils.project import load_class
14
15
  from crawlo.downloader import DownloaderBase
15
16
  from crawlo.utils.func_tools import transform
16
- from crawlo.exceptions import OutputError, TransformTypeError
17
- from crawlo.event import spider_opened, spider_error
17
+ from crawlo.event import spider_opened, spider_error, request_scheduled
18
18
 
19
19
 
20
20
  class Engine(object):
@@ -49,7 +49,7 @@ class Engine(object):
49
49
  async def start_spider(self, spider):
50
50
  self.spider = spider
51
51
 
52
- self.scheduler = Scheduler(self.crawler)
52
+ self.scheduler = Scheduler.create_instance(self.crawler)
53
53
  if hasattr(self.scheduler, 'open'):
54
54
  self.scheduler.open()
55
55
 
@@ -115,7 +115,7 @@ class Engine(object):
115
115
  if iscoroutine(_outputs):
116
116
  await _outputs
117
117
  else:
118
- return transform(_outputs)
118
+ return transform(_outputs, _response)
119
119
 
120
120
  _response = await self.downloader.fetch(request)
121
121
  if _response is None:
@@ -128,7 +128,8 @@ class Engine(object):
128
128
 
129
129
  async def _schedule_request(self, request):
130
130
  # TODO 去重
131
- await self.scheduler.enqueue_request(request)
131
+ if await self.scheduler.enqueue_request(request):
132
+ asyncio.create_task(self.crawler.subscriber.notify(request_scheduled, request, self.crawler.spider))
132
133
 
133
134
  async def _get_next_request(self):
134
135
  return await self.scheduler.next_request()
@@ -152,6 +153,7 @@ class Engine(object):
152
153
 
153
154
  async def close_spider(self):
154
155
  await asyncio.gather(*self.task_manager.current_task)
156
+ await self.scheduler.close()
155
157
  await self.downloader.close()
156
158
  if self.normal:
157
159
  await self.crawler.close()
@@ -15,7 +15,7 @@ class Processor(object):
15
15
  self.pipelines: Optional[PipelineManager] = None
16
16
 
17
17
  def open(self):
18
- self.pipelines = PipelineManager.create_instance(self.crawler)
18
+ self.pipelines = PipelineManager.from_crawler(self.crawler)
19
19
 
20
20
  async def process(self):
21
21
  while not self.idle():
@@ -0,0 +1,59 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from typing import Optional, Callable
4
+
5
+ from crawlo.utils.log import get_logger
6
+ from crawlo.utils.request import set_request
7
+ from crawlo.utils.pqueue import SpiderPriorityQueue
8
+ from crawlo.utils.project import load_class, common_call
9
+
10
+
11
+ class Scheduler:
12
+ def __init__(self, crawler, dupe_filter, stats, log_level, priority):
13
+ self.crawler = crawler
14
+ self.request_queue: Optional[SpiderPriorityQueue] = None
15
+
16
+ # self.item_count = 0
17
+ # self.response_count = 0
18
+ self.logger = get_logger(name=self.__class__.__name__, level=log_level)
19
+ self.stats = stats
20
+ self.dupe_filter = dupe_filter
21
+ self.priority = priority
22
+
23
+ @classmethod
24
+ def create_instance(cls, crawler):
25
+ filter_cls = load_class(crawler.settings.get('FILTER_CLASS'))
26
+ o = cls(
27
+ crawler=crawler,
28
+ dupe_filter=filter_cls.create_instance(crawler),
29
+ stats=crawler.stats,
30
+ log_level=crawler.settings.get('LOG_LEVEL'),
31
+ priority=crawler.settings.get('DEPTH_PRIORITY')
32
+ )
33
+ return o
34
+
35
+ def open(self):
36
+ self.request_queue = SpiderPriorityQueue()
37
+ self.logger.info(f'requesting filter: {self.dupe_filter}')
38
+
39
+ async def next_request(self):
40
+ request = await self.request_queue.get()
41
+ return request
42
+
43
+ async def enqueue_request(self, request):
44
+ if not request.dont_filter and await common_call(self.dupe_filter.requested, request):
45
+ self.dupe_filter.log_stats(request)
46
+ return False
47
+ set_request(request, self.priority)
48
+ await self.request_queue.put(request)
49
+ return True
50
+
51
+ def idle(self) -> bool:
52
+ return len(self) == 0
53
+
54
+ async def close(self):
55
+ if isinstance(closed := getattr(self.dupe_filter, 'closed', None), Callable):
56
+ await closed()
57
+
58
+ def __len__(self):
59
+ return self.request_queue.qsize()
@@ -0,0 +1,222 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*
3
+ import signal
4
+ import asyncio
5
+ from typing import Type, Final, Set, Optional
6
+
7
+ from crawlo.spider import Spider
8
+ from crawlo.core.engine import Engine
9
+ from crawlo.utils.log import get_logger
10
+ from crawlo.subscriber import Subscriber
11
+ from crawlo.extension import ExtensionManager
12
+ from crawlo.exceptions import SpiderTypeError
13
+ from crawlo.stats_collector import StatsCollector
14
+ from crawlo.event import spider_opened, spider_closed
15
+ from crawlo.settings.setting_manager import SettingManager
16
+ from crawlo.utils.project import merge_settings, get_settings
17
+ from crawlo.utils.concurrency_manager import calculate_optimal_concurrency
18
+
19
+ logger = get_logger(__name__)
20
+
21
+
22
+ class Crawler:
23
+
24
+ def __init__(self, spider_cls, settings):
25
+ self.spider_cls = spider_cls
26
+ self.spider: Optional[Spider] = None
27
+ self.engine: Optional[Engine] = None
28
+ self.stats: Optional[StatsCollector] = None
29
+ self.subscriber: Optional[Subscriber] = None
30
+ self.extension: Optional[ExtensionManager] = None
31
+ self.settings: SettingManager = settings.copy()
32
+
33
+ async def crawl(self):
34
+ self.subscriber = self._create_subscriber()
35
+ self.spider = self._create_spider()
36
+ self.engine = self._create_engine()
37
+ self.stats = self._create_stats()
38
+ self.extension = self._create_extension()
39
+
40
+ await self.engine.start_spider(self.spider)
41
+
42
+ @staticmethod
43
+ def _create_subscriber():
44
+ return Subscriber()
45
+
46
+ def _create_spider(self) -> Spider:
47
+ spider = self.spider_cls.create_instance(self)
48
+ self._set_spider(spider)
49
+ return spider
50
+
51
+ def _create_engine(self) -> Engine:
52
+ engine = Engine(self)
53
+ engine.engine_start()
54
+ return engine
55
+
56
+ def _create_stats(self) -> StatsCollector:
57
+ stats = StatsCollector(self)
58
+ return stats
59
+
60
+ def _create_extension(self) -> ExtensionManager:
61
+ extension = ExtensionManager.create_instance(self)
62
+ return extension
63
+
64
+ def _set_spider(self, spider):
65
+ self.subscriber.subscribe(spider.spider_opened, event=spider_opened)
66
+ self.subscriber.subscribe(spider.spider_closed, event=spider_closed)
67
+ merge_settings(spider, self.settings)
68
+
69
+ async def close(self, reason='finished') -> None:
70
+ await asyncio.create_task(self.subscriber.notify(spider_closed))
71
+ self.stats.close_spider(spider_name=self.spider, reason=reason)
72
+
73
+
74
+ class CrawlerProcess:
75
+ """爬虫处理类,支持跨平台动态并发控制和精细化日志"""
76
+
77
+ def __init__(self, settings=None, max_concurrency: Optional[int] = None, batch_size: int = 50):
78
+ self.crawlers: Final[Set] = set()
79
+ self._active_spiders: Final[Set] = set()
80
+ self.settings = settings or self._get_default_settings()
81
+ self.batch_size = batch_size
82
+
83
+ # 使用独立模块计算最大并发数
84
+ self.max_concurrency = calculate_optimal_concurrency(max_concurrency)
85
+ self.semaphore = asyncio.Semaphore(self.max_concurrency)
86
+
87
+ signal.signal(signal.SIGINT, self._shutdown)
88
+ logger.info(f"初始化爬虫处理进程,最大并发数: {self.max_concurrency}")
89
+
90
+ async def crawl(self, spiders):
91
+ """支持单个或多个爬虫的批量处理,优化日志输出"""
92
+ if not spiders:
93
+ raise ValueError("至少需要提供一个爬虫类")
94
+
95
+ # 统一转换为列表
96
+ if isinstance(spiders, type) and issubclass(spiders, Spider):
97
+ spiders = [spiders]
98
+ elif isinstance(spiders, (list, tuple)):
99
+ spiders = list(spiders)
100
+ else:
101
+ raise TypeError("spiders 必须是爬虫类或爬虫类列表/元组")
102
+
103
+ # 按爬虫类名首字母排序(升序)
104
+ spiders.sort(key=lambda x: x.__name__.lower())
105
+
106
+ if len(spiders) == 1:
107
+ logger.info(f"启动爬虫: {spiders[0].__name__}")
108
+ else:
109
+ logger.info(f"启动{len(spiders)}个爬虫,按名称排序后分批处理中")
110
+
111
+ batches = [spiders[i:i + self.batch_size] for i in range(0, len(spiders), self.batch_size)]
112
+
113
+ for batch_idx, batch in enumerate(batches):
114
+ batch_tasks = set()
115
+
116
+ for spider_cls in batch:
117
+ crawler = self._create_crawler(spider_cls)
118
+ self.crawlers.add(crawler)
119
+
120
+ await self.semaphore.acquire()
121
+ task = asyncio.create_task(self._run_crawler_with_semaphore(crawler))
122
+ batch_tasks.add(task)
123
+ self._active_spiders.add(task)
124
+
125
+ if len(spiders) > 1: # 仅对多爬虫显示批次信息
126
+ logger.info(f"启动第 {batch_idx + 1}/{len(batches)} 批爬虫,共 {len(batch)} 个")
127
+
128
+ await asyncio.gather(*batch_tasks)
129
+
130
+ if len(spiders) > 1: # 仅对多爬虫显示批次完成信息
131
+ logger.info(f"第 {batch_idx + 1} 批爬虫处理完成")
132
+
133
+ async def _run_crawler_with_semaphore(self, crawler):
134
+ """使用信号量控制的爬虫运行函数"""
135
+ try:
136
+ await crawler.crawl()
137
+ finally:
138
+ self.semaphore.release() # 确保资源释放
139
+
140
+ async def start(self):
141
+ """启动所有爬虫任务"""
142
+ if self._active_spiders:
143
+ logger.info(f"启动 {len(self._active_spiders)} 个爬虫任务,并发限制: {self.max_concurrency}")
144
+ await asyncio.gather(*self._active_spiders)
145
+
146
+ def _create_crawler(self, spider_cls) -> Crawler:
147
+ """创建爬虫实例"""
148
+ if isinstance(spider_cls, str):
149
+ raise SpiderTypeError(f"{type(self)}.crawl args: String is not supported.")
150
+ crawler: Crawler = Crawler(spider_cls, self.settings)
151
+ return crawler
152
+
153
+ def _shutdown(self, _signum, _frame):
154
+ """优雅关闭所有爬虫"""
155
+ logger.warning(f"收到关闭信号,正在优雅关闭 {len(self.crawlers)} 个爬虫...")
156
+ for crawler in self.crawlers:
157
+ if crawler.engine:
158
+ crawler.engine.running = False
159
+ crawler.engine.normal = False
160
+ crawler.stats.close_spider(crawler.spider, 'shutdown signal')
161
+
162
+ # 等待所有任务完成
163
+ asyncio.create_task(self._wait_for_tasks())
164
+
165
+ async def _wait_for_tasks(self):
166
+ """等待所有活跃任务完成"""
167
+ pending = [task for task in self._active_spiders if not task.done()]
168
+ if pending:
169
+ logger.info(f"等待 {len(pending)} 个活跃任务完成...")
170
+ await asyncio.gather(*pending)
171
+ logger.info("所有爬虫已优雅关闭")
172
+
173
+ @classmethod
174
+ def _get_default_settings(cls):
175
+ """框架自动获取默认配置"""
176
+ try:
177
+ return get_settings()
178
+ except ImportError:
179
+ return {}
180
+
181
+ # class CrawlerProcess:
182
+ #
183
+ # def __init__(self, settings=None):
184
+ # self.crawlers: Final[Set] = set()
185
+ # self._active_spiders: Final[Set] = set()
186
+ # self.settings = settings or self._get_default_settings()
187
+ #
188
+ # signal.signal(signal.SIGINT, self._shutdown)
189
+ #
190
+ # async def crawl(self, spider: Type[Spider]):
191
+ # crawler: Crawler = self._create_crawler(spider)
192
+ # self.crawlers.add(crawler)
193
+ # task = await self._crawl(crawler)
194
+ # self._active_spiders.add(task)
195
+ #
196
+ # @classmethod
197
+ # def _get_default_settings(cls):
198
+ # """自动获取默认配置"""
199
+ # try:
200
+ # return get_settings()
201
+ # except ImportError:
202
+ # return {}
203
+ #
204
+ # @staticmethod
205
+ # async def _crawl(crawler):
206
+ # return asyncio.create_task(crawler.crawl())
207
+ #
208
+ # async def start(self):
209
+ # await asyncio.gather(*self._active_spiders)
210
+ #
211
+ # def _create_crawler(self, spider_cls) -> Crawler:
212
+ # if isinstance(spider_cls, str):
213
+ # raise SpiderTypeError(f"{type(self)}.crawl args: String is not supported.")
214
+ # crawler: Crawler = Crawler(spider_cls, self.settings)
215
+ # return crawler
216
+ #
217
+ # def _shutdown(self, _signum, _frame):
218
+ # for crawler in self.crawlers:
219
+ # crawler.engine.running = False
220
+ # crawler.engine.normal = False
221
+ # crawler.stats.close_spider(crawler.spider, 'Ctrl C')
222
+ # logger.warning(f'spiders received: `Ctrl C` signal, closed.')
@@ -0,0 +1,161 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from typing import Optional, Dict, Any
4
+ from playwright.async_api import Browser, Page, Response as PlaywrightResponse
5
+ from crawlo import Response, Request
6
+ from crawlo.downloader import DownloaderBase
7
+
8
+
9
+ class PlaywrightDownloader(DownloaderBase):
10
+ def __init__(self, crawler):
11
+ super().__init__(crawler)
12
+ # Playwright 核心对象
13
+ self.browser: Optional[Browser] = None # 浏览器实例
14
+ self.context: Optional[Any] = None # 浏览器上下文(隔离cookies等)
15
+
16
+ # 可配置参数(通过crawler.settings覆盖默认值)
17
+ self._browser_type: str = "chromium" # 浏览器类型(chromium/firefox/webkit)
18
+ self._headless: bool = True # 是否无头模式
19
+ self._timeout: int = 30000 # 操作超时(毫秒)
20
+ self._viewport: Dict[str, int] = {"width": 1280, "height": 720} # 视口大小
21
+ self._extra_launch_args: Dict[str, Any] = {} # 浏览器启动额外参数
22
+
23
+ async def _init_browser(self):
24
+ """初始化Playwright浏览器实例"""
25
+ from playwright.async_api import async_playwright
26
+
27
+ # 启动Playwright引擎
28
+ playwright = await async_playwright().start()
29
+
30
+ # 根据配置选择浏览器类型
31
+ browser_launcher = {
32
+ "chromium": playwright.chromium,
33
+ "firefox": playwright.firefox,
34
+ "webkit": playwright.webkit
35
+ }.get(self._browser_type, playwright.chromium) # 默认chromium
36
+
37
+ # 启动浏览器(含启动参数)
38
+ self.browser = await browser_launcher.launch(
39
+ headless=self._headless, # 无头模式开关
40
+ timeout=self._timeout, # 启动超时
41
+ **self._extra_launch_args # 透传额外参数(如代理配置)
42
+ )
43
+
44
+ # 创建浏览器上下文(隔离环境)
45
+ self.context = await self.browser.new_context(
46
+ viewport=self._viewport, # 设置窗口大小
47
+ user_agent=self.crawler.settings.get("USER_AGENT") # 自定义UA
48
+ )
49
+
50
+ def open(self):
51
+ """从crawler配置加载参数"""
52
+ super().open() # 调用父类初始化
53
+
54
+ # 读取配置(支持在settings.py中覆盖)
55
+ self._browser_type = self.crawler.settings.get("PLAYWRIGHT_BROWSER", "chromium")
56
+ self._headless = self.crawler.settings.get_bool("HEADLESS", True)
57
+ self._timeout = self.crawler.settings.get_int("PLAYWRIGHT_TIMEOUT", 30000)
58
+ self._viewport = self.crawler.settings.get_dict("VIEWPORT", {"width": 1280, "height": 720})
59
+ self._extra_launch_args = self.crawler.settings.get_dict("PLAYWRIGHT_LAUNCH_ARGS", {})
60
+
61
+ async def download(self, request: Request) -> Response:
62
+ """
63
+ 核心下载方法:
64
+ 1. 创建新页面Tab
65
+ 2. 加载目标URL
66
+ 3. 获取渲染后的内容
67
+ """
68
+ if not self.browser:
69
+ await self._init_browser() # 懒加载浏览器
70
+
71
+ page = await self.context.new_page() # 每个请求独立Page(自动隔离)
72
+
73
+ try:
74
+ # 设置请求头(模拟浏览器)
75
+ if request.headers:
76
+ await page.set_extra_http_headers(request.headers)
77
+
78
+ # 导航到目标URL(支持等待策略配置)
79
+ response = await page.goto(
80
+ request.url,
81
+ timeout=self._timeout,
82
+ wait_until="domcontentloaded" # 等待策略:domcontentloaded/networkidle/load
83
+ )
84
+
85
+ # 特殊处理POST请求(Playwright限制需用API方式)
86
+ if request.method.lower() == "post":
87
+ return await self._handle_post_request(request, page)
88
+
89
+ # 执行自定义JavaScript(用于提取动态数据)
90
+ if request.meta.get("execute_js"):
91
+ result = await page.evaluate(request.meta["execute_js"])
92
+ request.meta["js_result"] = result # 存储JS执行结果
93
+
94
+ # 获取渲染后的完整HTML(含动态生成内容)
95
+ body = await page.content()
96
+
97
+ # 调试模式下截图(用于排查页面问题)
98
+ if self.crawler.settings.get_bool("DEBUG"):
99
+ screenshot = await page.screenshot(type="png")
100
+ request.meta["screenshot"] = screenshot # 截图存入request.meta
101
+
102
+ # 构造统一响应对象
103
+ return self._structure_response(request, response, body)
104
+
105
+ except Exception as e:
106
+ self.logger.error(f"页面下载失败: {str(e)}")
107
+ raise
108
+ finally:
109
+ await page.close() # 确保页面关闭,避免资源泄漏
110
+
111
+ async def _handle_post_request(self, request: Request, page: Page) -> Response:
112
+ """
113
+ 处理POST请求的特殊方法:
114
+ 通过页面内fetch API发送POST请求,并监听响应
115
+ """
116
+ async with page.expect_response(request.url) as response_info:
117
+ # 在页面上下文中执行fetch
118
+ await page.evaluate(
119
+ """async ({url, headers, body}) => {
120
+ await fetch(url, {
121
+ method: 'POST',
122
+ headers: headers,
123
+ body: body
124
+ });
125
+ }""",
126
+ {
127
+ "url": request.url,
128
+ "headers": request.headers or {},
129
+ "body": request.body or ""
130
+ }
131
+ )
132
+
133
+ response = await response_info.value # 获取API响应
134
+ body = await response.text() # 读取响应体
135
+ return self._structure_response(request, response, body)
136
+
137
+ @staticmethod
138
+ def _structure_response(
139
+ request: Request,
140
+ response: PlaywrightResponse,
141
+ body: str
142
+ ) -> Response:
143
+ """
144
+ 标准化响应格式:
145
+ 将Playwright的响应转换为crawlo的统一Response对象
146
+ """
147
+ return Response(
148
+ url=str(response.url), # 最终URL(含重定向)
149
+ headers=response.headers, # 响应头
150
+ status_code=response.status, # HTTP状态码
151
+ body=body.encode('utf-8'), # 响应体(转bytes)
152
+ request=request # 关联的请求对象
153
+ )
154
+
155
+ async def close(self) -> None:
156
+ """资源清理:关闭浏览器实例和上下文"""
157
+ if self.context:
158
+ await self.context.close()
159
+ if self.browser:
160
+ await self.browser.close()
161
+ await super().close() # 调用父类清理逻辑
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/python
2
2
  # -*- coding:UTF-8 -*-
3
3
  from crawlo import event
4
- from crawlo.utils.date_tools import now, date_delta
4
+ from crawlo.utils.date_tools import get_current_time, time_diff_seconds
5
5
 
6
6
 
7
7
  class LogStats(object):
@@ -22,11 +22,11 @@ class LogStats(object):
22
22
  return o
23
23
 
24
24
  async def spider_opened(self):
25
- self._stats['start_time'] = now()
25
+ self._stats['start_time'] = get_current_time(fmt='%Y-%m-%d %H:%M:%S')
26
26
 
27
27
  async def spider_closed(self):
28
- self._stats['end_time'] = now()
29
- self._stats['cost_time(s)'] = date_delta(start=self._stats['start_time'], end=self._stats['end_time'])
28
+ self._stats['end_time'] = get_current_time(fmt='%Y-%m-%d %H:%M:%S')
29
+ self._stats['cost_time(s)'] = time_diff_seconds(start_time=self._stats['start_time'], end_time=self._stats['end_time'])
30
30
 
31
31
  async def item_successful(self, _item, _spider):
32
32
  self._stats.inc_value('item_successful_count')
@@ -0,0 +1,37 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from abc import ABC, abstractmethod
4
+
5
+ from crawlo import Request
6
+ from crawlo.utils.request import request_fingerprint
7
+
8
+
9
+ class BaseFilter(ABC):
10
+
11
+ def __init__(self, logger, stats, debug: bool):
12
+ self.logger = logger
13
+ self.stats = stats
14
+ self.debug = debug
15
+
16
+ @classmethod
17
+ def create_instance(cls, *args, **kwargs) -> 'BaseFilter':
18
+ return cls(*args, **kwargs)
19
+
20
+ def requested(self, request: Request):
21
+ fp = request_fingerprint(request)
22
+ if fp in self:
23
+ return True
24
+ self.add_fingerprint(fp)
25
+ return False
26
+
27
+ @abstractmethod
28
+ def add_fingerprint(self, fp) -> None:
29
+ pass
30
+
31
+ def log_stats(self, request: Request) -> None:
32
+ if self.debug:
33
+ self.logger.debug(f'Filtered duplicate request: {request}')
34
+ self.stats.inc_value(f'{self}/filtered_count')
35
+
36
+ def __str__(self) -> str:
37
+ return f'{self.__class__.__name__}'