crawlo 1.0.4__py3-none-any.whl → 1.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (112) hide show
  1. crawlo/__init__.py +25 -9
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +41 -0
  4. crawlo/commands/__init__.py +10 -0
  5. crawlo/commands/genspider.py +111 -0
  6. crawlo/commands/run.py +149 -0
  7. crawlo/commands/startproject.py +101 -0
  8. crawlo/core/__init__.py +2 -2
  9. crawlo/core/engine.py +158 -158
  10. crawlo/core/processor.py +40 -40
  11. crawlo/core/scheduler.py +57 -57
  12. crawlo/crawler.py +219 -242
  13. crawlo/downloader/__init__.py +78 -78
  14. crawlo/downloader/aiohttp_downloader.py +200 -259
  15. crawlo/downloader/cffi_downloader.py +277 -0
  16. crawlo/downloader/httpx_downloader.py +246 -187
  17. crawlo/event.py +11 -11
  18. crawlo/exceptions.py +78 -64
  19. crawlo/extension/__init__.py +31 -31
  20. crawlo/extension/log_interval.py +49 -49
  21. crawlo/extension/log_stats.py +44 -44
  22. crawlo/extension/logging_extension.py +35 -0
  23. crawlo/filters/__init__.py +37 -37
  24. crawlo/filters/aioredis_filter.py +150 -150
  25. crawlo/filters/memory_filter.py +202 -202
  26. crawlo/items/__init__.py +22 -62
  27. crawlo/items/base.py +31 -0
  28. crawlo/items/fields.py +54 -0
  29. crawlo/items/items.py +105 -119
  30. crawlo/middleware/__init__.py +21 -21
  31. crawlo/middleware/default_header.py +32 -32
  32. crawlo/middleware/download_delay.py +28 -28
  33. crawlo/middleware/middleware_manager.py +135 -140
  34. crawlo/middleware/proxy.py +246 -0
  35. crawlo/middleware/request_ignore.py +30 -30
  36. crawlo/middleware/response_code.py +18 -18
  37. crawlo/middleware/response_filter.py +26 -26
  38. crawlo/middleware/retry.py +90 -90
  39. crawlo/network/__init__.py +7 -7
  40. crawlo/network/request.py +203 -204
  41. crawlo/network/response.py +166 -166
  42. crawlo/pipelines/__init__.py +13 -13
  43. crawlo/pipelines/console_pipeline.py +39 -39
  44. crawlo/pipelines/mongo_pipeline.py +116 -116
  45. crawlo/pipelines/mysql_batch_pipline.py +273 -134
  46. crawlo/pipelines/mysql_pipeline.py +195 -195
  47. crawlo/pipelines/pipeline_manager.py +56 -56
  48. crawlo/settings/__init__.py +7 -7
  49. crawlo/settings/default_settings.py +169 -94
  50. crawlo/settings/setting_manager.py +99 -99
  51. crawlo/spider/__init__.py +41 -36
  52. crawlo/stats_collector.py +59 -59
  53. crawlo/subscriber.py +106 -106
  54. crawlo/task_manager.py +27 -27
  55. crawlo/templates/crawlo.cfg.tmpl +11 -0
  56. crawlo/templates/project/__init__.py.tmpl +4 -0
  57. crawlo/templates/project/items.py.tmpl +18 -0
  58. crawlo/templates/project/middlewares.py.tmpl +76 -0
  59. crawlo/templates/project/pipelines.py.tmpl +64 -0
  60. crawlo/templates/project/settings.py.tmpl +54 -0
  61. crawlo/templates/project/spiders/__init__.py.tmpl +6 -0
  62. crawlo/templates/spider/spider.py.tmpl +32 -0
  63. crawlo/utils/__init__.py +7 -7
  64. crawlo/utils/concurrency_manager.py +124 -124
  65. crawlo/utils/date_tools.py +233 -177
  66. crawlo/utils/db_helper.py +344 -0
  67. crawlo/utils/func_tools.py +82 -82
  68. crawlo/utils/log.py +129 -39
  69. crawlo/utils/pqueue.py +173 -173
  70. crawlo/utils/project.py +199 -59
  71. crawlo/utils/request.py +267 -122
  72. crawlo/utils/spider_loader.py +63 -0
  73. crawlo/utils/system.py +11 -11
  74. crawlo/utils/tools.py +5 -303
  75. crawlo/utils/url.py +39 -39
  76. {crawlo-1.0.4.dist-info → crawlo-1.0.6.dist-info}/METADATA +49 -48
  77. crawlo-1.0.6.dist-info/RECORD +94 -0
  78. crawlo-1.0.6.dist-info/entry_points.txt +2 -0
  79. {crawlo-1.0.4.dist-info → crawlo-1.0.6.dist-info}/top_level.txt +1 -0
  80. examples/gxb/items.py +36 -0
  81. examples/gxb/run.py +16 -0
  82. examples/gxb/settings.py +72 -0
  83. examples/gxb/spider/__init__.py +0 -0
  84. examples/gxb/spider/miit_spider.py +180 -0
  85. examples/gxb/spider/telecom_device.py +129 -0
  86. tests/__init__.py +7 -7
  87. tests/test_proxy_health_check.py +33 -0
  88. tests/test_proxy_middleware_integration.py +137 -0
  89. tests/test_proxy_providers.py +57 -0
  90. tests/test_proxy_stats.py +20 -0
  91. tests/test_proxy_strategies.py +60 -0
  92. crawlo/downloader/playwright_downloader.py +0 -161
  93. crawlo/templates/item_template.tmpl +0 -22
  94. crawlo/templates/project_template/main.py +0 -33
  95. crawlo/templates/project_template/setting.py +0 -190
  96. crawlo/templates/spider_template.tmpl +0 -31
  97. crawlo-1.0.4.dist-info/RECORD +0 -79
  98. crawlo-1.0.4.dist-info/entry_points.txt +0 -2
  99. tests/baidu_spider/__init__.py +0 -7
  100. tests/baidu_spider/demo.py +0 -94
  101. tests/baidu_spider/items.py +0 -25
  102. tests/baidu_spider/middleware.py +0 -49
  103. tests/baidu_spider/pipeline.py +0 -55
  104. tests/baidu_spider/request_fingerprints.txt +0 -9
  105. tests/baidu_spider/run.py +0 -27
  106. tests/baidu_spider/settings.py +0 -80
  107. tests/baidu_spider/spiders/__init__.py +0 -7
  108. tests/baidu_spider/spiders/bai_du.py +0 -61
  109. tests/baidu_spider/spiders/sina.py +0 -79
  110. {crawlo-1.0.4.dist-info → crawlo-1.0.6.dist-info}/WHEEL +0 -0
  111. {crawlo/templates/project_template/items → examples}/__init__.py +0 -0
  112. {crawlo/templates/project_template/spiders → examples/gxb}/__init__.py +0 -0
@@ -0,0 +1,60 @@
1
+ # tests/test_proxy_strategies.py
2
+ import pytest
3
+ from crawlo import Request
4
+ from crawlo.proxy.strategies import STRATEGIES
5
+
6
+
7
+ @pytest.fixture
8
+ def mock_proxies():
9
+ """提供测试用的代理列表"""
10
+ return [
11
+ {'url': 'http://p1:8080'},
12
+ {'url': 'http://p2:8080'},
13
+ {'url': 'http://p3:8080'},
14
+ ]
15
+
16
+
17
+ @pytest.fixture
18
+ def mock_stats():
19
+ """提供测试用的统计信息"""
20
+ return {
21
+ 'http://p1:8080': {'total': 10},
22
+ 'http://p2:8080': {'total': 5},
23
+ 'http://p3:8080': {'total': 1},
24
+ }
25
+
26
+
27
+ @pytest.fixture
28
+ def mock_request():
29
+ """提供测试用的请求对象"""
30
+ return Request("https://example.com")
31
+
32
+
33
+ def test_random_strategy(mock_proxies, mock_request, mock_stats):
34
+ """测试随机策略"""
35
+ strategy = STRATEGIES['random']
36
+ chosen = strategy(mock_proxies, mock_request, mock_stats)
37
+ assert chosen in [p['url'] for p in mock_proxies]
38
+
39
+
40
+ def test_least_used_strategy(mock_proxies, mock_request, mock_stats):
41
+ """测试最少使用策略"""
42
+ strategy = STRATEGIES['least_used']
43
+ chosen = strategy(mock_proxies, mock_request, mock_stats)
44
+ assert chosen == 'http://p3:8080' # total=1
45
+
46
+
47
+ def test_domain_rule_strategy(mock_proxies, mock_request, mock_stats):
48
+ """测试域名规则策略"""
49
+ from crawlo.proxy.strategies.domain_rule import domain_rule_strategy
50
+ request = Request("https://taobao.com/item/123")
51
+ rules = {'taobao.com': 'http://special:8080'}
52
+
53
+ # Monkey patch 确保有回退策略
54
+ old_strategy = STRATEGIES['least_used']
55
+ try:
56
+ STRATEGIES['least_used'] = lambda p, r, s: 'http://fallback:8080'
57
+ chosen = domain_rule_strategy(mock_proxies, request, mock_stats, rules)
58
+ assert chosen == 'http://special:8080'
59
+ finally:
60
+ STRATEGIES['least_used'] = old_strategy
@@ -1,161 +0,0 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- from typing import Optional, Dict, Any
4
- from playwright.async_api import Browser, Page, Response as PlaywrightResponse
5
- from crawlo import Response, Request
6
- from crawlo.downloader import DownloaderBase
7
-
8
-
9
- class PlaywrightDownloader(DownloaderBase):
10
- def __init__(self, crawler):
11
- super().__init__(crawler)
12
- # Playwright 核心对象
13
- self.browser: Optional[Browser] = None # 浏览器实例
14
- self.context: Optional[Any] = None # 浏览器上下文(隔离cookies等)
15
-
16
- # 可配置参数(通过crawler.settings覆盖默认值)
17
- self._browser_type: str = "chromium" # 浏览器类型(chromium/firefox/webkit)
18
- self._headless: bool = True # 是否无头模式
19
- self._timeout: int = 30000 # 操作超时(毫秒)
20
- self._viewport: Dict[str, int] = {"width": 1280, "height": 720} # 视口大小
21
- self._extra_launch_args: Dict[str, Any] = {} # 浏览器启动额外参数
22
-
23
- async def _init_browser(self):
24
- """初始化Playwright浏览器实例"""
25
- from playwright.async_api import async_playwright
26
-
27
- # 启动Playwright引擎
28
- playwright = await async_playwright().start()
29
-
30
- # 根据配置选择浏览器类型
31
- browser_launcher = {
32
- "chromium": playwright.chromium,
33
- "firefox": playwright.firefox,
34
- "webkit": playwright.webkit
35
- }.get(self._browser_type, playwright.chromium) # 默认chromium
36
-
37
- # 启动浏览器(含启动参数)
38
- self.browser = await browser_launcher.launch(
39
- headless=self._headless, # 无头模式开关
40
- timeout=self._timeout, # 启动超时
41
- **self._extra_launch_args # 透传额外参数(如代理配置)
42
- )
43
-
44
- # 创建浏览器上下文(隔离环境)
45
- self.context = await self.browser.new_context(
46
- viewport=self._viewport, # 设置窗口大小
47
- user_agent=self.crawler.settings.get("USER_AGENT") # 自定义UA
48
- )
49
-
50
- def open(self):
51
- """从crawler配置加载参数"""
52
- super().open() # 调用父类初始化
53
-
54
- # 读取配置(支持在settings.py中覆盖)
55
- self._browser_type = self.crawler.settings.get("PLAYWRIGHT_BROWSER", "chromium")
56
- self._headless = self.crawler.settings.get_bool("HEADLESS", True)
57
- self._timeout = self.crawler.settings.get_int("PLAYWRIGHT_TIMEOUT", 30000)
58
- self._viewport = self.crawler.settings.get_dict("VIEWPORT", {"width": 1280, "height": 720})
59
- self._extra_launch_args = self.crawler.settings.get_dict("PLAYWRIGHT_LAUNCH_ARGS", {})
60
-
61
- async def download(self, request: Request) -> Response:
62
- """
63
- 核心下载方法:
64
- 1. 创建新页面Tab
65
- 2. 加载目标URL
66
- 3. 获取渲染后的内容
67
- """
68
- if not self.browser:
69
- await self._init_browser() # 懒加载浏览器
70
-
71
- page = await self.context.new_page() # 每个请求独立Page(自动隔离)
72
-
73
- try:
74
- # 设置请求头(模拟浏览器)
75
- if request.headers:
76
- await page.set_extra_http_headers(request.headers)
77
-
78
- # 导航到目标URL(支持等待策略配置)
79
- response = await page.goto(
80
- request.url,
81
- timeout=self._timeout,
82
- wait_until="domcontentloaded" # 等待策略:domcontentloaded/networkidle/load
83
- )
84
-
85
- # 特殊处理POST请求(Playwright限制需用API方式)
86
- if request.method.lower() == "post":
87
- return await self._handle_post_request(request, page)
88
-
89
- # 执行自定义JavaScript(用于提取动态数据)
90
- if request.meta.get("execute_js"):
91
- result = await page.evaluate(request.meta["execute_js"])
92
- request.meta["js_result"] = result # 存储JS执行结果
93
-
94
- # 获取渲染后的完整HTML(含动态生成内容)
95
- body = await page.content()
96
-
97
- # 调试模式下截图(用于排查页面问题)
98
- if self.crawler.settings.get_bool("DEBUG"):
99
- screenshot = await page.screenshot(type="png")
100
- request.meta["screenshot"] = screenshot # 截图存入request.meta
101
-
102
- # 构造统一响应对象
103
- return self._structure_response(request, response, body)
104
-
105
- except Exception as e:
106
- self.logger.error(f"页面下载失败: {str(e)}")
107
- raise
108
- finally:
109
- await page.close() # 确保页面关闭,避免资源泄漏
110
-
111
- async def _handle_post_request(self, request: Request, page: Page) -> Response:
112
- """
113
- 处理POST请求的特殊方法:
114
- 通过页面内fetch API发送POST请求,并监听响应
115
- """
116
- async with page.expect_response(request.url) as response_info:
117
- # 在页面上下文中执行fetch
118
- await page.evaluate(
119
- """async ({url, headers, body}) => {
120
- await fetch(url, {
121
- method: 'POST',
122
- headers: headers,
123
- body: body
124
- });
125
- }""",
126
- {
127
- "url": request.url,
128
- "headers": request.headers or {},
129
- "body": request.body or ""
130
- }
131
- )
132
-
133
- response = await response_info.value # 获取API响应
134
- body = await response.text() # 读取响应体
135
- return self._structure_response(request, response, body)
136
-
137
- @staticmethod
138
- def _structure_response(
139
- request: Request,
140
- response: PlaywrightResponse,
141
- body: str
142
- ) -> Response:
143
- """
144
- 标准化响应格式:
145
- 将Playwright的响应转换为crawlo的统一Response对象
146
- """
147
- return Response(
148
- url=str(response.url), # 最终URL(含重定向)
149
- headers=response.headers, # 响应头
150
- status_code=response.status, # HTTP状态码
151
- body=body.encode('utf-8'), # 响应体(转bytes)
152
- request=request # 关联的请求对象
153
- )
154
-
155
- async def close(self) -> None:
156
- """资源清理:关闭浏览器实例和上下文"""
157
- if self.context:
158
- await self.context.close()
159
- if self.browser:
160
- await self.browser.close()
161
- await super().close() # 调用父类清理逻辑
@@ -1,22 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- """
3
- Created on {DATE}
4
- ---------
5
- @summary:
6
- ---------
7
- @author: {USER}
8
- """
9
-
10
- from crawlo import Item
11
-
12
-
13
- class ${item_name}Item(Item):
14
- """
15
- This class was generated by feapder
16
- command: feapder create -i ${command}
17
- """
18
-
19
- __table_name__ = "${table_name}"
20
-
21
- def __init__(self, *args, **kwargs):
22
- ${propertys}
@@ -1,33 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- """
3
- Created on {DATE}
4
- ---------
5
- @summary: 爬虫入口
6
- ---------
7
- @author: {USER}
8
- """
9
-
10
- from crawlo import ArgumentParser
11
-
12
- from spiders import *
13
-
14
-
15
-
16
- def crawl_xxx():
17
- """
18
- Spider爬虫
19
- """
20
- spider = xxx.XXXSpider(redis_key="xxx:xxx")
21
- spider.start()
22
-
23
-
24
-
25
- if __name__ == "__main__":
26
- parser = ArgumentParser(description="xxx爬虫")
27
-
28
- parser.add_argument(
29
- "--crawl_xxx", action="store_true", help="xxx爬虫", function=crawl_xxx
30
- )
31
- parser.start()
32
-
33
- # main.py作为爬虫启动的统一入口,提供命令行的方式启动多个爬虫
@@ -1,190 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- """爬虫配置文件"""
3
- # import os
4
- # import sys
5
- #
6
- # # MYSQL
7
- # MYSQL_IP = "localhost"
8
- # MYSQL_PORT = 3306
9
- # MYSQL_DB = ""
10
- # MYSQL_USER_NAME = ""
11
- # MYSQL_USER_PASS = ""
12
- #
13
- # # MONGODB
14
- # MONGO_IP = "localhost"
15
- # MONGO_PORT = 27017
16
- # MONGO_DB = ""
17
- # MONGO_USER_NAME = ""
18
- # MONGO_USER_PASS = ""
19
- # MONGO_URL = "
20
- #
21
- # # REDIS
22
- # # ip:port 多个可写为列表或者逗号隔开 如 ip1:port1,ip2:port2 或 ["ip1:port1", "ip2:port2"]
23
- # REDISDB_IP_PORTS = "localhost:6379"
24
- # REDISDB_USER_PASS = ""
25
- # REDISDB_DB = 0
26
- # # 连接redis时携带的其他参数,如ssl=True
27
- # REDISDB_KWARGS = dict()
28
- # # 适用于redis哨兵模式
29
- # REDISDB_SERVICE_NAME = ""
30
- #
31
- # # 数据入库的pipeline,可自定义,默认MysqlPipeline
32
- # ITEM_PIPELINES = [
33
- # "feapder.pipelines.mysql_pipeline.MysqlPipeline",
34
- # # "feapder.pipelines.mongo_pipeline.MongoPipeline",
35
- # # "feapder.pipelines.console_pipeline.ConsolePipeline",
36
- # ]
37
- # EXPORT_DATA_MAX_FAILED_TIMES = 10 # 导出数据时最大的失败次数,包括保存和更新,超过这个次数报警
38
- # EXPORT_DATA_MAX_RETRY_TIMES = 10 # 导出数据时最大的重试次数,包括保存和更新,超过这个次数则放弃重试
39
- #
40
- # # 爬虫相关
41
- # # COLLECTOR
42
- # COLLECTOR_TASK_COUNT = 32 # 每次获取任务数量,追求速度推荐32
43
- #
44
- # # SPIDER
45
- # SPIDER_THREAD_COUNT = 1 # 爬虫并发数,追求速度推荐32
46
- # # 下载时间间隔 单位秒。 支持随机 如 SPIDER_SLEEP_TIME = [2, 5] 则间隔为 2~5秒之间的随机数,包含2和5
47
- # SPIDER_SLEEP_TIME = 0
48
- # SPIDER_MAX_RETRY_TIMES = 10 # 每个请求最大重试次数
49
- # KEEP_ALIVE = False # 爬虫是否常驻
50
-
51
- # 下载
52
- # DOWNLOADER = "feapder.network.downloader.RequestsDownloader" # 请求下载器
53
- # SESSION_DOWNLOADER = "feapder.network.downloader.RequestsSessionDownloader"
54
- # RENDER_DOWNLOADER = "feapder.network.downloader.SeleniumDownloader" # 渲染下载器
55
- # # RENDER_DOWNLOADER="feapder.network.downloader.PlaywrightDownloader"
56
- # MAKE_ABSOLUTE_LINKS = True # 自动转成绝对连接
57
-
58
- # # 浏览器渲染
59
- # WEBDRIVER = dict(
60
- # pool_size=1, # 浏览器的数量
61
- # load_images=True, # 是否加载图片
62
- # user_agent=None, # 字符串 或 无参函数,返回值为user_agent
63
- # proxy=None, # xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址
64
- # headless=False, # 是否为无头浏览器
65
- # driver_type="CHROME", # CHROME、EDGE、PHANTOMJS、FIREFOX
66
- # timeout=30, # 请求超时时间
67
- # window_size=(1024, 800), # 窗口大小
68
- # executable_path=None, # 浏览器路径,默认为默认路径
69
- # render_time=0, # 渲染时长,即打开网页等待指定时间后再获取源码
70
- # custom_argument=[
71
- # "--ignore-certificate-errors",
72
- # "--disable-blink-features=AutomationControlled",
73
- # ], # 自定义浏览器渲染参数
74
- # xhr_url_regexes=None, # 拦截xhr接口,支持正则,数组类型
75
- # auto_install_driver=True, # 自动下载浏览器驱动 支持chrome 和 firefox
76
- # download_path=None, # 下载文件的路径
77
- # use_stealth_js=False, # 使用stealth.min.js隐藏浏览器特征
78
- # )
79
- #
80
- # PLAYWRIGHT = dict(
81
- # user_agent=None, # 字符串 或 无参函数,返回值为user_agent
82
- # proxy=None, # xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址
83
- # headless=False, # 是否为无头浏览器
84
- # driver_type="chromium", # chromium、firefox、webkit
85
- # timeout=30, # 请求超时时间
86
- # window_size=(1024, 800), # 窗口大小
87
- # executable_path=None, # 浏览器路径,默认为默认路径
88
- # download_path=None, # 下载文件的路径
89
- # render_time=0, # 渲染时长,即打开网页等待指定时间后再获取源码
90
- # wait_until="networkidle", # 等待页面加载完成的事件,可选值:"commit", "domcontentloaded", "load", "networkidle"
91
- # use_stealth_js=False, # 使用stealth.min.js隐藏浏览器特征
92
- # page_on_event_callback=None, # page.on() 事件的回调 如 page_on_event_callback={"dialog": lambda dialog: dialog.accept()}
93
- # storage_state_path=None, # 保存浏览器状态的路径
94
- # url_regexes=None, # 拦截接口,支持正则,数组类型
95
- # save_all=False, # 是否保存所有拦截的接口, 配合url_regexes使用,为False时只保存最后一次拦截的接口
96
- # )
97
- #
98
- # # 爬虫启动时,重新抓取失败的requests
99
- # RETRY_FAILED_REQUESTS = False
100
- # # 爬虫启动时,重新入库失败的item
101
- # RETRY_FAILED_ITEMS = False
102
- # # 保存失败的request
103
- # SAVE_FAILED_REQUEST = True
104
- # # request防丢机制。(指定的REQUEST_LOST_TIMEOUT时间内request还没做完,会重新下发 重做)
105
- # REQUEST_LOST_TIMEOUT = 600 # 10分钟
106
- # # request网络请求超时时间
107
- # REQUEST_TIMEOUT = 22 # 等待服务器响应的超时时间,浮点数,或(connect timeout, read timeout)元组
108
- # # item在内存队列中最大缓存数量
109
- # ITEM_MAX_CACHED_COUNT = 5000
110
- # # item每批入库的最大数量
111
- # ITEM_UPLOAD_BATCH_MAX_SIZE = 1000
112
- # # item入库时间间隔
113
- # ITEM_UPLOAD_INTERVAL = 1
114
- # # 内存任务队列最大缓存的任务数,默认不限制;仅对AirSpider有效。
115
- # TASK_MAX_CACHED_SIZE = 0
116
- #
117
- # # 下载缓存 利用redis缓存,但由于内存大小限制,所以建议仅供开发调试代码时使用,防止每次debug都需要网络请求
118
- # RESPONSE_CACHED_ENABLE = False # 是否启用下载缓存 成本高的数据或容易变需求的数据,建议设置为True
119
- # RESPONSE_CACHED_EXPIRE_TIME = 3600 # 缓存时间 秒
120
- # RESPONSE_CACHED_USED = False # 是否使用缓存 补采数据时可设置为True
121
- #
122
- # # 设置代理
123
- # PROXY_EXTRACT_API = None # 代理提取API ,返回的代理分割符为\r\n
124
- # PROXY_ENABLE = True
125
- # PROXY_MAX_FAILED_TIMES = 5 # 代理最大失败次数,超过则不使用,自动删除
126
- # PROXY_POOL = "feapder.network.proxy_pool.ProxyPool" # 代理池
127
- #
128
- # # 随机headers
129
- # RANDOM_HEADERS = True
130
- # # UserAgent类型 支持 'chrome', 'opera', 'firefox', 'internetexplorer', 'safari','mobile' 若不指定则随机类型
131
- # USER_AGENT_TYPE = "chrome"
132
- # # 默认使用的浏览器头
133
- # DEFAULT_USERAGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
134
- # # requests 使用session
135
- # USE_SESSION = False
136
- #
137
- # # 去重
138
- # ITEM_FILTER_ENABLE = False # item 去重
139
- # REQUEST_FILTER_ENABLE = False # request 去重
140
- # ITEM_FILTER_SETTING = dict(
141
- # filter_type=1 # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3、轻量去重(LiteFilter)= 4
142
- # )
143
- # REQUEST_FILTER_SETTING = dict(
144
- # filter_type=3, # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3、 轻量去重(LiteFilter)= 4
145
- # expire_time=2592000, # 过期时间1个月
146
- # )
147
- #
148
- # # 报警 支持钉钉、飞书、企业微信、邮件
149
- # # 钉钉报警
150
- # DINGDING_WARNING_URL = "" # 钉钉机器人api
151
- # DINGDING_WARNING_PHONE = "" # 被@的群成员手机号,支持列表,可指定多个。
152
- # DINGDING_WARNING_USER_ID = "" # 被@的群成员userId,支持列表,可指定多个
153
- # DINGDING_WARNING_ALL = False # 是否提示所有人, 默认为False
154
- # DINGDING_WARNING_SECRET = None # 加签密钥
155
- # # 飞书报警
156
- # # https://open.feishu.cn/document/ukTMukTMukTM/ucTM5YjL3ETO24yNxkjN#e1cdee9f
157
- # FEISHU_WARNING_URL = "" # 飞书机器人api
158
- # FEISHU_WARNING_USER = None # 报警人 {"open_id":"ou_xxxxx", "name":"xxxx"} 或 [{"open_id":"ou_xxxxx", "name":"xxxx"}]
159
- # FEISHU_WARNING_ALL = False # 是否提示所有人, 默认为False
160
- # # 邮件报警
161
- # EMAIL_SENDER = "" # 发件人
162
- # EMAIL_PASSWORD = "" # 授权码
163
- # EMAIL_RECEIVER = "" # 收件人 支持列表,可指定多个
164
- # EMAIL_SMTPSERVER = "smtp.163.com" # 邮件服务器 默认为163邮箱
165
- # # 企业微信报警
166
- # WECHAT_WARNING_URL = "" # 企业微信机器人api
167
- # WECHAT_WARNING_PHONE = "" # 报警人 将会在群内@此人, 支持列表,可指定多人
168
- # WECHAT_WARNING_ALL = False # 是否提示所有人, 默认为False
169
- # # 时间间隔
170
- # WARNING_INTERVAL = 3600 # 相同报警的报警时间间隔,防止刷屏; 0表示不去重
171
- # WARNING_LEVEL = "DEBUG" # 报警级别, DEBUG / INFO / ERROR
172
- # WARNING_FAILED_COUNT = 1000 # 任务失败数 超过WARNING_FAILED_COUNT则报警
173
- #
174
- # LOG_NAME = os.path.basename(os.getcwd())
175
- # LOG_PATH = "log/%s.log" % LOG_NAME # log存储路径
176
- # LOG_LEVEL = "DEBUG"
177
- # LOG_COLOR = True # 是否带有颜色
178
- # LOG_IS_WRITE_TO_CONSOLE = True # 是否打印到控制台
179
- # LOG_IS_WRITE_TO_FILE = False # 是否写文件
180
- # LOG_MODE = "w" # 写文件的模式
181
- # LOG_MAX_BYTES = 10 * 1024 * 1024 # 每个日志文件的最大字节数
182
- # LOG_BACKUP_COUNT = 20 # 日志文件保留数量
183
- # LOG_ENCODING = "utf8" # 日志文件编码
184
- # OTHERS_LOG_LEVAL = "ERROR" # 第三方库的log等级
185
- #
186
- # # 切换工作路径为当前项目路径
187
- # project_path = os.path.abspath(os.path.dirname(__file__))
188
- # os.chdir(project_path) # 切换工作路经
189
- # sys.path.insert(0, project_path)
190
- # print("当前工作路径为 " + os.getcwd())
@@ -1,31 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- """
3
- Created on {DATE}
4
- ---------
5
- @summary:
6
- ---------
7
- @author: {USER}
8
- """
9
-
10
- import crawlo
11
-
12
-
13
- class ${spider_name}(crawlo.Spider):
14
- # 自定义数据库,若项目中有setting.py文件,此自定义可删除
15
- __custom_setting__ = dict(
16
- REDISDB_IP_PORTS="localhost:6379", REDISDB_USER_PASS="", REDISDB_DB=0
17
- )
18
-
19
- def start_requests(self):
20
- yield feapder.Request("https://spidertools.cn")
21
-
22
- def parse(self, request, response):
23
- # 提取网站title
24
- print(response.xpath("//title/text()").extract_first())
25
- # 提取网站描述
26
- print(response.xpath("//meta[@name='description']/@content").extract_first())
27
- print("网站地址: ", response.url)
28
-
29
-
30
- if __name__ == "__main__":
31
- ${spider_name}(redis_key="xxx:xxx").start()
@@ -1,79 +0,0 @@
1
- crawlo/__init__.py,sha256=XOWXajnhT2HVql5cycwGkQ0MS85bpQnFdM7tl0Fusik,327
2
- crawlo/__version__.py,sha256=acuR_XSJzp4OrQ5T8-Ac5gYe48mUwObuwjRmisFmZ7k,22
3
- crawlo/crawler.py,sha256=rqKjMLDU6qlm2D2gIhkezF5jFOCz0TgYyq-nS7MEFMU,9237
4
- crawlo/event.py,sha256=7-y6HNv_EIJSYQNzsj0mVK-Gg4ON3wdQeMdQjfFJPlw,313
5
- crawlo/exceptions.py,sha256=7dtEJBxb9yvmMJe6MQyDB0LuV9que1J_jQN4QYeyO4g,916
6
- crawlo/stats_collector.py,sha256=NkO09CB-220qz5rxFcD_dedGfr2VPFrDo4hya0Zh8Qc,1577
7
- crawlo/subscriber.py,sha256=3d4eYtkSgPj-18-mTZM6RQLSil-ux13FUcmfFxr3sMk,3730
8
- crawlo/task_manager.py,sha256=AS7Xu_8Q_eb3jg9QSkK_wv6W1rRXaI6WjDp8p6h9ltU,721
9
- crawlo/core/__init__.py,sha256=PnFyJdVNHBoPmV1sW0AHQXijeoSTQ8cMYrbNM1JK8kA,41
10
- crawlo/core/engine.py,sha256=OcGsY2ikDNXK4j9VqB0bUHs2v1TyWvSJu2mD1W9CbGc,5872
11
- crawlo/core/processor.py,sha256=qmCqAeqhwYu-UE86evYesaGt9qpuSIfH-ZIZKcXFCZc,1140
12
- crawlo/core/scheduler.py,sha256=3pnm5L241aEHnJJArnbCnirooo4wWFgAmnP1tMB049k,1891
13
- crawlo/downloader/__init__.py,sha256=ukrDBULCaoDWoMLCO3XcQhDoasF0oUzj0PHnJ_ACJaE,2306
14
- crawlo/downloader/aiohttp_downloader.py,sha256=EVCfbaCUJlTa1ZG32NhxKyi8FVFh-NoK0u57ct1MWos,9844
15
- crawlo/downloader/httpx_downloader.py,sha256=UQ7u3U_Iy8u1i2q0fDBakfu3C6EBN-T5Q0US6u-Um84,7002
16
- crawlo/downloader/playwright_downloader.py,sha256=q-Yy-hAaS7attXeAr7HAWAuFO2u1EOp_NdHnUPoDRRA,6566
17
- crawlo/extension/__init__.py,sha256=LPy9XyCu089k6L6oVENIi_imr75AEuY8QTtSJjRioiw,1139
18
- crawlo/extension/log_interval.py,sha256=S-hSoiz9GdmgHrac4vDQ52fleoBcH-kzdPUD8YRAons,1922
19
- crawlo/extension/log_stats.py,sha256=WeSnOoSKB8pI_xmcGdh906XnF1xwo6fgJnf_prElwwI,1742
20
- crawlo/filters/__init__.py,sha256=BCZl86BHiTfDGRe_b1TlNSr6pfNbMKTu0Uq0j4gX_1Q,977
21
- crawlo/filters/aioredis_filter.py,sha256=MJT74BeVZTjdExKEzdrWKc7WPXFss1k-txc7E54H77E,5522
22
- crawlo/filters/memory_filter.py,sha256=bs2WUe7CdHiXgr344vzDqMfBv1b3RwXJMnwxpDb64Pw,6639
23
- crawlo/items/__init__.py,sha256=JUw4wZX50DidJuCMLkP41ik_wTKum2b8iDxm7EbRRds,2063
24
- crawlo/items/items.py,sha256=00TdAYChF5Rbbgm6a6d-GCxkx4gXP-rA-_Q7u33BuFI,3990
25
- crawlo/middleware/__init__.py,sha256=ldaGFNbiJnK9Fx12Vdf9fDNfzXxoETtShp5r-vodtw0,549
26
- crawlo/middleware/default_header.py,sha256=i_Uj07JObyeZFxL7ZAZmvZsHvA1HGtkNab1sA0d-nWI,1067
27
- crawlo/middleware/download_delay.py,sha256=2M-TchDA7MwyTfYy0Hzh_bW9wlHlpiP-oQlys7crTj0,966
28
- crawlo/middleware/middleware_manager.py,sha256=kMqnSf4kltDkRf8PV0Xs9Ple9z-oKKQrMS0Q0_-4vNQ,6489
29
- crawlo/middleware/request_ignore.py,sha256=QI2z4fUnJ-4xvPTZAmsL-GqR4RFHS1xq9iDr5KFrMco,997
30
- crawlo/middleware/response_code.py,sha256=tmef2QVl3JCiTMii6VQkASlOY2OyqmOPoOfNxIK1eF8,659
31
- crawlo/middleware/response_filter.py,sha256=ep8ZxDlfIefi9YqK8dPASEp5TTDRo9QEY_jMceC411s,837
32
- crawlo/middleware/retry.py,sha256=BV-rYm3WVp8Hcrxc0JUGYfYAwvOWfXTWtatp3S5K9oU,3375
33
- crawlo/network/__init__.py,sha256=VaD0GmsgDYJ8UMgrtjeOc1Wc7lDGee1uAF3neRpyug0,123
34
- crawlo/network/request.py,sha256=eyju3BddPU8eNoueY48eqju8I96r9RasmNXXSaUU8dg,7086
35
- crawlo/network/response.py,sha256=6TO8hvkcgtVVPjQhXI3ywx5E_lV0eGrTBceEi60t55c,6034
36
- crawlo/pipelines/__init__.py,sha256=Hk-M6X0VCGLp6OEdgnhXGhGhKS5TjKf6dkg8bU9pvUE,260
37
- crawlo/pipelines/console_pipeline.py,sha256=KABkR3J-rqO0Awox7lizxKR2XuHfVhWPiVRgIybwwu4,1248
38
- crawlo/pipelines/mongo_pipeline.py,sha256=lv-Zn_mWdE_jVy7Nh30Lzqm3YhtLRV5rMy-m4rBWYe0,4442
39
- crawlo/pipelines/mysql_batch_pipline.py,sha256=g111iuPTRyKr0q4PHTJYIfsYAFf8CCuyYY6DDLSpMO0,4889
40
- crawlo/pipelines/mysql_pipeline.py,sha256=ZlRWwZLewG9SBLBZ1wWNZ8yAj5xWWitb7BKRSrqEWtI,7857
41
- crawlo/pipelines/pipeline_manager.py,sha256=JIoX5D-oDfUT7VJrb5m355wi43SChb4nNb09z_0F4_g,2118
42
- crawlo/settings/__init__.py,sha256=xsukVKn_h2Hopm1Nj-bXkhbfyS62QTTvJi7fhZUwR9M,123
43
- crawlo/settings/default_settings.py,sha256=zNMVMo_9s1DGr1TiPzwZjSmxuD4qj_JT_oCCmkoMfjs,2579
44
- crawlo/settings/setting_manager.py,sha256=SxKB1aCWh4OySM_bH9cYng9I3PAmrSP-Q8XOZEWEwbI,2899
45
- crawlo/spider/__init__.py,sha256=pP_TChnozpHeuS87Bs-Sj31hb0R7glYN3K6BsRw4FOA,905
46
- crawlo/templates/item_template.tmpl,sha256=bo0cjaFOT1jMrtLjXs6z7Mhwev-s3037suD4BL2_ji4,351
47
- crawlo/templates/spider_template.tmpl,sha256=dDMOa_17uWKB3FopYrDYLMYhHGaYevm1hI9AVeY6QAg,805
48
- crawlo/templates/project_template/main.py,sha256=RbGWsdtpOTPMD-jL80sDqu8C-TgI9qrrwJZ8NeK0PZM,594
49
- crawlo/templates/project_template/setting.py,sha256=NjP9KuhL3pBtRQfC4djBFq4CvBR4H1_OSVDfWMZITh0,9206
50
- crawlo/templates/project_template/items/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
51
- crawlo/templates/project_template/spiders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
52
- crawlo/utils/__init__.py,sha256=BDORpyjMN7VGPKImnCDKSkprS-petgD7ezc9rMlBvb0,123
53
- crawlo/utils/concurrency_manager.py,sha256=nBrHlrKqGENINDA6zrbpK0jCbcjWqobI10vYy9Sg3wU,5106
54
- crawlo/utils/date_tools.py,sha256=9OVJB66_0BvRq-lwUE4JYcd6J5RGADW3lcKEWS6lCi0,5319
55
- crawlo/utils/func_tools.py,sha256=WUZEGpWMuDDX7g-QySM7iaiC74erW2SSkZoUvDw1NjM,2369
56
- crawlo/utils/log.py,sha256=vNHCIw9owCZ4voNM_hni7fOEyt9kKoOhIrjvl76lKQg,989
57
- crawlo/utils/pqueue.py,sha256=4Ymkm38fRFqEcSJeD_ULkuBaCk6QdYvJdnYvtJjh-Tk,5386
58
- crawlo/utils/project.py,sha256=qRErB6Ru81-PpSnT9g2ZPyfbWCwZ8hygpMAWhwIhC_M,1485
59
- crawlo/utils/request.py,sha256=ftnyr6f--StdcYju5FaU_khRARaxMMktJS8wROA1Fe0,4119
60
- crawlo/utils/system.py,sha256=HvWV1acxou0Rn0L7pNq4CnV_GWFeU0Tmjy3_nLD8M64,237
61
- crawlo/utils/tools.py,sha256=isRvzTMU3n1FWMhzTAt-7TVrHmH8JPUwgWyVel71Wj0,9462
62
- crawlo/utils/url.py,sha256=rlgX2VlJv6JvLmCDTsbxzMSXE6R5ZL_0dLEqprsA-JU,1482
63
- tests/__init__.py,sha256=6i631BPnS_TR_BWUjtjB5CBO-zv9kRkwQTQvSya2wHE,123
64
- tests/baidu_spider/__init__.py,sha256=xlj0-TBQBhcKglllla_bQbufNiv10UFE0KsWMLvzFz4,123
65
- tests/baidu_spider/demo.py,sha256=MTEHkm7U4Kyx5QULCgR6to391xn4XPay6fmuV1c1uRc,24278
66
- tests/baidu_spider/items.py,sha256=vkbdnCw4tjYLmCL4oDIUxDNCgpYNqZHEG-6lVN3qfvI,443
67
- tests/baidu_spider/middleware.py,sha256=I71ZMmWTiDBFq4t2zfTE7IIXCqwaaeQ1DvKGW70q2Yg,1397
68
- tests/baidu_spider/pipeline.py,sha256=TUK_LnrU818UYmCn2_gKeNaTZjaj9qjrlndRLsR4wf0,1437
69
- tests/baidu_spider/request_fingerprints.txt,sha256=TJAuFJZZ_uvYExfruA9bEsIiArz86vxe95QoF2lbnfE,585
70
- tests/baidu_spider/run.py,sha256=YVe9qwn-2XBRRoZdUnwPRrWlBO5YAmKnyLRI3RpfogE,646
71
- tests/baidu_spider/settings.py,sha256=EenFOFgupwnn7HIySKSHBgP9--qxxkiWgIi2NDltXRw,2811
72
- tests/baidu_spider/spiders/__init__.py,sha256=eJ_ih4GiGfwQzPILeouy1Hnc4BrPz0KNPYlLHYvrvoc,123
73
- tests/baidu_spider/spiders/bai_du.py,sha256=pw4WccbmBR07CuSqCgm_7x9SH63FDJS_sXSaN5Ew5Tw,1589
74
- tests/baidu_spider/spiders/sina.py,sha256=BKQGJiCS8aiZ2f27C99WcK90QQJwgUY-vS4fUaQSdIQ,2456
75
- crawlo-1.0.4.dist-info/METADATA,sha256=dzEuRJVuBVSeKTQeEvOXRhfRcyjhcZqJFlPWivAZ9UE,1743
76
- crawlo-1.0.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
77
- crawlo-1.0.4.dist-info/entry_points.txt,sha256=GD9PBhKQN83EaxPYtz7NhcGeZeh3bdr2jWbTixOs-lw,59
78
- crawlo-1.0.4.dist-info/top_level.txt,sha256=bKtfejkszFTNHm7Z6aqtt0AUG8DdeNeL4AoZsg4XdZY,13
79
- crawlo-1.0.4.dist-info/RECORD,,
@@ -1,2 +0,0 @@
1
- [console_scripts]
2
- crawlo = crawlo.commands.cmdline:execute
@@ -1,7 +0,0 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- """
4
- # @Time : 2025-02-05 13:05
5
- # @Author : oscar
6
- # @Desc : None
7
- """