crawlo 1.4.3__py3-none-any.whl → 1.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (107) hide show
  1. crawlo/__init__.py +11 -15
  2. crawlo/__version__.py +1 -1
  3. crawlo/commands/genspider.py +52 -17
  4. crawlo/commands/startproject.py +24 -0
  5. crawlo/core/engine.py +2 -2
  6. crawlo/core/scheduler.py +4 -4
  7. crawlo/crawler.py +13 -6
  8. crawlo/downloader/__init__.py +5 -2
  9. crawlo/extension/__init__.py +2 -2
  10. crawlo/filters/aioredis_filter.py +8 -1
  11. crawlo/filters/memory_filter.py +8 -1
  12. crawlo/initialization/built_in.py +13 -4
  13. crawlo/initialization/core.py +5 -4
  14. crawlo/interfaces.py +24 -0
  15. crawlo/middleware/__init__.py +7 -4
  16. crawlo/middleware/middleware_manager.py +15 -8
  17. crawlo/mode_manager.py +45 -11
  18. crawlo/network/response.py +374 -69
  19. crawlo/pipelines/mysql_pipeline.py +6 -6
  20. crawlo/pipelines/pipeline_manager.py +2 -2
  21. crawlo/project.py +2 -4
  22. crawlo/queue/pqueue.py +2 -6
  23. crawlo/queue/queue_manager.py +1 -2
  24. crawlo/settings/default_settings.py +15 -30
  25. crawlo/task_manager.py +2 -2
  26. crawlo/templates/project/items.py.tmpl +2 -2
  27. crawlo/templates/project/middlewares.py.tmpl +9 -89
  28. crawlo/templates/project/pipelines.py.tmpl +8 -68
  29. crawlo/templates/project/settings.py.tmpl +51 -65
  30. crawlo/templates/project/settings_distributed.py.tmpl +59 -67
  31. crawlo/templates/project/settings_gentle.py.tmpl +45 -40
  32. crawlo/templates/project/settings_high_performance.py.tmpl +45 -40
  33. crawlo/templates/project/settings_minimal.py.tmpl +37 -26
  34. crawlo/templates/project/settings_simple.py.tmpl +45 -40
  35. crawlo/templates/run.py.tmpl +3 -7
  36. crawlo/tools/__init__.py +0 -11
  37. crawlo/utils/__init__.py +17 -1
  38. crawlo/utils/db_helper.py +220 -319
  39. crawlo/utils/error_handler.py +313 -67
  40. crawlo/utils/fingerprint.py +3 -4
  41. crawlo/utils/misc.py +82 -0
  42. crawlo/utils/request.py +55 -66
  43. crawlo/utils/selector_helper.py +138 -0
  44. crawlo/utils/spider_loader.py +185 -45
  45. crawlo/utils/text_helper.py +95 -0
  46. crawlo-1.4.5.dist-info/METADATA +329 -0
  47. {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/RECORD +89 -68
  48. tests/bug_check_test.py +251 -0
  49. tests/direct_selector_helper_test.py +97 -0
  50. tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -0
  51. tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -0
  52. tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -0
  53. tests/ofweek_scrapy/ofweek_scrapy/settings.py +85 -0
  54. tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -0
  55. tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +162 -0
  56. tests/ofweek_scrapy/scrapy.cfg +11 -0
  57. tests/performance_comparison.py +4 -5
  58. tests/simple_crawlo_test.py +1 -2
  59. tests/simple_follow_test.py +39 -0
  60. tests/simple_response_selector_test.py +95 -0
  61. tests/simple_selector_helper_test.py +155 -0
  62. tests/simple_selector_test.py +208 -0
  63. tests/simple_url_test.py +74 -0
  64. tests/test_crawler_process_import.py +39 -0
  65. tests/test_crawler_process_spider_modules.py +48 -0
  66. tests/test_edge_cases.py +7 -5
  67. tests/test_encoding_core.py +57 -0
  68. tests/test_encoding_detection.py +127 -0
  69. tests/test_factory_compatibility.py +197 -0
  70. tests/test_multi_directory.py +68 -0
  71. tests/test_multiple_spider_modules.py +81 -0
  72. tests/test_optimized_selector_naming.py +101 -0
  73. tests/test_priority_behavior.py +18 -18
  74. tests/test_response_follow.py +105 -0
  75. tests/test_response_selector_methods.py +93 -0
  76. tests/test_response_url_methods.py +71 -0
  77. tests/test_response_urljoin.py +87 -0
  78. tests/test_scrapy_style_encoding.py +113 -0
  79. tests/test_selector_helper.py +101 -0
  80. tests/test_selector_optimizations.py +147 -0
  81. tests/test_spider_loader.py +50 -0
  82. tests/test_spider_loader_comprehensive.py +70 -0
  83. tests/test_spider_modules.py +85 -0
  84. tests/test_spiders/__init__.py +1 -0
  85. tests/test_spiders/test_spider.py +10 -0
  86. crawlo/tools/anti_crawler.py +0 -269
  87. crawlo/utils/class_loader.py +0 -26
  88. crawlo/utils/enhanced_error_handler.py +0 -357
  89. crawlo-1.4.3.dist-info/METADATA +0 -190
  90. examples/test_project/__init__.py +0 -7
  91. examples/test_project/run.py +0 -35
  92. examples/test_project/test_project/__init__.py +0 -4
  93. examples/test_project/test_project/items.py +0 -18
  94. examples/test_project/test_project/middlewares.py +0 -119
  95. examples/test_project/test_project/pipelines.py +0 -97
  96. examples/test_project/test_project/settings.py +0 -170
  97. examples/test_project/test_project/spiders/__init__.py +0 -10
  98. examples/test_project/test_project/spiders/of_week_dis.py +0 -144
  99. tests/simple_log_test.py +0 -58
  100. tests/simple_test.py +0 -48
  101. tests/test_framework_logger.py +0 -67
  102. tests/test_framework_startup.py +0 -65
  103. tests/test_mode_change.py +0 -73
  104. {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/WHEEL +0 -0
  105. {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/entry_points.txt +0 -0
  106. {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/top_level.txt +0 -0
  107. /tests/{final_command_test_report.md → ofweek_scrapy/ofweek_scrapy/__init__.py} +0 -0
@@ -6,75 +6,79 @@
6
6
  适合快速开始和简单项目。
7
7
  """
8
8
 
9
- # ============================== 项目基本信息 ==============================
9
+ # =================================== 基础配置 ===================================
10
+
11
+ # 项目基本信息
10
12
  PROJECT_NAME = '{{project_name}}'
11
13
 
12
- # ============================== 运行模式 ==============================
14
+ # 运行模式
13
15
  RUN_MODE = 'standalone'
14
16
 
15
- # ============================== 并发配置 ==============================
17
+ # 并发配置
16
18
  CONCURRENCY = 4
17
19
  MAX_RUNNING_SPIDERS = 1
18
20
  DOWNLOAD_DELAY = 1.0
19
21
 
20
- # ============================== 下载器配置 ==============================
21
- # 可选下载器:
22
- # DOWNLOADER = 'crawlo.downloader.aiohttp_downloader.AioHttpDownloader'
23
- # DOWNLOADER = 'crawlo.downloader.httpx_downloader.HttpXDownloader'
24
- # DOWNLOADER = 'crawlo.downloader.cffi_downloader.CurlCffiDownloader'
22
+ # =================================== 核心组件配置 ===================================
23
+
24
+ # 下载器配置
25
25
  DOWNLOADER = 'crawlo.downloader.aiohttp_downloader.AioHttpDownloader'
26
26
 
27
- # ============================== 队列配置 ==============================
27
+ # 队列配置
28
28
  QUEUE_TYPE = 'memory'
29
29
 
30
- # ============================== 去重过滤器 ==============================
30
+ # 去重过滤器
31
31
  FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
32
32
 
33
- # ============================== 默认去重管道 ==============================
33
+ # 默认去重管道
34
34
  DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline'
35
35
 
36
- # ============================== 爬虫模块配置 ==============================
37
- SPIDER_MODULES = ['{{project_name}}.spiders']
36
+ # =================================== 爬虫配置 ===================================
38
37
 
39
- # ============================== 中间件 ==============================
40
- # MIDDLEWARES = [
41
- # 'crawlo.middleware.simple_proxy.SimpleProxyMiddleware',
42
- # ]
38
+ # 爬虫模块配置
39
+ SPIDER_MODULES = ['{{project_name}}.spiders']
43
40
 
44
- # ============================== 默认请求头配置 ==============================
41
+ # 默认请求头配置
45
42
  # 为DefaultHeaderMiddleware配置默认请求头
46
- DEFAULT_REQUEST_HEADERS = {
47
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
48
- 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
49
- 'Accept-Encoding': 'gzip, deflate, br',
50
- }
43
+ # DEFAULT_REQUEST_HEADERS = {}
51
44
 
52
- # ============================== 允许的域名 ==============================
45
+ # 允许的域名
53
46
  # 为OffsiteMiddleware配置允许的域名
54
- # ALLOWED_DOMAINS = ['example.com']
47
+ # ALLOWED_DOMAINS = []
55
48
 
56
- # ============================== 数据管道 ==============================
49
+ # 数据管道
50
+ # 如需添加自定义管道,请取消注释并添加
57
51
  # PIPELINES = [
58
- # 'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline', # MySQL 存储(使用asyncmy异步库)
52
+ # 'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline', # MySQL 存储(使用asyncmy异步库)
53
+ # # '{{project_name}}.pipelines.CustomPipeline', # 用户自定义管道示例
59
54
  # ]
60
55
 
61
- # ============================== 扩展组件 ==============================
56
+ # =================================== 系统配置 ===================================
57
+
58
+ # 扩展组件
59
+ # 如需添加自定义扩展,请取消注释并添加
62
60
  # EXTENSIONS = [
63
- # 'crawlo.extension.log_interval.LogIntervalExtension',
64
- # 'crawlo.extension.log_stats.LogStats',
65
- # 'crawlo.extension.logging_extension.CustomLoggerExtension',
61
+ # # '{{project_name}}.extensions.CustomExtension', # 用户自定义扩展示例
62
+ # ]
63
+
64
+ # 中间件
65
+ # 如需添加自定义中间件,请取消注释并添加
66
+ # MIDDLEWARES = [
67
+ # # '{{project_name}}.middlewares.CustomMiddleware', # 用户自定义中间件示例
66
68
  # ]
67
69
 
68
- # ============================== 日志配置 ==============================
70
+ # 日志配置
69
71
  LOG_LEVEL = 'INFO'
70
72
  LOG_FILE = 'logs/{{project_name}}.log'
71
73
  LOG_ENCODING = 'utf-8' # 明确指定日志文件编码
72
74
  STATS_DUMP = True
73
75
 
74
- # ============================== 输出配置 ==============================
76
+ # 输出配置
75
77
  OUTPUT_DIR = 'output'
76
78
 
77
- # ============================== Redis配置 ==============================
79
+ # =================================== 数据库配置 ===================================
80
+
81
+ # Redis配置
78
82
  REDIS_HOST = '127.0.0.1'
79
83
  REDIS_PORT = 6379
80
84
  REDIS_PASSWORD = ''
@@ -86,7 +90,7 @@ if REDIS_PASSWORD:
86
90
  else:
87
91
  REDIS_URL = f'redis://{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB}'
88
92
 
89
- # ============================== MySQL配置 ==============================
93
+ # MySQL配置
90
94
  MYSQL_HOST = '127.0.0.1'
91
95
  MYSQL_PORT = 3306
92
96
  MYSQL_USER = 'root'
@@ -96,7 +100,7 @@ MYSQL_TABLE = '{{project_name}}_data'
96
100
  MYSQL_BATCH_SIZE = 100
97
101
  MYSQL_USE_BATCH = False # 是否启用批量插入
98
102
 
99
- # ============================== MongoDB配置 ==============================
103
+ # MongoDB配置
100
104
  MONGO_URI = 'mongodb://localhost:27017'
101
105
  MONGO_DATABASE = '{{project_name}}_db'
102
106
  MONGO_COLLECTION = '{{project_name}}_items'
@@ -105,7 +109,9 @@ MONGO_MIN_POOL_SIZE = 20
105
109
  MONGO_BATCH_SIZE = 100 # 批量插入条数
106
110
  MONGO_USE_BATCH = False # 是否启用批量插入
107
111
 
108
- # ============================== 代理配置 ==============================
112
+ # =================================== 网络配置 ===================================
113
+
114
+ # 代理配置
109
115
  # 代理功能默认不启用,如需使用请在项目配置文件中启用并配置相关参数
110
116
  PROXY_ENABLED = False # 是否启用代理
111
117
 
@@ -124,7 +130,6 @@ PROXY_EXTRACTOR = "proxy"
124
130
  PROXY_REFRESH_INTERVAL = 60 # 代理刷新间隔(秒)
125
131
  PROXY_API_TIMEOUT = 10 # 请求代理 API 超时时间
126
132
 
127
- # ============================== Curl-Cffi 特有配置 ==============================
128
133
  # 浏览器指纹模拟(仅 CurlCffi 下载器有效)
129
134
  CURL_BROWSER_TYPE = "chrome" # 可选: chrome, edge, safari, firefox 或版本如 chrome136
130
135
 
@@ -136,7 +141,7 @@ CURL_BROWSER_VERSION_MAP = {
136
141
  "firefox": "firefox135",
137
142
  }
138
143
 
139
- # ============================== 下载器优化配置 ==============================
144
+ # 下载器优化配置
140
145
  # 下载器健康检查
141
146
  DOWNLOADER_HEALTH_CHECK = True # 是否启用下载器健康检查
142
147
  HEALTH_CHECK_INTERVAL = 60 # 健康检查间隔(秒)
@@ -157,7 +162,7 @@ AIOHTTP_FORCE_CLOSE = False # 是否强制关闭连接
157
162
  CONNECTION_TTL_DNS_CACHE = 300 # DNS缓存TTL(秒)
158
163
  CONNECTION_KEEPALIVE_TIMEOUT = 15 # Keep-Alive超时(秒)
159
164
 
160
- # ============================== 内存监控配置 ==============================
165
+ # 内存监控配置
161
166
  # 内存监控扩展默认不启用,如需使用请在项目配置文件中启用
162
167
  MEMORY_MONITOR_ENABLED = False # 是否启用内存监控
163
168
  MEMORY_MONITOR_INTERVAL = 60 # 内存监控检查间隔(秒)
@@ -6,7 +6,7 @@
6
6
  基于 Crawlo 框架的简化爬虫启动器。
7
7
 
8
8
  框架会自动处理爬虫模块的导入和注册,用户无需手动导入。
9
- 只需指定spider_modules参数,框架会自动扫描并导入所有爬虫。
9
+ 框架会自动从settings.py中读取SPIDER_MODULES配置。
10
10
  """
11
11
  import sys
12
12
  import asyncio
@@ -17,12 +17,8 @@ from crawlo.crawler import CrawlerProcess
17
17
  def main():
18
18
  """主函数:运行爬虫"""
19
19
  try:
20
- # 指定爬虫模块路径,框架会自动导入并注册所有爬虫
21
- spider_modules = ['{{project_name}}.spiders']
22
- process = CrawlerProcess(spider_modules=spider_modules)
23
-
24
- # TODO 运行指定的爬虫
25
- asyncio.run(process.crawl('spider_name'))
20
+ # TODO: 请将 'spider_name' 替换为实际要运行的爬虫名称
21
+ asyncio.run(CrawlerProcess().crawl('spider_name'))
26
22
 
27
23
  except Exception as e:
28
24
  print(f"❌ 运行失败: {e}")
crawlo/tools/__init__.py CHANGED
@@ -79,17 +79,6 @@ from .retry_mechanism import (
79
79
  exponential_backoff
80
80
  )
81
81
 
82
- # 反爬虫应对工具
83
- from .anti_crawler import (
84
- ProxyPoolManager,
85
- CaptchaHandler,
86
- AntiCrawler,
87
- get_random_user_agent,
88
- rotate_proxy,
89
- handle_captcha,
90
- detect_rate_limiting
91
- )
92
-
93
82
  # 带认证代理工具
94
83
  from .authenticated_proxy import (
95
84
  AuthenticatedProxy,
crawlo/utils/__init__.py CHANGED
@@ -4,6 +4,9 @@
4
4
  # @Time : 2025-02-05 13:57
5
5
  # @Author : oscar
6
6
  # @Desc : 工具模块集合
7
+
8
+ 提供用于处理parsel选择器的辅助函数,用于提取文本和属性等操作。
9
+ 所有方法都采用了简洁直观的命名风格,便于记忆和使用。
7
10
  """
8
11
 
9
12
  from ..tools.date_tools import (
@@ -20,6 +23,14 @@ from ..tools.date_tools import (
20
23
  from_timestamp_with_tz
21
24
  )
22
25
 
26
+ from .selector_helper import (
27
+ extract_text,
28
+ extract_texts,
29
+ extract_attr,
30
+ extract_attrs,
31
+ is_xpath
32
+ )
33
+
23
34
  __all__ = [
24
35
  "TimeUtils",
25
36
  "parse_time",
@@ -31,5 +42,10 @@ __all__ = [
31
42
  "to_timezone",
32
43
  "to_utc",
33
44
  "to_local",
34
- "from_timestamp_with_tz"
45
+ "from_timestamp_with_tz",
46
+ "extract_text",
47
+ "extract_texts",
48
+ "extract_attr",
49
+ "extract_attrs",
50
+ "is_xpath"
35
51
  ]