crawlo 1.0.3__py3-none-any.whl → 1.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

crawlo/__version__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.0.3"
1
+ __version__ = "1.0.4"
@@ -9,7 +9,7 @@ from crawlo.utils.request import request_fingerprint
9
9
 
10
10
 
11
11
  class AioRedisFilter(BaseFilter):
12
- """使用Redis集合实现的异步请求去重过滤器(适用于分布式爬虫)"""
12
+ """基于Redis集合实现的异步请求去重过滤器(支持分布式爬虫),提供TTL和清理控制"""
13
13
 
14
14
  def __init__(
15
15
  self,
@@ -18,7 +18,8 @@ class AioRedisFilter(BaseFilter):
18
18
  stats: dict,
19
19
  debug: bool,
20
20
  log_level: str,
21
- cleanup_fp: bool = False
21
+ cleanup_fp: bool = False,
22
+ ttl: Optional[int] = None # None表示持久化,>0表示过期时间(秒)
22
23
  ):
23
24
  """初始化过滤器"""
24
25
  self.logger = get_logger(self.__class__.__name__, log_level)
@@ -27,12 +28,19 @@ class AioRedisFilter(BaseFilter):
27
28
  self.redis_key = redis_key
28
29
  self.redis = client
29
30
  self.cleanup_fp = cleanup_fp
31
+ self.ttl = ttl
30
32
 
31
33
  @classmethod
32
34
  def create_instance(cls, crawler) -> 'BaseFilter':
33
35
  """从爬虫配置创建过滤器实例"""
34
36
  redis_url = crawler.settings.get('REDIS_URL', 'redis://localhost:6379')
35
- decode_responses = crawler.settings.get_bool('DECODE_RESPONSES', False) # 关键:改为False
37
+ decode_responses = crawler.settings.get_bool('DECODE_RESPONSES', False)
38
+ ttl_setting = crawler.settings.get_int('REDIS_TTL')
39
+
40
+ # 处理TTL设置
41
+ ttl = None
42
+ if ttl_setting is not None:
43
+ ttl = max(0, int(ttl_setting)) if ttl_setting > 0 else None
36
44
 
37
45
  try:
38
46
  redis_client = aioredis.from_url(
@@ -42,110 +50,93 @@ class AioRedisFilter(BaseFilter):
42
50
  encoding='utf-8'
43
51
  )
44
52
  except Exception as e:
45
- raise RuntimeError(f"Redis连接失败 {redis_url}: {str(e)}")
53
+ raise RuntimeError(f"Redis连接失败: {redis_url} - {str(e)}")
46
54
 
47
55
  return cls(
48
56
  redis_key=f"{crawler.settings.get('PROJECT_NAME', 'default')}:{crawler.settings.get('REDIS_KEY', 'request_fingerprints')}",
49
57
  client=redis_client,
50
58
  stats=crawler.stats,
51
59
  cleanup_fp=crawler.settings.get_bool('CLEANUP_FP', False),
60
+ ttl=ttl,
52
61
  debug=crawler.settings.get_bool('FILTER_DEBUG', False),
53
62
  log_level=crawler.settings.get('LOG_LEVEL', 'INFO')
54
63
  )
55
64
 
56
65
  async def requested(self, request: Request) -> bool:
57
- """
58
- 检查请求是否重复
59
- """
66
+ """检查请求是否已存在"""
60
67
  try:
61
- fp = request_fingerprint(request)
62
- self.logger.debug(f"Checking fingerprint: {fp}")
63
-
64
- # 确保fp是字符串类型
65
- if not isinstance(fp, str):
66
- fp = str(fp)
67
-
68
- # 检查Redis连接状态
69
- if not self.redis:
70
- raise RuntimeError("Redis client is not initialized")
68
+ fp = str(request_fingerprint(request))
71
69
 
72
- # 检查指纹是否已存在
73
- is_member = await self.redis.sismember(self.redis_key, fp)
74
- self.logger.debug(f"Fingerprint {fp} exists: {is_member}")
70
+ # 1. 检查指纹是否存在
71
+ pipe = self.redis.pipeline()
72
+ pipe.sismember(self.redis_key, fp) # 不单独 await
73
+ exists = (await pipe.execute())[0] # 执行并获取结果
75
74
 
76
- if is_member:
77
- if self.debug:
78
- self.logger.debug(f"Filtered duplicate request: {fp}")
75
+ if exists: # 如果已存在,返回 True
79
76
  return True
80
77
 
81
- # 添加新指纹
82
- result = await self.redis.sadd(self.redis_key, fp)
78
+ # 2. 如果不存在,添加指纹并设置 TTL
79
+ pipe = self.redis.pipeline()
80
+ pipe.sadd(self.redis_key, fp) # 不单独 await
81
+ if self.ttl and self.ttl > 0:
82
+ pipe.expire(self.redis_key, self.ttl) # 不单独 await
83
+ await pipe.execute() # 一次性执行所有命令
83
84
 
84
- if self.debug:
85
- if result == 1:
86
- self.logger.debug(f"Added new fingerprint: {fp}")
87
- else:
88
- self.logger.warning(f"Failed to add fingerprint: {fp}")
89
-
90
- return False
85
+ return False # 表示是新请求
91
86
 
92
87
  except Exception as e:
93
- self.logger.error(f"Filter check failed for {getattr(request, 'url', 'unknown')}: {str(e)}")
94
- # 可以选择抛出异常或返回False(不过滤)
88
+ self.logger.error(f"请求检查失败: {getattr(request, 'url', '未知URL')}")
95
89
  raise
96
90
 
97
91
  async def add_fingerprint(self, fp: str) -> bool:
98
- """Redis集合添加新指纹"""
92
+ """添加新指纹到Redis集合"""
99
93
  try:
100
- if not isinstance(fp, str):
101
- fp = str(fp)
94
+ fp = str(fp)
95
+ added = await self.redis.sadd(self.redis_key, fp)
96
+
97
+ if self.ttl and self.ttl > 0:
98
+ await self.redis.expire(self.redis_key, self.ttl)
102
99
 
103
- result = await self.redis.sadd(self.redis_key, fp)
104
- if self.debug:
105
- self.logger.debug(f"Added fingerprint {fp}, result: {result}")
106
- return result == 1
100
+ return added == 1
107
101
  except Exception as e:
108
- self.logger.error(f"Failed to add fingerprint {fp}: {str(e)}")
102
+ self.logger.error("添加指纹失败")
109
103
  raise
110
104
 
111
105
  async def get_stats(self) -> dict:
112
- """获取当前过滤器统计信息"""
106
+ """获取过滤器统计信息"""
113
107
  try:
114
108
  count = await self.redis.scard(self.redis_key)
115
- return {
116
- 'total_fingerprints': count,
117
- 'redis_key': self.redis_key,
118
- **self.stats
109
+ stats = {
110
+ '指纹总数': count,
111
+ 'Redis键名': self.redis_key,
112
+ 'TTL配置': f"{self.ttl}秒" if self.ttl else "持久化"
119
113
  }
114
+ stats.update(self.stats)
115
+ return stats
120
116
  except Exception as e:
121
- self.logger.error(f"Failed to get stats: {str(e)}")
117
+ self.logger.error("获取统计信息失败")
122
118
  return self.stats
123
119
 
124
120
  async def clear_all(self) -> int:
125
121
  """清空所有指纹数据"""
126
122
  try:
127
123
  deleted = await self.redis.delete(self.redis_key)
128
- self.logger.info(f"Cleared {deleted} keys")
124
+ self.logger.info(f"已清除指纹数: {deleted}")
129
125
  return deleted
130
126
  except Exception as e:
131
- self.logger.error(f"Failed to clear fingerprints: {str(e)}")
127
+ self.logger.error("清空指纹失败")
132
128
  raise
133
129
 
134
130
  async def closed(self, reason: Optional[str] = None) -> None:
135
- """爬虫关闭时的处理"""
131
+ """爬虫关闭时的清理操作"""
136
132
  try:
137
133
  if self.cleanup_fp:
138
134
  deleted = await self.redis.delete(self.redis_key)
139
- self.logger.info(
140
- f"Cleaned {deleted} fingerprints from {self.redis_key} "
141
- f"(reason: {reason or 'manual'})"
142
- )
135
+ self.logger.info(f"爬虫关闭清理: 已删除{deleted}个指纹")
143
136
  else:
144
- # 显示统计信息
145
137
  count = await self.redis.scard(self.redis_key)
146
- self.logger.info(f"Total fingerprints preserved: {count}")
147
- except Exception as e:
148
- self.logger.warning(f"Close operation failed: {e}")
138
+ ttl_info = f"{self.ttl}秒" if self.ttl else "持久化"
139
+ self.logger.info(f"保留指纹数: {count} (TTL: {ttl_info})")
149
140
  finally:
150
141
  await self._close_redis()
151
142
 
@@ -154,5 +145,6 @@ class AioRedisFilter(BaseFilter):
154
145
  try:
155
146
  if hasattr(self.redis, 'close'):
156
147
  await self.redis.close()
148
+ self.logger.debug("Redis连接已关闭")
157
149
  except Exception as e:
158
- self.logger.warning(f"Redis close error: {e}")
150
+ self.logger.warning(f"Redis关闭时出错:{e}")
@@ -84,7 +84,8 @@ FILTER_DEBUG = True
84
84
  FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
85
85
 
86
86
  # redis filter
87
- CLEANUP_FP = False
87
+ REDIS_TTL = 0
88
+ CLEANUP_FP = 0
88
89
  DECODE_RESPONSES = True
89
90
  REDIS_KEY = 'request_fingerprint'
90
91
  REDIS_HOST = os.getenv('REDIS_HOST', '127.0.0.1')
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: crawlo
3
- Version: 1.0.3
3
+ Version: 1.0.4
4
4
  Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
5
5
  Home-page: https://github.com/crawl-coder/Crawlo.git
6
6
  Author: crawl-coder
@@ -1,5 +1,5 @@
1
1
  crawlo/__init__.py,sha256=XOWXajnhT2HVql5cycwGkQ0MS85bpQnFdM7tl0Fusik,327
2
- crawlo/__version__.py,sha256=2plzdEEb24FLjE2I2XyBBcJEPYWHccNL4SgtLC_6erg,22
2
+ crawlo/__version__.py,sha256=acuR_XSJzp4OrQ5T8-Ac5gYe48mUwObuwjRmisFmZ7k,22
3
3
  crawlo/crawler.py,sha256=rqKjMLDU6qlm2D2gIhkezF5jFOCz0TgYyq-nS7MEFMU,9237
4
4
  crawlo/event.py,sha256=7-y6HNv_EIJSYQNzsj0mVK-Gg4ON3wdQeMdQjfFJPlw,313
5
5
  crawlo/exceptions.py,sha256=7dtEJBxb9yvmMJe6MQyDB0LuV9que1J_jQN4QYeyO4g,916
@@ -18,9 +18,8 @@ crawlo/extension/__init__.py,sha256=LPy9XyCu089k6L6oVENIi_imr75AEuY8QTtSJjRioiw,
18
18
  crawlo/extension/log_interval.py,sha256=S-hSoiz9GdmgHrac4vDQ52fleoBcH-kzdPUD8YRAons,1922
19
19
  crawlo/extension/log_stats.py,sha256=WeSnOoSKB8pI_xmcGdh906XnF1xwo6fgJnf_prElwwI,1742
20
20
  crawlo/filters/__init__.py,sha256=BCZl86BHiTfDGRe_b1TlNSr6pfNbMKTu0Uq0j4gX_1Q,977
21
- crawlo/filters/aioredis_filter.py,sha256=2KAfFohKjWU3lRKKaxNDCN4d2lvWbpmHHL5oM7jsmKs,5640
21
+ crawlo/filters/aioredis_filter.py,sha256=MJT74BeVZTjdExKEzdrWKc7WPXFss1k-txc7E54H77E,5522
22
22
  crawlo/filters/memory_filter.py,sha256=bs2WUe7CdHiXgr344vzDqMfBv1b3RwXJMnwxpDb64Pw,6639
23
- crawlo/filters/redis_filter.py,sha256=W3Wam4Qdd1mZPyue3N9pYkaF72HUcXd38iHOsHHDfEg,4092
24
23
  crawlo/items/__init__.py,sha256=JUw4wZX50DidJuCMLkP41ik_wTKum2b8iDxm7EbRRds,2063
25
24
  crawlo/items/items.py,sha256=00TdAYChF5Rbbgm6a6d-GCxkx4gXP-rA-_Q7u33BuFI,3990
26
25
  crawlo/middleware/__init__.py,sha256=ldaGFNbiJnK9Fx12Vdf9fDNfzXxoETtShp5r-vodtw0,549
@@ -41,7 +40,7 @@ crawlo/pipelines/mysql_batch_pipline.py,sha256=g111iuPTRyKr0q4PHTJYIfsYAFf8CCuyY
41
40
  crawlo/pipelines/mysql_pipeline.py,sha256=ZlRWwZLewG9SBLBZ1wWNZ8yAj5xWWitb7BKRSrqEWtI,7857
42
41
  crawlo/pipelines/pipeline_manager.py,sha256=JIoX5D-oDfUT7VJrb5m355wi43SChb4nNb09z_0F4_g,2118
43
42
  crawlo/settings/__init__.py,sha256=xsukVKn_h2Hopm1Nj-bXkhbfyS62QTTvJi7fhZUwR9M,123
44
- crawlo/settings/default_settings.py,sha256=5_3TK65iiorS9KbpmQGy79qi0DUTwQTN9FLUSjk4eEo,2569
43
+ crawlo/settings/default_settings.py,sha256=zNMVMo_9s1DGr1TiPzwZjSmxuD4qj_JT_oCCmkoMfjs,2579
45
44
  crawlo/settings/setting_manager.py,sha256=SxKB1aCWh4OySM_bH9cYng9I3PAmrSP-Q8XOZEWEwbI,2899
46
45
  crawlo/spider/__init__.py,sha256=pP_TChnozpHeuS87Bs-Sj31hb0R7glYN3K6BsRw4FOA,905
47
46
  crawlo/templates/item_template.tmpl,sha256=bo0cjaFOT1jMrtLjXs6z7Mhwev-s3037suD4BL2_ji4,351
@@ -69,12 +68,12 @@ tests/baidu_spider/middleware.py,sha256=I71ZMmWTiDBFq4t2zfTE7IIXCqwaaeQ1DvKGW70q
69
68
  tests/baidu_spider/pipeline.py,sha256=TUK_LnrU818UYmCn2_gKeNaTZjaj9qjrlndRLsR4wf0,1437
70
69
  tests/baidu_spider/request_fingerprints.txt,sha256=TJAuFJZZ_uvYExfruA9bEsIiArz86vxe95QoF2lbnfE,585
71
70
  tests/baidu_spider/run.py,sha256=YVe9qwn-2XBRRoZdUnwPRrWlBO5YAmKnyLRI3RpfogE,646
72
- tests/baidu_spider/settings.py,sha256=z8rtEwVZ0b2f_EZLZdaugAEMJBIp2nBV8xjSFtxe3vY,2737
71
+ tests/baidu_spider/settings.py,sha256=EenFOFgupwnn7HIySKSHBgP9--qxxkiWgIi2NDltXRw,2811
73
72
  tests/baidu_spider/spiders/__init__.py,sha256=eJ_ih4GiGfwQzPILeouy1Hnc4BrPz0KNPYlLHYvrvoc,123
74
73
  tests/baidu_spider/spiders/bai_du.py,sha256=pw4WccbmBR07CuSqCgm_7x9SH63FDJS_sXSaN5Ew5Tw,1589
75
74
  tests/baidu_spider/spiders/sina.py,sha256=BKQGJiCS8aiZ2f27C99WcK90QQJwgUY-vS4fUaQSdIQ,2456
76
- crawlo-1.0.3.dist-info/METADATA,sha256=HjBB_eeDrEFN6TTm3lKNl_3xuMB1DLhMjI222q3cUV4,1743
77
- crawlo-1.0.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
78
- crawlo-1.0.3.dist-info/entry_points.txt,sha256=GD9PBhKQN83EaxPYtz7NhcGeZeh3bdr2jWbTixOs-lw,59
79
- crawlo-1.0.3.dist-info/top_level.txt,sha256=bKtfejkszFTNHm7Z6aqtt0AUG8DdeNeL4AoZsg4XdZY,13
80
- crawlo-1.0.3.dist-info/RECORD,,
75
+ crawlo-1.0.4.dist-info/METADATA,sha256=dzEuRJVuBVSeKTQeEvOXRhfRcyjhcZqJFlPWivAZ9UE,1743
76
+ crawlo-1.0.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
77
+ crawlo-1.0.4.dist-info/entry_points.txt,sha256=GD9PBhKQN83EaxPYtz7NhcGeZeh3bdr2jWbTixOs-lw,59
78
+ crawlo-1.0.4.dist-info/top_level.txt,sha256=bKtfejkszFTNHm7Z6aqtt0AUG8DdeNeL4AoZsg4XdZY,13
79
+ crawlo-1.0.4.dist-info/RECORD,,
@@ -72,7 +72,9 @@ DEFAULT_HEADERS = {
72
72
  Mongo_Params = ''
73
73
  MONGODB_DB = 'news'
74
74
 
75
- CLEANUP_FP = True
75
+ REDIS_TTL = 0
76
+ CLEANUP_FP = False
76
77
 
77
78
  FILTER_CLASS = 'crawlo.filters.aioredis_filter.AioRedisFilter'
79
+ # FILTER_CLASS = 'crawlo.filters.redis_filter.RedisFilter'
78
80
  # FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFileFilter'
@@ -1,120 +0,0 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- import redis
4
-
5
- from crawlo import Request
6
- from crawlo.filters import BaseFilter
7
- from crawlo.utils.log import get_logger
8
- from crawlo.utils.request import request_fingerprint
9
-
10
-
11
- class RedisFilter(BaseFilter):
12
- """使用Redis集合实现的同步请求去重过滤器"""
13
-
14
- def __init__(
15
- self,
16
- redis_key: str,
17
- client: redis.Redis,
18
- stats: dict,
19
- debug: bool,
20
- log_level: str,
21
- save_fp: bool
22
- ):
23
- """
24
- 初始化过滤器
25
-
26
- :param redis_key: Redis存储键名
27
- :param client: redis客户端实例
28
- :param stats: 统计字典
29
- :param debug: 是否启用调试模式
30
- :param log_level: 日志级别
31
- :param save_fp: 是否保留指纹数据
32
- """
33
- self.logger = get_logger(self.__class__.__name__, log_level)
34
- super().__init__(self.logger, stats, debug)
35
-
36
- self.redis_key = redis_key
37
- self.redis = client
38
- self.save_fp = save_fp
39
-
40
- @classmethod
41
- def create_instance(cls, crawler) -> 'BaseFilter':
42
- """工厂方法创建实例"""
43
- redis_url = crawler.settings.get('REDIS_URL', 'redis://localhost:6379')
44
- decode_responses = crawler.settings.get_bool('DECODE_RESPONSES', True)
45
-
46
- try:
47
- # 添加连接池配置
48
- redis_client = redis.from_url(
49
- redis_url,
50
- decode_responses=decode_responses,
51
- socket_timeout=5, # 超时设置
52
- socket_connect_timeout=5,
53
- max_connections=20 # 连接池大小
54
- )
55
- # 测试连接是否有效
56
- redis_client.ping()
57
- except redis.RedisError as e:
58
- raise RuntimeError(f"Redis连接失败: {str(e)}")
59
-
60
- return cls(
61
- redis_key=f"{crawler.settings.get('PROJECT_NAME')}:{crawler.settings.get('REDIS_KEY', 'request_fingerprints')}",
62
- client=redis_client,
63
- stats=crawler.stats,
64
- save_fp=crawler.settings.get_bool('SAVE_FP', False),
65
- debug=crawler.settings.get_bool('FILTER_DEBUG', False),
66
- log_level=crawler.settings.get('LOG_LEVEL', 'INFO')
67
- )
68
-
69
- def requested(self, request: Request) -> bool:
70
- """
71
- 检查请求是否已存在
72
-
73
- :param request: 请求对象
74
- :return: 是否重复
75
- """
76
- fp = request_fingerprint(request)
77
- try:
78
- if self.redis.sismember(self.redis_key, fp):
79
- self.logger.debug(f"重复请求: {fp}")
80
- return True
81
-
82
- self.add_fingerprint(fp)
83
- return False
84
- except redis.RedisError as e:
85
- self.logger.error(f"Redis操作失败: {str(e)}")
86
- raise
87
-
88
- def add_fingerprint(self, fp: str) -> None:
89
- """添加指纹到Redis集合"""
90
- try:
91
- self.redis.sadd(self.redis_key, fp)
92
- self.logger.debug(f"新增指纹: {fp}")
93
- except redis.RedisError as e:
94
- self.logger.error(f"指纹添加失败: {str(e)}")
95
- raise
96
-
97
- def __contains__(self, item) -> bool:
98
- """支持 in 操作符检查 (必须返回bool类型)"""
99
- try:
100
- # 显式将redis返回的0/1转换为bool
101
- return bool(self.redis.sismember(self.redis_key, item))
102
- except redis.RedisError as e:
103
- self.logger.error(f"Redis查询失败: {str(e)}")
104
- raise
105
-
106
- def close(self) -> None:
107
- """同步清理方法(注意不是异步的closed)"""
108
- if not self.save_fp:
109
- try:
110
- count = self.redis.delete(self.redis_key)
111
- self.logger.info(f"已清理Redis键 {self.redis_key}, 删除数量: {count}")
112
- except redis.RedisError as e:
113
- self.logger.error(f"清理失败: {str(e)}")
114
- finally:
115
- # 同步客户端需要手动关闭连接池
116
- self.redis.close()
117
-
118
- async def closed(self):
119
- """兼容异步接口的同步实现"""
120
- self.close()
File without changes