zenx 0.6.8__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
zenx/clients/database.py CHANGED
@@ -45,22 +45,7 @@ class DBClient(ABC):
45
45
 
46
46
 
47
47
  @abstractmethod
48
- async def insert(self, id: str, spider_name: str) -> None:
49
- ...
50
-
51
-
52
- @abstractmethod
53
- async def exists(self, id: str, spider_name: str) -> bool:
54
- ...
55
-
56
-
57
- @abstractmethod
58
- async def delete(self, id: str, spider_name: str) -> None:
59
- ...
60
-
61
-
62
- @abstractmethod
63
- async def cleanup(self, days: int) -> None:
48
+ async def insert(self, id: str, spider_name: str) -> bool:
64
49
  ...
65
50
 
66
51
 
@@ -87,25 +72,14 @@ class MemoryDB(DBClient):
87
72
  pass
88
73
 
89
74
 
90
- async def insert(self, id: str, spider_name: str) -> None:
75
+ async def insert(self, id: str, spider_name: str) -> bool:
91
76
  unique_id = f"{spider_name}_{id}"
77
+ if unique_id in self.dq:
78
+ self.logger.debug("exists", id=unique_id, db=self.name)
79
+ return False
92
80
  self.dq.append(unique_id)
93
81
  self.logger.debug("inserted", id=unique_id, db=self.name)
94
-
95
-
96
- async def exists(self, id: str, spider_name: str) -> bool:
97
- unique_id = f"{spider_name}_{id}"
98
- return unique_id in self.dq
99
-
100
-
101
- async def delete(self, id: str, spider_name: str) -> None:
102
- unique_id = f"{spider_name}_{id}"
103
- self.dq.remove(unique_id)
104
- self.logger.debug("deleted", id=unique_id, db=self.name)
105
-
106
-
107
- async def cleanup(self, days: int) -> None:
108
- pass
82
+ return True
109
83
 
110
84
 
111
85
  async def close(self) -> None:
@@ -123,7 +97,7 @@ try:
123
97
  def __init__(self, logger: BoundLogger, settings: Settings) -> None:
124
98
  super().__init__(logger, settings)
125
99
  self.r: Optional[redis.Redis] = None
126
- self._processed_ids_zset: str = "processed:ids:zset"
100
+ self._record_expiry_sec = self.settings.REDIS_RECORD_EXPIRY_SECONDS
127
101
 
128
102
 
129
103
  async def start(self) -> None:
@@ -151,28 +125,15 @@ try:
151
125
  self.logger.info("connected", db=self.name)
152
126
 
153
127
 
154
- async def insert(self, id: str, spider_name: str) -> None:
155
- unique_id = f"{spider_name}_{id}"
156
- timestamp = int(time.time())
157
- await self.r.zadd(self._processed_ids_zset, {unique_id: timestamp})
158
- self.logger.debug("inserted", id=unique_id, db=self.name)
159
-
160
-
161
- async def exists(self, id: str, spider_name: str) -> bool:
128
+ async def insert(self, id: str, spider_name: str) -> bool:
162
129
  unique_id = f"{spider_name}_{id}"
163
- score = await self.r.zscore(self._processed_ids_zset, unique_id)
164
- return score is not None
165
-
166
-
167
- async def delete(self, id: str, spider_name: str) -> None:
168
- unique_id = f"{spider_name}_{id}"
169
- await self.r.delete(unique_id)
170
- self.logger.debug("deleted", id=unique_id, db=self.name)
171
-
172
-
173
- async def cleanup(self, days: int) -> None:
174
- threshold_timestamp = int(time.time()) - (days * 24 * 60 * 60)
175
- await self.r.zremrangebyscore(self._processed_ids_zset, 0, threshold_timestamp)
130
+ result = await self.r.set(unique_id, 1, ex=self._record_expiry_sec, nx=True)
131
+ if result:
132
+ self.logger.debug("inserted", id=unique_id, db=self.name)
133
+ return True
134
+ else:
135
+ self.logger.debug("exists", id=unique_id, db=self.name)
136
+ return False
176
137
 
177
138
 
178
139
  async def close(self) -> None:
@@ -196,8 +157,5 @@ except ModuleNotFoundError:
196
157
 
197
158
  async def start(self) -> None: pass
198
159
  async def _connect(self) -> None: pass
199
- async def insert(self, id: str, spider_name: str) -> None: pass
200
- async def exists(self, id: str, spider_name: str) -> bool: return False
201
- async def delete(self, id: str, spider_name: str) -> None: pass
202
- async def cleanup(self, days: int) -> None: pass
160
+ async def insert(self, id: str, spider_name: str) -> bool: return False
203
161
  async def close(self) -> None: pass
zenx/clients/http.py CHANGED
@@ -1,7 +1,4 @@
1
1
  from __future__ import annotations
2
- import time
3
- import functools
4
- from collections import deque
5
2
  import parsel
6
3
  import random
7
4
  from curl_cffi.requests.impersonate import BrowserTypeLiteral
@@ -35,16 +32,6 @@ class Response:
35
32
  return sel
36
33
 
37
34
 
38
- def record_request(func):
39
- @functools.wraps(func)
40
- async def wrapper(self: HttpClient, *args, **kwargs) -> Response:
41
- result = await func(self, *args, **kwargs)
42
- self._requests_timestamps.append(time.time())
43
- self._total_requests +=1
44
- return result
45
- return wrapper
46
-
47
-
48
35
  class HttpClient(ABC):
49
36
  # central registry
50
37
  name: ClassVar[str]
@@ -69,23 +56,7 @@ class HttpClient(ABC):
69
56
  self.logger = logger
70
57
  self.settings = settings
71
58
  self._session_pool: asyncio.Queue
72
- # stats
73
- self._requests_timestamps = deque()
74
- self._total_requests = 0
75
59
 
76
-
77
- def get_stats(self) -> Dict:
78
- # calculate RPM based on rolling window of 1 min
79
- now = time.time()
80
- # remove requests older than 60 sec
81
- while self._requests_timestamps and self._requests_timestamps[0] < (now - 60):
82
- self._requests_timestamps.popleft()
83
- rpm = len(self._requests_timestamps)
84
- return {
85
- "rpm": rpm,
86
- "total_requests": self._total_requests,
87
- }
88
-
89
60
 
90
61
  @abstractmethod
91
62
  async def request(
@@ -125,7 +96,6 @@ class CurlCffi(HttpClient):
125
96
  return chosen_fingerprint
126
97
 
127
98
 
128
- @record_request
129
99
  async def request(
130
100
  self,
131
101
  url: str,
zenx/engine.py CHANGED
@@ -26,16 +26,6 @@ class Engine:
26
26
  self.shutdown_event.set()
27
27
 
28
28
 
29
- async def _log_stats(self, spider: Spider) -> None:
30
- while True:
31
- await asyncio.sleep(60)
32
- try:
33
- rpm = spider.client.get_stats().get("rpm")
34
- spider.logger.info("stats", rpm=rpm)
35
- except asyncio.CancelledError:
36
- break
37
-
38
-
39
29
  async def _execute(self, spider_name: str) -> None:
40
30
  loop = asyncio.get_running_loop()
41
31
  loop.add_signal_handler(signal.SIGINT, self._shutdown_handler)
@@ -57,8 +47,6 @@ class Engine:
57
47
  await pm.start_pipelines()
58
48
 
59
49
  spider = spider_cls(client=client, pm=pm, logger=logger, settings=settings)
60
-
61
- stats_task = asyncio.create_task(self._log_stats(spider))
62
50
  try:
63
51
  if self.forever:
64
52
  while not self.shutdown_event.is_set():
@@ -72,15 +60,6 @@ class Engine:
72
60
  finally:
73
61
  if self.shutdown_event.is_set():
74
62
  logger.info("shutdown", spider=spider_name)
75
-
76
- stats_task.cancel()
77
- try:
78
- await stats_task
79
- except asyncio.CancelledError:
80
- pass
81
- stats = spider.client.get_stats()
82
- logger.info("summary", **stats)
83
-
84
63
  await client.close()
85
64
  await db.close()
86
65
  await pm.close_pipelines()
zenx/pipelines/manager.py CHANGED
@@ -1,3 +1,4 @@
1
+ import asyncio
1
2
  from typing import Dict, List
2
3
  from structlog import BoundLogger
3
4
 
@@ -12,29 +13,32 @@ class PipelineManager:
12
13
 
13
14
  def __init__(self, pipeline_names: List[str], logger: BoundLogger, db: DBClient, settings: Settings) -> None:
14
15
  self.logger = logger
15
- self.pipelines = [Pipeline.get_pipeline(name)(logger, db, settings) for name in pipeline_names]
16
+ self.pipelines = {name:Pipeline.get_pipeline(name)(logger, db, settings) for name in pipeline_names}
16
17
  self.settings = settings
18
+ self._fire_and_forget_pipelines = [p for p in self.pipelines.values() if p.name != "preprocess"]
17
19
 
18
20
 
19
21
  async def start_pipelines(self) -> None:
20
22
  """ connect and monitor """
21
- for pipeline in self.pipelines:
23
+ for pipeline in self.pipelines.values():
22
24
  await pipeline.start()
23
25
 
24
26
 
25
- async def process_item(self, item: Dict, spider: str) -> Dict:
26
- for pipeline in self.pipelines:
27
+ async def process_item(self, item: Dict, spider: str) -> None:
28
+ preprocess_pipeline = self.pipelines.get("preprocess")
29
+ if preprocess_pipeline:
27
30
  try:
28
- item = await pipeline.process_item(item, spider)
31
+ item = await preprocess_pipeline.process_item(item, spider)
29
32
  except DropItem:
30
- self.logger.debug("dropped", id=item.get("_id"))
31
- break
33
+ self.logger.debug("dropped", id=item.get("_id"), pipeline=preprocess_pipeline.name)
34
+ return
32
35
  except Exception:
33
- self.logger.exception("process_item", pipeline=pipeline.name, item=item)
36
+ self.logger.exception("process_item", item=item, pipeline=preprocess_pipeline.name)
34
37
  raise
35
- return item
38
+ for pipeline in self._fire_and_forget_pipelines:
39
+ asyncio.create_task(pipeline.process_item(item, spider))
36
40
 
37
41
 
38
42
  async def close_pipelines(self) -> None:
39
- for pipeline in self.pipelines:
43
+ for pipeline in self.pipelines.values():
40
44
  await pipeline.close()
zenx/settings.py CHANGED
@@ -9,6 +9,7 @@ class Settings(BaseSettings):
9
9
  SESSION_POOL_SIZE: int = 1
10
10
  MAX_SCRAPE_DELAY: int = 10 # 10 seconds
11
11
  DQ_MAX_SIZE: int = 100 # max size of the deque for memory database
12
+ REDIS_RECORD_EXPIRY_SECONDS: int = 604800 # 7 days (7*24*60*60)
12
13
 
13
14
  DB_TYPE: Literal["memory", "redis"] = "memory"
14
15
  DB_NAME: str | None = None
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: zenx
3
- Version: 0.6.8
3
+ Version: 0.7.0
4
4
  Summary: mini-framework
5
5
  Requires-Python: >=3.12
6
6
  Requires-Dist: curl-cffi>=0.12.0
@@ -1,18 +1,18 @@
1
1
  zenx/cli.py,sha256=pHKhOTdqI6NQQoYK91waRIMpxCXLYtXEryzVaTbmvqc,2810
2
2
  zenx/debug_runner.py,sha256=B2Jd9A4_EHDa-ohLcwnFIxOV73FagTWXX2fl3qgwlpY,809
3
3
  zenx/discovery.py,sha256=YANVGzy2IG1fYruUud-11Y-ynyO6iEp3EjlHnhIQJQI,1014
4
- zenx/engine.py,sha256=csnBhGf6lfGY3b_7I5AlH_fPGPN9mu1iEiXbJ9ZNxiw,2962
4
+ zenx/engine.py,sha256=Nsz59CnHjUmlYa3dcqdRCBGZh8ZBHDht-rrZTKd1ysU,2341
5
5
  zenx/exceptions.py,sha256=BJXxzwwX2CU6inhppfblx8c8Z6Mhvsk7MAhQ1LAnhBg,37
6
6
  zenx/logger.py,sha256=UmEk0vV1mSCozV7z_DDgCCXdAManDr5wgkrhKiRQtyU,1651
7
- zenx/settings.py,sha256=Q0z3oGyVkhucRyqsraZQR1lAPSq1sjiGKmAkDCmQsA0,940
7
+ zenx/settings.py,sha256=0RLeKcqHJdD1vuYIQtGeXkp2YIvv3tNyz9O8jw620VQ,1008
8
8
  zenx/utils.py,sha256=a0JOM4CRQHFCD-W1E8P7voIWmdEbSYXN3yV4db8ni3U,588
9
9
  zenx/clients/__init__.py,sha256=CaAAuNa8DPyMdejR0KNSDDg_UzC3WxaTol5_QvwwwG8,132
10
- zenx/clients/database.py,sha256=GaIafQ2OkYJQ_sz2f6u7sZKn77VOReWun09hogk6dO0,6019
11
- zenx/clients/http.py,sha256=4n2y1Vsij7ELmD54DoMs1i53HIEEbVtSo2l1Dnz7P_M,6621
10
+ zenx/clients/database.py,sha256=9QW2H-4vscoBKZy_iG_q8WvSExoD0U7Rih4FPsYngbI,4756
11
+ zenx/clients/http.py,sha256=fb6COYot6vidNFRBWgoU6CYEfnYWJP0JuVkydvxsHb4,5700
12
12
  zenx/pipelines/__init__.py,sha256=IxkZ0UpEJdYjLdd-PMcC9PzzzArTBNNcpgKc7NiOe5Y,131
13
13
  zenx/pipelines/base.py,sha256=N_388z5DFMeaU6wMwcClZAbQFWKh4kpAF7eUJhpQevs,1863
14
14
  zenx/pipelines/google_rpc.py,sha256=F7p5ml9W1UliZbrDrF9MFNVKlCP5pG1WpO6rdmBgKp8,4707
15
- zenx/pipelines/manager.py,sha256=LRejhnSiXospa4cMSX9Srp0jfscGnHDggXyl1aUNhtg,1300
15
+ zenx/pipelines/manager.py,sha256=bP2WIMblbWYHOJPU44l_IF3xLD0eHBYV4CCTFZc2FHI,1662
16
16
  zenx/pipelines/preprocess.py,sha256=k1Ake8H3nm_S_0ZnruyXAl7cFlBhLn-wpQZHuovIH4o,774
17
17
  zenx/pipelines/websocket.py,sha256=74XIV7_YbBngCgqKP3ZiBpDfQte-VB2G4kjNsZyycIc,4456
18
18
  zenx/resources/proto/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -20,8 +20,8 @@ zenx/resources/proto/feed_pb2.py,sha256=ZyICOLnyuXekkvV4bAHZ1nE1-wwzcYYRRrmRJCMr
20
20
  zenx/resources/proto/feed_pb2_grpc.py,sha256=Mim6FfBgIMj0PmTqHk036nVUMJH3A6I3ts6r1j3bQF8,7441
21
21
  zenx/spiders/__init__.py,sha256=rs5LuqdM2MQlUYiTGJrzkYhzN8_SSLTrR7wGjSRrrSo,25
22
22
  zenx/spiders/base.py,sha256=YB-KqsAzfIUTzDMy5_ElgW1mul-I4Ltft6JAJxpy4hg,1672
23
- zenx-0.6.8.dist-info/METADATA,sha256=6dvQwWjNmQWLfwTsyMiXAL4qHD6yvCoJc4gkVMfku5o,1273
24
- zenx-0.6.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
25
- zenx-0.6.8.dist-info/entry_points.txt,sha256=8JXob2f1VtvzGFris-e9Usqywg7oca-cChDlH9moOZU,38
26
- zenx-0.6.8.dist-info/top_level.txt,sha256=JeXwvK86d7sB-2x-avugFnZIZa33zaHWKI8RHWJR6KY,5
27
- zenx-0.6.8.dist-info/RECORD,,
23
+ zenx-0.7.0.dist-info/METADATA,sha256=aMtFlE8YY8qgo-DPOg0xz4igPppYrBBro8yXYxcnI5I,1273
24
+ zenx-0.7.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
25
+ zenx-0.7.0.dist-info/entry_points.txt,sha256=8JXob2f1VtvzGFris-e9Usqywg7oca-cChDlH9moOZU,38
26
+ zenx-0.7.0.dist-info/top_level.txt,sha256=JeXwvK86d7sB-2x-avugFnZIZa33zaHWKI8RHWJR6KY,5
27
+ zenx-0.7.0.dist-info/RECORD,,
File without changes