zenx 0.6.8__tar.gz → 0.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {zenx-0.6.8 → zenx-0.7.0}/PKG-INFO +1 -1
  2. {zenx-0.6.8 → zenx-0.7.0}/pyproject.toml +1 -1
  3. {zenx-0.6.8 → zenx-0.7.0}/zenx/clients/database.py +16 -58
  4. {zenx-0.6.8 → zenx-0.7.0}/zenx/clients/http.py +0 -30
  5. {zenx-0.6.8 → zenx-0.7.0}/zenx/engine.py +0 -21
  6. zenx-0.7.0/zenx/pipelines/manager.py +44 -0
  7. {zenx-0.6.8 → zenx-0.7.0}/zenx/settings.py +1 -0
  8. {zenx-0.6.8 → zenx-0.7.0}/zenx.egg-info/PKG-INFO +1 -1
  9. zenx-0.6.8/zenx/pipelines/manager.py +0 -40
  10. {zenx-0.6.8 → zenx-0.7.0}/setup.cfg +0 -0
  11. {zenx-0.6.8 → zenx-0.7.0}/zenx/cli.py +0 -0
  12. {zenx-0.6.8 → zenx-0.7.0}/zenx/clients/__init__.py +0 -0
  13. {zenx-0.6.8 → zenx-0.7.0}/zenx/debug_runner.py +0 -0
  14. {zenx-0.6.8 → zenx-0.7.0}/zenx/discovery.py +0 -0
  15. {zenx-0.6.8 → zenx-0.7.0}/zenx/exceptions.py +0 -0
  16. {zenx-0.6.8 → zenx-0.7.0}/zenx/logger.py +0 -0
  17. {zenx-0.6.8 → zenx-0.7.0}/zenx/pipelines/__init__.py +0 -0
  18. {zenx-0.6.8 → zenx-0.7.0}/zenx/pipelines/base.py +0 -0
  19. {zenx-0.6.8 → zenx-0.7.0}/zenx/pipelines/google_rpc.py +0 -0
  20. {zenx-0.6.8 → zenx-0.7.0}/zenx/pipelines/preprocess.py +0 -0
  21. {zenx-0.6.8 → zenx-0.7.0}/zenx/pipelines/websocket.py +0 -0
  22. {zenx-0.6.8 → zenx-0.7.0}/zenx/resources/proto/__init__.py +0 -0
  23. {zenx-0.6.8 → zenx-0.7.0}/zenx/resources/proto/feed_pb2.py +0 -0
  24. {zenx-0.6.8 → zenx-0.7.0}/zenx/resources/proto/feed_pb2_grpc.py +0 -0
  25. {zenx-0.6.8 → zenx-0.7.0}/zenx/spiders/__init__.py +0 -0
  26. {zenx-0.6.8 → zenx-0.7.0}/zenx/spiders/base.py +0 -0
  27. {zenx-0.6.8 → zenx-0.7.0}/zenx/utils.py +0 -0
  28. {zenx-0.6.8 → zenx-0.7.0}/zenx.egg-info/SOURCES.txt +0 -0
  29. {zenx-0.6.8 → zenx-0.7.0}/zenx.egg-info/dependency_links.txt +0 -0
  30. {zenx-0.6.8 → zenx-0.7.0}/zenx.egg-info/entry_points.txt +0 -0
  31. {zenx-0.6.8 → zenx-0.7.0}/zenx.egg-info/requires.txt +0 -0
  32. {zenx-0.6.8 → zenx-0.7.0}/zenx.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: zenx
3
- Version: 0.6.8
3
+ Version: 0.7.0
4
4
  Summary: mini-framework
5
5
  Requires-Python: >=3.12
6
6
  Requires-Dist: curl-cffi>=0.12.0
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "zenx"
3
- version = "0.6.8"
3
+ version = "0.7.0"
4
4
  description = "mini-framework"
5
5
  requires-python = ">=3.12"
6
6
  dependencies = [
@@ -45,22 +45,7 @@ class DBClient(ABC):
45
45
 
46
46
 
47
47
  @abstractmethod
48
- async def insert(self, id: str, spider_name: str) -> None:
49
- ...
50
-
51
-
52
- @abstractmethod
53
- async def exists(self, id: str, spider_name: str) -> bool:
54
- ...
55
-
56
-
57
- @abstractmethod
58
- async def delete(self, id: str, spider_name: str) -> None:
59
- ...
60
-
61
-
62
- @abstractmethod
63
- async def cleanup(self, days: int) -> None:
48
+ async def insert(self, id: str, spider_name: str) -> bool:
64
49
  ...
65
50
 
66
51
 
@@ -87,25 +72,14 @@ class MemoryDB(DBClient):
87
72
  pass
88
73
 
89
74
 
90
- async def insert(self, id: str, spider_name: str) -> None:
75
+ async def insert(self, id: str, spider_name: str) -> bool:
91
76
  unique_id = f"{spider_name}_{id}"
77
+ if unique_id in self.dq:
78
+ self.logger.debug("exists", id=unique_id, db=self.name)
79
+ return False
92
80
  self.dq.append(unique_id)
93
81
  self.logger.debug("inserted", id=unique_id, db=self.name)
94
-
95
-
96
- async def exists(self, id: str, spider_name: str) -> bool:
97
- unique_id = f"{spider_name}_{id}"
98
- return unique_id in self.dq
99
-
100
-
101
- async def delete(self, id: str, spider_name: str) -> None:
102
- unique_id = f"{spider_name}_{id}"
103
- self.dq.remove(unique_id)
104
- self.logger.debug("deleted", id=unique_id, db=self.name)
105
-
106
-
107
- async def cleanup(self, days: int) -> None:
108
- pass
82
+ return True
109
83
 
110
84
 
111
85
  async def close(self) -> None:
@@ -123,7 +97,7 @@ try:
123
97
  def __init__(self, logger: BoundLogger, settings: Settings) -> None:
124
98
  super().__init__(logger, settings)
125
99
  self.r: Optional[redis.Redis] = None
126
- self._processed_ids_zset: str = "processed:ids:zset"
100
+ self._record_expiry_sec = self.settings.REDIS_RECORD_EXPIRY_SECONDS
127
101
 
128
102
 
129
103
  async def start(self) -> None:
@@ -151,28 +125,15 @@ try:
151
125
  self.logger.info("connected", db=self.name)
152
126
 
153
127
 
154
- async def insert(self, id: str, spider_name: str) -> None:
155
- unique_id = f"{spider_name}_{id}"
156
- timestamp = int(time.time())
157
- await self.r.zadd(self._processed_ids_zset, {unique_id: timestamp})
158
- self.logger.debug("inserted", id=unique_id, db=self.name)
159
-
160
-
161
- async def exists(self, id: str, spider_name: str) -> bool:
128
+ async def insert(self, id: str, spider_name: str) -> bool:
162
129
  unique_id = f"{spider_name}_{id}"
163
- score = await self.r.zscore(self._processed_ids_zset, unique_id)
164
- return score is not None
165
-
166
-
167
- async def delete(self, id: str, spider_name: str) -> None:
168
- unique_id = f"{spider_name}_{id}"
169
- await self.r.delete(unique_id)
170
- self.logger.debug("deleted", id=unique_id, db=self.name)
171
-
172
-
173
- async def cleanup(self, days: int) -> None:
174
- threshold_timestamp = int(time.time()) - (days * 24 * 60 * 60)
175
- await self.r.zremrangebyscore(self._processed_ids_zset, 0, threshold_timestamp)
130
+ result = await self.r.set(unique_id, 1, ex=self._record_expiry_sec, nx=True)
131
+ if result:
132
+ self.logger.debug("inserted", id=unique_id, db=self.name)
133
+ return True
134
+ else:
135
+ self.logger.debug("exists", id=unique_id, db=self.name)
136
+ return False
176
137
 
177
138
 
178
139
  async def close(self) -> None:
@@ -196,8 +157,5 @@ except ModuleNotFoundError:
196
157
 
197
158
  async def start(self) -> None: pass
198
159
  async def _connect(self) -> None: pass
199
- async def insert(self, id: str, spider_name: str) -> None: pass
200
- async def exists(self, id: str, spider_name: str) -> bool: return False
201
- async def delete(self, id: str, spider_name: str) -> None: pass
202
- async def cleanup(self, days: int) -> None: pass
160
+ async def insert(self, id: str, spider_name: str) -> bool: return False
203
161
  async def close(self) -> None: pass
@@ -1,7 +1,4 @@
1
1
  from __future__ import annotations
2
- import time
3
- import functools
4
- from collections import deque
5
2
  import parsel
6
3
  import random
7
4
  from curl_cffi.requests.impersonate import BrowserTypeLiteral
@@ -35,16 +32,6 @@ class Response:
35
32
  return sel
36
33
 
37
34
 
38
- def record_request(func):
39
- @functools.wraps(func)
40
- async def wrapper(self: HttpClient, *args, **kwargs) -> Response:
41
- result = await func(self, *args, **kwargs)
42
- self._requests_timestamps.append(time.time())
43
- self._total_requests +=1
44
- return result
45
- return wrapper
46
-
47
-
48
35
  class HttpClient(ABC):
49
36
  # central registry
50
37
  name: ClassVar[str]
@@ -69,23 +56,7 @@ class HttpClient(ABC):
69
56
  self.logger = logger
70
57
  self.settings = settings
71
58
  self._session_pool: asyncio.Queue
72
- # stats
73
- self._requests_timestamps = deque()
74
- self._total_requests = 0
75
59
 
76
-
77
- def get_stats(self) -> Dict:
78
- # calculate RPM based on rolling window of 1 min
79
- now = time.time()
80
- # remove requests older than 60 sec
81
- while self._requests_timestamps and self._requests_timestamps[0] < (now - 60):
82
- self._requests_timestamps.popleft()
83
- rpm = len(self._requests_timestamps)
84
- return {
85
- "rpm": rpm,
86
- "total_requests": self._total_requests,
87
- }
88
-
89
60
 
90
61
  @abstractmethod
91
62
  async def request(
@@ -125,7 +96,6 @@ class CurlCffi(HttpClient):
125
96
  return chosen_fingerprint
126
97
 
127
98
 
128
- @record_request
129
99
  async def request(
130
100
  self,
131
101
  url: str,
@@ -26,16 +26,6 @@ class Engine:
26
26
  self.shutdown_event.set()
27
27
 
28
28
 
29
- async def _log_stats(self, spider: Spider) -> None:
30
- while True:
31
- await asyncio.sleep(60)
32
- try:
33
- rpm = spider.client.get_stats().get("rpm")
34
- spider.logger.info("stats", rpm=rpm)
35
- except asyncio.CancelledError:
36
- break
37
-
38
-
39
29
  async def _execute(self, spider_name: str) -> None:
40
30
  loop = asyncio.get_running_loop()
41
31
  loop.add_signal_handler(signal.SIGINT, self._shutdown_handler)
@@ -57,8 +47,6 @@ class Engine:
57
47
  await pm.start_pipelines()
58
48
 
59
49
  spider = spider_cls(client=client, pm=pm, logger=logger, settings=settings)
60
-
61
- stats_task = asyncio.create_task(self._log_stats(spider))
62
50
  try:
63
51
  if self.forever:
64
52
  while not self.shutdown_event.is_set():
@@ -72,15 +60,6 @@ class Engine:
72
60
  finally:
73
61
  if self.shutdown_event.is_set():
74
62
  logger.info("shutdown", spider=spider_name)
75
-
76
- stats_task.cancel()
77
- try:
78
- await stats_task
79
- except asyncio.CancelledError:
80
- pass
81
- stats = spider.client.get_stats()
82
- logger.info("summary", **stats)
83
-
84
63
  await client.close()
85
64
  await db.close()
86
65
  await pm.close_pipelines()
@@ -0,0 +1,44 @@
1
+ import asyncio
2
+ from typing import Dict, List
3
+ from structlog import BoundLogger
4
+
5
+ from zenx.exceptions import DropItem
6
+ from zenx.pipelines.base import Pipeline
7
+ from zenx.clients.database import DBClient
8
+ from zenx.settings import Settings
9
+
10
+
11
+ class PipelineManager:
12
+
13
+
14
+ def __init__(self, pipeline_names: List[str], logger: BoundLogger, db: DBClient, settings: Settings) -> None:
15
+ self.logger = logger
16
+ self.pipelines = {name:Pipeline.get_pipeline(name)(logger, db, settings) for name in pipeline_names}
17
+ self.settings = settings
18
+ self._fire_and_forget_pipelines = [p for p in self.pipelines.values() if p.name != "preprocess"]
19
+
20
+
21
+ async def start_pipelines(self) -> None:
22
+ """ connect and monitor """
23
+ for pipeline in self.pipelines.values():
24
+ await pipeline.start()
25
+
26
+
27
+ async def process_item(self, item: Dict, spider: str) -> None:
28
+ preprocess_pipeline = self.pipelines.get("preprocess")
29
+ if preprocess_pipeline:
30
+ try:
31
+ item = await preprocess_pipeline.process_item(item, spider)
32
+ except DropItem:
33
+ self.logger.debug("dropped", id=item.get("_id"), pipeline=preprocess_pipeline.name)
34
+ return
35
+ except Exception:
36
+ self.logger.exception("process_item", item=item, pipeline=preprocess_pipeline.name)
37
+ raise
38
+ for pipeline in self._fire_and_forget_pipelines:
39
+ asyncio.create_task(pipeline.process_item(item, spider))
40
+
41
+
42
+ async def close_pipelines(self) -> None:
43
+ for pipeline in self.pipelines.values():
44
+ await pipeline.close()
@@ -9,6 +9,7 @@ class Settings(BaseSettings):
9
9
  SESSION_POOL_SIZE: int = 1
10
10
  MAX_SCRAPE_DELAY: int = 10 # 10 seconds
11
11
  DQ_MAX_SIZE: int = 100 # max size of the deque for memory database
12
+ REDIS_RECORD_EXPIRY_SECONDS: int = 604800 # 7 days (7*24*60*60)
12
13
 
13
14
  DB_TYPE: Literal["memory", "redis"] = "memory"
14
15
  DB_NAME: str | None = None
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: zenx
3
- Version: 0.6.8
3
+ Version: 0.7.0
4
4
  Summary: mini-framework
5
5
  Requires-Python: >=3.12
6
6
  Requires-Dist: curl-cffi>=0.12.0
@@ -1,40 +0,0 @@
1
- from typing import Dict, List
2
- from structlog import BoundLogger
3
-
4
- from zenx.exceptions import DropItem
5
- from zenx.pipelines.base import Pipeline
6
- from zenx.clients.database import DBClient
7
- from zenx.settings import Settings
8
-
9
-
10
- class PipelineManager:
11
-
12
-
13
- def __init__(self, pipeline_names: List[str], logger: BoundLogger, db: DBClient, settings: Settings) -> None:
14
- self.logger = logger
15
- self.pipelines = [Pipeline.get_pipeline(name)(logger, db, settings) for name in pipeline_names]
16
- self.settings = settings
17
-
18
-
19
- async def start_pipelines(self) -> None:
20
- """ connect and monitor """
21
- for pipeline in self.pipelines:
22
- await pipeline.start()
23
-
24
-
25
- async def process_item(self, item: Dict, spider: str) -> Dict:
26
- for pipeline in self.pipelines:
27
- try:
28
- item = await pipeline.process_item(item, spider)
29
- except DropItem:
30
- self.logger.debug("dropped", id=item.get("_id"))
31
- break
32
- except Exception:
33
- self.logger.exception("process_item", pipeline=pipeline.name, item=item)
34
- raise
35
- return item
36
-
37
-
38
- async def close_pipelines(self) -> None:
39
- for pipeline in self.pipelines:
40
- await pipeline.close()
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes