zenx 0.6.8__py3-none-any.whl → 0.6.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zenx/clients/database.py +16 -58
- zenx/clients/http.py +0 -30
- zenx/engine.py +0 -21
- zenx/settings.py +1 -0
- {zenx-0.6.8.dist-info → zenx-0.6.9.dist-info}/METADATA +1 -1
- {zenx-0.6.8.dist-info → zenx-0.6.9.dist-info}/RECORD +9 -9
- {zenx-0.6.8.dist-info → zenx-0.6.9.dist-info}/WHEEL +0 -0
- {zenx-0.6.8.dist-info → zenx-0.6.9.dist-info}/entry_points.txt +0 -0
- {zenx-0.6.8.dist-info → zenx-0.6.9.dist-info}/top_level.txt +0 -0
zenx/clients/database.py
CHANGED
@@ -45,22 +45,7 @@ class DBClient(ABC):
|
|
45
45
|
|
46
46
|
|
47
47
|
@abstractmethod
|
48
|
-
async def insert(self, id: str, spider_name: str) ->
|
49
|
-
...
|
50
|
-
|
51
|
-
|
52
|
-
@abstractmethod
|
53
|
-
async def exists(self, id: str, spider_name: str) -> bool:
|
54
|
-
...
|
55
|
-
|
56
|
-
|
57
|
-
@abstractmethod
|
58
|
-
async def delete(self, id: str, spider_name: str) -> None:
|
59
|
-
...
|
60
|
-
|
61
|
-
|
62
|
-
@abstractmethod
|
63
|
-
async def cleanup(self, days: int) -> None:
|
48
|
+
async def insert(self, id: str, spider_name: str) -> bool:
|
64
49
|
...
|
65
50
|
|
66
51
|
|
@@ -87,25 +72,14 @@ class MemoryDB(DBClient):
|
|
87
72
|
pass
|
88
73
|
|
89
74
|
|
90
|
-
async def insert(self, id: str, spider_name: str) ->
|
75
|
+
async def insert(self, id: str, spider_name: str) -> bool:
|
91
76
|
unique_id = f"{spider_name}_{id}"
|
77
|
+
if unique_id in self.dq:
|
78
|
+
self.logger.debug("exists", id=unique_id, db=self.name)
|
79
|
+
return False
|
92
80
|
self.dq.append(unique_id)
|
93
81
|
self.logger.debug("inserted", id=unique_id, db=self.name)
|
94
|
-
|
95
|
-
|
96
|
-
async def exists(self, id: str, spider_name: str) -> bool:
|
97
|
-
unique_id = f"{spider_name}_{id}"
|
98
|
-
return unique_id in self.dq
|
99
|
-
|
100
|
-
|
101
|
-
async def delete(self, id: str, spider_name: str) -> None:
|
102
|
-
unique_id = f"{spider_name}_{id}"
|
103
|
-
self.dq.remove(unique_id)
|
104
|
-
self.logger.debug("deleted", id=unique_id, db=self.name)
|
105
|
-
|
106
|
-
|
107
|
-
async def cleanup(self, days: int) -> None:
|
108
|
-
pass
|
82
|
+
return True
|
109
83
|
|
110
84
|
|
111
85
|
async def close(self) -> None:
|
@@ -123,7 +97,7 @@ try:
|
|
123
97
|
def __init__(self, logger: BoundLogger, settings: Settings) -> None:
|
124
98
|
super().__init__(logger, settings)
|
125
99
|
self.r: Optional[redis.Redis] = None
|
126
|
-
self.
|
100
|
+
self._record_expiry_sec = self.settings.REDIS_RECORD_EXPIRY_SECONDS
|
127
101
|
|
128
102
|
|
129
103
|
async def start(self) -> None:
|
@@ -151,28 +125,15 @@ try:
|
|
151
125
|
self.logger.info("connected", db=self.name)
|
152
126
|
|
153
127
|
|
154
|
-
async def insert(self, id: str, spider_name: str) ->
|
155
|
-
unique_id = f"{spider_name}_{id}"
|
156
|
-
timestamp = int(time.time())
|
157
|
-
await self.r.zadd(self._processed_ids_zset, {unique_id: timestamp})
|
158
|
-
self.logger.debug("inserted", id=unique_id, db=self.name)
|
159
|
-
|
160
|
-
|
161
|
-
async def exists(self, id: str, spider_name: str) -> bool:
|
128
|
+
async def insert(self, id: str, spider_name: str) -> bool:
|
162
129
|
unique_id = f"{spider_name}_{id}"
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
self.logger.debug("deleted", id=unique_id, db=self.name)
|
171
|
-
|
172
|
-
|
173
|
-
async def cleanup(self, days: int) -> None:
|
174
|
-
threshold_timestamp = int(time.time()) - (days * 24 * 60 * 60)
|
175
|
-
await self.r.zremrangebyscore(self._processed_ids_zset, 0, threshold_timestamp)
|
130
|
+
result = await self.r.set(unique_id, 1, ex=self._record_expiry_sec, nx=True)
|
131
|
+
if result:
|
132
|
+
self.logger.debug("inserted", id=unique_id, db=self.name)
|
133
|
+
return True
|
134
|
+
else:
|
135
|
+
self.logger.debug("exists", id=unique_id, db=self.name)
|
136
|
+
return False
|
176
137
|
|
177
138
|
|
178
139
|
async def close(self) -> None:
|
@@ -196,8 +157,5 @@ except ModuleNotFoundError:
|
|
196
157
|
|
197
158
|
async def start(self) -> None: pass
|
198
159
|
async def _connect(self) -> None: pass
|
199
|
-
async def insert(self, id: str, spider_name: str) ->
|
200
|
-
async def exists(self, id: str, spider_name: str) -> bool: return False
|
201
|
-
async def delete(self, id: str, spider_name: str) -> None: pass
|
202
|
-
async def cleanup(self, days: int) -> None: pass
|
160
|
+
async def insert(self, id: str, spider_name: str) -> bool: return False
|
203
161
|
async def close(self) -> None: pass
|
zenx/clients/http.py
CHANGED
@@ -1,7 +1,4 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
-
import time
|
3
|
-
import functools
|
4
|
-
from collections import deque
|
5
2
|
import parsel
|
6
3
|
import random
|
7
4
|
from curl_cffi.requests.impersonate import BrowserTypeLiteral
|
@@ -35,16 +32,6 @@ class Response:
|
|
35
32
|
return sel
|
36
33
|
|
37
34
|
|
38
|
-
def record_request(func):
|
39
|
-
@functools.wraps(func)
|
40
|
-
async def wrapper(self: HttpClient, *args, **kwargs) -> Response:
|
41
|
-
result = await func(self, *args, **kwargs)
|
42
|
-
self._requests_timestamps.append(time.time())
|
43
|
-
self._total_requests +=1
|
44
|
-
return result
|
45
|
-
return wrapper
|
46
|
-
|
47
|
-
|
48
35
|
class HttpClient(ABC):
|
49
36
|
# central registry
|
50
37
|
name: ClassVar[str]
|
@@ -69,23 +56,7 @@ class HttpClient(ABC):
|
|
69
56
|
self.logger = logger
|
70
57
|
self.settings = settings
|
71
58
|
self._session_pool: asyncio.Queue
|
72
|
-
# stats
|
73
|
-
self._requests_timestamps = deque()
|
74
|
-
self._total_requests = 0
|
75
59
|
|
76
|
-
|
77
|
-
def get_stats(self) -> Dict:
|
78
|
-
# calculate RPM based on rolling window of 1 min
|
79
|
-
now = time.time()
|
80
|
-
# remove requests older than 60 sec
|
81
|
-
while self._requests_timestamps and self._requests_timestamps[0] < (now - 60):
|
82
|
-
self._requests_timestamps.popleft()
|
83
|
-
rpm = len(self._requests_timestamps)
|
84
|
-
return {
|
85
|
-
"rpm": rpm,
|
86
|
-
"total_requests": self._total_requests,
|
87
|
-
}
|
88
|
-
|
89
60
|
|
90
61
|
@abstractmethod
|
91
62
|
async def request(
|
@@ -125,7 +96,6 @@ class CurlCffi(HttpClient):
|
|
125
96
|
return chosen_fingerprint
|
126
97
|
|
127
98
|
|
128
|
-
@record_request
|
129
99
|
async def request(
|
130
100
|
self,
|
131
101
|
url: str,
|
zenx/engine.py
CHANGED
@@ -26,16 +26,6 @@ class Engine:
|
|
26
26
|
self.shutdown_event.set()
|
27
27
|
|
28
28
|
|
29
|
-
async def _log_stats(self, spider: Spider) -> None:
|
30
|
-
while True:
|
31
|
-
await asyncio.sleep(60)
|
32
|
-
try:
|
33
|
-
rpm = spider.client.get_stats().get("rpm")
|
34
|
-
spider.logger.info("stats", rpm=rpm)
|
35
|
-
except asyncio.CancelledError:
|
36
|
-
break
|
37
|
-
|
38
|
-
|
39
29
|
async def _execute(self, spider_name: str) -> None:
|
40
30
|
loop = asyncio.get_running_loop()
|
41
31
|
loop.add_signal_handler(signal.SIGINT, self._shutdown_handler)
|
@@ -57,8 +47,6 @@ class Engine:
|
|
57
47
|
await pm.start_pipelines()
|
58
48
|
|
59
49
|
spider = spider_cls(client=client, pm=pm, logger=logger, settings=settings)
|
60
|
-
|
61
|
-
stats_task = asyncio.create_task(self._log_stats(spider))
|
62
50
|
try:
|
63
51
|
if self.forever:
|
64
52
|
while not self.shutdown_event.is_set():
|
@@ -72,15 +60,6 @@ class Engine:
|
|
72
60
|
finally:
|
73
61
|
if self.shutdown_event.is_set():
|
74
62
|
logger.info("shutdown", spider=spider_name)
|
75
|
-
|
76
|
-
stats_task.cancel()
|
77
|
-
try:
|
78
|
-
await stats_task
|
79
|
-
except asyncio.CancelledError:
|
80
|
-
pass
|
81
|
-
stats = spider.client.get_stats()
|
82
|
-
logger.info("summary", **stats)
|
83
|
-
|
84
63
|
await client.close()
|
85
64
|
await db.close()
|
86
65
|
await pm.close_pipelines()
|
zenx/settings.py
CHANGED
@@ -9,6 +9,7 @@ class Settings(BaseSettings):
|
|
9
9
|
SESSION_POOL_SIZE: int = 1
|
10
10
|
MAX_SCRAPE_DELAY: int = 10 # 10 seconds
|
11
11
|
DQ_MAX_SIZE: int = 100 # max size of the deque for memory database
|
12
|
+
REDIS_RECORD_EXPIRY_SECONDS: int = 604800 # 7 days (7*24*60*60)
|
12
13
|
|
13
14
|
DB_TYPE: Literal["memory", "redis"] = "memory"
|
14
15
|
DB_NAME: str | None = None
|
@@ -1,14 +1,14 @@
|
|
1
1
|
zenx/cli.py,sha256=pHKhOTdqI6NQQoYK91waRIMpxCXLYtXEryzVaTbmvqc,2810
|
2
2
|
zenx/debug_runner.py,sha256=B2Jd9A4_EHDa-ohLcwnFIxOV73FagTWXX2fl3qgwlpY,809
|
3
3
|
zenx/discovery.py,sha256=YANVGzy2IG1fYruUud-11Y-ynyO6iEp3EjlHnhIQJQI,1014
|
4
|
-
zenx/engine.py,sha256=
|
4
|
+
zenx/engine.py,sha256=Nsz59CnHjUmlYa3dcqdRCBGZh8ZBHDht-rrZTKd1ysU,2341
|
5
5
|
zenx/exceptions.py,sha256=BJXxzwwX2CU6inhppfblx8c8Z6Mhvsk7MAhQ1LAnhBg,37
|
6
6
|
zenx/logger.py,sha256=UmEk0vV1mSCozV7z_DDgCCXdAManDr5wgkrhKiRQtyU,1651
|
7
|
-
zenx/settings.py,sha256=
|
7
|
+
zenx/settings.py,sha256=0RLeKcqHJdD1vuYIQtGeXkp2YIvv3tNyz9O8jw620VQ,1008
|
8
8
|
zenx/utils.py,sha256=a0JOM4CRQHFCD-W1E8P7voIWmdEbSYXN3yV4db8ni3U,588
|
9
9
|
zenx/clients/__init__.py,sha256=CaAAuNa8DPyMdejR0KNSDDg_UzC3WxaTol5_QvwwwG8,132
|
10
|
-
zenx/clients/database.py,sha256=
|
11
|
-
zenx/clients/http.py,sha256=
|
10
|
+
zenx/clients/database.py,sha256=9QW2H-4vscoBKZy_iG_q8WvSExoD0U7Rih4FPsYngbI,4756
|
11
|
+
zenx/clients/http.py,sha256=fb6COYot6vidNFRBWgoU6CYEfnYWJP0JuVkydvxsHb4,5700
|
12
12
|
zenx/pipelines/__init__.py,sha256=IxkZ0UpEJdYjLdd-PMcC9PzzzArTBNNcpgKc7NiOe5Y,131
|
13
13
|
zenx/pipelines/base.py,sha256=N_388z5DFMeaU6wMwcClZAbQFWKh4kpAF7eUJhpQevs,1863
|
14
14
|
zenx/pipelines/google_rpc.py,sha256=F7p5ml9W1UliZbrDrF9MFNVKlCP5pG1WpO6rdmBgKp8,4707
|
@@ -20,8 +20,8 @@ zenx/resources/proto/feed_pb2.py,sha256=ZyICOLnyuXekkvV4bAHZ1nE1-wwzcYYRRrmRJCMr
|
|
20
20
|
zenx/resources/proto/feed_pb2_grpc.py,sha256=Mim6FfBgIMj0PmTqHk036nVUMJH3A6I3ts6r1j3bQF8,7441
|
21
21
|
zenx/spiders/__init__.py,sha256=rs5LuqdM2MQlUYiTGJrzkYhzN8_SSLTrR7wGjSRrrSo,25
|
22
22
|
zenx/spiders/base.py,sha256=YB-KqsAzfIUTzDMy5_ElgW1mul-I4Ltft6JAJxpy4hg,1672
|
23
|
-
zenx-0.6.
|
24
|
-
zenx-0.6.
|
25
|
-
zenx-0.6.
|
26
|
-
zenx-0.6.
|
27
|
-
zenx-0.6.
|
23
|
+
zenx-0.6.9.dist-info/METADATA,sha256=WqdAikf-ESTs5RbByLL7hQQuvuDeFwdobnAog5CiNFM,1273
|
24
|
+
zenx-0.6.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
25
|
+
zenx-0.6.9.dist-info/entry_points.txt,sha256=8JXob2f1VtvzGFris-e9Usqywg7oca-cChDlH9moOZU,38
|
26
|
+
zenx-0.6.9.dist-info/top_level.txt,sha256=JeXwvK86d7sB-2x-avugFnZIZa33zaHWKI8RHWJR6KY,5
|
27
|
+
zenx-0.6.9.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|