zenx 0.6.3__tar.gz → 0.6.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {zenx-0.6.3 → zenx-0.6.5}/PKG-INFO +1 -1
- {zenx-0.6.3 → zenx-0.6.5}/pyproject.toml +1 -1
- {zenx-0.6.3 → zenx-0.6.5}/zenx/clients/http.py +21 -2
- {zenx-0.6.3 → zenx-0.6.5}/zenx/spiders/base.py +1 -19
- {zenx-0.6.3 → zenx-0.6.5}/zenx/utils.py +12 -0
- {zenx-0.6.3 → zenx-0.6.5}/zenx.egg-info/PKG-INFO +1 -1
- {zenx-0.6.3 → zenx-0.6.5}/setup.cfg +0 -0
- {zenx-0.6.3 → zenx-0.6.5}/zenx/cli.py +0 -0
- {zenx-0.6.3 → zenx-0.6.5}/zenx/clients/__init__.py +0 -0
- {zenx-0.6.3 → zenx-0.6.5}/zenx/clients/database.py +0 -0
- {zenx-0.6.3 → zenx-0.6.5}/zenx/debug_runner.py +0 -0
- {zenx-0.6.3 → zenx-0.6.5}/zenx/discovery.py +0 -0
- {zenx-0.6.3 → zenx-0.6.5}/zenx/engine.py +0 -0
- {zenx-0.6.3 → zenx-0.6.5}/zenx/exceptions.py +0 -0
- {zenx-0.6.3 → zenx-0.6.5}/zenx/logger.py +0 -0
- {zenx-0.6.3 → zenx-0.6.5}/zenx/pipelines/__init__.py +0 -0
- {zenx-0.6.3 → zenx-0.6.5}/zenx/pipelines/base.py +0 -0
- {zenx-0.6.3 → zenx-0.6.5}/zenx/pipelines/google_rpc.py +0 -0
- {zenx-0.6.3 → zenx-0.6.5}/zenx/pipelines/manager.py +0 -0
- {zenx-0.6.3 → zenx-0.6.5}/zenx/pipelines/preprocess.py +0 -0
- {zenx-0.6.3 → zenx-0.6.5}/zenx/pipelines/websocket.py +0 -0
- {zenx-0.6.3 → zenx-0.6.5}/zenx/resources/proto/__init__.py +0 -0
- {zenx-0.6.3 → zenx-0.6.5}/zenx/resources/proto/feed_pb2.py +0 -0
- {zenx-0.6.3 → zenx-0.6.5}/zenx/resources/proto/feed_pb2_grpc.py +0 -0
- {zenx-0.6.3 → zenx-0.6.5}/zenx/settings.py +0 -0
- {zenx-0.6.3 → zenx-0.6.5}/zenx/spiders/__init__.py +0 -0
- {zenx-0.6.3 → zenx-0.6.5}/zenx.egg-info/SOURCES.txt +0 -0
- {zenx-0.6.3 → zenx-0.6.5}/zenx.egg-info/dependency_links.txt +0 -0
- {zenx-0.6.3 → zenx-0.6.5}/zenx.egg-info/entry_points.txt +0 -0
- {zenx-0.6.3 → zenx-0.6.5}/zenx.egg-info/requires.txt +0 -0
- {zenx-0.6.3 → zenx-0.6.5}/zenx.egg-info/top_level.txt +0 -0
@@ -1,4 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
+
import time
|
3
|
+
from collections import deque
|
2
4
|
import parsel
|
3
5
|
import random
|
4
6
|
from curl_cffi.requests.impersonate import BrowserTypeLiteral
|
@@ -11,7 +13,7 @@ import json
|
|
11
13
|
from structlog import BoundLogger
|
12
14
|
|
13
15
|
from zenx.settings import Settings
|
14
|
-
from zenx.utils import get_time
|
16
|
+
from zenx.utils import get_time, record_request
|
15
17
|
|
16
18
|
|
17
19
|
@dataclass
|
@@ -56,8 +58,24 @@ class HttpClient(ABC):
|
|
56
58
|
self.logger = logger
|
57
59
|
self.settings = settings
|
58
60
|
self._session_pool: asyncio.Queue
|
61
|
+
# stats
|
62
|
+
self._requests_timestamps = deque()
|
63
|
+
self._total_requests = 0
|
64
|
+
|
59
65
|
|
66
|
+
def get_stats(self) -> Dict:
|
67
|
+
# calculate RPM based on rolling window of 1 min
|
68
|
+
now = time.time()
|
69
|
+
# remove requests older than 60 sec
|
70
|
+
while self._requests_timestamps and self._requests_timestamps[0] < (now - 60):
|
71
|
+
self._requests_timestamps.popleft()
|
72
|
+
rpm = len(self._requests_timestamps)
|
73
|
+
return {
|
74
|
+
"rpm": rpm,
|
75
|
+
"total_requests": self._total_requests,
|
76
|
+
}
|
60
77
|
|
78
|
+
|
61
79
|
@abstractmethod
|
62
80
|
async def request(
|
63
81
|
self,
|
@@ -96,6 +114,7 @@ class CurlCffi(HttpClient):
|
|
96
114
|
return chosen_fingerprint
|
97
115
|
|
98
116
|
|
117
|
+
@record_request
|
99
118
|
async def request(
|
100
119
|
self,
|
101
120
|
url: str,
|
@@ -124,7 +143,7 @@ class CurlCffi(HttpClient):
|
|
124
143
|
)
|
125
144
|
recv_at = get_time()
|
126
145
|
latency = recv_at - req_at
|
127
|
-
self.logger.debug("response", status=response.status_code, url=url, impersonate=impersonate, client=self.name, requested_at=req_at, responded_at=recv_at, latency_ms=latency)
|
146
|
+
self.logger.debug("response", status=response.status_code, url=url, impersonate=session.impersonate, client=self.name, requested_at=req_at, responded_at=recv_at, latency_ms=latency)
|
128
147
|
except Exception:
|
129
148
|
self.logger.exception("request", url=url, client=self.name)
|
130
149
|
raise
|
@@ -1,7 +1,5 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
from abc import ABC, abstractmethod
|
3
|
-
from collections import deque
|
4
|
-
import time
|
5
3
|
from typing import ClassVar, Dict, List, Literal, Type
|
6
4
|
from structlog import BoundLogger
|
7
5
|
|
@@ -15,7 +13,7 @@ class Spider(ABC):
|
|
15
13
|
# central registry
|
16
14
|
name: ClassVar[str]
|
17
15
|
_registry: ClassVar[Dict[str, Type[Spider]]] = {}
|
18
|
-
pipelines: ClassVar[List[Literal["synoptic_websocket","synoptic_grpc"]]]
|
16
|
+
pipelines: ClassVar[List[Literal["preprocess","synoptic_websocket","synoptic_grpc"]]]
|
19
17
|
client_name: ClassVar[Literal["curl_cffi"]] = "curl_cffi"
|
20
18
|
|
21
19
|
|
@@ -46,22 +44,6 @@ class Spider(ABC):
|
|
46
44
|
self.pm = pm
|
47
45
|
self.logger = logger
|
48
46
|
self.settings = settings
|
49
|
-
# stats
|
50
|
-
self.requests_timestamps = deque()
|
51
|
-
self.total_requests = 0
|
52
|
-
|
53
|
-
|
54
|
-
def get_stats(self) -> Dict:
|
55
|
-
# calculate RPM based on rolling window of 1 min
|
56
|
-
now = time.time()
|
57
|
-
# remove requests older than 60 sec
|
58
|
-
while self.requests_timestamps and self.requests_timestamps[0] < (now - 60):
|
59
|
-
self.requests_timestamps.popleft()
|
60
|
-
rpm = len(self.requests_timestamps)
|
61
|
-
return {
|
62
|
-
"rpm": rpm,
|
63
|
-
"total_requests": self.total_requests,
|
64
|
-
}
|
65
47
|
|
66
48
|
|
67
49
|
@abstractmethod
|
@@ -2,6 +2,8 @@ import time
|
|
2
2
|
from typing import Dict
|
3
3
|
import functools
|
4
4
|
|
5
|
+
from zenx.clients.http import HttpClient, Response
|
6
|
+
|
5
7
|
|
6
8
|
def get_time() -> int:
|
7
9
|
""" current unix time in milliseconds """
|
@@ -17,3 +19,13 @@ def log_processing_time(func):
|
|
17
19
|
self.logger.info("processed", id=item['_id'], time_ms=processed_time, pipeline=self.name)
|
18
20
|
return result
|
19
21
|
return wrapper
|
22
|
+
|
23
|
+
|
24
|
+
def record_request(func):
|
25
|
+
@functools.wraps(func)
|
26
|
+
async def wrapper(self: HttpClient, *args, **kwargs) -> Response:
|
27
|
+
result = await func(self, *args, **kwargs)
|
28
|
+
self._requests_timestamps.append(time.time())
|
29
|
+
self._total_requests +=1
|
30
|
+
return result
|
31
|
+
return wrapper
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|