yad2-scraper 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- yad2_scraper/constants.py +1 -1
- yad2_scraper/exceptions.py +13 -12
- yad2_scraper/scraper.py +59 -68
- yad2_scraper/utils.py +0 -7
- {yad2_scraper-0.3.0.dist-info → yad2_scraper-0.4.0.dist-info}/METADATA +1 -1
- yad2_scraper-0.4.0.dist-info/RECORD +12 -0
- yad2_scraper-0.3.0.dist-info/RECORD +0 -12
- {yad2_scraper-0.3.0.dist-info → yad2_scraper-0.4.0.dist-info}/LICENSE +0 -0
- {yad2_scraper-0.3.0.dist-info → yad2_scraper-0.4.0.dist-info}/WHEEL +0 -0
yad2_scraper/constants.py
CHANGED
@@ -19,7 +19,7 @@ ALLOW_REQUEST_REDIRECTS = True
|
|
19
19
|
VERIFY_REQUEST_SSL = True
|
20
20
|
|
21
21
|
ANTIBOT_CONTENT_IDENTIFIER = b"Are you for real" # robot-captcha
|
22
|
-
|
22
|
+
PAGE_CONTENT_IDENTIFIER = b"https://www.yad2.co.il/"
|
23
23
|
|
24
24
|
FIRST_PAGE_NUMBER = 1
|
25
25
|
NOT_MENTIONED_PRICE_RANGE = 0, 0
|
yad2_scraper/exceptions.py
CHANGED
@@ -1,10 +1,12 @@
|
|
1
1
|
import httpx
|
2
|
-
from typing import List
|
2
|
+
from typing import List, Union
|
3
3
|
|
4
4
|
|
5
|
-
class ResponseError(
|
6
|
-
|
7
|
-
|
5
|
+
class ResponseError(Exception):
|
6
|
+
def __init__(self, msg: str, request: httpx.Request, response: httpx.Response):
|
7
|
+
super().__init__(msg)
|
8
|
+
self.request = request
|
9
|
+
self.response = response
|
8
10
|
|
9
11
|
|
10
12
|
class AntiBotDetectedError(ResponseError):
|
@@ -15,17 +17,16 @@ class UnexpectedContentError(ResponseError):
|
|
15
17
|
pass
|
16
18
|
|
17
19
|
|
18
|
-
class
|
19
|
-
def __init__(self, msg: str, errors: List[
|
20
|
+
class MaxAttemptsExceededError(Exception):
|
21
|
+
def __init__(self, msg: str, max_attempts: int, errors: List[BaseException] = None):
|
20
22
|
super().__init__(msg)
|
23
|
+
self.max_attempts = max_attempts
|
21
24
|
self.errors = errors
|
22
25
|
|
23
26
|
|
24
|
-
class
|
25
|
-
def __init__(self, method: str, url: str,
|
27
|
+
class MaxRequestAttemptsExceededError(MaxAttemptsExceededError):
|
28
|
+
def __init__(self, method: str, url: str, max_attempts: int, errors: List[Union[httpx.HTTPError, ResponseError]]):
|
29
|
+
msg = f"All {max_attempts} attempts for {method} request to '{url}' have failed"
|
30
|
+
super().__init__(msg, max_attempts, errors)
|
26
31
|
self.method = method
|
27
32
|
self.url = url
|
28
|
-
self.max_retries = max_retries
|
29
|
-
|
30
|
-
msg = f"All {self.max_retries} retry attempts for {self.method} request to '{self.url}' have failed"
|
31
|
-
super().__init__(msg, errors)
|
yad2_scraper/scraper.py
CHANGED
@@ -1,25 +1,25 @@
|
|
1
1
|
import logging
|
2
2
|
import httpx
|
3
3
|
import time
|
4
|
-
import
|
5
|
-
from typing import Optional, Dict, Any,
|
4
|
+
from fake_useragent import FakeUserAgent
|
5
|
+
from typing import Optional, Dict, Any, Callable, Union, Type, TypeVar
|
6
6
|
|
7
7
|
from yad2_scraper.category import Yad2Category
|
8
8
|
from yad2_scraper.query import QueryFilters
|
9
|
-
from yad2_scraper.
|
10
|
-
from yad2_scraper.exceptions import AntiBotDetectedError, UnexpectedContentError, MaxRequestRetriesExceededError
|
9
|
+
from yad2_scraper.exceptions import AntiBotDetectedError, UnexpectedContentError, MaxRequestAttemptsExceededError
|
11
10
|
from yad2_scraper.constants import (
|
12
11
|
DEFAULT_REQUEST_HEADERS,
|
13
12
|
ALLOW_REQUEST_REDIRECTS,
|
14
13
|
VERIFY_REQUEST_SSL,
|
15
14
|
ANTIBOT_CONTENT_IDENTIFIER,
|
16
|
-
|
15
|
+
PAGE_CONTENT_IDENTIFIER
|
17
16
|
)
|
18
17
|
|
19
18
|
Category = TypeVar("Category", bound=Yad2Category)
|
20
|
-
|
19
|
+
WaitStrategy = Callable[[int], Optional[float]]
|
21
20
|
QueryParamTypes = Union[QueryFilters, Dict[str, Any]]
|
22
21
|
|
22
|
+
fua = FakeUserAgent()
|
23
23
|
logger = logging.getLogger(__name__)
|
24
24
|
|
25
25
|
|
@@ -28,9 +28,9 @@ class Yad2Scraper:
|
|
28
28
|
self,
|
29
29
|
client: Optional[httpx.Client] = None,
|
30
30
|
request_defaults: Optional[Dict[str, Any]] = None,
|
31
|
-
randomize_user_agent: bool =
|
32
|
-
|
33
|
-
|
31
|
+
randomize_user_agent: bool = True,
|
32
|
+
wait_strategy: Optional[WaitStrategy] = None,
|
33
|
+
max_request_attempts: int = 1
|
34
34
|
):
|
35
35
|
self.client = client or httpx.Client(
|
36
36
|
headers=DEFAULT_REQUEST_HEADERS,
|
@@ -39,16 +39,25 @@ class Yad2Scraper:
|
|
39
39
|
)
|
40
40
|
self.request_defaults = request_defaults or {}
|
41
41
|
self.randomize_user_agent = randomize_user_agent
|
42
|
-
self.
|
43
|
-
self.
|
42
|
+
self.wait_strategy = wait_strategy
|
43
|
+
self.max_request_attempts = max_request_attempts
|
44
44
|
|
45
45
|
logger.debug(f"Scraper initialized with client: {self.client}")
|
46
46
|
|
47
|
+
def set_user_agent(self, user_agent: str) -> None:
|
48
|
+
self.client.headers["User-Agent"] = user_agent
|
49
|
+
logger.debug(f"User-Agent client header set to: '{user_agent}'")
|
50
|
+
|
51
|
+
def set_no_script(self, no_script: bool) -> None:
|
52
|
+
value = "1" if no_script else "0"
|
53
|
+
self.client.cookies.set("noscript", value)
|
54
|
+
logger.debug(f"NoScript (noscript) client cookie set to: '{value}'")
|
55
|
+
|
47
56
|
def fetch_category(
|
48
57
|
self,
|
49
58
|
url: str,
|
50
|
-
|
51
|
-
|
59
|
+
category_type: Type[Category] = Yad2Category,
|
60
|
+
params: Optional[QueryParamTypes] = None
|
52
61
|
) -> Category:
|
53
62
|
logger.debug(f"Fetching category from URL: '{url}'")
|
54
63
|
response = self.get(url, params)
|
@@ -59,72 +68,48 @@ class Yad2Scraper:
|
|
59
68
|
return self.request("GET", url, params=params)
|
60
69
|
|
61
70
|
def request(self, method: str, url: str, params: Optional[QueryParamTypes] = None) -> httpx.Response:
|
71
|
+
if not isinstance(self.max_request_attempts, int):
|
72
|
+
raise TypeError(f"max_request_attempts must be of type 'int', but got {type(self.max_request_attempts)}")
|
73
|
+
|
74
|
+
if self.max_request_attempts <= 0:
|
75
|
+
raise ValueError(f"max_request_attempts must be a positive integer, but got {self.max_request_attempts}")
|
76
|
+
|
62
77
|
request_options = self._prepare_request_options(params=params)
|
78
|
+
error_list = []
|
63
79
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
80
|
+
for attempt in range(1, self.max_request_attempts + 1):
|
81
|
+
try:
|
82
|
+
return self._send_request(method, url, request_options, attempt)
|
83
|
+
except Exception as error:
|
84
|
+
logger.error(f"{method} request to '{url}' failed {self._format_attempt_info(attempt)}: {error}")
|
85
|
+
error_list.append(error)
|
68
86
|
|
69
|
-
|
70
|
-
|
71
|
-
logger.debug(f"User-Agent client header set to: '{user_agent}'")
|
87
|
+
if self.max_request_attempts == 1:
|
88
|
+
raise error_list[0] # only one error exists, raise it
|
72
89
|
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
logger.debug(f"noscript client cookie set to: '{value}'")
|
90
|
+
max_attempts_error = MaxRequestAttemptsExceededError(method, url, self.max_request_attempts, error_list)
|
91
|
+
logger.error(str(max_attempts_error))
|
92
|
+
raise max_attempts_error from error_list[-1] # multiple errors exist, raise from the last one
|
77
93
|
|
78
94
|
def close(self) -> None:
|
79
95
|
logger.debug("Closing scraper client")
|
80
96
|
self.client.close()
|
81
97
|
logger.info("Scraper client closed")
|
82
98
|
|
83
|
-
def _send_request(self, method: str, url: str, request_options: Dict[str, Any]) -> httpx.Response:
|
99
|
+
def _send_request(self, method: str, url: str, request_options: Dict[str, Any], attempt: int) -> httpx.Response:
|
84
100
|
if self.randomize_user_agent:
|
85
101
|
self._set_random_user_agent(request_options)
|
86
102
|
|
87
|
-
if self.
|
88
|
-
self.
|
103
|
+
if self.wait_strategy:
|
104
|
+
self._apply_wait_strategy(attempt)
|
89
105
|
|
90
|
-
logger.info(f"
|
106
|
+
logger.info(f"Sending {method} request to URL: '{url}' {self._format_attempt_info(attempt)}")
|
91
107
|
response = self.client.request(method, url, **request_options)
|
92
|
-
logger.debug(f"Received response
|
108
|
+
logger.debug(f"Received response {response.status_code} from '{url}' {self._format_attempt_info(attempt)}")
|
93
109
|
self._validate_response(response)
|
94
110
|
|
95
111
|
return response
|
96
112
|
|
97
|
-
def _handle_request_error(
|
98
|
-
self,
|
99
|
-
method: str,
|
100
|
-
url: str,
|
101
|
-
request_options: Dict[str, Any],
|
102
|
-
error: Exception
|
103
|
-
) -> httpx.Response:
|
104
|
-
logger.error(f"{method} request to '{url}' failed: {error}")
|
105
|
-
|
106
|
-
if self.max_retries == 0:
|
107
|
-
raise error
|
108
|
-
|
109
|
-
return self._retry_request(method, url, request_options)
|
110
|
-
|
111
|
-
def _retry_request(self, method: str, url: str, request_options: Dict[str, Any]) -> httpx.Response:
|
112
|
-
logger.info(f"Retrying {method} request to '{url}' (max retries: {self.max_retries})")
|
113
|
-
|
114
|
-
errors = []
|
115
|
-
|
116
|
-
for retry_attempt in range(1, self.max_retries + 1):
|
117
|
-
try:
|
118
|
-
logger.debug(f"Retry attempt {retry_attempt}/{self.max_retries}")
|
119
|
-
return self._send_request(method, url, request_options)
|
120
|
-
except Exception as error:
|
121
|
-
logger.warning(f"Retry attempt {retry_attempt} failed: {error}")
|
122
|
-
errors.append(error)
|
123
|
-
|
124
|
-
error_to_raise = MaxRequestRetriesExceededError(method, url, self.max_retries, errors)
|
125
|
-
logger.error(str(error_to_raise))
|
126
|
-
raise error_to_raise from errors[-1]
|
127
|
-
|
128
113
|
def _prepare_request_options(self, params: Optional[QueryParamTypes] = None) -> Dict[str, Any]:
|
129
114
|
logger.debug("Preparing request options from defaults")
|
130
115
|
request_options = self.request_defaults.copy()
|
@@ -135,17 +120,20 @@ class Yad2Scraper:
|
|
135
120
|
|
136
121
|
return request_options
|
137
122
|
|
138
|
-
def _apply_request_delay(self):
|
139
|
-
delay = random.uniform(*self.random_delay_range)
|
140
|
-
logger.debug(f"Applying request delay of {delay:.2f} seconds")
|
141
|
-
time.sleep(delay)
|
142
|
-
|
143
123
|
@staticmethod
|
144
124
|
def _set_random_user_agent(request_options: Dict[str, str]):
|
145
|
-
user_agent =
|
125
|
+
user_agent = fua.random
|
146
126
|
request_options.setdefault("headers", {})["User-Agent"] = user_agent
|
147
127
|
logger.debug(f"Updated request options with random User-Agent header: '{user_agent}'")
|
148
128
|
|
129
|
+
def _apply_wait_strategy(self, attempt: int):
|
130
|
+
wait_time = self.wait_strategy(attempt)
|
131
|
+
if not wait_time:
|
132
|
+
return
|
133
|
+
|
134
|
+
logger.debug(f"Waiting {wait_time:.2f} seconds before request {self._format_attempt_info(attempt)}")
|
135
|
+
time.sleep(wait_time)
|
136
|
+
|
149
137
|
@staticmethod
|
150
138
|
def _validate_response(response: httpx.Response):
|
151
139
|
response.raise_for_status()
|
@@ -156,15 +144,18 @@ class Yad2Scraper:
|
|
156
144
|
request=response.request,
|
157
145
|
response=response
|
158
146
|
)
|
159
|
-
if
|
147
|
+
if response.request.method == "GET" and PAGE_CONTENT_IDENTIFIER not in response.content:
|
160
148
|
raise UnexpectedContentError(
|
161
|
-
"The response does not contain yad2 content",
|
149
|
+
"The GET response does not contain yad2 related content",
|
162
150
|
request=response.request,
|
163
151
|
response=response
|
164
152
|
)
|
165
153
|
|
166
154
|
logger.debug("Response validation succeeded")
|
167
155
|
|
156
|
+
def _format_attempt_info(self, attempt: int) -> str:
|
157
|
+
return f"(attempt {attempt}/{self.max_request_attempts})"
|
158
|
+
|
168
159
|
def __enter__(self):
|
169
160
|
logger.debug("Entering scraper context")
|
170
161
|
return self
|
yad2_scraper/utils.py
CHANGED
@@ -1,13 +1,6 @@
|
|
1
|
-
from fake_useragent import FakeUserAgent
|
2
1
|
from bs4 import BeautifulSoup, Tag
|
3
2
|
from typing import Union, List
|
4
3
|
|
5
|
-
fua = FakeUserAgent()
|
6
|
-
|
7
|
-
|
8
|
-
def get_random_user_agent() -> str:
|
9
|
-
return fua.random
|
10
|
-
|
11
4
|
|
12
5
|
def join_url(url: str, path: str) -> str:
|
13
6
|
return url.rstrip("/") + "/" + path.lstrip("/")
|
@@ -0,0 +1,12 @@
|
|
1
|
+
yad2_scraper/__init__.py,sha256=UUiIk6TAHTAP4IY86bIR4TcY3VVMTCyEF0Sq1MSneMM,141
|
2
|
+
yad2_scraper/category.py,sha256=KXLyjMOlPzu3xj08-uRmffAMD83DbqFVm-y1-T83Djw,910
|
3
|
+
yad2_scraper/constants.py,sha256=8zXJ31fRqkDIOJp96BRK1PJofGXX8SG64YcfmJnVW8Q,910
|
4
|
+
yad2_scraper/exceptions.py,sha256=_IcuDdJPKAznSUp_c3fLEuTnDdKf0NLJqpRPs0IzdXw,979
|
5
|
+
yad2_scraper/next_data.py,sha256=-vqvXJqugk-895_kOnwb7J8kUjugg28Aqrh4Z_ct11M,512
|
6
|
+
yad2_scraper/query.py,sha256=WaOWUlyNye9MNXv3hkiUaBFDeV9lbkvHiaDHWYKzgtY,1194
|
7
|
+
yad2_scraper/scraper.py,sha256=sgDpfnKlBSDIWEb2enpQ5O9E5fJvXz3cDOnGXHGCJL4,6653
|
8
|
+
yad2_scraper/utils.py,sha256=e6tqaN5Gw9BXunOQ1V919NkLrZREN7TdMsDuOvZgrcY,713
|
9
|
+
yad2_scraper-0.4.0.dist-info/LICENSE,sha256=JCpnDxMx2kE40e0UQ1svSmifrLWg2Gni5VTkJR68thY,1065
|
10
|
+
yad2_scraper-0.4.0.dist-info/METADATA,sha256=TDO8cS7t4aGZ-B-XikL4hSneizcb7TzLtBsV7dQGl1k,925
|
11
|
+
yad2_scraper-0.4.0.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
|
12
|
+
yad2_scraper-0.4.0.dist-info/RECORD,,
|
@@ -1,12 +0,0 @@
|
|
1
|
-
yad2_scraper/__init__.py,sha256=UUiIk6TAHTAP4IY86bIR4TcY3VVMTCyEF0Sq1MSneMM,141
|
2
|
-
yad2_scraper/category.py,sha256=KXLyjMOlPzu3xj08-uRmffAMD83DbqFVm-y1-T83Djw,910
|
3
|
-
yad2_scraper/constants.py,sha256=RAikaxRILQyiNeZG-_MPAwPi83abK5sscHdzDOrFge8,910
|
4
|
-
yad2_scraper/exceptions.py,sha256=Vewa3CmVEdH6Wok3YP2686RoIrA7myKnjDQTNEZAn7w,830
|
5
|
-
yad2_scraper/next_data.py,sha256=-vqvXJqugk-895_kOnwb7J8kUjugg28Aqrh4Z_ct11M,512
|
6
|
-
yad2_scraper/query.py,sha256=WaOWUlyNye9MNXv3hkiUaBFDeV9lbkvHiaDHWYKzgtY,1194
|
7
|
-
yad2_scraper/scraper.py,sha256=QeLNFxwTQSN9Dq3zotBFnnTU5XHQrnEoWOJD3qfj2w8,6564
|
8
|
-
yad2_scraper/utils.py,sha256=48flvJPUje3nDHL3F_C3pPw3pf3ycke0f1WoXq2cSeE,837
|
9
|
-
yad2_scraper-0.3.0.dist-info/LICENSE,sha256=JCpnDxMx2kE40e0UQ1svSmifrLWg2Gni5VTkJR68thY,1065
|
10
|
-
yad2_scraper-0.3.0.dist-info/METADATA,sha256=YOj8J10dvwS7fYdC6nEzu0Ea-D-CpndhzN9o5LujvNk,925
|
11
|
-
yad2_scraper-0.3.0.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
|
12
|
-
yad2_scraper-0.3.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|