yad2-scraper 0.3.0__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {yad2_scraper-0.3.0 → yad2_scraper-0.5.0}/PKG-INFO +2 -3
- {yad2_scraper-0.3.0 → yad2_scraper-0.5.0}/pyproject.toml +2 -2
- {yad2_scraper-0.3.0 → yad2_scraper-0.5.0}/yad2_scraper/constants.py +1 -1
- yad2_scraper-0.5.0/yad2_scraper/exceptions.py +32 -0
- yad2_scraper-0.5.0/yad2_scraper/next_data.py +50 -0
- {yad2_scraper-0.3.0 → yad2_scraper-0.5.0}/yad2_scraper/query.py +2 -2
- {yad2_scraper-0.3.0 → yad2_scraper-0.5.0}/yad2_scraper/scraper.py +59 -68
- {yad2_scraper-0.3.0 → yad2_scraper-0.5.0}/yad2_scraper/utils.py +16 -8
- yad2_scraper-0.5.0/yad2_scraper/vehicles/__init__.py +5 -0
- yad2_scraper-0.5.0/yad2_scraper/vehicles/category.py +15 -0
- yad2_scraper-0.5.0/yad2_scraper/vehicles/next_data.py +318 -0
- yad2_scraper-0.5.0/yad2_scraper/vehicles/query.py +25 -0
- yad2_scraper-0.5.0/yad2_scraper/vehicles/tag.py +63 -0
- yad2_scraper-0.5.0/yad2_scraper/vehicles/urls.py +16 -0
- yad2_scraper-0.3.0/yad2_scraper/exceptions.py +0 -31
- yad2_scraper-0.3.0/yad2_scraper/next_data.py +0 -27
- {yad2_scraper-0.3.0 → yad2_scraper-0.5.0}/LICENSE +0 -0
- {yad2_scraper-0.3.0 → yad2_scraper-0.5.0}/README.md +0 -0
- {yad2_scraper-0.3.0 → yad2_scraper-0.5.0}/yad2_scraper/__init__.py +0 -0
- {yad2_scraper-0.3.0 → yad2_scraper-0.5.0}/yad2_scraper/category.py +0 -0
@@ -1,14 +1,13 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: yad2-scraper
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.5.0
|
4
4
|
Summary: Scrape Yad2 in Python.
|
5
5
|
License: LICENSE
|
6
6
|
Author: dav ost
|
7
7
|
Author-email: davidost2003@gmail.com
|
8
|
-
Requires-Python: >=3.
|
8
|
+
Requires-Python: >=3.8
|
9
9
|
Classifier: License :: Other/Proprietary License
|
10
10
|
Classifier: Programming Language :: Python :: 3
|
11
|
-
Classifier: Programming Language :: Python :: 3.7
|
12
11
|
Classifier: Programming Language :: Python :: 3.8
|
13
12
|
Classifier: Programming Language :: Python :: 3.9
|
14
13
|
Classifier: Programming Language :: Python :: 3.10
|
@@ -1,13 +1,13 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "yad2-scraper"
|
3
|
-
version = "0.
|
3
|
+
version = "0.5.0"
|
4
4
|
description = "Scrape Yad2 in Python."
|
5
5
|
authors = ["dav ost <davidost2003@gmail.com>"]
|
6
6
|
license = "LICENSE"
|
7
7
|
readme = "README.md"
|
8
8
|
|
9
9
|
[tool.poetry.dependencies]
|
10
|
-
python = ">=3.
|
10
|
+
python = ">=3.8"
|
11
11
|
httpx = "^0.24.0"
|
12
12
|
httpcore = ">=0.15.0"
|
13
13
|
fake-useragent = "^0.1.11"
|
@@ -19,7 +19,7 @@ ALLOW_REQUEST_REDIRECTS = True
|
|
19
19
|
VERIFY_REQUEST_SSL = True
|
20
20
|
|
21
21
|
ANTIBOT_CONTENT_IDENTIFIER = b"Are you for real" # robot-captcha
|
22
|
-
|
22
|
+
PAGE_CONTENT_IDENTIFIER = b"https://www.yad2.co.il/"
|
23
23
|
|
24
24
|
FIRST_PAGE_NUMBER = 1
|
25
25
|
NOT_MENTIONED_PRICE_RANGE = 0, 0
|
@@ -0,0 +1,32 @@
|
|
1
|
+
import httpx
|
2
|
+
from typing import List, Union
|
3
|
+
|
4
|
+
|
5
|
+
class ResponseError(Exception):
|
6
|
+
def __init__(self, msg: str, request: httpx.Request, response: httpx.Response):
|
7
|
+
super().__init__(msg)
|
8
|
+
self.request = request
|
9
|
+
self.response = response
|
10
|
+
|
11
|
+
|
12
|
+
class AntiBotDetectedError(ResponseError):
|
13
|
+
pass
|
14
|
+
|
15
|
+
|
16
|
+
class UnexpectedContentError(ResponseError):
|
17
|
+
pass
|
18
|
+
|
19
|
+
|
20
|
+
class MaxAttemptsExceededError(Exception):
|
21
|
+
def __init__(self, msg: str, max_attempts: int, errors: List[BaseException] = None):
|
22
|
+
super().__init__(msg)
|
23
|
+
self.max_attempts = max_attempts
|
24
|
+
self.errors = errors
|
25
|
+
|
26
|
+
|
27
|
+
class MaxRequestAttemptsExceededError(MaxAttemptsExceededError):
|
28
|
+
def __init__(self, method: str, url: str, max_attempts: int, errors: List[Union[httpx.HTTPError, ResponseError]]):
|
29
|
+
msg = f"All {max_attempts} attempts for {method} request to '{url}' have failed"
|
30
|
+
super().__init__(msg, max_attempts, errors)
|
31
|
+
self.method = method
|
32
|
+
self.url = url
|
@@ -0,0 +1,50 @@
|
|
1
|
+
from datetime import datetime
|
2
|
+
from enum import Enum
|
3
|
+
from typing import List, Union
|
4
|
+
|
5
|
+
from yad2_scraper.utils import safe_access
|
6
|
+
|
7
|
+
FieldTypes = Union[str, int]
|
8
|
+
|
9
|
+
safe_access_optional_keys = safe_access(exceptions=(KeyError, TypeError), default=None)
|
10
|
+
|
11
|
+
|
12
|
+
class SafeAccessOptionalKeysMeta(type):
|
13
|
+
def __new__(cls, name, bases, dictionary):
|
14
|
+
for attr_name, attr_value in dictionary.items():
|
15
|
+
if callable(attr_value): # Wrap methods
|
16
|
+
dictionary[attr_name] = safe_access_optional_keys(attr_value)
|
17
|
+
elif isinstance(attr_value, property): # Wrap properties
|
18
|
+
dictionary[attr_name] = property(
|
19
|
+
safe_access_optional_keys(attr_value.fget) if attr_value.fget else None,
|
20
|
+
safe_access_optional_keys(attr_value.fset) if attr_value.fset else None,
|
21
|
+
safe_access_optional_keys(attr_value.fdel) if attr_value.fdel else None,
|
22
|
+
attr_value.__doc__,
|
23
|
+
)
|
24
|
+
return super().__new__(cls, name, bases, dictionary)
|
25
|
+
|
26
|
+
|
27
|
+
class Field(str, Enum):
|
28
|
+
ID = "id"
|
29
|
+
TEXT = "text"
|
30
|
+
ENGLISH_TEXT = "textEng"
|
31
|
+
|
32
|
+
|
33
|
+
def convert_string_date_to_datetime(date_string: str) -> datetime:
|
34
|
+
return datetime.fromisoformat(date_string)
|
35
|
+
|
36
|
+
|
37
|
+
class NextData:
|
38
|
+
def __init__(self, data: dict):
|
39
|
+
self.data = data
|
40
|
+
|
41
|
+
@property
|
42
|
+
def json(self) -> dict:
|
43
|
+
return self.data
|
44
|
+
|
45
|
+
@property
|
46
|
+
def queries(self) -> List[dict]:
|
47
|
+
return self.data["props"]["pageProps"]["dehydratedState"]["queries"]
|
48
|
+
|
49
|
+
def __getitem__(self, item):
|
50
|
+
return self.data[item]
|
@@ -2,7 +2,7 @@ from pydantic import BaseModel
|
|
2
2
|
from enum import Enum
|
3
3
|
from typing import Optional, Tuple
|
4
4
|
|
5
|
-
|
5
|
+
NumberRange = Tuple[int, int]
|
6
6
|
|
7
7
|
|
8
8
|
class OrderBy(int, Enum):
|
@@ -27,7 +27,7 @@ def format_number_range(number_range: Optional[Tuple[int, int]]) -> Optional[str
|
|
27
27
|
class QueryFilters(BaseModel):
|
28
28
|
page: Optional[int] = None
|
29
29
|
order_by: Optional[OrderBy] = None
|
30
|
-
price_range: Optional[
|
30
|
+
price_range: Optional[NumberRange] = None
|
31
31
|
...
|
32
32
|
|
33
33
|
def to_params(self) -> dict:
|
@@ -1,25 +1,25 @@
|
|
1
1
|
import logging
|
2
2
|
import httpx
|
3
3
|
import time
|
4
|
-
import
|
5
|
-
from typing import Optional, Dict, Any,
|
4
|
+
from fake_useragent import FakeUserAgent
|
5
|
+
from typing import Optional, Dict, Any, Callable, Union, Type, TypeVar
|
6
6
|
|
7
7
|
from yad2_scraper.category import Yad2Category
|
8
8
|
from yad2_scraper.query import QueryFilters
|
9
|
-
from yad2_scraper.
|
10
|
-
from yad2_scraper.exceptions import AntiBotDetectedError, UnexpectedContentError, MaxRequestRetriesExceededError
|
9
|
+
from yad2_scraper.exceptions import AntiBotDetectedError, UnexpectedContentError, MaxRequestAttemptsExceededError
|
11
10
|
from yad2_scraper.constants import (
|
12
11
|
DEFAULT_REQUEST_HEADERS,
|
13
12
|
ALLOW_REQUEST_REDIRECTS,
|
14
13
|
VERIFY_REQUEST_SSL,
|
15
14
|
ANTIBOT_CONTENT_IDENTIFIER,
|
16
|
-
|
15
|
+
PAGE_CONTENT_IDENTIFIER
|
17
16
|
)
|
18
17
|
|
19
18
|
Category = TypeVar("Category", bound=Yad2Category)
|
20
|
-
|
19
|
+
WaitStrategy = Callable[[int], Optional[float]]
|
21
20
|
QueryParamTypes = Union[QueryFilters, Dict[str, Any]]
|
22
21
|
|
22
|
+
fua = FakeUserAgent()
|
23
23
|
logger = logging.getLogger(__name__)
|
24
24
|
|
25
25
|
|
@@ -28,9 +28,9 @@ class Yad2Scraper:
|
|
28
28
|
self,
|
29
29
|
client: Optional[httpx.Client] = None,
|
30
30
|
request_defaults: Optional[Dict[str, Any]] = None,
|
31
|
-
randomize_user_agent: bool =
|
32
|
-
|
33
|
-
|
31
|
+
randomize_user_agent: bool = True,
|
32
|
+
wait_strategy: Optional[WaitStrategy] = None,
|
33
|
+
max_request_attempts: int = 1
|
34
34
|
):
|
35
35
|
self.client = client or httpx.Client(
|
36
36
|
headers=DEFAULT_REQUEST_HEADERS,
|
@@ -39,16 +39,25 @@ class Yad2Scraper:
|
|
39
39
|
)
|
40
40
|
self.request_defaults = request_defaults or {}
|
41
41
|
self.randomize_user_agent = randomize_user_agent
|
42
|
-
self.
|
43
|
-
self.
|
42
|
+
self.wait_strategy = wait_strategy
|
43
|
+
self.max_request_attempts = max_request_attempts
|
44
44
|
|
45
45
|
logger.debug(f"Scraper initialized with client: {self.client}")
|
46
46
|
|
47
|
+
def set_user_agent(self, user_agent: str) -> None:
|
48
|
+
self.client.headers["User-Agent"] = user_agent
|
49
|
+
logger.debug(f"User-Agent client header set to: '{user_agent}'")
|
50
|
+
|
51
|
+
def set_no_script(self, no_script: bool) -> None:
|
52
|
+
value = "1" if no_script else "0"
|
53
|
+
self.client.cookies.set("noscript", value)
|
54
|
+
logger.debug(f"NoScript (noscript) client cookie set to: '{value}'")
|
55
|
+
|
47
56
|
def fetch_category(
|
48
57
|
self,
|
49
58
|
url: str,
|
50
|
-
|
51
|
-
|
59
|
+
category_type: Type[Category] = Yad2Category,
|
60
|
+
params: Optional[QueryParamTypes] = None
|
52
61
|
) -> Category:
|
53
62
|
logger.debug(f"Fetching category from URL: '{url}'")
|
54
63
|
response = self.get(url, params)
|
@@ -59,72 +68,48 @@ class Yad2Scraper:
|
|
59
68
|
return self.request("GET", url, params=params)
|
60
69
|
|
61
70
|
def request(self, method: str, url: str, params: Optional[QueryParamTypes] = None) -> httpx.Response:
|
71
|
+
if not isinstance(self.max_request_attempts, int):
|
72
|
+
raise TypeError(f"max_request_attempts must be of type 'int', but got {type(self.max_request_attempts)}")
|
73
|
+
|
74
|
+
if self.max_request_attempts <= 0:
|
75
|
+
raise ValueError(f"max_request_attempts must be a positive integer, but got {self.max_request_attempts}")
|
76
|
+
|
62
77
|
request_options = self._prepare_request_options(params=params)
|
78
|
+
error_list = []
|
63
79
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
80
|
+
for attempt in range(1, self.max_request_attempts + 1):
|
81
|
+
try:
|
82
|
+
return self._send_request(method, url, request_options, attempt)
|
83
|
+
except Exception as error:
|
84
|
+
logger.error(f"{method} request to '{url}' failed {self._format_attempt_info(attempt)}: {error}")
|
85
|
+
error_list.append(error)
|
68
86
|
|
69
|
-
|
70
|
-
|
71
|
-
logger.debug(f"User-Agent client header set to: '{user_agent}'")
|
87
|
+
if self.max_request_attempts == 1:
|
88
|
+
raise error_list[0] # only one error exists, raise it
|
72
89
|
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
logger.debug(f"noscript client cookie set to: '{value}'")
|
90
|
+
max_attempts_error = MaxRequestAttemptsExceededError(method, url, self.max_request_attempts, error_list)
|
91
|
+
logger.error(str(max_attempts_error))
|
92
|
+
raise max_attempts_error from error_list[-1] # multiple errors exist, raise from the last one
|
77
93
|
|
78
94
|
def close(self) -> None:
|
79
95
|
logger.debug("Closing scraper client")
|
80
96
|
self.client.close()
|
81
97
|
logger.info("Scraper client closed")
|
82
98
|
|
83
|
-
def _send_request(self, method: str, url: str, request_options: Dict[str, Any]) -> httpx.Response:
|
99
|
+
def _send_request(self, method: str, url: str, request_options: Dict[str, Any], attempt: int) -> httpx.Response:
|
84
100
|
if self.randomize_user_agent:
|
85
101
|
self._set_random_user_agent(request_options)
|
86
102
|
|
87
|
-
if self.
|
88
|
-
self.
|
103
|
+
if self.wait_strategy:
|
104
|
+
self._apply_wait_strategy(attempt)
|
89
105
|
|
90
|
-
logger.info(f"
|
106
|
+
logger.info(f"Sending {method} request to URL: '{url}' {self._format_attempt_info(attempt)}")
|
91
107
|
response = self.client.request(method, url, **request_options)
|
92
|
-
logger.debug(f"Received response
|
108
|
+
logger.debug(f"Received response {response.status_code} from '{url}' {self._format_attempt_info(attempt)}")
|
93
109
|
self._validate_response(response)
|
94
110
|
|
95
111
|
return response
|
96
112
|
|
97
|
-
def _handle_request_error(
|
98
|
-
self,
|
99
|
-
method: str,
|
100
|
-
url: str,
|
101
|
-
request_options: Dict[str, Any],
|
102
|
-
error: Exception
|
103
|
-
) -> httpx.Response:
|
104
|
-
logger.error(f"{method} request to '{url}' failed: {error}")
|
105
|
-
|
106
|
-
if self.max_retries == 0:
|
107
|
-
raise error
|
108
|
-
|
109
|
-
return self._retry_request(method, url, request_options)
|
110
|
-
|
111
|
-
def _retry_request(self, method: str, url: str, request_options: Dict[str, Any]) -> httpx.Response:
|
112
|
-
logger.info(f"Retrying {method} request to '{url}' (max retries: {self.max_retries})")
|
113
|
-
|
114
|
-
errors = []
|
115
|
-
|
116
|
-
for retry_attempt in range(1, self.max_retries + 1):
|
117
|
-
try:
|
118
|
-
logger.debug(f"Retry attempt {retry_attempt}/{self.max_retries}")
|
119
|
-
return self._send_request(method, url, request_options)
|
120
|
-
except Exception as error:
|
121
|
-
logger.warning(f"Retry attempt {retry_attempt} failed: {error}")
|
122
|
-
errors.append(error)
|
123
|
-
|
124
|
-
error_to_raise = MaxRequestRetriesExceededError(method, url, self.max_retries, errors)
|
125
|
-
logger.error(str(error_to_raise))
|
126
|
-
raise error_to_raise from errors[-1]
|
127
|
-
|
128
113
|
def _prepare_request_options(self, params: Optional[QueryParamTypes] = None) -> Dict[str, Any]:
|
129
114
|
logger.debug("Preparing request options from defaults")
|
130
115
|
request_options = self.request_defaults.copy()
|
@@ -135,17 +120,20 @@ class Yad2Scraper:
|
|
135
120
|
|
136
121
|
return request_options
|
137
122
|
|
138
|
-
def _apply_request_delay(self):
|
139
|
-
delay = random.uniform(*self.random_delay_range)
|
140
|
-
logger.debug(f"Applying request delay of {delay:.2f} seconds")
|
141
|
-
time.sleep(delay)
|
142
|
-
|
143
123
|
@staticmethod
|
144
124
|
def _set_random_user_agent(request_options: Dict[str, str]):
|
145
|
-
user_agent =
|
125
|
+
user_agent = fua.random
|
146
126
|
request_options.setdefault("headers", {})["User-Agent"] = user_agent
|
147
127
|
logger.debug(f"Updated request options with random User-Agent header: '{user_agent}'")
|
148
128
|
|
129
|
+
def _apply_wait_strategy(self, attempt: int):
|
130
|
+
wait_time = self.wait_strategy(attempt)
|
131
|
+
if not wait_time:
|
132
|
+
return
|
133
|
+
|
134
|
+
logger.debug(f"Waiting {wait_time:.2f} seconds before request {self._format_attempt_info(attempt)}")
|
135
|
+
time.sleep(wait_time)
|
136
|
+
|
149
137
|
@staticmethod
|
150
138
|
def _validate_response(response: httpx.Response):
|
151
139
|
response.raise_for_status()
|
@@ -156,15 +144,18 @@ class Yad2Scraper:
|
|
156
144
|
request=response.request,
|
157
145
|
response=response
|
158
146
|
)
|
159
|
-
if
|
147
|
+
if response.request.method == "GET" and PAGE_CONTENT_IDENTIFIER not in response.content:
|
160
148
|
raise UnexpectedContentError(
|
161
|
-
"The response does not contain yad2 content",
|
149
|
+
"The GET response does not contain yad2 related content",
|
162
150
|
request=response.request,
|
163
151
|
response=response
|
164
152
|
)
|
165
153
|
|
166
154
|
logger.debug("Response validation succeeded")
|
167
155
|
|
156
|
+
def _format_attempt_info(self, attempt: int) -> str:
|
157
|
+
return f"(attempt {attempt}/{self.max_request_attempts})"
|
158
|
+
|
168
159
|
def __enter__(self):
|
169
160
|
logger.debug("Entering scraper context")
|
170
161
|
return self
|
@@ -1,12 +1,6 @@
|
|
1
|
-
|
1
|
+
import functools
|
2
2
|
from bs4 import BeautifulSoup, Tag
|
3
|
-
from typing import Union, List
|
4
|
-
|
5
|
-
fua = FakeUserAgent()
|
6
|
-
|
7
|
-
|
8
|
-
def get_random_user_agent() -> str:
|
9
|
-
return fua.random
|
3
|
+
from typing import Union, List, Tuple, Any
|
10
4
|
|
11
5
|
|
12
6
|
def join_url(url: str, path: str) -> str:
|
@@ -26,3 +20,17 @@ def find_html_tag_by_class_substring(e: Union[BeautifulSoup, Tag], tag_name: str
|
|
26
20
|
|
27
21
|
def find_all_html_tags_by_class_substring(e: Union[BeautifulSoup, Tag], tag_name: str, substring: str) -> List[Tag]:
|
28
22
|
return e.find_all(tag_name, class_=lambda class_name: class_name and substring in class_name)
|
23
|
+
|
24
|
+
|
25
|
+
def safe_access(exceptions: Tuple = (), default: Any = None):
|
26
|
+
def decorator(func):
|
27
|
+
@functools.wraps(func)
|
28
|
+
def wrapper(*args, **kwargs):
|
29
|
+
try:
|
30
|
+
return func(*args, **kwargs)
|
31
|
+
except exceptions:
|
32
|
+
return default
|
33
|
+
|
34
|
+
return wrapper
|
35
|
+
|
36
|
+
return decorator
|
@@ -0,0 +1,15 @@
|
|
1
|
+
from typing import List, Optional
|
2
|
+
|
3
|
+
from yad2_scraper.category import Yad2Category
|
4
|
+
from yad2_scraper.vehicles.tag import VehicleTag
|
5
|
+
from yad2_scraper.vehicles.next_data import VehiclesNextData
|
6
|
+
|
7
|
+
|
8
|
+
class Yad2VehiclesCategory(Yad2Category):
|
9
|
+
def get_vehicle_tags(self) -> List[VehicleTag]:
|
10
|
+
tags = self.find_all_tags_by_class_substring("div", "feedItemBox")
|
11
|
+
return [VehicleTag(tag) for tag in tags]
|
12
|
+
|
13
|
+
def load_next_data(self) -> Optional[VehiclesNextData]:
|
14
|
+
next_data = super().load_next_data()
|
15
|
+
return VehiclesNextData(next_data) if next_data else None
|
@@ -0,0 +1,318 @@
|
|
1
|
+
import itertools
|
2
|
+
from datetime import datetime
|
3
|
+
from typing import List, Any, Iterator, Optional
|
4
|
+
|
5
|
+
from yad2_scraper.next_data import (
|
6
|
+
SafeAccessOptionalKeysMeta,
|
7
|
+
NextData,
|
8
|
+
Field,
|
9
|
+
FieldTypes,
|
10
|
+
convert_string_date_to_datetime
|
11
|
+
)
|
12
|
+
from yad2_scraper.utils import join_url
|
13
|
+
from yad2_scraper.vehicles.urls import VEHICLES_URL
|
14
|
+
|
15
|
+
|
16
|
+
class VehicleData(metaclass=SafeAccessOptionalKeysMeta):
|
17
|
+
def __init__(self, data: dict):
|
18
|
+
self.data = data
|
19
|
+
|
20
|
+
@property
|
21
|
+
def token(self) -> str:
|
22
|
+
return self["token"]
|
23
|
+
|
24
|
+
@property
|
25
|
+
def page_link(self) -> str:
|
26
|
+
return join_url(VEHICLES_URL, f"item/{self.token}")
|
27
|
+
|
28
|
+
@property
|
29
|
+
def price(self) -> int:
|
30
|
+
return self["price"]
|
31
|
+
|
32
|
+
@property
|
33
|
+
def customer(self) -> dict:
|
34
|
+
return self["customer"]
|
35
|
+
|
36
|
+
@property
|
37
|
+
def customer_name(self) -> str:
|
38
|
+
return self.customer["name"]
|
39
|
+
|
40
|
+
@property
|
41
|
+
def customer_phone(self) -> str:
|
42
|
+
return self.customer["phone"]
|
43
|
+
|
44
|
+
@property
|
45
|
+
def address(self) -> dict:
|
46
|
+
return self["address"]
|
47
|
+
|
48
|
+
def top_area(self, field: Field = Field.TEXT) -> Optional[FieldTypes]:
|
49
|
+
return self["address"]["topArea"][field]
|
50
|
+
|
51
|
+
def area(self, field: Field = Field.TEXT) -> Optional[FieldTypes]:
|
52
|
+
return self["address"]["area"][field]
|
53
|
+
|
54
|
+
def city(self, field: Field = Field.TEXT) -> Optional[FieldTypes]:
|
55
|
+
return self["address"]["city"][field]
|
56
|
+
|
57
|
+
@property
|
58
|
+
def metadata(self) -> dict:
|
59
|
+
return self["metaData"]
|
60
|
+
|
61
|
+
@property
|
62
|
+
def video(self) -> str:
|
63
|
+
return self.metadata["video"]
|
64
|
+
|
65
|
+
@property
|
66
|
+
def cover_image(self) -> str:
|
67
|
+
return self.metadata["coverImage"]
|
68
|
+
|
69
|
+
@property
|
70
|
+
def images(self) -> str:
|
71
|
+
return self.metadata["images"]
|
72
|
+
|
73
|
+
@property
|
74
|
+
def description(self) -> str:
|
75
|
+
return self.metadata["description"]
|
76
|
+
|
77
|
+
@property
|
78
|
+
def dates(self) -> dict:
|
79
|
+
return self["dates"]
|
80
|
+
|
81
|
+
@property
|
82
|
+
def updated_at(self) -> datetime:
|
83
|
+
return convert_string_date_to_datetime(self.dates["updatedAt"])
|
84
|
+
|
85
|
+
@property
|
86
|
+
def created_at(self) -> datetime:
|
87
|
+
return convert_string_date_to_datetime(self.dates["createdAt"])
|
88
|
+
|
89
|
+
@property
|
90
|
+
def ends_at(self) -> datetime:
|
91
|
+
return convert_string_date_to_datetime(self.dates["endsAt"])
|
92
|
+
|
93
|
+
@property
|
94
|
+
def rebounced_at(self) -> datetime:
|
95
|
+
return convert_string_date_to_datetime(self.dates["rebouncedAt"])
|
96
|
+
|
97
|
+
def manufacturer(self, field: Field = Field.TEXT) -> Optional[FieldTypes]:
|
98
|
+
return self["manufacturer"][field]
|
99
|
+
|
100
|
+
def color(self, field: Field = Field.TEXT) -> Optional[FieldTypes]:
|
101
|
+
return self["color"][field]
|
102
|
+
|
103
|
+
@property
|
104
|
+
def km(self) -> Optional[int]:
|
105
|
+
return self["km"]
|
106
|
+
|
107
|
+
@property
|
108
|
+
def hand(self, field: Field = Field.ID) -> Optional[FieldTypes]:
|
109
|
+
return self["hand"][field]
|
110
|
+
|
111
|
+
@property
|
112
|
+
def engine_volume(self) -> Optional[int]:
|
113
|
+
return self["engineVolume"]
|
114
|
+
|
115
|
+
@property
|
116
|
+
def horse_power(self) -> Optional[int]:
|
117
|
+
return self["horsePower"]
|
118
|
+
|
119
|
+
@property
|
120
|
+
def previous_owner(self, field: Field = Field.TEXT) -> Optional[FieldTypes]:
|
121
|
+
return self["previousOwner"][field]
|
122
|
+
|
123
|
+
@property
|
124
|
+
def above_price(self) -> Optional[int]:
|
125
|
+
return self["abovePrice"]
|
126
|
+
|
127
|
+
@property
|
128
|
+
def tags(self) -> List[dict]:
|
129
|
+
return self["tags"]
|
130
|
+
|
131
|
+
@property
|
132
|
+
def is_contact_lead_supported(self) -> Optional[bool]:
|
133
|
+
return self["isContactLeadSupported"]
|
134
|
+
|
135
|
+
@property
|
136
|
+
def vehicle_dates(self) -> dict:
|
137
|
+
return self["vehicleDates"]
|
138
|
+
|
139
|
+
@property
|
140
|
+
def year_of_production(self) -> Optional[int]:
|
141
|
+
return self.vehicle_dates["yearOfProduction"]
|
142
|
+
|
143
|
+
@property
|
144
|
+
def month_of_production(self) -> Optional[int]:
|
145
|
+
return self.vehicle_dates["monthOfProduction"]["id"]
|
146
|
+
|
147
|
+
@property
|
148
|
+
def test_date(self) -> Optional[datetime]:
|
149
|
+
return convert_string_date_to_datetime(self.vehicle_dates["testDate"])
|
150
|
+
|
151
|
+
def model(self, field: Field = Field.TEXT) -> Optional[FieldTypes]:
|
152
|
+
return self["model"][field]
|
153
|
+
|
154
|
+
@property
|
155
|
+
def sub_model(self) -> Optional[str]:
|
156
|
+
return self["subModel"]
|
157
|
+
|
158
|
+
def gear_box(self, field: Field = Field.TEXT) -> Optional[FieldTypes]:
|
159
|
+
return self["gearBox"][field]
|
160
|
+
|
161
|
+
def car_family_types(self, field: Field = Field.TEXT) -> Optional[List[FieldTypes]]:
|
162
|
+
return [obj[field] for obj in self["carFamilyType"]]
|
163
|
+
|
164
|
+
def engine_type(self, field: Field = Field.TEXT) -> Optional[FieldTypes]:
|
165
|
+
return self["engineType"][field]
|
166
|
+
|
167
|
+
@property
|
168
|
+
def seats(self) -> Optional[int]:
|
169
|
+
return self["seats"]
|
170
|
+
|
171
|
+
@property
|
172
|
+
def number_of_doors(self) -> Optional[int]:
|
173
|
+
return self["numberOfDoors"]
|
174
|
+
|
175
|
+
@property
|
176
|
+
def owner(self) -> Optional[str]:
|
177
|
+
return self["owner"]["text"]
|
178
|
+
|
179
|
+
@property
|
180
|
+
def body_type(self) -> Optional[str]:
|
181
|
+
return self["bodyType"]["text"]
|
182
|
+
|
183
|
+
@property
|
184
|
+
def combined_fuel_consumption(self) -> Optional[float]:
|
185
|
+
return self["combinedFuelConsumption"]
|
186
|
+
|
187
|
+
@property
|
188
|
+
def power_train_architecture(self) -> Optional[str]:
|
189
|
+
return self["powertrainArchitecture"]
|
190
|
+
|
191
|
+
def car_tags(self, field: Field = Field.TEXT) -> Optional[List[FieldTypes]]:
|
192
|
+
return [obj[field] for obj in self["carTag"]]
|
193
|
+
|
194
|
+
@property
|
195
|
+
def specification(self) -> dict:
|
196
|
+
return self["specification"]
|
197
|
+
|
198
|
+
@property
|
199
|
+
def has_air_conditioner(self) -> Optional[bool]:
|
200
|
+
return self.specification["airConditioner"]
|
201
|
+
|
202
|
+
@property
|
203
|
+
def has_power_steering(self) -> Optional[bool]:
|
204
|
+
return self.specification["powerSteering"]
|
205
|
+
|
206
|
+
@property
|
207
|
+
def has_magnesium_wheel(self) -> Optional[bool]:
|
208
|
+
return self.specification["magnesiumWheel"]
|
209
|
+
|
210
|
+
@property
|
211
|
+
def has_tire_pressure_monitoring_system(self) -> Optional[bool]:
|
212
|
+
return self.specification["tirePressureMonitoringSystem"]
|
213
|
+
|
214
|
+
@property
|
215
|
+
def has_abs(self) -> Optional[bool]:
|
216
|
+
return self.specification["abs"]
|
217
|
+
|
218
|
+
@property
|
219
|
+
def air_bags(self) -> Optional[int]:
|
220
|
+
return self.specification["airBags"]
|
221
|
+
|
222
|
+
@property
|
223
|
+
def has_control_stability(self) -> Optional[bool]:
|
224
|
+
return self.specification["controlStability"]
|
225
|
+
|
226
|
+
@property
|
227
|
+
def has_electric_window(self) -> Optional[int]:
|
228
|
+
return self.specification["electricWindow"]
|
229
|
+
|
230
|
+
@property
|
231
|
+
def has_breaking_assist_system(self) -> Optional[bool]:
|
232
|
+
return self.specification["breakingAssistSystem"]
|
233
|
+
|
234
|
+
@property
|
235
|
+
def has_reverse_camera(self) -> Optional[bool]:
|
236
|
+
return self.specification["reverseCamera"]
|
237
|
+
|
238
|
+
@property
|
239
|
+
def has_adaptive_cruise_control(self) -> Optional[bool]:
|
240
|
+
return self.specification["adaptiveCruiseControl"]
|
241
|
+
|
242
|
+
@property
|
243
|
+
def has_high_beams_auto_control(self) -> Optional[bool]:
|
244
|
+
return self.specification["highBeamsAutoControl"]
|
245
|
+
|
246
|
+
@property
|
247
|
+
def has_blind_spot_assist(self) -> Optional[bool]:
|
248
|
+
return self.specification["blindSpotAssist"]
|
249
|
+
|
250
|
+
@property
|
251
|
+
def has_identify_pedestrians(self) -> Optional[bool]:
|
252
|
+
return self.specification["identifyPedestrians"]
|
253
|
+
|
254
|
+
@property
|
255
|
+
def has_seat_belts_sensors(self) -> Optional[bool]:
|
256
|
+
return self.specification["seatBeltsSensors"]
|
257
|
+
|
258
|
+
@property
|
259
|
+
def has_identifying_dangerous_nearing(self) -> Optional[bool]:
|
260
|
+
return self.specification["identifyingDangerousNearing"]
|
261
|
+
|
262
|
+
@property
|
263
|
+
def has_auto_lighting_in_forward(self) -> Optional[bool]:
|
264
|
+
return self.specification["autoLightingInForward"]
|
265
|
+
|
266
|
+
@property
|
267
|
+
def has_identify_traffic_signs(self) -> Optional[bool]:
|
268
|
+
return self.specification["identifyTrafficSigns"]
|
269
|
+
|
270
|
+
def ignition(self, field: Field = Field.TEXT) -> Optional[FieldTypes]:
|
271
|
+
return self.specification["ignition"][field]
|
272
|
+
|
273
|
+
@property
|
274
|
+
def safety_points(self) -> Optional[int]:
|
275
|
+
return self.specification["safetyPoints"]
|
276
|
+
|
277
|
+
@property
|
278
|
+
def is_handicapped_friendly(self) -> Optional[bool]:
|
279
|
+
return self.specification["isHandicappedFriendly"]
|
280
|
+
|
281
|
+
@property
|
282
|
+
def has_sun_roof(self) -> Optional[bool]:
|
283
|
+
return self.specification["sunRoof"]
|
284
|
+
|
285
|
+
@property
|
286
|
+
def is_turbo(self) -> Optional[bool]:
|
287
|
+
return self.specification["isTurbo"]
|
288
|
+
|
289
|
+
@property
|
290
|
+
def has_road_deviation_control(self) -> Optional[bool]:
|
291
|
+
return self.specification["roadDeviationControl"]
|
292
|
+
|
293
|
+
@property
|
294
|
+
def has_forward_distance_monitor(self) -> Optional[bool]:
|
295
|
+
return self.specification["forwardDistanceMonitor"]
|
296
|
+
|
297
|
+
@property
|
298
|
+
def has_box(self) -> Optional[bool]:
|
299
|
+
return self.specification["box"]
|
300
|
+
|
301
|
+
def __getitem__(self, key: str) -> Any:
|
302
|
+
return self.data[key]
|
303
|
+
|
304
|
+
|
305
|
+
class VehiclesNextData(NextData):
|
306
|
+
def iterate_vehicles(self) -> Iterator[VehicleData]:
|
307
|
+
for query in self.queries:
|
308
|
+
data = query["state"].get("data")
|
309
|
+
|
310
|
+
if not data or isinstance(data, list):
|
311
|
+
continue
|
312
|
+
|
313
|
+
for vehicle_data in itertools.chain.from_iterable(data.values()):
|
314
|
+
if isinstance(vehicle_data, dict):
|
315
|
+
yield VehicleData(vehicle_data)
|
316
|
+
|
317
|
+
def __getitem__(self, item):
|
318
|
+
return self.data[item]
|
@@ -0,0 +1,25 @@
|
|
1
|
+
from enum import Enum
|
2
|
+
from typing import Optional
|
3
|
+
|
4
|
+
from yad2_scraper.query import QueryFilters, OrderBy, NumberRange, format_number_range
|
5
|
+
|
6
|
+
|
7
|
+
class OrderVehiclesBy(int, Enum):
|
8
|
+
DATE = OrderBy.DATE
|
9
|
+
PRICE_LOWEST_TO_HIGHEST = OrderBy.PRICE_LOWEST_TO_HIGHEST
|
10
|
+
PRICE_HIGHEST_TO_LOWEST = OrderBy.PRICE_HIGHEST_TO_LOWEST
|
11
|
+
DISTANCE_LOWEST_TO_HIGHEST = 5
|
12
|
+
YEAR_HIGHEST_TO_LOWEST = 6
|
13
|
+
|
14
|
+
|
15
|
+
class VehiclesQueryFilters(QueryFilters):
|
16
|
+
year_range: Optional[NumberRange] = None
|
17
|
+
...
|
18
|
+
|
19
|
+
def to_params(self) -> dict:
|
20
|
+
return {
|
21
|
+
**super().to_params(),
|
22
|
+
"year": format_number_range(self.year_range)
|
23
|
+
}
|
24
|
+
|
25
|
+
# TODO: add QueryParams class for each vehicle type (some share the same attributes - sometimes with different enums)
|
@@ -0,0 +1,63 @@
|
|
1
|
+
from functools import cached_property
|
2
|
+
from bs4 import Tag
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
from yad2_scraper.utils import join_url, find_html_tag_by_class_substring
|
6
|
+
from yad2_scraper.vehicles.urls import VEHICLES_URL
|
7
|
+
|
8
|
+
YEAR_AND_HAND_TAG_SEPARATOR = " • "
|
9
|
+
|
10
|
+
|
11
|
+
class VehicleTag:
|
12
|
+
def __init__(self, tag: Tag):
|
13
|
+
self.tag = tag
|
14
|
+
|
15
|
+
@cached_property
|
16
|
+
def relative_link(self) -> str:
|
17
|
+
return self.find_tag_by_class_substring("a", "itemLink")["href"]
|
18
|
+
|
19
|
+
@property
|
20
|
+
def page_link(self) -> str:
|
21
|
+
return join_url(VEHICLES_URL, self.relative_link)
|
22
|
+
|
23
|
+
@cached_property
|
24
|
+
def image_url(self) -> str:
|
25
|
+
return self.find_tag_by_class_substring("img", "image")["src"]
|
26
|
+
|
27
|
+
@cached_property
|
28
|
+
def model(self) -> str:
|
29
|
+
return self.find_tag_by_class_substring("span", "heading").text.strip()
|
30
|
+
|
31
|
+
@cached_property
|
32
|
+
def marketing_text(self) -> str:
|
33
|
+
return self.find_tag_by_class_substring("span", "marketingText").text.strip()
|
34
|
+
|
35
|
+
@cached_property
|
36
|
+
def year_and_hand_string(self) -> str:
|
37
|
+
return self.find_tag_by_class_substring("span", "yearAndHand").text.strip()
|
38
|
+
|
39
|
+
@property
|
40
|
+
def year(self) -> int:
|
41
|
+
year, _ = self.year_and_hand_string.split(YEAR_AND_HAND_TAG_SEPARATOR)
|
42
|
+
return int(year)
|
43
|
+
|
44
|
+
@property
|
45
|
+
def hand(self) -> int:
|
46
|
+
_, hand_string = self.year_and_hand_string.split(YEAR_AND_HAND_TAG_SEPARATOR)
|
47
|
+
_, hand = hand_string.split()
|
48
|
+
return int(hand)
|
49
|
+
|
50
|
+
@cached_property
|
51
|
+
def price_string(self) -> str:
|
52
|
+
return self.find_tag_by_class_substring("span", "price").text.strip()
|
53
|
+
|
54
|
+
@property
|
55
|
+
def price(self) -> Optional[int]:
|
56
|
+
try:
|
57
|
+
price, _ = self.price_string.split()
|
58
|
+
return int(price.replace(",", ""))
|
59
|
+
except ValueError:
|
60
|
+
return None
|
61
|
+
|
62
|
+
def find_tag_by_class_substring(self, tag_name: str, substring: str) -> Tag:
|
63
|
+
return find_html_tag_by_class_substring(self.tag, tag_name, substring)
|
@@ -0,0 +1,16 @@
|
|
1
|
+
from typing import Literal, get_args
|
2
|
+
|
3
|
+
from yad2_scraper.utils import join_url
|
4
|
+
from yad2_scraper.constants import BASE_URL
|
5
|
+
|
6
|
+
VEHICLES_URL = join_url(BASE_URL, "vehicles")
|
7
|
+
|
8
|
+
VehicleType = Literal["cars", "motorcycles", "scooters", "trucks", "watercraft", "others"]
|
9
|
+
|
10
|
+
_VALID_VEHICLE_TYPES = get_args(VehicleType)
|
11
|
+
|
12
|
+
|
13
|
+
def get_vehicle_url(vehicle_type: VehicleType) -> str:
|
14
|
+
if vehicle_type not in _VALID_VEHICLE_TYPES:
|
15
|
+
raise ValueError(f"Invalid vehicle type: {repr(vehicle_type)}. Expected one of {_VALID_VEHICLE_TYPES}")
|
16
|
+
return join_url(VEHICLES_URL, vehicle_type)
|
@@ -1,31 +0,0 @@
|
|
1
|
-
import httpx
|
2
|
-
from typing import List
|
3
|
-
|
4
|
-
|
5
|
-
class ResponseError(httpx.HTTPStatusError):
|
6
|
-
# This adds the request/response objects to the error
|
7
|
-
pass
|
8
|
-
|
9
|
-
|
10
|
-
class AntiBotDetectedError(ResponseError):
|
11
|
-
pass
|
12
|
-
|
13
|
-
|
14
|
-
class UnexpectedContentError(ResponseError):
|
15
|
-
pass
|
16
|
-
|
17
|
-
|
18
|
-
class MaxRetriesExceededError(Exception):
|
19
|
-
def __init__(self, msg: str, errors: List[Exception] = None):
|
20
|
-
super().__init__(msg)
|
21
|
-
self.errors = errors
|
22
|
-
|
23
|
-
|
24
|
-
class MaxRequestRetriesExceededError(MaxRetriesExceededError):
|
25
|
-
def __init__(self, method: str, url: str, max_retries: int, errors: List[Exception] = None):
|
26
|
-
self.method = method
|
27
|
-
self.url = url
|
28
|
-
self.max_retries = max_retries
|
29
|
-
|
30
|
-
msg = f"All {self.max_retries} retry attempts for {self.method} request to '{self.url}' have failed"
|
31
|
-
super().__init__(msg, errors)
|
@@ -1,27 +0,0 @@
|
|
1
|
-
from enum import Enum
|
2
|
-
from typing import List, Union
|
3
|
-
|
4
|
-
|
5
|
-
class Field(str, Enum):
|
6
|
-
ID = "id"
|
7
|
-
TEXT = "text"
|
8
|
-
ENGLISH_TEXT = "textEng"
|
9
|
-
|
10
|
-
|
11
|
-
FieldTypes = Union[str, int]
|
12
|
-
|
13
|
-
|
14
|
-
class NextData:
|
15
|
-
def __init__(self, data: dict):
|
16
|
-
self.data = data
|
17
|
-
|
18
|
-
@property
|
19
|
-
def json(self) -> dict:
|
20
|
-
return self.data
|
21
|
-
|
22
|
-
@property
|
23
|
-
def queries(self) -> List[dict]:
|
24
|
-
return self.data["props"]["pageProps"]["dehydratedState"]["queries"]
|
25
|
-
|
26
|
-
def __getitem__(self, item):
|
27
|
-
return self.data[item]
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|