yad2-scraper 0.3.0__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,14 +1,13 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: yad2-scraper
3
- Version: 0.3.0
3
+ Version: 0.5.0
4
4
  Summary: Scrape Yad2 in Python.
5
5
  License: LICENSE
6
6
  Author: dav ost
7
7
  Author-email: davidost2003@gmail.com
8
- Requires-Python: >=3.7
8
+ Requires-Python: >=3.8
9
9
  Classifier: License :: Other/Proprietary License
10
10
  Classifier: Programming Language :: Python :: 3
11
- Classifier: Programming Language :: Python :: 3.7
12
11
  Classifier: Programming Language :: Python :: 3.8
13
12
  Classifier: Programming Language :: Python :: 3.9
14
13
  Classifier: Programming Language :: Python :: 3.10
@@ -1,13 +1,13 @@
1
1
  [tool.poetry]
2
2
  name = "yad2-scraper"
3
- version = "0.3.0"
3
+ version = "0.5.0"
4
4
  description = "Scrape Yad2 in Python."
5
5
  authors = ["dav ost <davidost2003@gmail.com>"]
6
6
  license = "LICENSE"
7
7
  readme = "README.md"
8
8
 
9
9
  [tool.poetry.dependencies]
10
- python = ">=3.7"
10
+ python = ">=3.8"
11
11
  httpx = "^0.24.0"
12
12
  httpcore = ">=0.15.0"
13
13
  fake-useragent = "^0.1.11"
@@ -19,7 +19,7 @@ ALLOW_REQUEST_REDIRECTS = True
19
19
  VERIFY_REQUEST_SSL = True
20
20
 
21
21
  ANTIBOT_CONTENT_IDENTIFIER = b"Are you for real" # robot-captcha
22
- YAD2_CONTENT_IDENTIFIER = b"https://www.yad2.co.il/"
22
+ PAGE_CONTENT_IDENTIFIER = b"https://www.yad2.co.il/"
23
23
 
24
24
  FIRST_PAGE_NUMBER = 1
25
25
  NOT_MENTIONED_PRICE_RANGE = 0, 0
@@ -0,0 +1,32 @@
1
+ import httpx
2
+ from typing import List, Union
3
+
4
+
5
+ class ResponseError(Exception):
6
+ def __init__(self, msg: str, request: httpx.Request, response: httpx.Response):
7
+ super().__init__(msg)
8
+ self.request = request
9
+ self.response = response
10
+
11
+
12
+ class AntiBotDetectedError(ResponseError):
13
+ pass
14
+
15
+
16
+ class UnexpectedContentError(ResponseError):
17
+ pass
18
+
19
+
20
+ class MaxAttemptsExceededError(Exception):
21
+ def __init__(self, msg: str, max_attempts: int, errors: List[BaseException] = None):
22
+ super().__init__(msg)
23
+ self.max_attempts = max_attempts
24
+ self.errors = errors
25
+
26
+
27
+ class MaxRequestAttemptsExceededError(MaxAttemptsExceededError):
28
+ def __init__(self, method: str, url: str, max_attempts: int, errors: List[Union[httpx.HTTPError, ResponseError]]):
29
+ msg = f"All {max_attempts} attempts for {method} request to '{url}' have failed"
30
+ super().__init__(msg, max_attempts, errors)
31
+ self.method = method
32
+ self.url = url
@@ -0,0 +1,50 @@
1
+ from datetime import datetime
2
+ from enum import Enum
3
+ from typing import List, Union
4
+
5
+ from yad2_scraper.utils import safe_access
6
+
7
+ FieldTypes = Union[str, int]
8
+
9
+ safe_access_optional_keys = safe_access(exceptions=(KeyError, TypeError), default=None)
10
+
11
+
12
+ class SafeAccessOptionalKeysMeta(type):
13
+ def __new__(cls, name, bases, dictionary):
14
+ for attr_name, attr_value in dictionary.items():
15
+ if callable(attr_value): # Wrap methods
16
+ dictionary[attr_name] = safe_access_optional_keys(attr_value)
17
+ elif isinstance(attr_value, property): # Wrap properties
18
+ dictionary[attr_name] = property(
19
+ safe_access_optional_keys(attr_value.fget) if attr_value.fget else None,
20
+ safe_access_optional_keys(attr_value.fset) if attr_value.fset else None,
21
+ safe_access_optional_keys(attr_value.fdel) if attr_value.fdel else None,
22
+ attr_value.__doc__,
23
+ )
24
+ return super().__new__(cls, name, bases, dictionary)
25
+
26
+
27
+ class Field(str, Enum):
28
+ ID = "id"
29
+ TEXT = "text"
30
+ ENGLISH_TEXT = "textEng"
31
+
32
+
33
+ def convert_string_date_to_datetime(date_string: str) -> datetime:
34
+ return datetime.fromisoformat(date_string)
35
+
36
+
37
+ class NextData:
38
+ def __init__(self, data: dict):
39
+ self.data = data
40
+
41
+ @property
42
+ def json(self) -> dict:
43
+ return self.data
44
+
45
+ @property
46
+ def queries(self) -> List[dict]:
47
+ return self.data["props"]["pageProps"]["dehydratedState"]["queries"]
48
+
49
+ def __getitem__(self, item):
50
+ return self.data[item]
@@ -2,7 +2,7 @@ from pydantic import BaseModel
2
2
  from enum import Enum
3
3
  from typing import Optional, Tuple
4
4
 
5
- PriceRange = Tuple[int, int]
5
+ NumberRange = Tuple[int, int]
6
6
 
7
7
 
8
8
  class OrderBy(int, Enum):
@@ -27,7 +27,7 @@ def format_number_range(number_range: Optional[Tuple[int, int]]) -> Optional[str
27
27
  class QueryFilters(BaseModel):
28
28
  page: Optional[int] = None
29
29
  order_by: Optional[OrderBy] = None
30
- price_range: Optional[PriceRange] = None
30
+ price_range: Optional[NumberRange] = None
31
31
  ...
32
32
 
33
33
  def to_params(self) -> dict:
@@ -1,25 +1,25 @@
1
1
  import logging
2
2
  import httpx
3
3
  import time
4
- import random
5
- from typing import Optional, Dict, Any, Tuple, Union, Type, TypeVar
4
+ from fake_useragent import FakeUserAgent
5
+ from typing import Optional, Dict, Any, Callable, Union, Type, TypeVar
6
6
 
7
7
  from yad2_scraper.category import Yad2Category
8
8
  from yad2_scraper.query import QueryFilters
9
- from yad2_scraper.utils import get_random_user_agent
10
- from yad2_scraper.exceptions import AntiBotDetectedError, UnexpectedContentError, MaxRequestRetriesExceededError
9
+ from yad2_scraper.exceptions import AntiBotDetectedError, UnexpectedContentError, MaxRequestAttemptsExceededError
11
10
  from yad2_scraper.constants import (
12
11
  DEFAULT_REQUEST_HEADERS,
13
12
  ALLOW_REQUEST_REDIRECTS,
14
13
  VERIFY_REQUEST_SSL,
15
14
  ANTIBOT_CONTENT_IDENTIFIER,
16
- YAD2_CONTENT_IDENTIFIER
15
+ PAGE_CONTENT_IDENTIFIER
17
16
  )
18
17
 
19
18
  Category = TypeVar("Category", bound=Yad2Category)
20
- DelayRange = Tuple[float, float]
19
+ WaitStrategy = Callable[[int], Optional[float]]
21
20
  QueryParamTypes = Union[QueryFilters, Dict[str, Any]]
22
21
 
22
+ fua = FakeUserAgent()
23
23
  logger = logging.getLogger(__name__)
24
24
 
25
25
 
@@ -28,9 +28,9 @@ class Yad2Scraper:
28
28
  self,
29
29
  client: Optional[httpx.Client] = None,
30
30
  request_defaults: Optional[Dict[str, Any]] = None,
31
- randomize_user_agent: bool = False,
32
- random_delay_range: Optional[DelayRange] = None,
33
- max_retries: int = 0
31
+ randomize_user_agent: bool = True,
32
+ wait_strategy: Optional[WaitStrategy] = None,
33
+ max_request_attempts: int = 1
34
34
  ):
35
35
  self.client = client or httpx.Client(
36
36
  headers=DEFAULT_REQUEST_HEADERS,
@@ -39,16 +39,25 @@ class Yad2Scraper:
39
39
  )
40
40
  self.request_defaults = request_defaults or {}
41
41
  self.randomize_user_agent = randomize_user_agent
42
- self.random_delay_range = random_delay_range
43
- self.max_retries = max_retries
42
+ self.wait_strategy = wait_strategy
43
+ self.max_request_attempts = max_request_attempts
44
44
 
45
45
  logger.debug(f"Scraper initialized with client: {self.client}")
46
46
 
47
+ def set_user_agent(self, user_agent: str) -> None:
48
+ self.client.headers["User-Agent"] = user_agent
49
+ logger.debug(f"User-Agent client header set to: '{user_agent}'")
50
+
51
+ def set_no_script(self, no_script: bool) -> None:
52
+ value = "1" if no_script else "0"
53
+ self.client.cookies.set("noscript", value)
54
+ logger.debug(f"NoScript (noscript) client cookie set to: '{value}'")
55
+
47
56
  def fetch_category(
48
57
  self,
49
58
  url: str,
50
- params: Optional[QueryParamTypes] = None,
51
- category_type: Type[Category] = Yad2Category
59
+ category_type: Type[Category] = Yad2Category,
60
+ params: Optional[QueryParamTypes] = None
52
61
  ) -> Category:
53
62
  logger.debug(f"Fetching category from URL: '{url}'")
54
63
  response = self.get(url, params)
@@ -59,72 +68,48 @@ class Yad2Scraper:
59
68
  return self.request("GET", url, params=params)
60
69
 
61
70
  def request(self, method: str, url: str, params: Optional[QueryParamTypes] = None) -> httpx.Response:
71
+ if not isinstance(self.max_request_attempts, int):
72
+ raise TypeError(f"max_request_attempts must be of type 'int', but got {type(self.max_request_attempts)}")
73
+
74
+ if self.max_request_attempts <= 0:
75
+ raise ValueError(f"max_request_attempts must be a positive integer, but got {self.max_request_attempts}")
76
+
62
77
  request_options = self._prepare_request_options(params=params)
78
+ error_list = []
63
79
 
64
- try:
65
- return self._send_request(method, url, request_options)
66
- except Exception as error:
67
- return self._handle_request_error(method, url, request_options, error)
80
+ for attempt in range(1, self.max_request_attempts + 1):
81
+ try:
82
+ return self._send_request(method, url, request_options, attempt)
83
+ except Exception as error:
84
+ logger.error(f"{method} request to '{url}' failed {self._format_attempt_info(attempt)}: {error}")
85
+ error_list.append(error)
68
86
 
69
- def set_user_agent(self, user_agent: str) -> None:
70
- self.client.headers["User-Agent"] = user_agent
71
- logger.debug(f"User-Agent client header set to: '{user_agent}'")
87
+ if self.max_request_attempts == 1:
88
+ raise error_list[0] # only one error exists, raise it
72
89
 
73
- def set_no_script(self, no_script: bool) -> None:
74
- value = "1" if no_script else "0"
75
- self.client.cookies.set("noscript", value)
76
- logger.debug(f"noscript client cookie set to: '{value}'")
90
+ max_attempts_error = MaxRequestAttemptsExceededError(method, url, self.max_request_attempts, error_list)
91
+ logger.error(str(max_attempts_error))
92
+ raise max_attempts_error from error_list[-1] # multiple errors exist, raise from the last one
77
93
 
78
94
  def close(self) -> None:
79
95
  logger.debug("Closing scraper client")
80
96
  self.client.close()
81
97
  logger.info("Scraper client closed")
82
98
 
83
- def _send_request(self, method: str, url: str, request_options: Dict[str, Any]) -> httpx.Response:
99
+ def _send_request(self, method: str, url: str, request_options: Dict[str, Any], attempt: int) -> httpx.Response:
84
100
  if self.randomize_user_agent:
85
101
  self._set_random_user_agent(request_options)
86
102
 
87
- if self.random_delay_range:
88
- self._apply_request_delay()
103
+ if self.wait_strategy:
104
+ self._apply_wait_strategy(attempt)
89
105
 
90
- logger.info(f"Making {method} request to URL: '{url}'")
106
+ logger.info(f"Sending {method} request to URL: '{url}' {self._format_attempt_info(attempt)}")
91
107
  response = self.client.request(method, url, **request_options)
92
- logger.debug(f"Received response with status code: {response.status_code}")
108
+ logger.debug(f"Received response {response.status_code} from '{url}' {self._format_attempt_info(attempt)}")
93
109
  self._validate_response(response)
94
110
 
95
111
  return response
96
112
 
97
- def _handle_request_error(
98
- self,
99
- method: str,
100
- url: str,
101
- request_options: Dict[str, Any],
102
- error: Exception
103
- ) -> httpx.Response:
104
- logger.error(f"{method} request to '{url}' failed: {error}")
105
-
106
- if self.max_retries == 0:
107
- raise error
108
-
109
- return self._retry_request(method, url, request_options)
110
-
111
- def _retry_request(self, method: str, url: str, request_options: Dict[str, Any]) -> httpx.Response:
112
- logger.info(f"Retrying {method} request to '{url}' (max retries: {self.max_retries})")
113
-
114
- errors = []
115
-
116
- for retry_attempt in range(1, self.max_retries + 1):
117
- try:
118
- logger.debug(f"Retry attempt {retry_attempt}/{self.max_retries}")
119
- return self._send_request(method, url, request_options)
120
- except Exception as error:
121
- logger.warning(f"Retry attempt {retry_attempt} failed: {error}")
122
- errors.append(error)
123
-
124
- error_to_raise = MaxRequestRetriesExceededError(method, url, self.max_retries, errors)
125
- logger.error(str(error_to_raise))
126
- raise error_to_raise from errors[-1]
127
-
128
113
  def _prepare_request_options(self, params: Optional[QueryParamTypes] = None) -> Dict[str, Any]:
129
114
  logger.debug("Preparing request options from defaults")
130
115
  request_options = self.request_defaults.copy()
@@ -135,17 +120,20 @@ class Yad2Scraper:
135
120
 
136
121
  return request_options
137
122
 
138
- def _apply_request_delay(self):
139
- delay = random.uniform(*self.random_delay_range)
140
- logger.debug(f"Applying request delay of {delay:.2f} seconds")
141
- time.sleep(delay)
142
-
143
123
  @staticmethod
144
124
  def _set_random_user_agent(request_options: Dict[str, str]):
145
- user_agent = get_random_user_agent()
125
+ user_agent = fua.random
146
126
  request_options.setdefault("headers", {})["User-Agent"] = user_agent
147
127
  logger.debug(f"Updated request options with random User-Agent header: '{user_agent}'")
148
128
 
129
+ def _apply_wait_strategy(self, attempt: int):
130
+ wait_time = self.wait_strategy(attempt)
131
+ if not wait_time:
132
+ return
133
+
134
+ logger.debug(f"Waiting {wait_time:.2f} seconds before request {self._format_attempt_info(attempt)}")
135
+ time.sleep(wait_time)
136
+
149
137
  @staticmethod
150
138
  def _validate_response(response: httpx.Response):
151
139
  response.raise_for_status()
@@ -156,15 +144,18 @@ class Yad2Scraper:
156
144
  request=response.request,
157
145
  response=response
158
146
  )
159
- if YAD2_CONTENT_IDENTIFIER not in response.content:
147
+ if response.request.method == "GET" and PAGE_CONTENT_IDENTIFIER not in response.content:
160
148
  raise UnexpectedContentError(
161
- "The response does not contain yad2 content",
149
+ "The GET response does not contain yad2 related content",
162
150
  request=response.request,
163
151
  response=response
164
152
  )
165
153
 
166
154
  logger.debug("Response validation succeeded")
167
155
 
156
+ def _format_attempt_info(self, attempt: int) -> str:
157
+ return f"(attempt {attempt}/{self.max_request_attempts})"
158
+
168
159
  def __enter__(self):
169
160
  logger.debug("Entering scraper context")
170
161
  return self
@@ -1,12 +1,6 @@
1
- from fake_useragent import FakeUserAgent
1
+ import functools
2
2
  from bs4 import BeautifulSoup, Tag
3
- from typing import Union, List
4
-
5
- fua = FakeUserAgent()
6
-
7
-
8
- def get_random_user_agent() -> str:
9
- return fua.random
3
+ from typing import Union, List, Tuple, Any
10
4
 
11
5
 
12
6
  def join_url(url: str, path: str) -> str:
@@ -26,3 +20,17 @@ def find_html_tag_by_class_substring(e: Union[BeautifulSoup, Tag], tag_name: str
26
20
 
27
21
  def find_all_html_tags_by_class_substring(e: Union[BeautifulSoup, Tag], tag_name: str, substring: str) -> List[Tag]:
28
22
  return e.find_all(tag_name, class_=lambda class_name: class_name and substring in class_name)
23
+
24
+
25
+ def safe_access(exceptions: Tuple = (), default: Any = None):
26
+ def decorator(func):
27
+ @functools.wraps(func)
28
+ def wrapper(*args, **kwargs):
29
+ try:
30
+ return func(*args, **kwargs)
31
+ except exceptions:
32
+ return default
33
+
34
+ return wrapper
35
+
36
+ return decorator
@@ -0,0 +1,5 @@
1
+ from .urls import VEHICLES_URL, VehicleType, get_vehicle_url
2
+ from .query import VehiclesQueryFilters, OrderVehiclesBy
3
+ from .category import Yad2VehiclesCategory
4
+ from .tag import VehicleTag
5
+ from .next_data import VehiclesNextData
@@ -0,0 +1,15 @@
1
+ from typing import List, Optional
2
+
3
+ from yad2_scraper.category import Yad2Category
4
+ from yad2_scraper.vehicles.tag import VehicleTag
5
+ from yad2_scraper.vehicles.next_data import VehiclesNextData
6
+
7
+
8
+ class Yad2VehiclesCategory(Yad2Category):
9
+ def get_vehicle_tags(self) -> List[VehicleTag]:
10
+ tags = self.find_all_tags_by_class_substring("div", "feedItemBox")
11
+ return [VehicleTag(tag) for tag in tags]
12
+
13
+ def load_next_data(self) -> Optional[VehiclesNextData]:
14
+ next_data = super().load_next_data()
15
+ return VehiclesNextData(next_data) if next_data else None
@@ -0,0 +1,318 @@
1
+ import itertools
2
+ from datetime import datetime
3
+ from typing import List, Any, Iterator, Optional
4
+
5
+ from yad2_scraper.next_data import (
6
+ SafeAccessOptionalKeysMeta,
7
+ NextData,
8
+ Field,
9
+ FieldTypes,
10
+ convert_string_date_to_datetime
11
+ )
12
+ from yad2_scraper.utils import join_url
13
+ from yad2_scraper.vehicles.urls import VEHICLES_URL
14
+
15
+
16
+ class VehicleData(metaclass=SafeAccessOptionalKeysMeta):
17
+ def __init__(self, data: dict):
18
+ self.data = data
19
+
20
+ @property
21
+ def token(self) -> str:
22
+ return self["token"]
23
+
24
+ @property
25
+ def page_link(self) -> str:
26
+ return join_url(VEHICLES_URL, f"item/{self.token}")
27
+
28
+ @property
29
+ def price(self) -> int:
30
+ return self["price"]
31
+
32
+ @property
33
+ def customer(self) -> dict:
34
+ return self["customer"]
35
+
36
+ @property
37
+ def customer_name(self) -> str:
38
+ return self.customer["name"]
39
+
40
+ @property
41
+ def customer_phone(self) -> str:
42
+ return self.customer["phone"]
43
+
44
+ @property
45
+ def address(self) -> dict:
46
+ return self["address"]
47
+
48
+ def top_area(self, field: Field = Field.TEXT) -> Optional[FieldTypes]:
49
+ return self["address"]["topArea"][field]
50
+
51
+ def area(self, field: Field = Field.TEXT) -> Optional[FieldTypes]:
52
+ return self["address"]["area"][field]
53
+
54
+ def city(self, field: Field = Field.TEXT) -> Optional[FieldTypes]:
55
+ return self["address"]["city"][field]
56
+
57
+ @property
58
+ def metadata(self) -> dict:
59
+ return self["metaData"]
60
+
61
+ @property
62
+ def video(self) -> str:
63
+ return self.metadata["video"]
64
+
65
+ @property
66
+ def cover_image(self) -> str:
67
+ return self.metadata["coverImage"]
68
+
69
+ @property
70
+ def images(self) -> str:
71
+ return self.metadata["images"]
72
+
73
+ @property
74
+ def description(self) -> str:
75
+ return self.metadata["description"]
76
+
77
+ @property
78
+ def dates(self) -> dict:
79
+ return self["dates"]
80
+
81
+ @property
82
+ def updated_at(self) -> datetime:
83
+ return convert_string_date_to_datetime(self.dates["updatedAt"])
84
+
85
+ @property
86
+ def created_at(self) -> datetime:
87
+ return convert_string_date_to_datetime(self.dates["createdAt"])
88
+
89
+ @property
90
+ def ends_at(self) -> datetime:
91
+ return convert_string_date_to_datetime(self.dates["endsAt"])
92
+
93
+ @property
94
+ def rebounced_at(self) -> datetime:
95
+ return convert_string_date_to_datetime(self.dates["rebouncedAt"])
96
+
97
+ def manufacturer(self, field: Field = Field.TEXT) -> Optional[FieldTypes]:
98
+ return self["manufacturer"][field]
99
+
100
+ def color(self, field: Field = Field.TEXT) -> Optional[FieldTypes]:
101
+ return self["color"][field]
102
+
103
+ @property
104
+ def km(self) -> Optional[int]:
105
+ return self["km"]
106
+
107
+ @property
108
+ def hand(self, field: Field = Field.ID) -> Optional[FieldTypes]:
109
+ return self["hand"][field]
110
+
111
+ @property
112
+ def engine_volume(self) -> Optional[int]:
113
+ return self["engineVolume"]
114
+
115
+ @property
116
+ def horse_power(self) -> Optional[int]:
117
+ return self["horsePower"]
118
+
119
+ @property
120
+ def previous_owner(self, field: Field = Field.TEXT) -> Optional[FieldTypes]:
121
+ return self["previousOwner"][field]
122
+
123
+ @property
124
+ def above_price(self) -> Optional[int]:
125
+ return self["abovePrice"]
126
+
127
+ @property
128
+ def tags(self) -> List[dict]:
129
+ return self["tags"]
130
+
131
+ @property
132
+ def is_contact_lead_supported(self) -> Optional[bool]:
133
+ return self["isContactLeadSupported"]
134
+
135
+ @property
136
+ def vehicle_dates(self) -> dict:
137
+ return self["vehicleDates"]
138
+
139
+ @property
140
+ def year_of_production(self) -> Optional[int]:
141
+ return self.vehicle_dates["yearOfProduction"]
142
+
143
+ @property
144
+ def month_of_production(self) -> Optional[int]:
145
+ return self.vehicle_dates["monthOfProduction"]["id"]
146
+
147
+ @property
148
+ def test_date(self) -> Optional[datetime]:
149
+ return convert_string_date_to_datetime(self.vehicle_dates["testDate"])
150
+
151
+ def model(self, field: Field = Field.TEXT) -> Optional[FieldTypes]:
152
+ return self["model"][field]
153
+
154
+ @property
155
+ def sub_model(self) -> Optional[str]:
156
+ return self["subModel"]
157
+
158
+ def gear_box(self, field: Field = Field.TEXT) -> Optional[FieldTypes]:
159
+ return self["gearBox"][field]
160
+
161
+ def car_family_types(self, field: Field = Field.TEXT) -> Optional[List[FieldTypes]]:
162
+ return [obj[field] for obj in self["carFamilyType"]]
163
+
164
+ def engine_type(self, field: Field = Field.TEXT) -> Optional[FieldTypes]:
165
+ return self["engineType"][field]
166
+
167
+ @property
168
+ def seats(self) -> Optional[int]:
169
+ return self["seats"]
170
+
171
+ @property
172
+ def number_of_doors(self) -> Optional[int]:
173
+ return self["numberOfDoors"]
174
+
175
+ @property
176
+ def owner(self) -> Optional[str]:
177
+ return self["owner"]["text"]
178
+
179
+ @property
180
+ def body_type(self) -> Optional[str]:
181
+ return self["bodyType"]["text"]
182
+
183
+ @property
184
+ def combined_fuel_consumption(self) -> Optional[float]:
185
+ return self["combinedFuelConsumption"]
186
+
187
+ @property
188
+ def power_train_architecture(self) -> Optional[str]:
189
+ return self["powertrainArchitecture"]
190
+
191
+ def car_tags(self, field: Field = Field.TEXT) -> Optional[List[FieldTypes]]:
192
+ return [obj[field] for obj in self["carTag"]]
193
+
194
+ @property
195
+ def specification(self) -> dict:
196
+ return self["specification"]
197
+
198
+ @property
199
+ def has_air_conditioner(self) -> Optional[bool]:
200
+ return self.specification["airConditioner"]
201
+
202
+ @property
203
+ def has_power_steering(self) -> Optional[bool]:
204
+ return self.specification["powerSteering"]
205
+
206
+ @property
207
+ def has_magnesium_wheel(self) -> Optional[bool]:
208
+ return self.specification["magnesiumWheel"]
209
+
210
+ @property
211
+ def has_tire_pressure_monitoring_system(self) -> Optional[bool]:
212
+ return self.specification["tirePressureMonitoringSystem"]
213
+
214
+ @property
215
+ def has_abs(self) -> Optional[bool]:
216
+ return self.specification["abs"]
217
+
218
+ @property
219
+ def air_bags(self) -> Optional[int]:
220
+ return self.specification["airBags"]
221
+
222
+ @property
223
+ def has_control_stability(self) -> Optional[bool]:
224
+ return self.specification["controlStability"]
225
+
226
+ @property
227
+ def has_electric_window(self) -> Optional[int]:
228
+ return self.specification["electricWindow"]
229
+
230
+ @property
231
+ def has_breaking_assist_system(self) -> Optional[bool]:
232
+ return self.specification["breakingAssistSystem"]
233
+
234
+ @property
235
+ def has_reverse_camera(self) -> Optional[bool]:
236
+ return self.specification["reverseCamera"]
237
+
238
+ @property
239
+ def has_adaptive_cruise_control(self) -> Optional[bool]:
240
+ return self.specification["adaptiveCruiseControl"]
241
+
242
+ @property
243
+ def has_high_beams_auto_control(self) -> Optional[bool]:
244
+ return self.specification["highBeamsAutoControl"]
245
+
246
+ @property
247
+ def has_blind_spot_assist(self) -> Optional[bool]:
248
+ return self.specification["blindSpotAssist"]
249
+
250
+ @property
251
+ def has_identify_pedestrians(self) -> Optional[bool]:
252
+ return self.specification["identifyPedestrians"]
253
+
254
+ @property
255
+ def has_seat_belts_sensors(self) -> Optional[bool]:
256
+ return self.specification["seatBeltsSensors"]
257
+
258
+ @property
259
+ def has_identifying_dangerous_nearing(self) -> Optional[bool]:
260
+ return self.specification["identifyingDangerousNearing"]
261
+
262
+ @property
263
+ def has_auto_lighting_in_forward(self) -> Optional[bool]:
264
+ return self.specification["autoLightingInForward"]
265
+
266
+ @property
267
+ def has_identify_traffic_signs(self) -> Optional[bool]:
268
+ return self.specification["identifyTrafficSigns"]
269
+
270
+ def ignition(self, field: Field = Field.TEXT) -> Optional[FieldTypes]:
271
+ return self.specification["ignition"][field]
272
+
273
+ @property
274
+ def safety_points(self) -> Optional[int]:
275
+ return self.specification["safetyPoints"]
276
+
277
+ @property
278
+ def is_handicapped_friendly(self) -> Optional[bool]:
279
+ return self.specification["isHandicappedFriendly"]
280
+
281
+ @property
282
+ def has_sun_roof(self) -> Optional[bool]:
283
+ return self.specification["sunRoof"]
284
+
285
+ @property
286
+ def is_turbo(self) -> Optional[bool]:
287
+ return self.specification["isTurbo"]
288
+
289
+ @property
290
+ def has_road_deviation_control(self) -> Optional[bool]:
291
+ return self.specification["roadDeviationControl"]
292
+
293
+ @property
294
+ def has_forward_distance_monitor(self) -> Optional[bool]:
295
+ return self.specification["forwardDistanceMonitor"]
296
+
297
+ @property
298
+ def has_box(self) -> Optional[bool]:
299
+ return self.specification["box"]
300
+
301
+ def __getitem__(self, key: str) -> Any:
302
+ return self.data[key]
303
+
304
+
305
+ class VehiclesNextData(NextData):
306
+ def iterate_vehicles(self) -> Iterator[VehicleData]:
307
+ for query in self.queries:
308
+ data = query["state"].get("data")
309
+
310
+ if not data or isinstance(data, list):
311
+ continue
312
+
313
+ for vehicle_data in itertools.chain.from_iterable(data.values()):
314
+ if isinstance(vehicle_data, dict):
315
+ yield VehicleData(vehicle_data)
316
+
317
+ def __getitem__(self, item):
318
+ return self.data[item]
@@ -0,0 +1,25 @@
1
+ from enum import Enum
2
+ from typing import Optional
3
+
4
+ from yad2_scraper.query import QueryFilters, OrderBy, NumberRange, format_number_range
5
+
6
+
7
+ class OrderVehiclesBy(int, Enum):
8
+ DATE = OrderBy.DATE
9
+ PRICE_LOWEST_TO_HIGHEST = OrderBy.PRICE_LOWEST_TO_HIGHEST
10
+ PRICE_HIGHEST_TO_LOWEST = OrderBy.PRICE_HIGHEST_TO_LOWEST
11
+ DISTANCE_LOWEST_TO_HIGHEST = 5
12
+ YEAR_HIGHEST_TO_LOWEST = 6
13
+
14
+
15
+ class VehiclesQueryFilters(QueryFilters):
16
+ year_range: Optional[NumberRange] = None
17
+ ...
18
+
19
+ def to_params(self) -> dict:
20
+ return {
21
+ **super().to_params(),
22
+ "year": format_number_range(self.year_range)
23
+ }
24
+
25
+ # TODO: add QueryParams class for each vehicle type (some share the same attributes - sometimes with different enums)
@@ -0,0 +1,63 @@
1
+ from functools import cached_property
2
+ from bs4 import Tag
3
+ from typing import Optional
4
+
5
+ from yad2_scraper.utils import join_url, find_html_tag_by_class_substring
6
+ from yad2_scraper.vehicles.urls import VEHICLES_URL
7
+
8
+ YEAR_AND_HAND_TAG_SEPARATOR = " • "
9
+
10
+
11
+ class VehicleTag:
12
+ def __init__(self, tag: Tag):
13
+ self.tag = tag
14
+
15
+ @cached_property
16
+ def relative_link(self) -> str:
17
+ return self.find_tag_by_class_substring("a", "itemLink")["href"]
18
+
19
+ @property
20
+ def page_link(self) -> str:
21
+ return join_url(VEHICLES_URL, self.relative_link)
22
+
23
+ @cached_property
24
+ def image_url(self) -> str:
25
+ return self.find_tag_by_class_substring("img", "image")["src"]
26
+
27
+ @cached_property
28
+ def model(self) -> str:
29
+ return self.find_tag_by_class_substring("span", "heading").text.strip()
30
+
31
+ @cached_property
32
+ def marketing_text(self) -> str:
33
+ return self.find_tag_by_class_substring("span", "marketingText").text.strip()
34
+
35
+ @cached_property
36
+ def year_and_hand_string(self) -> str:
37
+ return self.find_tag_by_class_substring("span", "yearAndHand").text.strip()
38
+
39
+ @property
40
+ def year(self) -> int:
41
+ year, _ = self.year_and_hand_string.split(YEAR_AND_HAND_TAG_SEPARATOR)
42
+ return int(year)
43
+
44
+ @property
45
+ def hand(self) -> int:
46
+ _, hand_string = self.year_and_hand_string.split(YEAR_AND_HAND_TAG_SEPARATOR)
47
+ _, hand = hand_string.split()
48
+ return int(hand)
49
+
50
+ @cached_property
51
+ def price_string(self) -> str:
52
+ return self.find_tag_by_class_substring("span", "price").text.strip()
53
+
54
+ @property
55
+ def price(self) -> Optional[int]:
56
+ try:
57
+ price, _ = self.price_string.split()
58
+ return int(price.replace(",", ""))
59
+ except ValueError:
60
+ return None
61
+
62
+ def find_tag_by_class_substring(self, tag_name: str, substring: str) -> Tag:
63
+ return find_html_tag_by_class_substring(self.tag, tag_name, substring)
@@ -0,0 +1,16 @@
1
+ from typing import Literal, get_args
2
+
3
+ from yad2_scraper.utils import join_url
4
+ from yad2_scraper.constants import BASE_URL
5
+
6
+ VEHICLES_URL = join_url(BASE_URL, "vehicles")
7
+
8
+ VehicleType = Literal["cars", "motorcycles", "scooters", "trucks", "watercraft", "others"]
9
+
10
+ _VALID_VEHICLE_TYPES = get_args(VehicleType)
11
+
12
+
13
+ def get_vehicle_url(vehicle_type: VehicleType) -> str:
14
+ if vehicle_type not in _VALID_VEHICLE_TYPES:
15
+ raise ValueError(f"Invalid vehicle type: {repr(vehicle_type)}. Expected one of {_VALID_VEHICLE_TYPES}")
16
+ return join_url(VEHICLES_URL, vehicle_type)
@@ -1,31 +0,0 @@
1
- import httpx
2
- from typing import List
3
-
4
-
5
- class ResponseError(httpx.HTTPStatusError):
6
- # This adds the request/response objects to the error
7
- pass
8
-
9
-
10
- class AntiBotDetectedError(ResponseError):
11
- pass
12
-
13
-
14
- class UnexpectedContentError(ResponseError):
15
- pass
16
-
17
-
18
- class MaxRetriesExceededError(Exception):
19
- def __init__(self, msg: str, errors: List[Exception] = None):
20
- super().__init__(msg)
21
- self.errors = errors
22
-
23
-
24
- class MaxRequestRetriesExceededError(MaxRetriesExceededError):
25
- def __init__(self, method: str, url: str, max_retries: int, errors: List[Exception] = None):
26
- self.method = method
27
- self.url = url
28
- self.max_retries = max_retries
29
-
30
- msg = f"All {self.max_retries} retry attempts for {self.method} request to '{self.url}' have failed"
31
- super().__init__(msg, errors)
@@ -1,27 +0,0 @@
1
- from enum import Enum
2
- from typing import List, Union
3
-
4
-
5
- class Field(str, Enum):
6
- ID = "id"
7
- TEXT = "text"
8
- ENGLISH_TEXT = "textEng"
9
-
10
-
11
- FieldTypes = Union[str, int]
12
-
13
-
14
- class NextData:
15
- def __init__(self, data: dict):
16
- self.data = data
17
-
18
- @property
19
- def json(self) -> dict:
20
- return self.data
21
-
22
- @property
23
- def queries(self) -> List[dict]:
24
- return self.data["props"]["pageProps"]["dehydratedState"]["queries"]
25
-
26
- def __getitem__(self, item):
27
- return self.data[item]
File without changes
File without changes