yad2-scraper 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
yad2_scraper/__init__.py CHANGED
@@ -1,4 +1,99 @@
1
- from .scraper import Yad2Scraper
2
- from .query import QueryFilters, OrderBy
1
+ from typing import Optional, Type
2
+
3
+ from .scraper import Yad2Scraper, Category
4
+ from .query import QueryFilters, OrderBy, NumberRange
3
5
  from .category import Yad2Category
4
- from .next_data import NextData
6
+ from .next_data import NextData, Field
7
+ from .utils import any_param_specified
8
+ from .vehicles import (
9
+ Yad2VehiclesCategory,
10
+ VehiclesQueryFilters,
11
+ OrderVehiclesBy,
12
+ VehicleCategory,
13
+ get_vehicle_category_url
14
+ )
15
+
16
+ _default_scraper = None
17
+
18
+
19
+ def get_default_scraper() -> Yad2Scraper:
20
+ """
21
+ Retrieves the default instance of the Yad2Scraper. If an instance does not already exist, it will be created.
22
+
23
+ Returns:
24
+ Yad2Scraper: The default instance of the Yad2Scraper.
25
+
26
+ Notes:
27
+ The default scraper is a singleton instance that is reused across multiple calls.
28
+ """
29
+ global _default_scraper
30
+
31
+ if not _default_scraper:
32
+ _default_scraper = Yad2Scraper()
33
+
34
+ return _default_scraper
35
+
36
+
37
+ def fetch_category(
38
+ url: str,
39
+ category_type: Type[Category] = Yad2Category,
40
+ page: Optional[int] = None,
41
+ order_by: Optional[OrderBy] = None,
42
+ price_range: [NumberRange] = None
43
+ ) -> Category:
44
+ """
45
+ Fetches a specific category from the given URL, while applying optional filters.
46
+
47
+ Args:
48
+ url (str): The URL of the category to fetch.
49
+ category_type (Type[Category], optional): The type of category to return (default is `Yad2Category`).
50
+ page (Optional[int], optional): The page number for pagination (default is None).
51
+ order_by (Optional[OrderBy], optional): The sorting order for the results (default is None).
52
+ price_range (Optional[List[NumberRange]], optional): The price range filter for the results (default is None).
53
+
54
+ Returns:
55
+ Category: An instance of the specified `category_type`, populated with the fetched data.
56
+
57
+ Notes:
58
+ This method uses the default scraper to retrieve the category.
59
+ """
60
+ if any_param_specified(page, order_by, price_range):
61
+ params = QueryFilters(page=page, order_by=order_by, price_range=price_range)
62
+ else:
63
+ params = None
64
+
65
+ default_scraper = get_default_scraper()
66
+ return default_scraper.fetch_category(url, category_type, params=params)
67
+
68
+
69
+ def fetch_vehicle_category(
70
+ vehicle_category: VehicleCategory,
71
+ page: Optional[int] = None,
72
+ order_by: Optional[OrderVehiclesBy] = None,
73
+ price_range: [NumberRange] = None,
74
+ year_range: [NumberRange] = None
75
+ ) -> Yad2VehiclesCategory:
76
+ """
77
+ Fetches a specific vehicle category, while applying optional filters.
78
+
79
+ Args:
80
+ vehicle_category (VehicleCategory): The vehicle category to fetch.
81
+ page (Optional[int], optional): The page number for pagination (default is None).
82
+ order_by (Optional[OrderVehiclesBy], optional): The sorting order for the results (default is None).
83
+ price_range (Optional[List[NumberRange]], optional): The price range filter for the results (default is None).
84
+ year_range (Optional[List[NumberRange]], optional): The year range filter for the results (default is None).
85
+
86
+ Returns:
87
+ Yad2VehiclesCategory: An instance of `Yad2VehiclesCategory`, populated with the fetched vehicle category data.
88
+
89
+ Notes:
90
+ This method uses the default scraper to fetch the vehicle category.
91
+ """
92
+ if any_param_specified(page, order_by, price_range, year_range):
93
+ params = VehiclesQueryFilters(page=page, order_by=order_by, price_range=price_range, year_range=year_range)
94
+ else:
95
+ params = None
96
+
97
+ url = get_vehicle_category_url(vehicle_category)
98
+ default_scraper = get_default_scraper()
99
+ return default_scraper.fetch_category(url, Yad2VehiclesCategory, params=params)
yad2_scraper/category.py CHANGED
@@ -8,18 +8,24 @@ from yad2_scraper.constants import NEXT_DATA_SCRIPT_ID
8
8
 
9
9
 
10
10
  class Yad2Category:
11
+ """Represents a Yad2 category parsed from an HTML page."""
12
+
11
13
  def __init__(self, soup: BeautifulSoup):
14
+ """Initialize with a BeautifulSoup object."""
12
15
  self.soup = soup
13
16
 
14
17
  @classmethod
15
18
  def from_html_io(cls, html_io: Union[TextIO, BinaryIO]):
19
+ """Create an instance from an HTML file-like object."""
16
20
  html = html_io.read()
17
21
  soup = BeautifulSoup(html, "html.parser")
18
22
  return cls(soup)
19
23
 
20
24
  def load_next_data(self) -> Optional[NextData]:
25
+ """Extract and parse Next.js data from the page."""
21
26
  tag = self.soup.find("script", id=NEXT_DATA_SCRIPT_ID)
22
27
  return NextData(json.loads(tag.string)) if tag else None
23
28
 
24
29
  def find_all_tags_by_class_substring(self, tag_name: str, substring: str) -> List[Tag]:
30
+ """Find all HTML tags with a class containing the given substring."""
25
31
  return find_all_html_tags_by_class_substring(self.soup, tag_name, substring)
@@ -3,6 +3,8 @@ from typing import List, Union
3
3
 
4
4
 
5
5
  class ResponseError(Exception):
6
+ """Represents an error response from an HTTP request."""
7
+
6
8
  def __init__(self, msg: str, request: httpx.Request, response: httpx.Response):
7
9
  super().__init__(msg)
8
10
  self.request = request
@@ -10,14 +12,18 @@ class ResponseError(Exception):
10
12
 
11
13
 
12
14
  class AntiBotDetectedError(ResponseError):
15
+ """Raised when an anti-bot mechanism is detected."""
13
16
  pass
14
17
 
15
18
 
16
19
  class UnexpectedContentError(ResponseError):
20
+ """Raised when the response content is not as expected."""
17
21
  pass
18
22
 
19
23
 
20
24
  class MaxAttemptsExceededError(Exception):
25
+ """Raised when the maximum number of attempts is exceeded."""
26
+
21
27
  def __init__(self, msg: str, max_attempts: int, errors: List[BaseException] = None):
22
28
  super().__init__(msg)
23
29
  self.max_attempts = max_attempts
@@ -25,6 +31,8 @@ class MaxAttemptsExceededError(Exception):
25
31
 
26
32
 
27
33
  class MaxRequestAttemptsExceededError(MaxAttemptsExceededError):
34
+ """Raised when all HTTP request attempts fail."""
35
+
28
36
  def __init__(self, method: str, url: str, max_attempts: int, errors: List[Union[httpx.HTTPError, ResponseError]]):
29
37
  msg = f"All {max_attempts} attempts for {method} request to '{url}' have failed"
30
38
  super().__init__(msg, max_attempts, errors)
yad2_scraper/next_data.py CHANGED
@@ -6,45 +6,55 @@ from yad2_scraper.utils import safe_access
6
6
 
7
7
  FieldTypes = Union[str, int]
8
8
 
9
- safe_access_optional_keys = safe_access(exceptions=(KeyError, TypeError), default=None)
9
+ _safe_access_optional_keys = safe_access(exceptions=(KeyError, TypeError), default=None)
10
10
 
11
11
 
12
12
  class SafeAccessOptionalKeysMeta(type):
13
+ """Metaclass that wraps methods and properties with safe access handling."""
14
+
13
15
  def __new__(cls, name, bases, dictionary):
14
16
  for attr_name, attr_value in dictionary.items():
15
17
  if callable(attr_value): # Wrap methods
16
- dictionary[attr_name] = safe_access_optional_keys(attr_value)
18
+ dictionary[attr_name] = _safe_access_optional_keys(attr_value)
17
19
  elif isinstance(attr_value, property): # Wrap properties
18
20
  dictionary[attr_name] = property(
19
- safe_access_optional_keys(attr_value.fget) if attr_value.fget else None,
20
- safe_access_optional_keys(attr_value.fset) if attr_value.fset else None,
21
- safe_access_optional_keys(attr_value.fdel) if attr_value.fdel else None,
21
+ _safe_access_optional_keys(attr_value.fget) if attr_value.fget else None,
22
+ _safe_access_optional_keys(attr_value.fset) if attr_value.fset else None,
23
+ _safe_access_optional_keys(attr_value.fdel) if attr_value.fdel else None,
22
24
  attr_value.__doc__,
23
25
  )
24
26
  return super().__new__(cls, name, bases, dictionary)
25
27
 
26
28
 
27
29
  class Field(str, Enum):
30
+ """Enum representing different field types for data."""
28
31
  ID = "id"
29
32
  TEXT = "text"
30
33
  ENGLISH_TEXT = "textEng"
31
34
 
32
35
 
33
36
  def convert_string_date_to_datetime(date_string: str) -> datetime:
37
+ """Convert an ISO format string to a datetime object."""
34
38
  return datetime.fromisoformat(date_string)
35
39
 
36
40
 
37
41
  class NextData:
42
+ """Represents structured Next.js data."""
43
+
38
44
  def __init__(self, data: dict):
45
+ """Initialize with Next.js data dictionary."""
39
46
  self.data = data
40
47
 
41
48
  @property
42
49
  def json(self) -> dict:
50
+ """Return raw JSON data."""
43
51
  return self.data
44
52
 
45
53
  @property
46
54
  def queries(self) -> List[dict]:
55
+ """Extract query data from Next.js state."""
47
56
  return self.data["props"]["pageProps"]["dehydratedState"]["queries"]
48
57
 
49
58
  def __getitem__(self, item):
59
+ """Allow dictionary-style access to data."""
50
60
  return self.data[item]
yad2_scraper/query.py CHANGED
@@ -6,6 +6,7 @@ NumberRange = Tuple[int, int]
6
6
 
7
7
 
8
8
  class OrderBy(int, Enum):
9
+ """Enum representing different order options for sorting."""
9
10
  DATE = 1
10
11
  PRICE_LOWEST_TO_HIGHEST = 3
11
12
  PRICE_HIGHEST_TO_LOWEST = 4
@@ -13,6 +14,7 @@ class OrderBy(int, Enum):
13
14
 
14
15
 
15
16
  def format_number_range(number_range: Optional[Tuple[int, int]]) -> Optional[str]:
17
+ """Format a number range as 'min_value-max_value'."""
16
18
  if number_range is None:
17
19
  return None
18
20
 
@@ -25,12 +27,13 @@ def format_number_range(number_range: Optional[Tuple[int, int]]) -> Optional[str
25
27
 
26
28
 
27
29
  class QueryFilters(BaseModel):
30
+ """Pydantic model representing query filters for querying a resource."""
28
31
  page: Optional[int] = None
29
32
  order_by: Optional[OrderBy] = None
30
33
  price_range: Optional[NumberRange] = None
31
- ...
32
34
 
33
35
  def to_params(self) -> dict:
36
+ """Convert filter fields to query parameters."""
34
37
  return {
35
38
  "page": self.page,
36
39
  "Order": self.order_by,
@@ -38,9 +41,9 @@ class QueryFilters(BaseModel):
38
41
  }
39
42
 
40
43
  def to_clean_params(self):
44
+ """Return query parameters excluding None values."""
41
45
  return {key: value for key, value in self.to_params().items() if value is not None}
42
46
 
43
- # TODO: add helper methods for managing the attribute values
44
-
45
47
  def __iter__(self):
48
+ """Allow iteration over the clean query parameters."""
46
49
  yield from self.to_clean_params().items()
yad2_scraper/scraper.py CHANGED
@@ -24,6 +24,8 @@ logger = logging.getLogger(__name__)
24
24
 
25
25
 
26
26
  class Yad2Scraper:
27
+ """A scraper for fetching data from the Yad2 website, with robust features"""
28
+
27
29
  def __init__(
28
30
  self,
29
31
  client: Optional[httpx.Client] = None,
@@ -32,6 +34,16 @@ class Yad2Scraper:
32
34
  wait_strategy: Optional[WaitStrategy] = None,
33
35
  max_request_attempts: int = 1
34
36
  ):
37
+ """
38
+ Initializes the Yad2Scraper with provided parameters.
39
+
40
+ Args:
41
+ client (Optional[httpx.Client]): An optional custom HTTP client. If not provided, a default client is used.
42
+ request_defaults (Optional[Dict[str, Any]]): Default parameters for requests such as headers, params, etc.
43
+ randomize_user_agent (bool): If True, a random User-Agent will be set for each request. Defaults to True.
44
+ wait_strategy (Optional[WaitStrategy]): A function to determine the wait time between requests.
45
+ max_request_attempts (int): The maximum number of retry attempts for failed requests. Defaults to 1.
46
+ """
35
47
  self.client = client or httpx.Client(
36
48
  headers=DEFAULT_REQUEST_HEADERS,
37
49
  follow_redirects=ALLOW_REQUEST_REDIRECTS,
@@ -41,14 +53,32 @@ class Yad2Scraper:
41
53
  self.randomize_user_agent = randomize_user_agent
42
54
  self.wait_strategy = wait_strategy
43
55
  self.max_request_attempts = max_request_attempts
56
+ self._request_count = 0
44
57
 
45
58
  logger.debug(f"Scraper initialized with client: {self.client}")
46
59
 
60
+ @property
61
+ def request_count(self) -> int:
62
+ """Returns the number of requests made by the scraper so far."""
63
+ return self._request_count
64
+
47
65
  def set_user_agent(self, user_agent: str) -> None:
66
+ """
67
+ Sets the User-Agent header for requests.
68
+
69
+ Args:
70
+ user_agent (str): The User-Agent string to be used in HTTP requests.
71
+ """
48
72
  self.client.headers["User-Agent"] = user_agent
49
73
  logger.debug(f"User-Agent client header set to: '{user_agent}'")
50
74
 
51
75
  def set_no_script(self, no_script: bool) -> None:
76
+ """
77
+ Sets the "noscript" cookie in the client's cookies to control JavaScript content.
78
+
79
+ Args:
80
+ no_script (bool): If True, the "noscript" cookie is set to "1". If False, it's set to "0".
81
+ """
52
82
  value = "1" if no_script else "0"
53
83
  self.client.cookies.set("noscript", value)
54
84
  logger.debug(f"NoScript (noscript) client cookie set to: '{value}'")
@@ -56,18 +86,44 @@ class Yad2Scraper:
56
86
  def fetch_category(
57
87
  self,
58
88
  url: str,
59
- category_type: Type[Category] = Yad2Category,
89
+ category_type: Type[Category],
60
90
  params: Optional[QueryParamTypes] = None
61
91
  ) -> Category:
92
+ """
93
+ Fetches and returns a category page from a given URL.
94
+
95
+ Args:
96
+ url (str): The URL of the category page.
97
+ category_type (Type[Category]): The class type of the category to be fetched.
98
+ params (Optional[QueryParamTypes]): Query parameters to be included in the request.
99
+
100
+ Returns:
101
+ Category: The fetched category, parsed from HTML.
102
+ """
62
103
  logger.debug(f"Fetching category from URL: '{url}'")
63
104
  response = self.get(url, params)
64
105
  logger.debug(f"Category fetched successfully from URL: '{url}'")
65
106
  return category_type.from_html_io(response)
66
107
 
67
108
  def get(self, url: str, params: Optional[QueryParamTypes] = None) -> httpx.Response:
109
+ """Sends a GET request to the specified URL."""
68
110
  return self.request("GET", url, params=params)
69
111
 
70
112
  def request(self, method: str, url: str, params: Optional[QueryParamTypes] = None) -> httpx.Response:
113
+ """
114
+ Sends an HTTP request with multiple attempts logic.
115
+
116
+ Args:
117
+ method (str): The HTTP method (e.g., "GET", "POST").
118
+ url (str): The URL to send the request to.
119
+ params (Optional[QueryParamTypes]): Query parameters to be included in the request.
120
+
121
+ Returns:
122
+ httpx.Response: The HTTP response object.
123
+
124
+ Raises:
125
+ MaxRequestAttemptsExceededError: If the request exceeds the maximum number of attempts.
126
+ """
71
127
  if not isinstance(self.max_request_attempts, int):
72
128
  raise TypeError(f"max_request_attempts must be of type 'int', but got {type(self.max_request_attempts)}")
73
129
 
@@ -92,11 +148,28 @@ class Yad2Scraper:
92
148
  raise max_attempts_error from error_list[-1] # multiple errors exist, raise from the last one
93
149
 
94
150
  def close(self) -> None:
151
+ """Closes the HTTP client and logs the closure."""
95
152
  logger.debug("Closing scraper client")
96
153
  self.client.close()
97
154
  logger.info("Scraper client closed")
98
155
 
99
156
  def _send_request(self, method: str, url: str, request_options: Dict[str, Any], attempt: int) -> httpx.Response:
157
+ """
158
+ Sends an HTTP request with the specified method to the given URL, applying all necessary actions.
159
+
160
+ Args:
161
+ method (str): The HTTP method (e.g., 'GET', 'POST').
162
+ url (str): The target URL for the request.
163
+ request_options (Dict[str, Any]): Additional request options, including headers and parameters.
164
+ attempt (int): The current attempt number for the request.
165
+
166
+ Returns:
167
+ httpx.Response: The HTTP response object received from the server.
168
+
169
+ Raises:
170
+ AntiBotDetectedError: If the response contains Anti-Bot content.
171
+ UnexpectedContentError: If a GET request does not contain expected content.
172
+ """
100
173
  if self.randomize_user_agent:
101
174
  self._set_random_user_agent(request_options)
102
175
 
@@ -105,12 +178,22 @@ class Yad2Scraper:
105
178
 
106
179
  logger.info(f"Sending {method} request to URL: '{url}' {self._format_attempt_info(attempt)}")
107
180
  response = self.client.request(method, url, **request_options)
181
+ self._request_count += 1
108
182
  logger.debug(f"Received response {response.status_code} from '{url}' {self._format_attempt_info(attempt)}")
109
183
  self._validate_response(response)
110
184
 
111
185
  return response
112
186
 
113
187
  def _prepare_request_options(self, params: Optional[QueryParamTypes] = None) -> Dict[str, Any]:
188
+ """
189
+ Prepares the request options to be passed to the HTTP client's request method, based on the default options.
190
+
191
+ Args:
192
+ params (Optional[QueryParamTypes]): Optional query parameters to include in the request.
193
+
194
+ Returns:
195
+ Dict[str, Any]: A dictionary of the request options, including headers and query parameters.
196
+ """
114
197
  logger.debug("Preparing request options from defaults")
115
198
  request_options = self.request_defaults.copy()
116
199
 
@@ -122,11 +205,23 @@ class Yad2Scraper:
122
205
 
123
206
  @staticmethod
124
207
  def _set_random_user_agent(request_options: Dict[str, str]):
208
+ """
209
+ Sets a random User-Agent header in the request options.
210
+
211
+ Args:
212
+ request_options (Dict[str, str]): The request options to update with the random User-Agent.
213
+ """
125
214
  user_agent = fua.random
126
215
  request_options.setdefault("headers", {})["User-Agent"] = user_agent
127
216
  logger.debug(f"Updated request options with random User-Agent header: '{user_agent}'")
128
217
 
129
218
  def _apply_wait_strategy(self, attempt: int):
219
+ """
220
+ Applies a wait time before making a request based on the wait strategy for the given attempt.
221
+
222
+ Args:
223
+ attempt (int): The current attempt number to calculate the wait time.
224
+ """
130
225
  wait_time = self.wait_strategy(attempt)
131
226
  if not wait_time:
132
227
  return
@@ -136,6 +231,17 @@ class Yad2Scraper:
136
231
 
137
232
  @staticmethod
138
233
  def _validate_response(response: httpx.Response):
234
+ """
235
+ Validates the response to ensure it is successful.
236
+
237
+ Args:
238
+ response (httpx.Response): The HTTP response object to validate.
239
+
240
+ Raises:
241
+ httpx.HTTPStatusError: If a status error occurred.
242
+ AntiBotDetectedError: If the response contains Anti-Bot content.
243
+ UnexpectedContentError: If a GET response does not contain expected content.
244
+ """
139
245
  response.raise_for_status()
140
246
 
141
247
  if ANTIBOT_CONTENT_IDENTIFIER in response.content:
@@ -154,12 +260,35 @@ class Yad2Scraper:
154
260
  logger.debug("Response validation succeeded")
155
261
 
156
262
  def _format_attempt_info(self, attempt: int) -> str:
263
+ """
264
+ Formats a string representing the current attempt number and total attempt count.
265
+
266
+ Args:
267
+ attempt (int): The current attempt number.
268
+
269
+ Returns:
270
+ str: A formatted string representing the attempt info, e.g., "(attempt 1/5)".
271
+ """
157
272
  return f"(attempt {attempt}/{self.max_request_attempts})"
158
273
 
159
274
  def __enter__(self):
275
+ """
276
+ Prepares the scraper to be used in a `with` statement, allowing for resource management.
277
+
278
+ Returns:
279
+ Yad2Scraper: The scraper instance to be used within the `with` block.
280
+ """
160
281
  logger.debug("Entering scraper context")
161
282
  return self
162
283
 
163
284
  def __exit__(self, exc_type, exc_val, exc_tb):
285
+ """
286
+ Cleans up resources and closes the scraper client when exiting the `with` statement.
287
+
288
+ Args:
289
+ exc_type: The exception type (if any).
290
+ exc_val: The exception value (if any).
291
+ exc_tb: The traceback object (if any).
292
+ """
164
293
  logger.debug("Exiting scraper context")
165
294
  self.close()
yad2_scraper/utils.py CHANGED
@@ -2,27 +2,30 @@ import functools
2
2
  from bs4 import BeautifulSoup, Tag
3
3
  from typing import Union, List, Tuple, Any
4
4
 
5
+ def any_param_specified(*params: Any) -> bool:
6
+ """Check if any parameter is not None."""
7
+ return any(param is not None for param in params)
5
8
 
6
9
  def join_url(url: str, path: str) -> str:
10
+ """Join a base URL with a path, ensuring proper slashes."""
7
11
  return url.rstrip("/") + "/" + path.lstrip("/")
8
12
 
9
-
10
13
  def get_parent_url(url: str) -> str:
14
+ """Return the parent URL by removing the last segment."""
11
15
  if url.count("/") <= 2:
12
16
  return url
13
-
14
17
  return url.rstrip("/").rsplit("/", 1)[0]
15
18
 
16
-
17
19
  def find_html_tag_by_class_substring(e: Union[BeautifulSoup, Tag], tag_name: str, substring: str) -> Tag:
20
+ """Find the first HTML tag with a class containing the given substring."""
18
21
  return e.find(tag_name, class_=lambda class_name: class_name and substring in class_name)
19
22
 
20
-
21
23
  def find_all_html_tags_by_class_substring(e: Union[BeautifulSoup, Tag], tag_name: str, substring: str) -> List[Tag]:
24
+ """Find all HTML tags with a class containing the given substring."""
22
25
  return e.find_all(tag_name, class_=lambda class_name: class_name and substring in class_name)
23
26
 
24
-
25
27
  def safe_access(exceptions: Tuple = (), default: Any = None):
28
+ """Decorator to safely execute a function, returning a default value on exception."""
26
29
  def decorator(func):
27
30
  @functools.wraps(func)
28
31
  def wrapper(*args, **kwargs):
@@ -30,7 +33,5 @@ def safe_access(exceptions: Tuple = (), default: Any = None):
30
33
  return func(*args, **kwargs)
31
34
  except exceptions:
32
35
  return default
33
-
34
36
  return wrapper
35
-
36
- return decorator
37
+ return decorator
@@ -1,4 +1,4 @@
1
- from .urls import VEHICLES_URL, VehicleType, get_vehicle_url
1
+ from .urls import VEHICLES_URL, VehicleCategory, get_vehicle_category_url
2
2
  from .query import VehiclesQueryFilters, OrderVehiclesBy
3
3
  from .category import Yad2VehiclesCategory
4
4
  from .tag import VehicleTag
@@ -6,10 +6,13 @@ from yad2_scraper.vehicles.next_data import VehiclesNextData
6
6
 
7
7
 
8
8
  class Yad2VehiclesCategory(Yad2Category):
9
+ """Represents a Yad2 vehicles category parsed from an HTML page."""
9
10
  def get_vehicle_tags(self) -> List[VehicleTag]:
11
+ """Retrieve a and return list of vehicle tags from the current category."""
10
12
  tags = self.find_all_tags_by_class_substring("div", "feedItemBox")
11
13
  return [VehicleTag(tag) for tag in tags]
12
14
 
13
15
  def load_next_data(self) -> Optional[VehiclesNextData]:
16
+ """Extract and parse Next.js data from the current vehicle page."""
14
17
  next_data = super().load_next_data()
15
18
  return VehiclesNextData(next_data) if next_data else None
@@ -14,6 +14,8 @@ from yad2_scraper.vehicles.urls import VEHICLES_URL
14
14
 
15
15
 
16
16
  class VehicleData(metaclass=SafeAccessOptionalKeysMeta):
17
+ """Represents the data for a single vehicle."""
18
+
17
19
  def __init__(self, data: dict):
18
20
  self.data = data
19
21
 
@@ -303,7 +305,10 @@ class VehicleData(metaclass=SafeAccessOptionalKeysMeta):
303
305
 
304
306
 
305
307
  class VehiclesNextData(NextData):
308
+ """Represents structured Next.js data of a specific vehicle category."""
309
+
306
310
  def iterate_vehicles(self) -> Iterator[VehicleData]:
311
+ """Iterates through the queries and yields `VehicleData` objects."""
307
312
  for query in self.queries:
308
313
  data = query["state"].get("data")
309
314
 
@@ -313,6 +318,3 @@ class VehiclesNextData(NextData):
313
318
  for vehicle_data in itertools.chain.from_iterable(data.values()):
314
319
  if isinstance(vehicle_data, dict):
315
320
  yield VehicleData(vehicle_data)
316
-
317
- def __getitem__(self, item):
318
- return self.data[item]
@@ -5,6 +5,7 @@ from yad2_scraper.query import QueryFilters, OrderBy, NumberRange, format_number
5
5
 
6
6
 
7
7
  class OrderVehiclesBy(int, Enum):
8
+ """Enum representing different order options for sorting vehicles."""
8
9
  DATE = OrderBy.DATE
9
10
  PRICE_LOWEST_TO_HIGHEST = OrderBy.PRICE_LOWEST_TO_HIGHEST
10
11
  PRICE_HIGHEST_TO_LOWEST = OrderBy.PRICE_HIGHEST_TO_LOWEST
@@ -13,13 +14,14 @@ class OrderVehiclesBy(int, Enum):
13
14
 
14
15
 
15
16
  class VehiclesQueryFilters(QueryFilters):
17
+ """Pydantic model representing query filters for querying a vehicle resource."""
16
18
  year_range: Optional[NumberRange] = None
17
- ...
18
19
 
19
20
  def to_params(self) -> dict:
21
+ """Convert filter fields to query parameters, including 'year'."""
20
22
  return {
21
23
  **super().to_params(),
22
24
  "year": format_number_range(self.year_range)
23
25
  }
24
26
 
25
- # TODO: add QueryParams class for each vehicle type (some share the same attributes - sometimes with different enums)
27
+ # TODO: add QueryParams class for each vehicle category (some share the same attributes, sometimes with different enums)
@@ -9,6 +9,8 @@ YEAR_AND_HAND_TAG_SEPARATOR = " • "
9
9
 
10
10
 
11
11
  class VehicleTag:
12
+ """Represents a vehicle listing on the webpage, providing access to various details"""
13
+
12
14
  def __init__(self, tag: Tag):
13
15
  self.tag = tag
14
16
 
@@ -5,12 +5,16 @@ from yad2_scraper.constants import BASE_URL
5
5
 
6
6
  VEHICLES_URL = join_url(BASE_URL, "vehicles")
7
7
 
8
- VehicleType = Literal["cars", "motorcycles", "scooters", "trucks", "watercraft", "others"]
8
+ VehicleCategory = Literal["cars", "motorcycles", "scooters", "trucks", "watercraft", "others"]
9
9
 
10
- _VALID_VEHICLE_TYPES = get_args(VehicleType)
10
+ _VALID_VEHICLE_CATEGORIES = get_args(VehicleCategory)
11
11
 
12
12
 
13
- def get_vehicle_url(vehicle_type: VehicleType) -> str:
14
- if vehicle_type not in _VALID_VEHICLE_TYPES:
15
- raise ValueError(f"Invalid vehicle type: {repr(vehicle_type)}. Expected one of {_VALID_VEHICLE_TYPES}")
16
- return join_url(VEHICLES_URL, vehicle_type)
13
+ def get_vehicle_category_url(vehicle_category: VehicleCategory) -> str:
14
+ """Generate the URL for the specified vehicle category."""
15
+ if vehicle_category not in _VALID_VEHICLE_CATEGORIES:
16
+ raise ValueError(
17
+ f"Invalid vehicle category: {repr(vehicle_category)}. Expected one of {_VALID_VEHICLE_CATEGORIES}"
18
+ )
19
+
20
+ return join_url(VEHICLES_URL, vehicle_category)
@@ -0,0 +1,164 @@
1
+ Metadata-Version: 2.3
2
+ Name: yad2-scraper
3
+ Version: 0.5.2
4
+ Summary: Scrape Yad2 in Python.
5
+ License: LICENSE
6
+ Author: dav ost
7
+ Author-email: davidost2003@gmail.com
8
+ Requires-Python: >=3.8
9
+ Classifier: License :: Other/Proprietary License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.8
12
+ Classifier: Programming Language :: Python :: 3.9
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.13
17
+ Requires-Dist: beautifulsoup4 (>=4.11.1,<5.0.0)
18
+ Requires-Dist: fake-useragent (>=0.1.11,<0.2.0)
19
+ Requires-Dist: httpcore (>=0.15.0)
20
+ Requires-Dist: httpx (>=0.24.0,<0.25.0)
21
+ Requires-Dist: pydantic (>=1.10.0,<2.0.0)
22
+ Description-Content-Type: text/markdown
23
+
24
+ # Yad2 Scraper
25
+
26
+ A Python package for scraping listings from [Yad2](https://www.yad2.co.il/), Israel's leading classifieds platform.
27
+ This package provides a simple and flexible interface to fetch data, filter results, and extract relevant information.
28
+
29
+ __NOTE__: Currently, the package primarily supports the **vehicles category**.
30
+ Support for additional categories may be added in future updates.
31
+
32
+ ---
33
+
34
+ ## Features
35
+
36
+ - **Fetch Listings**: Retrieve listings by category (e.g., vehicles, real-estate, etc.).
37
+ - **Filter Results**: Apply filters such as price range, year range, and sorting order.
38
+ - **Dynamic URL Generation**: Generate URLs for specific categories and filters.
39
+ - **Type-Safe API**: Uses Python type hints (`Literal`, `Optional`, etc.) for better code clarity and safety.
40
+ - **Extensible**: Easily extendable to support additional categories and filters.
41
+
42
+ ---
43
+
44
+ ## Installation
45
+
46
+ Install the package using `pip`:
47
+
48
+ ```bash
49
+ pip install yad2-scraper
50
+ ```
51
+
52
+ ## Usage
53
+
54
+ ### Fetching Category Listings
55
+
56
+ To fetch any category, use the `fetch_category` function:
57
+
58
+ ```python
59
+ from yad2_scraper import fetch_category, Yad2Category
60
+
61
+ # Fetch real estate category (returns a generic Yad2Category object)
62
+ real_estate_category_page1 = fetch_category("https://www.yad2.co.il/realestate/forsale", page=1)
63
+ ...
64
+ real_estate_category_page2 = fetch_category("https://www.yad2.co.il/realestate/forsale", page=2)
65
+ ...
66
+ ```
67
+
68
+ ### Fetching Vehicle Listings
69
+
70
+ To fetch vehicle listings for a specific category, use the `fetch_vehicle_category` function:
71
+
72
+ ```python
73
+ from yad2_scraper import fetch_vehicle_category, OrderVehiclesBy, Field
74
+
75
+ # Fetch cars category
76
+ cars_category = fetch_vehicle_category("cars")
77
+
78
+ for car_data in cars_category.load_next_data().iterate_vehicles():
79
+ print(car_data.model(Field.ENGLISH_TEXT))
80
+ print(car_data.test_date)
81
+ print(car_data.price)
82
+ ...
83
+
84
+ # Fetch motorcycles category
85
+ motorcycle_categories = fetch_vehicle_category(
86
+ "motorcycles",
87
+ price_range=(5000, 15000),
88
+ year_range=(2010, 2020),
89
+ order_by=OrderVehiclesBy.PRICE_LOWEST_TO_HIGHEST
90
+ )
91
+
92
+ for motorcycle_tag in motorcycle_categories.get_vehicle_tags():
93
+ print(motorcycle_tag.page_link)
94
+ print(motorcycle_tag.hand)
95
+ print(motorcycle_tag.price)
96
+ ...
97
+ ```
98
+
99
+ ### The Scraper Object
100
+
101
+ The `Yad2Scraper` class is the core of the package.
102
+ It handles HTTP requests, parses responses, and provides methods to fetch and filter vehicle listings.
103
+
104
+ #### Creating a Scraper Instance
105
+
106
+ You can create a `Yad2Scraper` instance manually or use the default scraper provided by the package:
107
+
108
+ ```python
109
+ from yad2_scraper import Yad2Scraper, get_default_scraper
110
+
111
+ # Create a custom scraper instance
112
+ scraper = Yad2Scraper()
113
+
114
+ # Use the default scraper
115
+ default_scraper = get_default_scraper()
116
+ ```
117
+
118
+ #### Fetching Category Listings
119
+
120
+ The `fetch_category` method is used to fetch listings for a specific category.
121
+ It takes a URL, a `Category` type, and optionally query params as arguments:
122
+
123
+ ```python
124
+ from yad2_scraper import Yad2Scraper, Yad2Category, QueryFilters, OrderBy
125
+ from yad2_scraper.vehicles import (
126
+ Yad2VehiclesCategory,
127
+ VehiclesQueryFilters,
128
+ OrderVehiclesBy,
129
+ get_vehicle_category_url
130
+ )
131
+
132
+ # Fetch businesses for sale category with filters
133
+ scraper = Yad2Scraper()
134
+ url = "https://www.yad2.co.il/products/businesses-for-sale"
135
+ query_filters = QueryFilters(price_range=(10000, 250000), order_by=OrderBy.PRICE_LOWEST_TO_HIGHEST)
136
+ business_for_sale_category = scraper.fetch_category(url, Yad2Category, params=query_filters)
137
+
138
+ # Fetch watercraft (vehicle) category with filters
139
+ url = get_vehicle_category_url("watercraft")
140
+ query_filters = VehiclesQueryFilters(year_range=(2010, 2020), order_by=OrderVehiclesBy.DATE)
141
+ watercraft_category = scraper.fetch_category(url, Yad2VehiclesCategory, params=query_filters)
142
+ ```
143
+
144
+ #### Attributes & Methods
145
+
146
+ The `Yad2Scraper` object contains a lot of additional attributes & methods which you can use.
147
+ Please check out the actual code documentation for more details.
148
+
149
+ ## Contributing
150
+
151
+ Contributions are welcomed! Here’s how you can get started:
152
+
153
+ 1. Fork the repository.
154
+ 2. Create a new branch for your feature or bugfix.
155
+ 3. Write tests for your changes.
156
+ 4. Submit a pull request.
157
+
158
+ ## License
159
+
160
+ This project is licensed under the MIT License. See the LICENSE file for details.
161
+
162
+ ## Support
163
+
164
+ For questions, issues, or feature requests, please open an issue on the GitHub repository.
@@ -0,0 +1,18 @@
1
+ yad2_scraper/__init__.py,sha256=oLANQo7jrtR5ex1tv4sM5ppaW9JpHS70Knsp0ZgVzm0,3708
2
+ yad2_scraper/category.py,sha256=SQ2eg0-fQ9hEaNryYpWVFaJqCx1d65t2_E_S3qpuw9g,1230
3
+ yad2_scraper/constants.py,sha256=8zXJ31fRqkDIOJp96BRK1PJofGXX8SG64YcfmJnVW8Q,910
4
+ yad2_scraper/exceptions.py,sha256=CC7LUy5hMQRTI48UqLZBvYJAYkVZD6n05HXeGWAIO5w,1283
5
+ yad2_scraper/next_data.py,sha256=k8Hkd_fMaAvVWHC6cizuv2osi9c_pJoKjo6mKqfJNEY,2037
6
+ yad2_scraper/query.py,sha256=6-Xc2qvHYLbejEUij85xWB4mHX3MF1XxPup9oUIkU3w,1503
7
+ yad2_scraper/scraper.py,sha256=VA-P24Gvn1y5Pkn_n3hDdpVl1aeEnLoC82eBYteAbWQ,11816
8
+ yad2_scraper/utils.py,sha256=UDpFKel_TJa0dJv1FV-CVqA8-uaFo_hDcooiFAkSZI8,1578
9
+ yad2_scraper/vehicles/__init__.py,sha256=dxjZcNv3ExnN3fKW-m1oqKiX9YC7gj8lqpIa3uWo9iI,242
10
+ yad2_scraper/vehicles/category.py,sha256=KGx_0Rh3QfC0kUf8ndRrFqCVuzzl8mhKi0Fkxxcp1bA,816
11
+ yad2_scraper/vehicles/next_data.py,sha256=4GofUS9InAY7g7xtg6MKVr3tWN6-LS9dGBYC5vwx6QU,9268
12
+ yad2_scraper/vehicles/query.py,sha256=N0kA1Ci_MexPXfNpa9EUkXf8XLYOr6np9jMMMpVOlaM,986
13
+ yad2_scraper/vehicles/tag.py,sha256=Wj7v2c8IPQLYHVkfzP1UiulKKJE4yLqnbeh81nvWZhU,2052
14
+ yad2_scraper/vehicles/urls.py,sha256=zxipWjm0SXn2gGOBWw9VqKAJ59mhIGpzd_fTYitpW8c,715
15
+ yad2_scraper-0.5.2.dist-info/LICENSE,sha256=JCpnDxMx2kE40e0UQ1svSmifrLWg2Gni5VTkJR68thY,1065
16
+ yad2_scraper-0.5.2.dist-info/METADATA,sha256=Yk4fZ8_OxjL0MsTYhXO4DA8mP2opOQGDBuQFhJA0ivw,5231
17
+ yad2_scraper-0.5.2.dist-info/WHEEL,sha256=7dDg4QLnNKTvwIDR9Ac8jJaAmBC_owJrckbC0jjThyA,88
18
+ yad2_scraper-0.5.2.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 2.0.1
2
+ Generator: poetry-core 2.1.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,26 +0,0 @@
1
- Metadata-Version: 2.3
2
- Name: yad2-scraper
3
- Version: 0.5.0
4
- Summary: Scrape Yad2 in Python.
5
- License: LICENSE
6
- Author: dav ost
7
- Author-email: davidost2003@gmail.com
8
- Requires-Python: >=3.8
9
- Classifier: License :: Other/Proprietary License
10
- Classifier: Programming Language :: Python :: 3
11
- Classifier: Programming Language :: Python :: 3.8
12
- Classifier: Programming Language :: Python :: 3.9
13
- Classifier: Programming Language :: Python :: 3.10
14
- Classifier: Programming Language :: Python :: 3.11
15
- Classifier: Programming Language :: Python :: 3.12
16
- Classifier: Programming Language :: Python :: 3.13
17
- Requires-Dist: beautifulsoup4 (>=4.11.1,<5.0.0)
18
- Requires-Dist: fake-useragent (>=0.1.11,<0.2.0)
19
- Requires-Dist: httpcore (>=0.15.0)
20
- Requires-Dist: httpx (>=0.24.0,<0.25.0)
21
- Requires-Dist: pydantic (>=1.10.0,<2.0.0)
22
- Description-Content-Type: text/markdown
23
-
24
- # yad2-scraper
25
- Scrape Yad2 in Python.
26
-
@@ -1,18 +0,0 @@
1
- yad2_scraper/__init__.py,sha256=UUiIk6TAHTAP4IY86bIR4TcY3VVMTCyEF0Sq1MSneMM,141
2
- yad2_scraper/category.py,sha256=KXLyjMOlPzu3xj08-uRmffAMD83DbqFVm-y1-T83Djw,910
3
- yad2_scraper/constants.py,sha256=8zXJ31fRqkDIOJp96BRK1PJofGXX8SG64YcfmJnVW8Q,910
4
- yad2_scraper/exceptions.py,sha256=_IcuDdJPKAznSUp_c3fLEuTnDdKf0NLJqpRPs0IzdXw,979
5
- yad2_scraper/next_data.py,sha256=fOatioaBxR7LZgRnXp35CoOkR7-Adv6rW_YKBQpSYj8,1585
6
- yad2_scraper/query.py,sha256=nURdupTnMbxgglJz7tdWSqnp4UG61nwWM1vjWQaylWE,1196
7
- yad2_scraper/scraper.py,sha256=sgDpfnKlBSDIWEb2enpQ5O9E5fJvXz3cDOnGXHGCJL4,6653
8
- yad2_scraper/utils.py,sha256=y6ErH2HcoCJn7OreNj4lvW--iOA7dv1LUIPa537GVjg,1070
9
- yad2_scraper/vehicles/__init__.py,sha256=4-4vVFu836nLzaTf1KTlddrjSk7dX3Nu9hm3cj1EKIU,229
10
- yad2_scraper/vehicles/category.py,sha256=BrH-aZY6hNlHtSqBmleifb7yY5R-76J2GAj9Bfd0Ulw,584
11
- yad2_scraper/vehicles/next_data.py,sha256=0xUbEwmj8CsWc0uqoW9hbM4FW26e4IWBiv-UcraSwrw,9125
12
- yad2_scraper/vehicles/query.py,sha256=VhL-E-sgpLxenZVvNgdCNWY15hMtoP0Oyv6SH_N3e04,757
13
- yad2_scraper/vehicles/tag.py,sha256=YTeCfVnaPnHz9CYRnfcQljEbNqynBDdlbX0HNPiB-XY,1960
14
- yad2_scraper/vehicles/urls.py,sha256=-aEtV_1elqHFdLIxBZglY0e0-UHGqQab5Rh5qKUyBtg,573
15
- yad2_scraper-0.5.0.dist-info/LICENSE,sha256=JCpnDxMx2kE40e0UQ1svSmifrLWg2Gni5VTkJR68thY,1065
16
- yad2_scraper-0.5.0.dist-info/METADATA,sha256=00MRqHUY9r2qiSRFgIZPCwli-aZgc_FJS2c0lshmKdY,875
17
- yad2_scraper-0.5.0.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
18
- yad2_scraper-0.5.0.dist-info/RECORD,,