yad2-scraper 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- yad2_scraper/__init__.py +98 -3
- yad2_scraper/category.py +6 -0
- yad2_scraper/exceptions.py +11 -0
- yad2_scraper/next_data.py +33 -1
- yad2_scraper/query.py +7 -5
- yad2_scraper/scraper.py +130 -1
- yad2_scraper/utils.py +21 -5
- yad2_scraper/vehicles/__init__.py +5 -0
- yad2_scraper/vehicles/category.py +17 -0
- yad2_scraper/vehicles/next_data.py +320 -0
- yad2_scraper/vehicles/query.py +26 -0
- yad2_scraper/vehicles/tag.py +65 -0
- yad2_scraper/vehicles/urls.py +20 -0
- yad2_scraper-0.5.1.dist-info/METADATA +164 -0
- yad2_scraper-0.5.1.dist-info/RECORD +18 -0
- {yad2_scraper-0.4.0.dist-info → yad2_scraper-0.5.1.dist-info}/WHEEL +1 -1
- yad2_scraper-0.4.0.dist-info/METADATA +0 -27
- yad2_scraper-0.4.0.dist-info/RECORD +0 -12
- {yad2_scraper-0.4.0.dist-info → yad2_scraper-0.5.1.dist-info}/LICENSE +0 -0
yad2_scraper/__init__.py
CHANGED
@@ -1,4 +1,99 @@
|
|
1
|
-
from
|
2
|
-
|
1
|
+
from typing import Optional, Type
|
2
|
+
|
3
|
+
from .scraper import Yad2Scraper, Category
|
4
|
+
from .query import QueryFilters, OrderBy, NumberRange
|
3
5
|
from .category import Yad2Category
|
4
|
-
from .next_data import NextData
|
6
|
+
from .next_data import NextData, Field
|
7
|
+
from .utils import any_param_specified
|
8
|
+
from .vehicles import (
|
9
|
+
Yad2VehiclesCategory,
|
10
|
+
VehiclesQueryFilters,
|
11
|
+
OrderVehiclesBy,
|
12
|
+
VehicleCategory,
|
13
|
+
get_vehicle_category_url
|
14
|
+
)
|
15
|
+
|
16
|
+
_default_scraper = None
|
17
|
+
|
18
|
+
|
19
|
+
def get_default_scraper() -> Yad2Scraper:
|
20
|
+
"""
|
21
|
+
Retrieves the default instance of the Yad2Scraper. If an instance does not already exist, it will be created.
|
22
|
+
|
23
|
+
Returns:
|
24
|
+
Yad2Scraper: The default instance of the Yad2Scraper.
|
25
|
+
|
26
|
+
Notes:
|
27
|
+
The default scraper is a singleton instance that is reused across multiple calls.
|
28
|
+
"""
|
29
|
+
global _default_scraper
|
30
|
+
|
31
|
+
if not _default_scraper:
|
32
|
+
_default_scraper = Yad2Scraper()
|
33
|
+
|
34
|
+
return _default_scraper
|
35
|
+
|
36
|
+
|
37
|
+
def fetch_category(
|
38
|
+
url: str,
|
39
|
+
category_type: Type[Category] = Yad2Category,
|
40
|
+
page: Optional[int] = None,
|
41
|
+
order_by: Optional[OrderBy] = None,
|
42
|
+
price_range: [NumberRange] = None
|
43
|
+
) -> Category:
|
44
|
+
"""
|
45
|
+
Fetches a specific category from the given URL, while applying optional filters.
|
46
|
+
|
47
|
+
Args:
|
48
|
+
url (str): The URL of the category to fetch.
|
49
|
+
category_type (Type[Category], optional): The type of category to return (default is `Yad2Category`).
|
50
|
+
page (Optional[int], optional): The page number for pagination (default is None).
|
51
|
+
order_by (Optional[OrderBy], optional): The sorting order for the results (default is None).
|
52
|
+
price_range (Optional[List[NumberRange]], optional): The price range filter for the results (default is None).
|
53
|
+
|
54
|
+
Returns:
|
55
|
+
Category: An instance of the specified `category_type`, populated with the fetched data.
|
56
|
+
|
57
|
+
Notes:
|
58
|
+
This method uses the default scraper to retrieve the category.
|
59
|
+
"""
|
60
|
+
if any_param_specified(page, order_by, price_range):
|
61
|
+
params = QueryFilters(page=page, order_by=order_by, price_range=price_range)
|
62
|
+
else:
|
63
|
+
params = None
|
64
|
+
|
65
|
+
default_scraper = get_default_scraper()
|
66
|
+
return default_scraper.fetch_category(url, category_type, params=params)
|
67
|
+
|
68
|
+
|
69
|
+
def fetch_vehicle_category(
|
70
|
+
vehicle_category: VehicleCategory,
|
71
|
+
page: Optional[int] = None,
|
72
|
+
order_by: Optional[OrderVehiclesBy] = None,
|
73
|
+
price_range: [NumberRange] = None,
|
74
|
+
year_range: [NumberRange] = None
|
75
|
+
) -> Yad2VehiclesCategory:
|
76
|
+
"""
|
77
|
+
Fetches a specific vehicle category, while applying optional filters.
|
78
|
+
|
79
|
+
Args:
|
80
|
+
vehicle_category (VehicleCategory): The vehicle category to fetch.
|
81
|
+
page (Optional[int], optional): The page number for pagination (default is None).
|
82
|
+
order_by (Optional[OrderVehiclesBy], optional): The sorting order for the results (default is None).
|
83
|
+
price_range (Optional[List[NumberRange]], optional): The price range filter for the results (default is None).
|
84
|
+
year_range (Optional[List[NumberRange]], optional): The year range filter for the results (default is None).
|
85
|
+
|
86
|
+
Returns:
|
87
|
+
Yad2VehiclesCategory: An instance of `Yad2VehiclesCategory`, populated with the fetched vehicle category data.
|
88
|
+
|
89
|
+
Notes:
|
90
|
+
This method uses the default scraper to fetch the vehicle category.
|
91
|
+
"""
|
92
|
+
if any_param_specified(page, order_by, price_range, year_range):
|
93
|
+
params = VehiclesQueryFilters(page=page, order_by=order_by, price_range=price_range, year_range=year_range)
|
94
|
+
else:
|
95
|
+
params = None
|
96
|
+
|
97
|
+
url = get_vehicle_category_url(vehicle_category)
|
98
|
+
default_scraper = get_default_scraper()
|
99
|
+
return default_scraper.fetch_category(url, Yad2VehiclesCategory, params=params)
|
yad2_scraper/category.py
CHANGED
@@ -8,18 +8,24 @@ from yad2_scraper.constants import NEXT_DATA_SCRIPT_ID
|
|
8
8
|
|
9
9
|
|
10
10
|
class Yad2Category:
|
11
|
+
"""Represents a Yad2 category parsed from an HTML page."""
|
12
|
+
|
11
13
|
def __init__(self, soup: BeautifulSoup):
|
14
|
+
"""Initialize with a BeautifulSoup object."""
|
12
15
|
self.soup = soup
|
13
16
|
|
14
17
|
@classmethod
|
15
18
|
def from_html_io(cls, html_io: Union[TextIO, BinaryIO]):
|
19
|
+
"""Create an instance from an HTML file-like object."""
|
16
20
|
html = html_io.read()
|
17
21
|
soup = BeautifulSoup(html, "html.parser")
|
18
22
|
return cls(soup)
|
19
23
|
|
20
24
|
def load_next_data(self) -> Optional[NextData]:
|
25
|
+
"""Extract and parse Next.js data from the page."""
|
21
26
|
tag = self.soup.find("script", id=NEXT_DATA_SCRIPT_ID)
|
22
27
|
return NextData(json.loads(tag.string)) if tag else None
|
23
28
|
|
24
29
|
def find_all_tags_by_class_substring(self, tag_name: str, substring: str) -> List[Tag]:
|
30
|
+
"""Find all HTML tags with a class containing the given substring."""
|
25
31
|
return find_all_html_tags_by_class_substring(self.soup, tag_name, substring)
|
yad2_scraper/exceptions.py
CHANGED
@@ -3,29 +3,40 @@ from typing import List, Union
|
|
3
3
|
|
4
4
|
|
5
5
|
class ResponseError(Exception):
|
6
|
+
"""Represents an error response from an HTTP request."""
|
7
|
+
|
6
8
|
def __init__(self, msg: str, request: httpx.Request, response: httpx.Response):
|
9
|
+
"""Initialize with an error message, request, and response objects."""
|
7
10
|
super().__init__(msg)
|
8
11
|
self.request = request
|
9
12
|
self.response = response
|
10
13
|
|
11
14
|
|
12
15
|
class AntiBotDetectedError(ResponseError):
|
16
|
+
"""Raised when an anti-bot mechanism is detected."""
|
13
17
|
pass
|
14
18
|
|
15
19
|
|
16
20
|
class UnexpectedContentError(ResponseError):
|
21
|
+
"""Raised when the response content is not as expected."""
|
17
22
|
pass
|
18
23
|
|
19
24
|
|
20
25
|
class MaxAttemptsExceededError(Exception):
|
26
|
+
"""Raised when the maximum number of attempts is exceeded."""
|
27
|
+
|
21
28
|
def __init__(self, msg: str, max_attempts: int, errors: List[BaseException] = None):
|
29
|
+
"""Initialize with an error message, max attempts, and optional errors."""
|
22
30
|
super().__init__(msg)
|
23
31
|
self.max_attempts = max_attempts
|
24
32
|
self.errors = errors
|
25
33
|
|
26
34
|
|
27
35
|
class MaxRequestAttemptsExceededError(MaxAttemptsExceededError):
|
36
|
+
"""Raised when all HTTP request attempts fail."""
|
37
|
+
|
28
38
|
def __init__(self, method: str, url: str, max_attempts: int, errors: List[Union[httpx.HTTPError, ResponseError]]):
|
39
|
+
"""Initialize with request method, URL, max attempts, and error list."""
|
29
40
|
msg = f"All {max_attempts} attempts for {method} request to '{url}' have failed"
|
30
41
|
super().__init__(msg, max_attempts, errors)
|
31
42
|
self.method = method
|
yad2_scraper/next_data.py
CHANGED
@@ -1,6 +1,30 @@
|
|
1
|
+
from datetime import datetime
|
1
2
|
from enum import Enum
|
2
3
|
from typing import List, Union
|
3
4
|
|
5
|
+
from yad2_scraper.utils import safe_access
|
6
|
+
|
7
|
+
FieldTypes = Union[str, int]
|
8
|
+
|
9
|
+
_safe_access_optional_keys = safe_access(exceptions=(KeyError, TypeError), default=None)
|
10
|
+
|
11
|
+
|
12
|
+
class SafeAccessOptionalKeysMeta(type):
|
13
|
+
"""Metaclass that wraps methods and properties with safe access handling."""
|
14
|
+
|
15
|
+
def __new__(cls, name, bases, dictionary):
|
16
|
+
for attr_name, attr_value in dictionary.items():
|
17
|
+
if callable(attr_value): # Wrap methods
|
18
|
+
dictionary[attr_name] = _safe_access_optional_keys(attr_value)
|
19
|
+
elif isinstance(attr_value, property): # Wrap properties
|
20
|
+
dictionary[attr_name] = property(
|
21
|
+
_safe_access_optional_keys(attr_value.fget) if attr_value.fget else None,
|
22
|
+
_safe_access_optional_keys(attr_value.fset) if attr_value.fset else None,
|
23
|
+
_safe_access_optional_keys(attr_value.fdel) if attr_value.fdel else None,
|
24
|
+
attr_value.__doc__,
|
25
|
+
)
|
26
|
+
return super().__new__(cls, name, bases, dictionary)
|
27
|
+
|
4
28
|
|
5
29
|
class Field(str, Enum):
|
6
30
|
ID = "id"
|
@@ -8,20 +32,28 @@ class Field(str, Enum):
|
|
8
32
|
ENGLISH_TEXT = "textEng"
|
9
33
|
|
10
34
|
|
11
|
-
|
35
|
+
def convert_string_date_to_datetime(date_string: str) -> datetime:
|
36
|
+
"""Convert an ISO format string to a datetime object."""
|
37
|
+
return datetime.fromisoformat(date_string)
|
12
38
|
|
13
39
|
|
14
40
|
class NextData:
|
41
|
+
"""Represents structured Next.js data."""
|
42
|
+
|
15
43
|
def __init__(self, data: dict):
|
44
|
+
"""Initialize with Next.js data dictionary."""
|
16
45
|
self.data = data
|
17
46
|
|
18
47
|
@property
|
19
48
|
def json(self) -> dict:
|
49
|
+
"""Return raw JSON data."""
|
20
50
|
return self.data
|
21
51
|
|
22
52
|
@property
|
23
53
|
def queries(self) -> List[dict]:
|
54
|
+
"""Extract query data from Next.js state."""
|
24
55
|
return self.data["props"]["pageProps"]["dehydratedState"]["queries"]
|
25
56
|
|
26
57
|
def __getitem__(self, item):
|
58
|
+
"""Allow dictionary-style access to data."""
|
27
59
|
return self.data[item]
|
yad2_scraper/query.py
CHANGED
@@ -2,7 +2,7 @@ from pydantic import BaseModel
|
|
2
2
|
from enum import Enum
|
3
3
|
from typing import Optional, Tuple
|
4
4
|
|
5
|
-
|
5
|
+
NumberRange = Tuple[int, int]
|
6
6
|
|
7
7
|
|
8
8
|
class OrderBy(int, Enum):
|
@@ -13,6 +13,7 @@ class OrderBy(int, Enum):
|
|
13
13
|
|
14
14
|
|
15
15
|
def format_number_range(number_range: Optional[Tuple[int, int]]) -> Optional[str]:
|
16
|
+
"""Format a number range as 'min_value-max_value'."""
|
16
17
|
if number_range is None:
|
17
18
|
return None
|
18
19
|
|
@@ -25,12 +26,13 @@ def format_number_range(number_range: Optional[Tuple[int, int]]) -> Optional[str
|
|
25
26
|
|
26
27
|
|
27
28
|
class QueryFilters(BaseModel):
|
29
|
+
"""Pydantic model representing query filters for querying a resource."""
|
28
30
|
page: Optional[int] = None
|
29
31
|
order_by: Optional[OrderBy] = None
|
30
|
-
price_range: Optional[
|
31
|
-
...
|
32
|
+
price_range: Optional[NumberRange] = None
|
32
33
|
|
33
34
|
def to_params(self) -> dict:
|
35
|
+
"""Convert filter fields to query parameters."""
|
34
36
|
return {
|
35
37
|
"page": self.page,
|
36
38
|
"Order": self.order_by,
|
@@ -38,9 +40,9 @@ class QueryFilters(BaseModel):
|
|
38
40
|
}
|
39
41
|
|
40
42
|
def to_clean_params(self):
|
43
|
+
"""Return query parameters excluding None values."""
|
41
44
|
return {key: value for key, value in self.to_params().items() if value is not None}
|
42
45
|
|
43
|
-
# TODO: add helper methods for managing the attribute values
|
44
|
-
|
45
46
|
def __iter__(self):
|
47
|
+
"""Allow iteration over the clean query parameters."""
|
46
48
|
yield from self.to_clean_params().items()
|
yad2_scraper/scraper.py
CHANGED
@@ -24,6 +24,8 @@ logger = logging.getLogger(__name__)
|
|
24
24
|
|
25
25
|
|
26
26
|
class Yad2Scraper:
|
27
|
+
"""A scraper for fetching data from the Yad2 website, with robust features"""
|
28
|
+
|
27
29
|
def __init__(
|
28
30
|
self,
|
29
31
|
client: Optional[httpx.Client] = None,
|
@@ -32,6 +34,16 @@ class Yad2Scraper:
|
|
32
34
|
wait_strategy: Optional[WaitStrategy] = None,
|
33
35
|
max_request_attempts: int = 1
|
34
36
|
):
|
37
|
+
"""
|
38
|
+
Initializes the Yad2Scraper with provided parameters.
|
39
|
+
|
40
|
+
Args:
|
41
|
+
client (Optional[httpx.Client]): An optional custom HTTP client. If not provided, a default client is used.
|
42
|
+
request_defaults (Optional[Dict[str, Any]]): Default parameters for requests such as headers, params, etc.
|
43
|
+
randomize_user_agent (bool): If True, a random User-Agent will be set for each request. Defaults to True.
|
44
|
+
wait_strategy (Optional[WaitStrategy]): A function to determine the wait time between requests.
|
45
|
+
max_request_attempts (int): The maximum number of retry attempts for failed requests. Defaults to 1.
|
46
|
+
"""
|
35
47
|
self.client = client or httpx.Client(
|
36
48
|
headers=DEFAULT_REQUEST_HEADERS,
|
37
49
|
follow_redirects=ALLOW_REQUEST_REDIRECTS,
|
@@ -41,14 +53,32 @@ class Yad2Scraper:
|
|
41
53
|
self.randomize_user_agent = randomize_user_agent
|
42
54
|
self.wait_strategy = wait_strategy
|
43
55
|
self.max_request_attempts = max_request_attempts
|
56
|
+
self._request_count = 0
|
44
57
|
|
45
58
|
logger.debug(f"Scraper initialized with client: {self.client}")
|
46
59
|
|
60
|
+
@property
|
61
|
+
def request_count(self) -> int:
|
62
|
+
"""Returns the number of requests made by the scraper so far."""
|
63
|
+
return self._request_count
|
64
|
+
|
47
65
|
def set_user_agent(self, user_agent: str) -> None:
|
66
|
+
"""
|
67
|
+
Sets the User-Agent header for requests.
|
68
|
+
|
69
|
+
Args:
|
70
|
+
user_agent (str): The User-Agent string to be used in HTTP requests.
|
71
|
+
"""
|
48
72
|
self.client.headers["User-Agent"] = user_agent
|
49
73
|
logger.debug(f"User-Agent client header set to: '{user_agent}'")
|
50
74
|
|
51
75
|
def set_no_script(self, no_script: bool) -> None:
|
76
|
+
"""
|
77
|
+
Sets the "noscript" cookie in the client's cookies to control JavaScript content.
|
78
|
+
|
79
|
+
Args:
|
80
|
+
no_script (bool): If True, the "noscript" cookie is set to "1". If False, it's set to "0".
|
81
|
+
"""
|
52
82
|
value = "1" if no_script else "0"
|
53
83
|
self.client.cookies.set("noscript", value)
|
54
84
|
logger.debug(f"NoScript (noscript) client cookie set to: '{value}'")
|
@@ -56,18 +86,44 @@ class Yad2Scraper:
|
|
56
86
|
def fetch_category(
|
57
87
|
self,
|
58
88
|
url: str,
|
59
|
-
category_type: Type[Category]
|
89
|
+
category_type: Type[Category],
|
60
90
|
params: Optional[QueryParamTypes] = None
|
61
91
|
) -> Category:
|
92
|
+
"""
|
93
|
+
Fetches and returns a category page from a given URL.
|
94
|
+
|
95
|
+
Args:
|
96
|
+
url (str): The URL of the category page.
|
97
|
+
category_type (Type[Category]): The class type of the category to be fetched.
|
98
|
+
params (Optional[QueryParamTypes]): Query parameters to be included in the request.
|
99
|
+
|
100
|
+
Returns:
|
101
|
+
Category: The fetched category, parsed from HTML.
|
102
|
+
"""
|
62
103
|
logger.debug(f"Fetching category from URL: '{url}'")
|
63
104
|
response = self.get(url, params)
|
64
105
|
logger.debug(f"Category fetched successfully from URL: '{url}'")
|
65
106
|
return category_type.from_html_io(response)
|
66
107
|
|
67
108
|
def get(self, url: str, params: Optional[QueryParamTypes] = None) -> httpx.Response:
|
109
|
+
"""Sends a GET request to the specified URL."""
|
68
110
|
return self.request("GET", url, params=params)
|
69
111
|
|
70
112
|
def request(self, method: str, url: str, params: Optional[QueryParamTypes] = None) -> httpx.Response:
|
113
|
+
"""
|
114
|
+
Sends an HTTP request with multiple attempts logic.
|
115
|
+
|
116
|
+
Args:
|
117
|
+
method (str): The HTTP method (e.g., "GET", "POST").
|
118
|
+
url (str): The URL to send the request to.
|
119
|
+
params (Optional[QueryParamTypes]): Query parameters to be included in the request.
|
120
|
+
|
121
|
+
Returns:
|
122
|
+
httpx.Response: The HTTP response object.
|
123
|
+
|
124
|
+
Raises:
|
125
|
+
MaxRequestAttemptsExceededError: If the request exceeds the maximum number of attempts.
|
126
|
+
"""
|
71
127
|
if not isinstance(self.max_request_attempts, int):
|
72
128
|
raise TypeError(f"max_request_attempts must be of type 'int', but got {type(self.max_request_attempts)}")
|
73
129
|
|
@@ -92,11 +148,28 @@ class Yad2Scraper:
|
|
92
148
|
raise max_attempts_error from error_list[-1] # multiple errors exist, raise from the last one
|
93
149
|
|
94
150
|
def close(self) -> None:
|
151
|
+
"""Closes the HTTP client and logs the closure."""
|
95
152
|
logger.debug("Closing scraper client")
|
96
153
|
self.client.close()
|
97
154
|
logger.info("Scraper client closed")
|
98
155
|
|
99
156
|
def _send_request(self, method: str, url: str, request_options: Dict[str, Any], attempt: int) -> httpx.Response:
|
157
|
+
"""
|
158
|
+
Sends an HTTP request with the specified method to the given URL, applying all necessary actions.
|
159
|
+
|
160
|
+
Args:
|
161
|
+
method (str): The HTTP method (e.g., 'GET', 'POST').
|
162
|
+
url (str): The target URL for the request.
|
163
|
+
request_options (Dict[str, Any]): Additional request options, including headers and parameters.
|
164
|
+
attempt (int): The current attempt number for the request.
|
165
|
+
|
166
|
+
Returns:
|
167
|
+
httpx.Response: The HTTP response object received from the server.
|
168
|
+
|
169
|
+
Raises:
|
170
|
+
AntiBotDetectedError: If the response contains Anti-Bot content.
|
171
|
+
UnexpectedContentError: If a GET request does not contain expected content.
|
172
|
+
"""
|
100
173
|
if self.randomize_user_agent:
|
101
174
|
self._set_random_user_agent(request_options)
|
102
175
|
|
@@ -105,12 +178,22 @@ class Yad2Scraper:
|
|
105
178
|
|
106
179
|
logger.info(f"Sending {method} request to URL: '{url}' {self._format_attempt_info(attempt)}")
|
107
180
|
response = self.client.request(method, url, **request_options)
|
181
|
+
self._request_count += 1
|
108
182
|
logger.debug(f"Received response {response.status_code} from '{url}' {self._format_attempt_info(attempt)}")
|
109
183
|
self._validate_response(response)
|
110
184
|
|
111
185
|
return response
|
112
186
|
|
113
187
|
def _prepare_request_options(self, params: Optional[QueryParamTypes] = None) -> Dict[str, Any]:
|
188
|
+
"""
|
189
|
+
Prepares the request options to be passed to the HTTP client's request method, based on the default options.
|
190
|
+
|
191
|
+
Args:
|
192
|
+
params (Optional[QueryParamTypes]): Optional query parameters to include in the request.
|
193
|
+
|
194
|
+
Returns:
|
195
|
+
Dict[str, Any]: A dictionary of the request options, including headers and query parameters.
|
196
|
+
"""
|
114
197
|
logger.debug("Preparing request options from defaults")
|
115
198
|
request_options = self.request_defaults.copy()
|
116
199
|
|
@@ -122,11 +205,23 @@ class Yad2Scraper:
|
|
122
205
|
|
123
206
|
@staticmethod
|
124
207
|
def _set_random_user_agent(request_options: Dict[str, str]):
|
208
|
+
"""
|
209
|
+
Sets a random User-Agent header in the request options.
|
210
|
+
|
211
|
+
Args:
|
212
|
+
request_options (Dict[str, str]): The request options to update with the random User-Agent.
|
213
|
+
"""
|
125
214
|
user_agent = fua.random
|
126
215
|
request_options.setdefault("headers", {})["User-Agent"] = user_agent
|
127
216
|
logger.debug(f"Updated request options with random User-Agent header: '{user_agent}'")
|
128
217
|
|
129
218
|
def _apply_wait_strategy(self, attempt: int):
|
219
|
+
"""
|
220
|
+
Applies a wait time before making a request based on the wait strategy for the given attempt.
|
221
|
+
|
222
|
+
Args:
|
223
|
+
attempt (int): The current attempt number to calculate the wait time.
|
224
|
+
"""
|
130
225
|
wait_time = self.wait_strategy(attempt)
|
131
226
|
if not wait_time:
|
132
227
|
return
|
@@ -136,6 +231,17 @@ class Yad2Scraper:
|
|
136
231
|
|
137
232
|
@staticmethod
|
138
233
|
def _validate_response(response: httpx.Response):
|
234
|
+
"""
|
235
|
+
Validates the response to ensure it is successful.
|
236
|
+
|
237
|
+
Args:
|
238
|
+
response (httpx.Response): The HTTP response object to validate.
|
239
|
+
|
240
|
+
Raises:
|
241
|
+
httpx.HTTPStatusError: If a status error occurred.
|
242
|
+
AntiBotDetectedError: If the response contains Anti-Bot content.
|
243
|
+
UnexpectedContentError: If a GET response does not contain expected content.
|
244
|
+
"""
|
139
245
|
response.raise_for_status()
|
140
246
|
|
141
247
|
if ANTIBOT_CONTENT_IDENTIFIER in response.content:
|
@@ -154,12 +260,35 @@ class Yad2Scraper:
|
|
154
260
|
logger.debug("Response validation succeeded")
|
155
261
|
|
156
262
|
def _format_attempt_info(self, attempt: int) -> str:
|
263
|
+
"""
|
264
|
+
Formats a string representing the current attempt number and total attempt count.
|
265
|
+
|
266
|
+
Args:
|
267
|
+
attempt (int): The current attempt number.
|
268
|
+
|
269
|
+
Returns:
|
270
|
+
str: A formatted string representing the attempt info, e.g., "(attempt 1/5)".
|
271
|
+
"""
|
157
272
|
return f"(attempt {attempt}/{self.max_request_attempts})"
|
158
273
|
|
159
274
|
def __enter__(self):
|
275
|
+
"""
|
276
|
+
Prepares the scraper to be used in a `with` statement, allowing for resource management.
|
277
|
+
|
278
|
+
Returns:
|
279
|
+
Yad2Scraper: The scraper instance to be used within the `with` block.
|
280
|
+
"""
|
160
281
|
logger.debug("Entering scraper context")
|
161
282
|
return self
|
162
283
|
|
163
284
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
285
|
+
"""
|
286
|
+
Cleans up resources and closes the scraper client when exiting the `with` statement.
|
287
|
+
|
288
|
+
Args:
|
289
|
+
exc_type: The exception type (if any).
|
290
|
+
exc_val: The exception value (if any).
|
291
|
+
exc_tb: The traceback object (if any).
|
292
|
+
"""
|
164
293
|
logger.debug("Exiting scraper context")
|
165
294
|
self.close()
|
yad2_scraper/utils.py
CHANGED
@@ -1,21 +1,37 @@
|
|
1
|
+
import functools
|
1
2
|
from bs4 import BeautifulSoup, Tag
|
2
|
-
from typing import Union, List
|
3
|
+
from typing import Union, List, Tuple, Any
|
3
4
|
|
5
|
+
def any_param_specified(*params: Any) -> bool:
|
6
|
+
"""Check if any parameter is not None."""
|
7
|
+
return any(param is not None for param in params)
|
4
8
|
|
5
9
|
def join_url(url: str, path: str) -> str:
|
10
|
+
"""Join a base URL with a path, ensuring proper slashes."""
|
6
11
|
return url.rstrip("/") + "/" + path.lstrip("/")
|
7
12
|
|
8
|
-
|
9
13
|
def get_parent_url(url: str) -> str:
|
14
|
+
"""Return the parent URL by removing the last segment."""
|
10
15
|
if url.count("/") <= 2:
|
11
16
|
return url
|
12
|
-
|
13
17
|
return url.rstrip("/").rsplit("/", 1)[0]
|
14
18
|
|
15
|
-
|
16
19
|
def find_html_tag_by_class_substring(e: Union[BeautifulSoup, Tag], tag_name: str, substring: str) -> Tag:
|
20
|
+
"""Find the first HTML tag with a class containing the given substring."""
|
17
21
|
return e.find(tag_name, class_=lambda class_name: class_name and substring in class_name)
|
18
22
|
|
19
|
-
|
20
23
|
def find_all_html_tags_by_class_substring(e: Union[BeautifulSoup, Tag], tag_name: str, substring: str) -> List[Tag]:
|
24
|
+
"""Find all HTML tags with a class containing the given substring."""
|
21
25
|
return e.find_all(tag_name, class_=lambda class_name: class_name and substring in class_name)
|
26
|
+
|
27
|
+
def safe_access(exceptions: Tuple = (), default: Any = None):
|
28
|
+
"""Decorator to safely execute a function, returning a default value on exception."""
|
29
|
+
def decorator(func):
|
30
|
+
@functools.wraps(func)
|
31
|
+
def wrapper(*args, **kwargs):
|
32
|
+
try:
|
33
|
+
return func(*args, **kwargs)
|
34
|
+
except exceptions:
|
35
|
+
return default
|
36
|
+
return wrapper
|
37
|
+
return decorator
|
@@ -0,0 +1,17 @@
|
|
1
|
+
from typing import List, Optional
|
2
|
+
|
3
|
+
from yad2_scraper.category import Yad2Category
|
4
|
+
from yad2_scraper.vehicles.tag import VehicleTag
|
5
|
+
from yad2_scraper.vehicles.next_data import VehiclesNextData
|
6
|
+
|
7
|
+
|
8
|
+
class Yad2VehiclesCategory(Yad2Category):
|
9
|
+
def get_vehicle_tags(self) -> List[VehicleTag]:
|
10
|
+
"""Retrieve a and return list of vehicle tags from the current category."""
|
11
|
+
tags = self.find_all_tags_by_class_substring("div", "feedItemBox")
|
12
|
+
return [VehicleTag(tag) for tag in tags]
|
13
|
+
|
14
|
+
def load_next_data(self) -> Optional[VehiclesNextData]:
|
15
|
+
"""Extract and parse Next.js data from the current vehicle page."""
|
16
|
+
next_data = super().load_next_data()
|
17
|
+
return VehiclesNextData(next_data) if next_data else None
|
@@ -0,0 +1,320 @@
|
|
1
|
+
import itertools
|
2
|
+
from datetime import datetime
|
3
|
+
from typing import List, Any, Iterator, Optional
|
4
|
+
|
5
|
+
from yad2_scraper.next_data import (
|
6
|
+
SafeAccessOptionalKeysMeta,
|
7
|
+
NextData,
|
8
|
+
Field,
|
9
|
+
FieldTypes,
|
10
|
+
convert_string_date_to_datetime
|
11
|
+
)
|
12
|
+
from yad2_scraper.utils import join_url
|
13
|
+
from yad2_scraper.vehicles.urls import VEHICLES_URL
|
14
|
+
|
15
|
+
|
16
|
+
class VehicleData(metaclass=SafeAccessOptionalKeysMeta):
|
17
|
+
"""Represents structured Next.js data of a specific vehicle category."""
|
18
|
+
|
19
|
+
def __init__(self, data: dict):
|
20
|
+
self.data = data
|
21
|
+
|
22
|
+
@property
|
23
|
+
def token(self) -> str:
|
24
|
+
return self["token"]
|
25
|
+
|
26
|
+
@property
|
27
|
+
def page_link(self) -> str:
|
28
|
+
return join_url(VEHICLES_URL, f"item/{self.token}")
|
29
|
+
|
30
|
+
@property
|
31
|
+
def price(self) -> int:
|
32
|
+
return self["price"]
|
33
|
+
|
34
|
+
@property
|
35
|
+
def customer(self) -> dict:
|
36
|
+
return self["customer"]
|
37
|
+
|
38
|
+
@property
|
39
|
+
def customer_name(self) -> str:
|
40
|
+
return self.customer["name"]
|
41
|
+
|
42
|
+
@property
|
43
|
+
def customer_phone(self) -> str:
|
44
|
+
return self.customer["phone"]
|
45
|
+
|
46
|
+
@property
|
47
|
+
def address(self) -> dict:
|
48
|
+
return self["address"]
|
49
|
+
|
50
|
+
def top_area(self, field: Field = Field.TEXT) -> Optional[FieldTypes]:
|
51
|
+
return self["address"]["topArea"][field]
|
52
|
+
|
53
|
+
def area(self, field: Field = Field.TEXT) -> Optional[FieldTypes]:
|
54
|
+
return self["address"]["area"][field]
|
55
|
+
|
56
|
+
def city(self, field: Field = Field.TEXT) -> Optional[FieldTypes]:
|
57
|
+
return self["address"]["city"][field]
|
58
|
+
|
59
|
+
@property
|
60
|
+
def metadata(self) -> dict:
|
61
|
+
return self["metaData"]
|
62
|
+
|
63
|
+
@property
|
64
|
+
def video(self) -> str:
|
65
|
+
return self.metadata["video"]
|
66
|
+
|
67
|
+
@property
|
68
|
+
def cover_image(self) -> str:
|
69
|
+
return self.metadata["coverImage"]
|
70
|
+
|
71
|
+
@property
|
72
|
+
def images(self) -> str:
|
73
|
+
return self.metadata["images"]
|
74
|
+
|
75
|
+
@property
|
76
|
+
def description(self) -> str:
|
77
|
+
return self.metadata["description"]
|
78
|
+
|
79
|
+
@property
|
80
|
+
def dates(self) -> dict:
|
81
|
+
return self["dates"]
|
82
|
+
|
83
|
+
@property
|
84
|
+
def updated_at(self) -> datetime:
|
85
|
+
return convert_string_date_to_datetime(self.dates["updatedAt"])
|
86
|
+
|
87
|
+
@property
|
88
|
+
def created_at(self) -> datetime:
|
89
|
+
return convert_string_date_to_datetime(self.dates["createdAt"])
|
90
|
+
|
91
|
+
@property
|
92
|
+
def ends_at(self) -> datetime:
|
93
|
+
return convert_string_date_to_datetime(self.dates["endsAt"])
|
94
|
+
|
95
|
+
@property
|
96
|
+
def rebounced_at(self) -> datetime:
|
97
|
+
return convert_string_date_to_datetime(self.dates["rebouncedAt"])
|
98
|
+
|
99
|
+
def manufacturer(self, field: Field = Field.TEXT) -> Optional[FieldTypes]:
|
100
|
+
return self["manufacturer"][field]
|
101
|
+
|
102
|
+
def color(self, field: Field = Field.TEXT) -> Optional[FieldTypes]:
|
103
|
+
return self["color"][field]
|
104
|
+
|
105
|
+
@property
|
106
|
+
def km(self) -> Optional[int]:
|
107
|
+
return self["km"]
|
108
|
+
|
109
|
+
@property
|
110
|
+
def hand(self, field: Field = Field.ID) -> Optional[FieldTypes]:
|
111
|
+
return self["hand"][field]
|
112
|
+
|
113
|
+
@property
|
114
|
+
def engine_volume(self) -> Optional[int]:
|
115
|
+
return self["engineVolume"]
|
116
|
+
|
117
|
+
@property
|
118
|
+
def horse_power(self) -> Optional[int]:
|
119
|
+
return self["horsePower"]
|
120
|
+
|
121
|
+
@property
|
122
|
+
def previous_owner(self, field: Field = Field.TEXT) -> Optional[FieldTypes]:
|
123
|
+
return self["previousOwner"][field]
|
124
|
+
|
125
|
+
@property
|
126
|
+
def above_price(self) -> Optional[int]:
|
127
|
+
return self["abovePrice"]
|
128
|
+
|
129
|
+
@property
|
130
|
+
def tags(self) -> List[dict]:
|
131
|
+
return self["tags"]
|
132
|
+
|
133
|
+
@property
|
134
|
+
def is_contact_lead_supported(self) -> Optional[bool]:
|
135
|
+
return self["isContactLeadSupported"]
|
136
|
+
|
137
|
+
@property
|
138
|
+
def vehicle_dates(self) -> dict:
|
139
|
+
return self["vehicleDates"]
|
140
|
+
|
141
|
+
@property
|
142
|
+
def year_of_production(self) -> Optional[int]:
|
143
|
+
return self.vehicle_dates["yearOfProduction"]
|
144
|
+
|
145
|
+
@property
|
146
|
+
def month_of_production(self) -> Optional[int]:
|
147
|
+
return self.vehicle_dates["monthOfProduction"]["id"]
|
148
|
+
|
149
|
+
@property
|
150
|
+
def test_date(self) -> Optional[datetime]:
|
151
|
+
return convert_string_date_to_datetime(self.vehicle_dates["testDate"])
|
152
|
+
|
153
|
+
def model(self, field: Field = Field.TEXT) -> Optional[FieldTypes]:
|
154
|
+
return self["model"][field]
|
155
|
+
|
156
|
+
@property
|
157
|
+
def sub_model(self) -> Optional[str]:
|
158
|
+
return self["subModel"]
|
159
|
+
|
160
|
+
def gear_box(self, field: Field = Field.TEXT) -> Optional[FieldTypes]:
|
161
|
+
return self["gearBox"][field]
|
162
|
+
|
163
|
+
def car_family_types(self, field: Field = Field.TEXT) -> Optional[List[FieldTypes]]:
|
164
|
+
return [obj[field] for obj in self["carFamilyType"]]
|
165
|
+
|
166
|
+
def engine_type(self, field: Field = Field.TEXT) -> Optional[FieldTypes]:
|
167
|
+
return self["engineType"][field]
|
168
|
+
|
169
|
+
@property
|
170
|
+
def seats(self) -> Optional[int]:
|
171
|
+
return self["seats"]
|
172
|
+
|
173
|
+
@property
|
174
|
+
def number_of_doors(self) -> Optional[int]:
|
175
|
+
return self["numberOfDoors"]
|
176
|
+
|
177
|
+
@property
|
178
|
+
def owner(self) -> Optional[str]:
|
179
|
+
return self["owner"]["text"]
|
180
|
+
|
181
|
+
@property
|
182
|
+
def body_type(self) -> Optional[str]:
|
183
|
+
return self["bodyType"]["text"]
|
184
|
+
|
185
|
+
@property
|
186
|
+
def combined_fuel_consumption(self) -> Optional[float]:
|
187
|
+
return self["combinedFuelConsumption"]
|
188
|
+
|
189
|
+
@property
|
190
|
+
def power_train_architecture(self) -> Optional[str]:
|
191
|
+
return self["powertrainArchitecture"]
|
192
|
+
|
193
|
+
def car_tags(self, field: Field = Field.TEXT) -> Optional[List[FieldTypes]]:
|
194
|
+
return [obj[field] for obj in self["carTag"]]
|
195
|
+
|
196
|
+
@property
|
197
|
+
def specification(self) -> dict:
|
198
|
+
return self["specification"]
|
199
|
+
|
200
|
+
@property
|
201
|
+
def has_air_conditioner(self) -> Optional[bool]:
|
202
|
+
return self.specification["airConditioner"]
|
203
|
+
|
204
|
+
@property
|
205
|
+
def has_power_steering(self) -> Optional[bool]:
|
206
|
+
return self.specification["powerSteering"]
|
207
|
+
|
208
|
+
@property
|
209
|
+
def has_magnesium_wheel(self) -> Optional[bool]:
|
210
|
+
return self.specification["magnesiumWheel"]
|
211
|
+
|
212
|
+
@property
|
213
|
+
def has_tire_pressure_monitoring_system(self) -> Optional[bool]:
|
214
|
+
return self.specification["tirePressureMonitoringSystem"]
|
215
|
+
|
216
|
+
@property
|
217
|
+
def has_abs(self) -> Optional[bool]:
|
218
|
+
return self.specification["abs"]
|
219
|
+
|
220
|
+
@property
|
221
|
+
def air_bags(self) -> Optional[int]:
|
222
|
+
return self.specification["airBags"]
|
223
|
+
|
224
|
+
@property
|
225
|
+
def has_control_stability(self) -> Optional[bool]:
|
226
|
+
return self.specification["controlStability"]
|
227
|
+
|
228
|
+
@property
|
229
|
+
def has_electric_window(self) -> Optional[int]:
|
230
|
+
return self.specification["electricWindow"]
|
231
|
+
|
232
|
+
@property
|
233
|
+
def has_breaking_assist_system(self) -> Optional[bool]:
|
234
|
+
return self.specification["breakingAssistSystem"]
|
235
|
+
|
236
|
+
@property
|
237
|
+
def has_reverse_camera(self) -> Optional[bool]:
|
238
|
+
return self.specification["reverseCamera"]
|
239
|
+
|
240
|
+
@property
|
241
|
+
def has_adaptive_cruise_control(self) -> Optional[bool]:
|
242
|
+
return self.specification["adaptiveCruiseControl"]
|
243
|
+
|
244
|
+
@property
|
245
|
+
def has_high_beams_auto_control(self) -> Optional[bool]:
|
246
|
+
return self.specification["highBeamsAutoControl"]
|
247
|
+
|
248
|
+
@property
|
249
|
+
def has_blind_spot_assist(self) -> Optional[bool]:
|
250
|
+
return self.specification["blindSpotAssist"]
|
251
|
+
|
252
|
+
@property
|
253
|
+
def has_identify_pedestrians(self) -> Optional[bool]:
|
254
|
+
return self.specification["identifyPedestrians"]
|
255
|
+
|
256
|
+
@property
|
257
|
+
def has_seat_belts_sensors(self) -> Optional[bool]:
|
258
|
+
return self.specification["seatBeltsSensors"]
|
259
|
+
|
260
|
+
@property
|
261
|
+
def has_identifying_dangerous_nearing(self) -> Optional[bool]:
|
262
|
+
return self.specification["identifyingDangerousNearing"]
|
263
|
+
|
264
|
+
@property
|
265
|
+
def has_auto_lighting_in_forward(self) -> Optional[bool]:
|
266
|
+
return self.specification["autoLightingInForward"]
|
267
|
+
|
268
|
+
@property
|
269
|
+
def has_identify_traffic_signs(self) -> Optional[bool]:
|
270
|
+
return self.specification["identifyTrafficSigns"]
|
271
|
+
|
272
|
+
def ignition(self, field: Field = Field.TEXT) -> Optional[FieldTypes]:
|
273
|
+
return self.specification["ignition"][field]
|
274
|
+
|
275
|
+
@property
|
276
|
+
def safety_points(self) -> Optional[int]:
|
277
|
+
return self.specification["safetyPoints"]
|
278
|
+
|
279
|
+
@property
|
280
|
+
def is_handicapped_friendly(self) -> Optional[bool]:
|
281
|
+
return self.specification["isHandicappedFriendly"]
|
282
|
+
|
283
|
+
@property
|
284
|
+
def has_sun_roof(self) -> Optional[bool]:
|
285
|
+
return self.specification["sunRoof"]
|
286
|
+
|
287
|
+
@property
|
288
|
+
def is_turbo(self) -> Optional[bool]:
|
289
|
+
return self.specification["isTurbo"]
|
290
|
+
|
291
|
+
@property
|
292
|
+
def has_road_deviation_control(self) -> Optional[bool]:
|
293
|
+
return self.specification["roadDeviationControl"]
|
294
|
+
|
295
|
+
@property
|
296
|
+
def has_forward_distance_monitor(self) -> Optional[bool]:
|
297
|
+
return self.specification["forwardDistanceMonitor"]
|
298
|
+
|
299
|
+
@property
|
300
|
+
def has_box(self) -> Optional[bool]:
|
301
|
+
return self.specification["box"]
|
302
|
+
|
303
|
+
def __getitem__(self, key: str) -> Any:
|
304
|
+
return self.data[key]
|
305
|
+
|
306
|
+
|
307
|
+
class VehiclesNextData(NextData):
|
308
|
+
def iterate_vehicles(self) -> Iterator[VehicleData]:
|
309
|
+
for query in self.queries:
|
310
|
+
data = query["state"].get("data")
|
311
|
+
|
312
|
+
if not data or isinstance(data, list):
|
313
|
+
continue
|
314
|
+
|
315
|
+
for vehicle_data in itertools.chain.from_iterable(data.values()):
|
316
|
+
if isinstance(vehicle_data, dict):
|
317
|
+
yield VehicleData(vehicle_data)
|
318
|
+
|
319
|
+
def __getitem__(self, item):
|
320
|
+
return self.data[item]
|
@@ -0,0 +1,26 @@
|
|
1
|
+
from enum import Enum
|
2
|
+
from typing import Optional
|
3
|
+
|
4
|
+
from yad2_scraper.query import QueryFilters, OrderBy, NumberRange, format_number_range
|
5
|
+
|
6
|
+
|
7
|
+
class OrderVehiclesBy(int, Enum):
|
8
|
+
DATE = OrderBy.DATE
|
9
|
+
PRICE_LOWEST_TO_HIGHEST = OrderBy.PRICE_LOWEST_TO_HIGHEST
|
10
|
+
PRICE_HIGHEST_TO_LOWEST = OrderBy.PRICE_HIGHEST_TO_LOWEST
|
11
|
+
DISTANCE_LOWEST_TO_HIGHEST = 5
|
12
|
+
YEAR_HIGHEST_TO_LOWEST = 6
|
13
|
+
|
14
|
+
|
15
|
+
class VehiclesQueryFilters(QueryFilters):
|
16
|
+
"""Pydantic model representing query filters for querying a vehicle resource."""
|
17
|
+
year_range: Optional[NumberRange] = None
|
18
|
+
|
19
|
+
def to_params(self) -> dict:
|
20
|
+
"""Convert filter fields to query parameters, including 'year'."""
|
21
|
+
return {
|
22
|
+
**super().to_params(),
|
23
|
+
"year": format_number_range(self.year_range)
|
24
|
+
}
|
25
|
+
|
26
|
+
# TODO: add QueryParams class for each vehicle category (some share the same attributes, sometimes with different enums)
|
@@ -0,0 +1,65 @@
|
|
1
|
+
from functools import cached_property
|
2
|
+
from bs4 import Tag
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
from yad2_scraper.utils import join_url, find_html_tag_by_class_substring
|
6
|
+
from yad2_scraper.vehicles.urls import VEHICLES_URL
|
7
|
+
|
8
|
+
YEAR_AND_HAND_TAG_SEPARATOR = " • "
|
9
|
+
|
10
|
+
|
11
|
+
class VehicleTag:
|
12
|
+
"""Represents a vehicle listing on the webpage, providing access to various details"""
|
13
|
+
|
14
|
+
def __init__(self, tag: Tag):
|
15
|
+
self.tag = tag
|
16
|
+
|
17
|
+
@cached_property
|
18
|
+
def relative_link(self) -> str:
|
19
|
+
return self.find_tag_by_class_substring("a", "itemLink")["href"]
|
20
|
+
|
21
|
+
@property
|
22
|
+
def page_link(self) -> str:
|
23
|
+
return join_url(VEHICLES_URL, self.relative_link)
|
24
|
+
|
25
|
+
@cached_property
|
26
|
+
def image_url(self) -> str:
|
27
|
+
return self.find_tag_by_class_substring("img", "image")["src"]
|
28
|
+
|
29
|
+
@cached_property
|
30
|
+
def model(self) -> str:
|
31
|
+
return self.find_tag_by_class_substring("span", "heading").text.strip()
|
32
|
+
|
33
|
+
@cached_property
|
34
|
+
def marketing_text(self) -> str:
|
35
|
+
return self.find_tag_by_class_substring("span", "marketingText").text.strip()
|
36
|
+
|
37
|
+
@cached_property
|
38
|
+
def year_and_hand_string(self) -> str:
|
39
|
+
return self.find_tag_by_class_substring("span", "yearAndHand").text.strip()
|
40
|
+
|
41
|
+
@property
|
42
|
+
def year(self) -> int:
|
43
|
+
year, _ = self.year_and_hand_string.split(YEAR_AND_HAND_TAG_SEPARATOR)
|
44
|
+
return int(year)
|
45
|
+
|
46
|
+
@property
|
47
|
+
def hand(self) -> int:
|
48
|
+
_, hand_string = self.year_and_hand_string.split(YEAR_AND_HAND_TAG_SEPARATOR)
|
49
|
+
_, hand = hand_string.split()
|
50
|
+
return int(hand)
|
51
|
+
|
52
|
+
@cached_property
|
53
|
+
def price_string(self) -> str:
|
54
|
+
return self.find_tag_by_class_substring("span", "price").text.strip()
|
55
|
+
|
56
|
+
@property
|
57
|
+
def price(self) -> Optional[int]:
|
58
|
+
try:
|
59
|
+
price, _ = self.price_string.split()
|
60
|
+
return int(price.replace(",", ""))
|
61
|
+
except ValueError:
|
62
|
+
return None
|
63
|
+
|
64
|
+
def find_tag_by_class_substring(self, tag_name: str, substring: str) -> Tag:
|
65
|
+
return find_html_tag_by_class_substring(self.tag, tag_name, substring)
|
@@ -0,0 +1,20 @@
|
|
1
|
+
from typing import Literal, get_args
|
2
|
+
|
3
|
+
from yad2_scraper.utils import join_url
|
4
|
+
from yad2_scraper.constants import BASE_URL
|
5
|
+
|
6
|
+
VEHICLES_URL = join_url(BASE_URL, "vehicles")
|
7
|
+
|
8
|
+
VehicleCategory = Literal["cars", "motorcycles", "scooters", "trucks", "watercraft", "others"]
|
9
|
+
|
10
|
+
_VALID_VEHICLE_CATEGORIES = get_args(VehicleCategory)
|
11
|
+
|
12
|
+
|
13
|
+
def get_vehicle_category_url(vehicle_category: VehicleCategory) -> str:
|
14
|
+
"""Generate the URL for the specified vehicle category."""
|
15
|
+
if vehicle_category not in _VALID_VEHICLE_CATEGORIES:
|
16
|
+
raise ValueError(
|
17
|
+
f"Invalid vehicle category: {repr(vehicle_category)}. Expected one of {_VALID_VEHICLE_CATEGORIES}"
|
18
|
+
)
|
19
|
+
|
20
|
+
return join_url(VEHICLES_URL, vehicle_category)
|
@@ -0,0 +1,164 @@
|
|
1
|
+
Metadata-Version: 2.3
|
2
|
+
Name: yad2-scraper
|
3
|
+
Version: 0.5.1
|
4
|
+
Summary: Scrape Yad2 in Python.
|
5
|
+
License: LICENSE
|
6
|
+
Author: dav ost
|
7
|
+
Author-email: davidost2003@gmail.com
|
8
|
+
Requires-Python: >=3.8
|
9
|
+
Classifier: License :: Other/Proprietary License
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
11
|
+
Classifier: Programming Language :: Python :: 3.8
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
17
|
+
Requires-Dist: beautifulsoup4 (>=4.11.1,<5.0.0)
|
18
|
+
Requires-Dist: fake-useragent (>=0.1.11,<0.2.0)
|
19
|
+
Requires-Dist: httpcore (>=0.15.0)
|
20
|
+
Requires-Dist: httpx (>=0.24.0,<0.25.0)
|
21
|
+
Requires-Dist: pydantic (>=1.10.0,<2.0.0)
|
22
|
+
Description-Content-Type: text/markdown
|
23
|
+
|
24
|
+
# Yad2 Scraper
|
25
|
+
|
26
|
+
A Python package for scraping listings from [Yad2](https://www.yad2.co.il/), Israel's leading classifieds platform.
|
27
|
+
This package provides a simple and flexible interface to fetch data, filter results, and extract relevant information.
|
28
|
+
|
29
|
+
__NOTE__: Currently, the package primarily supports the **vehicles category**.
|
30
|
+
Support for additional categories may be added in future updates.
|
31
|
+
|
32
|
+
---
|
33
|
+
|
34
|
+
## Features
|
35
|
+
|
36
|
+
- **Fetch Listings**: Retrieve listings by category (e.g., vehicles, real-estate, etc.).
|
37
|
+
- **Filter Results**: Apply filters such as price range, year range, and sorting order.
|
38
|
+
- **Dynamic URL Generation**: Generate URLs for specific categories and filters.
|
39
|
+
- **Type-Safe API**: Uses Python type hints (`Literal`, `Optional`, etc.) for better code clarity and safety.
|
40
|
+
- **Extensible**: Easily extendable to support additional categories and filters.
|
41
|
+
|
42
|
+
---
|
43
|
+
|
44
|
+
## Installation
|
45
|
+
|
46
|
+
Install the package using `pip`:
|
47
|
+
|
48
|
+
```bash
|
49
|
+
pip install yad2-scraper
|
50
|
+
```
|
51
|
+
|
52
|
+
## Usage
|
53
|
+
|
54
|
+
### Fetching Category Listings
|
55
|
+
|
56
|
+
To fetch any category, use the `fetch_category` function:
|
57
|
+
|
58
|
+
```python
|
59
|
+
from yad2_scraper import fetch_category, Yad2Category
|
60
|
+
|
61
|
+
# Fetch real estate category (returns a generic Yad2Category object)
|
62
|
+
real_estate_category_page1 = fetch_category("https://www.yad2.co.il/realestate/forsale", page=1)
|
63
|
+
...
|
64
|
+
real_estate_category_page2 = fetch_category("https://www.yad2.co.il/realestate/forsale", page=2)
|
65
|
+
...
|
66
|
+
```
|
67
|
+
|
68
|
+
### Fetching Vehicle Listings
|
69
|
+
|
70
|
+
To fetch vehicle listings for a specific category, use the `fetch_vehicle_category` function:
|
71
|
+
|
72
|
+
```python
|
73
|
+
from yad2_scraper import fetch_vehicle_category, OrderVehiclesBy, Field
|
74
|
+
|
75
|
+
# Fetch cars category
|
76
|
+
cars_category = fetch_vehicle_category("cars")
|
77
|
+
|
78
|
+
for car_data in cars_category.load_next_data().iterate_vehicles():
|
79
|
+
print(car_data.model(Field.ENGLISH_TEXT))
|
80
|
+
print(car_data.test_date)
|
81
|
+
print(car_data.price)
|
82
|
+
...
|
83
|
+
|
84
|
+
# Fetch motorcycles category
|
85
|
+
motorcycle_categories = fetch_vehicle_category(
|
86
|
+
"motorcycles",
|
87
|
+
price_range=(5000, 15000),
|
88
|
+
year_range=(2010, 2020),
|
89
|
+
order_by=OrderVehiclesBy.PRICE_LOWEST_TO_HIGHEST
|
90
|
+
)
|
91
|
+
|
92
|
+
for motorcycle_tag in motorcycle_categories.get_vehicle_tags():
|
93
|
+
print(motorcycle_tag.page_link)
|
94
|
+
print(motorcycle_tag.hand)
|
95
|
+
print(motorcycle_tag.price)
|
96
|
+
...
|
97
|
+
```
|
98
|
+
|
99
|
+
### The Scraper Object
|
100
|
+
|
101
|
+
The `Yad2Scraper` class is the core of the package.
|
102
|
+
It handles HTTP requests, parses responses, and provides methods to fetch and filter vehicle listings.
|
103
|
+
|
104
|
+
#### Creating a Scraper Instance
|
105
|
+
|
106
|
+
You can create a `Yad2Scraper` instance manually or use the default scraper provided by the package:
|
107
|
+
|
108
|
+
```python
|
109
|
+
from yad2_scraper import Yad2Scraper, get_default_scraper
|
110
|
+
|
111
|
+
# Create a custom scraper instance
|
112
|
+
scraper = Yad2Scraper()
|
113
|
+
|
114
|
+
# Use the default scraper
|
115
|
+
default_scraper = get_default_scraper()
|
116
|
+
```
|
117
|
+
|
118
|
+
#### Fetching Category Listings
|
119
|
+
|
120
|
+
The `fetch_category` method is used to fetch listings for a specific category.
|
121
|
+
It takes a URL, a `Category` type, and optionally query params as arguments:
|
122
|
+
|
123
|
+
```python
|
124
|
+
from yad2_scraper import Yad2Scraper, Yad2Category, QueryFilters, OrderBy
|
125
|
+
from yad2_scraper.vehicles import (
|
126
|
+
Yad2VehiclesCategory,
|
127
|
+
VehiclesQueryFilters,
|
128
|
+
OrderVehiclesBy,
|
129
|
+
get_vehicle_category_url
|
130
|
+
)
|
131
|
+
|
132
|
+
# Fetch businesses for sale category with filters
|
133
|
+
scraper = Yad2Scraper()
|
134
|
+
url = "https://www.yad2.co.il/products/businesses-for-sale"
|
135
|
+
query_filters = QueryFilters(price_range=(10000, 250000), order_by=OrderBy.PRICE_LOWEST_TO_HIGHEST)
|
136
|
+
real_estate_category = scraper.fetch_category(url, Yad2Category, params=query_filters)
|
137
|
+
|
138
|
+
# Fetch watercraft (vehicle) category with filters
|
139
|
+
url = get_vehicle_category_url("watercraft")
|
140
|
+
query_filters = VehiclesQueryFilters(year_range=(2010, 2020), order_by=OrderVehiclesBy.DATE)
|
141
|
+
watercraft_category = scraper.fetch_category(url, Yad2VehiclesCategory, params=query_filters)
|
142
|
+
```
|
143
|
+
|
144
|
+
#### Attributes & Methods
|
145
|
+
|
146
|
+
The `Yad2Scraper` object contains a lot of additional attributes & methods which you can use.
|
147
|
+
Please check out the actual code documentation for more details.
|
148
|
+
|
149
|
+
## Contributing
|
150
|
+
|
151
|
+
Contributions are welcomed! Here’s how you can get started:
|
152
|
+
|
153
|
+
1. Fork the repository.
|
154
|
+
2. Create a new branch for your feature or bugfix.
|
155
|
+
3. Write tests for your changes.
|
156
|
+
4. Submit a pull request.
|
157
|
+
|
158
|
+
## License
|
159
|
+
|
160
|
+
This project is licensed under the MIT License. See the LICENSE file for details.
|
161
|
+
|
162
|
+
## Support
|
163
|
+
|
164
|
+
For questions, issues, or feature requests, please open an issue on the GitHub repository.
|
@@ -0,0 +1,18 @@
|
|
1
|
+
yad2_scraper/__init__.py,sha256=oLANQo7jrtR5ex1tv4sM5ppaW9JpHS70Knsp0ZgVzm0,3708
|
2
|
+
yad2_scraper/category.py,sha256=SQ2eg0-fQ9hEaNryYpWVFaJqCx1d65t2_E_S3qpuw9g,1230
|
3
|
+
yad2_scraper/constants.py,sha256=8zXJ31fRqkDIOJp96BRK1PJofGXX8SG64YcfmJnVW8Q,910
|
4
|
+
yad2_scraper/exceptions.py,sha256=5yentEUBuEGItwRcjtZY89A19rvFErcTy4S4GUtY_WY,1526
|
5
|
+
yad2_scraper/next_data.py,sha256=OcZ7ingXSd6sLNkqQPz6NVTeEDbMkOai9QONFErc3FI,1977
|
6
|
+
yad2_scraper/query.py,sha256=HPBoLE6xFjsmvBFR2ULvPq96XXl-2zOqXt7LnHgetIk,1438
|
7
|
+
yad2_scraper/scraper.py,sha256=VA-P24Gvn1y5Pkn_n3hDdpVl1aeEnLoC82eBYteAbWQ,11816
|
8
|
+
yad2_scraper/utils.py,sha256=UDpFKel_TJa0dJv1FV-CVqA8-uaFo_hDcooiFAkSZI8,1578
|
9
|
+
yad2_scraper/vehicles/__init__.py,sha256=dxjZcNv3ExnN3fKW-m1oqKiX9YC7gj8lqpIa3uWo9iI,242
|
10
|
+
yad2_scraper/vehicles/category.py,sha256=HdUGCVpC1jw2V-2XvyAC4pPlVQR6cwHyVKDxS3pfQhc,744
|
11
|
+
yad2_scraper/vehicles/next_data.py,sha256=lEIWcTP7BOFDC3lL0FhBGp6u-7hsgGdbbrH0iw0Ux20,9203
|
12
|
+
yad2_scraper/vehicles/query.py,sha256=ieIJSGJELcgzqtJh6bQXalvDg743LnI2RYrAyHDIH80,912
|
13
|
+
yad2_scraper/vehicles/tag.py,sha256=Wj7v2c8IPQLYHVkfzP1UiulKKJE4yLqnbeh81nvWZhU,2052
|
14
|
+
yad2_scraper/vehicles/urls.py,sha256=zxipWjm0SXn2gGOBWw9VqKAJ59mhIGpzd_fTYitpW8c,715
|
15
|
+
yad2_scraper-0.5.1.dist-info/LICENSE,sha256=JCpnDxMx2kE40e0UQ1svSmifrLWg2Gni5VTkJR68thY,1065
|
16
|
+
yad2_scraper-0.5.1.dist-info/METADATA,sha256=SLeA6BPi1idJ20WWWbl7AW-hC_u1_vKPRmUTg4_VhVI,5225
|
17
|
+
yad2_scraper-0.5.1.dist-info/WHEEL,sha256=7dDg4QLnNKTvwIDR9Ac8jJaAmBC_owJrckbC0jjThyA,88
|
18
|
+
yad2_scraper-0.5.1.dist-info/RECORD,,
|
@@ -1,27 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.3
|
2
|
-
Name: yad2-scraper
|
3
|
-
Version: 0.4.0
|
4
|
-
Summary: Scrape Yad2 in Python.
|
5
|
-
License: LICENSE
|
6
|
-
Author: dav ost
|
7
|
-
Author-email: davidost2003@gmail.com
|
8
|
-
Requires-Python: >=3.7
|
9
|
-
Classifier: License :: Other/Proprietary License
|
10
|
-
Classifier: Programming Language :: Python :: 3
|
11
|
-
Classifier: Programming Language :: Python :: 3.7
|
12
|
-
Classifier: Programming Language :: Python :: 3.8
|
13
|
-
Classifier: Programming Language :: Python :: 3.9
|
14
|
-
Classifier: Programming Language :: Python :: 3.10
|
15
|
-
Classifier: Programming Language :: Python :: 3.11
|
16
|
-
Classifier: Programming Language :: Python :: 3.12
|
17
|
-
Classifier: Programming Language :: Python :: 3.13
|
18
|
-
Requires-Dist: beautifulsoup4 (>=4.11.1,<5.0.0)
|
19
|
-
Requires-Dist: fake-useragent (>=0.1.11,<0.2.0)
|
20
|
-
Requires-Dist: httpcore (>=0.15.0)
|
21
|
-
Requires-Dist: httpx (>=0.24.0,<0.25.0)
|
22
|
-
Requires-Dist: pydantic (>=1.10.0,<2.0.0)
|
23
|
-
Description-Content-Type: text/markdown
|
24
|
-
|
25
|
-
# yad2-scraper
|
26
|
-
Scrape Yad2 in Python.
|
27
|
-
|
@@ -1,12 +0,0 @@
|
|
1
|
-
yad2_scraper/__init__.py,sha256=UUiIk6TAHTAP4IY86bIR4TcY3VVMTCyEF0Sq1MSneMM,141
|
2
|
-
yad2_scraper/category.py,sha256=KXLyjMOlPzu3xj08-uRmffAMD83DbqFVm-y1-T83Djw,910
|
3
|
-
yad2_scraper/constants.py,sha256=8zXJ31fRqkDIOJp96BRK1PJofGXX8SG64YcfmJnVW8Q,910
|
4
|
-
yad2_scraper/exceptions.py,sha256=_IcuDdJPKAznSUp_c3fLEuTnDdKf0NLJqpRPs0IzdXw,979
|
5
|
-
yad2_scraper/next_data.py,sha256=-vqvXJqugk-895_kOnwb7J8kUjugg28Aqrh4Z_ct11M,512
|
6
|
-
yad2_scraper/query.py,sha256=WaOWUlyNye9MNXv3hkiUaBFDeV9lbkvHiaDHWYKzgtY,1194
|
7
|
-
yad2_scraper/scraper.py,sha256=sgDpfnKlBSDIWEb2enpQ5O9E5fJvXz3cDOnGXHGCJL4,6653
|
8
|
-
yad2_scraper/utils.py,sha256=e6tqaN5Gw9BXunOQ1V919NkLrZREN7TdMsDuOvZgrcY,713
|
9
|
-
yad2_scraper-0.4.0.dist-info/LICENSE,sha256=JCpnDxMx2kE40e0UQ1svSmifrLWg2Gni5VTkJR68thY,1065
|
10
|
-
yad2_scraper-0.4.0.dist-info/METADATA,sha256=TDO8cS7t4aGZ-B-XikL4hSneizcb7TzLtBsV7dQGl1k,925
|
11
|
-
yad2_scraper-0.4.0.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
|
12
|
-
yad2_scraper-0.4.0.dist-info/RECORD,,
|
File without changes
|