yad2-scraper 0.5.0__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- yad2_scraper/__init__.py +98 -3
- yad2_scraper/category.py +6 -0
- yad2_scraper/exceptions.py +11 -0
- yad2_scraper/next_data.py +14 -5
- yad2_scraper/query.py +5 -3
- yad2_scraper/scraper.py +130 -1
- yad2_scraper/utils.py +9 -8
- yad2_scraper/vehicles/__init__.py +1 -1
- yad2_scraper/vehicles/category.py +2 -0
- yad2_scraper/vehicles/next_data.py +2 -0
- yad2_scraper/vehicles/query.py +3 -2
- yad2_scraper/vehicles/tag.py +2 -0
- yad2_scraper/vehicles/urls.py +10 -6
- yad2_scraper-0.5.1.dist-info/METADATA +164 -0
- yad2_scraper-0.5.1.dist-info/RECORD +18 -0
- {yad2_scraper-0.5.0.dist-info → yad2_scraper-0.5.1.dist-info}/WHEEL +1 -1
- yad2_scraper-0.5.0.dist-info/METADATA +0 -26
- yad2_scraper-0.5.0.dist-info/RECORD +0 -18
- {yad2_scraper-0.5.0.dist-info → yad2_scraper-0.5.1.dist-info}/LICENSE +0 -0
yad2_scraper/__init__.py
CHANGED
@@ -1,4 +1,99 @@
|
|
1
|
-
from
|
2
|
-
|
1
|
+
from typing import Optional, Type
|
2
|
+
|
3
|
+
from .scraper import Yad2Scraper, Category
|
4
|
+
from .query import QueryFilters, OrderBy, NumberRange
|
3
5
|
from .category import Yad2Category
|
4
|
-
from .next_data import NextData
|
6
|
+
from .next_data import NextData, Field
|
7
|
+
from .utils import any_param_specified
|
8
|
+
from .vehicles import (
|
9
|
+
Yad2VehiclesCategory,
|
10
|
+
VehiclesQueryFilters,
|
11
|
+
OrderVehiclesBy,
|
12
|
+
VehicleCategory,
|
13
|
+
get_vehicle_category_url
|
14
|
+
)
|
15
|
+
|
16
|
+
_default_scraper = None
|
17
|
+
|
18
|
+
|
19
|
+
def get_default_scraper() -> Yad2Scraper:
|
20
|
+
"""
|
21
|
+
Retrieves the default instance of the Yad2Scraper. If an instance does not already exist, it will be created.
|
22
|
+
|
23
|
+
Returns:
|
24
|
+
Yad2Scraper: The default instance of the Yad2Scraper.
|
25
|
+
|
26
|
+
Notes:
|
27
|
+
The default scraper is a singleton instance that is reused across multiple calls.
|
28
|
+
"""
|
29
|
+
global _default_scraper
|
30
|
+
|
31
|
+
if not _default_scraper:
|
32
|
+
_default_scraper = Yad2Scraper()
|
33
|
+
|
34
|
+
return _default_scraper
|
35
|
+
|
36
|
+
|
37
|
+
def fetch_category(
|
38
|
+
url: str,
|
39
|
+
category_type: Type[Category] = Yad2Category,
|
40
|
+
page: Optional[int] = None,
|
41
|
+
order_by: Optional[OrderBy] = None,
|
42
|
+
price_range: [NumberRange] = None
|
43
|
+
) -> Category:
|
44
|
+
"""
|
45
|
+
Fetches a specific category from the given URL, while applying optional filters.
|
46
|
+
|
47
|
+
Args:
|
48
|
+
url (str): The URL of the category to fetch.
|
49
|
+
category_type (Type[Category], optional): The type of category to return (default is `Yad2Category`).
|
50
|
+
page (Optional[int], optional): The page number for pagination (default is None).
|
51
|
+
order_by (Optional[OrderBy], optional): The sorting order for the results (default is None).
|
52
|
+
price_range (Optional[List[NumberRange]], optional): The price range filter for the results (default is None).
|
53
|
+
|
54
|
+
Returns:
|
55
|
+
Category: An instance of the specified `category_type`, populated with the fetched data.
|
56
|
+
|
57
|
+
Notes:
|
58
|
+
This method uses the default scraper to retrieve the category.
|
59
|
+
"""
|
60
|
+
if any_param_specified(page, order_by, price_range):
|
61
|
+
params = QueryFilters(page=page, order_by=order_by, price_range=price_range)
|
62
|
+
else:
|
63
|
+
params = None
|
64
|
+
|
65
|
+
default_scraper = get_default_scraper()
|
66
|
+
return default_scraper.fetch_category(url, category_type, params=params)
|
67
|
+
|
68
|
+
|
69
|
+
def fetch_vehicle_category(
|
70
|
+
vehicle_category: VehicleCategory,
|
71
|
+
page: Optional[int] = None,
|
72
|
+
order_by: Optional[OrderVehiclesBy] = None,
|
73
|
+
price_range: [NumberRange] = None,
|
74
|
+
year_range: [NumberRange] = None
|
75
|
+
) -> Yad2VehiclesCategory:
|
76
|
+
"""
|
77
|
+
Fetches a specific vehicle category, while applying optional filters.
|
78
|
+
|
79
|
+
Args:
|
80
|
+
vehicle_category (VehicleCategory): The vehicle category to fetch.
|
81
|
+
page (Optional[int], optional): The page number for pagination (default is None).
|
82
|
+
order_by (Optional[OrderVehiclesBy], optional): The sorting order for the results (default is None).
|
83
|
+
price_range (Optional[List[NumberRange]], optional): The price range filter for the results (default is None).
|
84
|
+
year_range (Optional[List[NumberRange]], optional): The year range filter for the results (default is None).
|
85
|
+
|
86
|
+
Returns:
|
87
|
+
Yad2VehiclesCategory: An instance of `Yad2VehiclesCategory`, populated with the fetched vehicle category data.
|
88
|
+
|
89
|
+
Notes:
|
90
|
+
This method uses the default scraper to fetch the vehicle category.
|
91
|
+
"""
|
92
|
+
if any_param_specified(page, order_by, price_range, year_range):
|
93
|
+
params = VehiclesQueryFilters(page=page, order_by=order_by, price_range=price_range, year_range=year_range)
|
94
|
+
else:
|
95
|
+
params = None
|
96
|
+
|
97
|
+
url = get_vehicle_category_url(vehicle_category)
|
98
|
+
default_scraper = get_default_scraper()
|
99
|
+
return default_scraper.fetch_category(url, Yad2VehiclesCategory, params=params)
|
yad2_scraper/category.py
CHANGED
@@ -8,18 +8,24 @@ from yad2_scraper.constants import NEXT_DATA_SCRIPT_ID
|
|
8
8
|
|
9
9
|
|
10
10
|
class Yad2Category:
|
11
|
+
"""Represents a Yad2 category parsed from an HTML page."""
|
12
|
+
|
11
13
|
def __init__(self, soup: BeautifulSoup):
|
14
|
+
"""Initialize with a BeautifulSoup object."""
|
12
15
|
self.soup = soup
|
13
16
|
|
14
17
|
@classmethod
|
15
18
|
def from_html_io(cls, html_io: Union[TextIO, BinaryIO]):
|
19
|
+
"""Create an instance from an HTML file-like object."""
|
16
20
|
html = html_io.read()
|
17
21
|
soup = BeautifulSoup(html, "html.parser")
|
18
22
|
return cls(soup)
|
19
23
|
|
20
24
|
def load_next_data(self) -> Optional[NextData]:
|
25
|
+
"""Extract and parse Next.js data from the page."""
|
21
26
|
tag = self.soup.find("script", id=NEXT_DATA_SCRIPT_ID)
|
22
27
|
return NextData(json.loads(tag.string)) if tag else None
|
23
28
|
|
24
29
|
def find_all_tags_by_class_substring(self, tag_name: str, substring: str) -> List[Tag]:
|
30
|
+
"""Find all HTML tags with a class containing the given substring."""
|
25
31
|
return find_all_html_tags_by_class_substring(self.soup, tag_name, substring)
|
yad2_scraper/exceptions.py
CHANGED
@@ -3,29 +3,40 @@ from typing import List, Union
|
|
3
3
|
|
4
4
|
|
5
5
|
class ResponseError(Exception):
|
6
|
+
"""Represents an error response from an HTTP request."""
|
7
|
+
|
6
8
|
def __init__(self, msg: str, request: httpx.Request, response: httpx.Response):
|
9
|
+
"""Initialize with an error message, request, and response objects."""
|
7
10
|
super().__init__(msg)
|
8
11
|
self.request = request
|
9
12
|
self.response = response
|
10
13
|
|
11
14
|
|
12
15
|
class AntiBotDetectedError(ResponseError):
|
16
|
+
"""Raised when an anti-bot mechanism is detected."""
|
13
17
|
pass
|
14
18
|
|
15
19
|
|
16
20
|
class UnexpectedContentError(ResponseError):
|
21
|
+
"""Raised when the response content is not as expected."""
|
17
22
|
pass
|
18
23
|
|
19
24
|
|
20
25
|
class MaxAttemptsExceededError(Exception):
|
26
|
+
"""Raised when the maximum number of attempts is exceeded."""
|
27
|
+
|
21
28
|
def __init__(self, msg: str, max_attempts: int, errors: List[BaseException] = None):
|
29
|
+
"""Initialize with an error message, max attempts, and optional errors."""
|
22
30
|
super().__init__(msg)
|
23
31
|
self.max_attempts = max_attempts
|
24
32
|
self.errors = errors
|
25
33
|
|
26
34
|
|
27
35
|
class MaxRequestAttemptsExceededError(MaxAttemptsExceededError):
|
36
|
+
"""Raised when all HTTP request attempts fail."""
|
37
|
+
|
28
38
|
def __init__(self, method: str, url: str, max_attempts: int, errors: List[Union[httpx.HTTPError, ResponseError]]):
|
39
|
+
"""Initialize with request method, URL, max attempts, and error list."""
|
29
40
|
msg = f"All {max_attempts} attempts for {method} request to '{url}' have failed"
|
30
41
|
super().__init__(msg, max_attempts, errors)
|
31
42
|
self.method = method
|
yad2_scraper/next_data.py
CHANGED
@@ -6,19 +6,21 @@ from yad2_scraper.utils import safe_access
|
|
6
6
|
|
7
7
|
FieldTypes = Union[str, int]
|
8
8
|
|
9
|
-
|
9
|
+
_safe_access_optional_keys = safe_access(exceptions=(KeyError, TypeError), default=None)
|
10
10
|
|
11
11
|
|
12
12
|
class SafeAccessOptionalKeysMeta(type):
|
13
|
+
"""Metaclass that wraps methods and properties with safe access handling."""
|
14
|
+
|
13
15
|
def __new__(cls, name, bases, dictionary):
|
14
16
|
for attr_name, attr_value in dictionary.items():
|
15
17
|
if callable(attr_value): # Wrap methods
|
16
|
-
dictionary[attr_name] =
|
18
|
+
dictionary[attr_name] = _safe_access_optional_keys(attr_value)
|
17
19
|
elif isinstance(attr_value, property): # Wrap properties
|
18
20
|
dictionary[attr_name] = property(
|
19
|
-
|
20
|
-
|
21
|
-
|
21
|
+
_safe_access_optional_keys(attr_value.fget) if attr_value.fget else None,
|
22
|
+
_safe_access_optional_keys(attr_value.fset) if attr_value.fset else None,
|
23
|
+
_safe_access_optional_keys(attr_value.fdel) if attr_value.fdel else None,
|
22
24
|
attr_value.__doc__,
|
23
25
|
)
|
24
26
|
return super().__new__(cls, name, bases, dictionary)
|
@@ -31,20 +33,27 @@ class Field(str, Enum):
|
|
31
33
|
|
32
34
|
|
33
35
|
def convert_string_date_to_datetime(date_string: str) -> datetime:
|
36
|
+
"""Convert an ISO format string to a datetime object."""
|
34
37
|
return datetime.fromisoformat(date_string)
|
35
38
|
|
36
39
|
|
37
40
|
class NextData:
|
41
|
+
"""Represents structured Next.js data."""
|
42
|
+
|
38
43
|
def __init__(self, data: dict):
|
44
|
+
"""Initialize with Next.js data dictionary."""
|
39
45
|
self.data = data
|
40
46
|
|
41
47
|
@property
|
42
48
|
def json(self) -> dict:
|
49
|
+
"""Return raw JSON data."""
|
43
50
|
return self.data
|
44
51
|
|
45
52
|
@property
|
46
53
|
def queries(self) -> List[dict]:
|
54
|
+
"""Extract query data from Next.js state."""
|
47
55
|
return self.data["props"]["pageProps"]["dehydratedState"]["queries"]
|
48
56
|
|
49
57
|
def __getitem__(self, item):
|
58
|
+
"""Allow dictionary-style access to data."""
|
50
59
|
return self.data[item]
|
yad2_scraper/query.py
CHANGED
@@ -13,6 +13,7 @@ class OrderBy(int, Enum):
|
|
13
13
|
|
14
14
|
|
15
15
|
def format_number_range(number_range: Optional[Tuple[int, int]]) -> Optional[str]:
|
16
|
+
"""Format a number range as 'min_value-max_value'."""
|
16
17
|
if number_range is None:
|
17
18
|
return None
|
18
19
|
|
@@ -25,12 +26,13 @@ def format_number_range(number_range: Optional[Tuple[int, int]]) -> Optional[str
|
|
25
26
|
|
26
27
|
|
27
28
|
class QueryFilters(BaseModel):
|
29
|
+
"""Pydantic model representing query filters for querying a resource."""
|
28
30
|
page: Optional[int] = None
|
29
31
|
order_by: Optional[OrderBy] = None
|
30
32
|
price_range: Optional[NumberRange] = None
|
31
|
-
...
|
32
33
|
|
33
34
|
def to_params(self) -> dict:
|
35
|
+
"""Convert filter fields to query parameters."""
|
34
36
|
return {
|
35
37
|
"page": self.page,
|
36
38
|
"Order": self.order_by,
|
@@ -38,9 +40,9 @@ class QueryFilters(BaseModel):
|
|
38
40
|
}
|
39
41
|
|
40
42
|
def to_clean_params(self):
|
43
|
+
"""Return query parameters excluding None values."""
|
41
44
|
return {key: value for key, value in self.to_params().items() if value is not None}
|
42
45
|
|
43
|
-
# TODO: add helper methods for managing the attribute values
|
44
|
-
|
45
46
|
def __iter__(self):
|
47
|
+
"""Allow iteration over the clean query parameters."""
|
46
48
|
yield from self.to_clean_params().items()
|
yad2_scraper/scraper.py
CHANGED
@@ -24,6 +24,8 @@ logger = logging.getLogger(__name__)
|
|
24
24
|
|
25
25
|
|
26
26
|
class Yad2Scraper:
|
27
|
+
"""A scraper for fetching data from the Yad2 website, with robust features"""
|
28
|
+
|
27
29
|
def __init__(
|
28
30
|
self,
|
29
31
|
client: Optional[httpx.Client] = None,
|
@@ -32,6 +34,16 @@ class Yad2Scraper:
|
|
32
34
|
wait_strategy: Optional[WaitStrategy] = None,
|
33
35
|
max_request_attempts: int = 1
|
34
36
|
):
|
37
|
+
"""
|
38
|
+
Initializes the Yad2Scraper with provided parameters.
|
39
|
+
|
40
|
+
Args:
|
41
|
+
client (Optional[httpx.Client]): An optional custom HTTP client. If not provided, a default client is used.
|
42
|
+
request_defaults (Optional[Dict[str, Any]]): Default parameters for requests such as headers, params, etc.
|
43
|
+
randomize_user_agent (bool): If True, a random User-Agent will be set for each request. Defaults to True.
|
44
|
+
wait_strategy (Optional[WaitStrategy]): A function to determine the wait time between requests.
|
45
|
+
max_request_attempts (int): The maximum number of retry attempts for failed requests. Defaults to 1.
|
46
|
+
"""
|
35
47
|
self.client = client or httpx.Client(
|
36
48
|
headers=DEFAULT_REQUEST_HEADERS,
|
37
49
|
follow_redirects=ALLOW_REQUEST_REDIRECTS,
|
@@ -41,14 +53,32 @@ class Yad2Scraper:
|
|
41
53
|
self.randomize_user_agent = randomize_user_agent
|
42
54
|
self.wait_strategy = wait_strategy
|
43
55
|
self.max_request_attempts = max_request_attempts
|
56
|
+
self._request_count = 0
|
44
57
|
|
45
58
|
logger.debug(f"Scraper initialized with client: {self.client}")
|
46
59
|
|
60
|
+
@property
|
61
|
+
def request_count(self) -> int:
|
62
|
+
"""Returns the number of requests made by the scraper so far."""
|
63
|
+
return self._request_count
|
64
|
+
|
47
65
|
def set_user_agent(self, user_agent: str) -> None:
|
66
|
+
"""
|
67
|
+
Sets the User-Agent header for requests.
|
68
|
+
|
69
|
+
Args:
|
70
|
+
user_agent (str): The User-Agent string to be used in HTTP requests.
|
71
|
+
"""
|
48
72
|
self.client.headers["User-Agent"] = user_agent
|
49
73
|
logger.debug(f"User-Agent client header set to: '{user_agent}'")
|
50
74
|
|
51
75
|
def set_no_script(self, no_script: bool) -> None:
|
76
|
+
"""
|
77
|
+
Sets the "noscript" cookie in the client's cookies to control JavaScript content.
|
78
|
+
|
79
|
+
Args:
|
80
|
+
no_script (bool): If True, the "noscript" cookie is set to "1". If False, it's set to "0".
|
81
|
+
"""
|
52
82
|
value = "1" if no_script else "0"
|
53
83
|
self.client.cookies.set("noscript", value)
|
54
84
|
logger.debug(f"NoScript (noscript) client cookie set to: '{value}'")
|
@@ -56,18 +86,44 @@ class Yad2Scraper:
|
|
56
86
|
def fetch_category(
|
57
87
|
self,
|
58
88
|
url: str,
|
59
|
-
category_type: Type[Category]
|
89
|
+
category_type: Type[Category],
|
60
90
|
params: Optional[QueryParamTypes] = None
|
61
91
|
) -> Category:
|
92
|
+
"""
|
93
|
+
Fetches and returns a category page from a given URL.
|
94
|
+
|
95
|
+
Args:
|
96
|
+
url (str): The URL of the category page.
|
97
|
+
category_type (Type[Category]): The class type of the category to be fetched.
|
98
|
+
params (Optional[QueryParamTypes]): Query parameters to be included in the request.
|
99
|
+
|
100
|
+
Returns:
|
101
|
+
Category: The fetched category, parsed from HTML.
|
102
|
+
"""
|
62
103
|
logger.debug(f"Fetching category from URL: '{url}'")
|
63
104
|
response = self.get(url, params)
|
64
105
|
logger.debug(f"Category fetched successfully from URL: '{url}'")
|
65
106
|
return category_type.from_html_io(response)
|
66
107
|
|
67
108
|
def get(self, url: str, params: Optional[QueryParamTypes] = None) -> httpx.Response:
|
109
|
+
"""Sends a GET request to the specified URL."""
|
68
110
|
return self.request("GET", url, params=params)
|
69
111
|
|
70
112
|
def request(self, method: str, url: str, params: Optional[QueryParamTypes] = None) -> httpx.Response:
|
113
|
+
"""
|
114
|
+
Sends an HTTP request with multiple attempts logic.
|
115
|
+
|
116
|
+
Args:
|
117
|
+
method (str): The HTTP method (e.g., "GET", "POST").
|
118
|
+
url (str): The URL to send the request to.
|
119
|
+
params (Optional[QueryParamTypes]): Query parameters to be included in the request.
|
120
|
+
|
121
|
+
Returns:
|
122
|
+
httpx.Response: The HTTP response object.
|
123
|
+
|
124
|
+
Raises:
|
125
|
+
MaxRequestAttemptsExceededError: If the request exceeds the maximum number of attempts.
|
126
|
+
"""
|
71
127
|
if not isinstance(self.max_request_attempts, int):
|
72
128
|
raise TypeError(f"max_request_attempts must be of type 'int', but got {type(self.max_request_attempts)}")
|
73
129
|
|
@@ -92,11 +148,28 @@ class Yad2Scraper:
|
|
92
148
|
raise max_attempts_error from error_list[-1] # multiple errors exist, raise from the last one
|
93
149
|
|
94
150
|
def close(self) -> None:
|
151
|
+
"""Closes the HTTP client and logs the closure."""
|
95
152
|
logger.debug("Closing scraper client")
|
96
153
|
self.client.close()
|
97
154
|
logger.info("Scraper client closed")
|
98
155
|
|
99
156
|
def _send_request(self, method: str, url: str, request_options: Dict[str, Any], attempt: int) -> httpx.Response:
|
157
|
+
"""
|
158
|
+
Sends an HTTP request with the specified method to the given URL, applying all necessary actions.
|
159
|
+
|
160
|
+
Args:
|
161
|
+
method (str): The HTTP method (e.g., 'GET', 'POST').
|
162
|
+
url (str): The target URL for the request.
|
163
|
+
request_options (Dict[str, Any]): Additional request options, including headers and parameters.
|
164
|
+
attempt (int): The current attempt number for the request.
|
165
|
+
|
166
|
+
Returns:
|
167
|
+
httpx.Response: The HTTP response object received from the server.
|
168
|
+
|
169
|
+
Raises:
|
170
|
+
AntiBotDetectedError: If the response contains Anti-Bot content.
|
171
|
+
UnexpectedContentError: If a GET request does not contain expected content.
|
172
|
+
"""
|
100
173
|
if self.randomize_user_agent:
|
101
174
|
self._set_random_user_agent(request_options)
|
102
175
|
|
@@ -105,12 +178,22 @@ class Yad2Scraper:
|
|
105
178
|
|
106
179
|
logger.info(f"Sending {method} request to URL: '{url}' {self._format_attempt_info(attempt)}")
|
107
180
|
response = self.client.request(method, url, **request_options)
|
181
|
+
self._request_count += 1
|
108
182
|
logger.debug(f"Received response {response.status_code} from '{url}' {self._format_attempt_info(attempt)}")
|
109
183
|
self._validate_response(response)
|
110
184
|
|
111
185
|
return response
|
112
186
|
|
113
187
|
def _prepare_request_options(self, params: Optional[QueryParamTypes] = None) -> Dict[str, Any]:
|
188
|
+
"""
|
189
|
+
Prepares the request options to be passed to the HTTP client's request method, based on the default options.
|
190
|
+
|
191
|
+
Args:
|
192
|
+
params (Optional[QueryParamTypes]): Optional query parameters to include in the request.
|
193
|
+
|
194
|
+
Returns:
|
195
|
+
Dict[str, Any]: A dictionary of the request options, including headers and query parameters.
|
196
|
+
"""
|
114
197
|
logger.debug("Preparing request options from defaults")
|
115
198
|
request_options = self.request_defaults.copy()
|
116
199
|
|
@@ -122,11 +205,23 @@ class Yad2Scraper:
|
|
122
205
|
|
123
206
|
@staticmethod
|
124
207
|
def _set_random_user_agent(request_options: Dict[str, str]):
|
208
|
+
"""
|
209
|
+
Sets a random User-Agent header in the request options.
|
210
|
+
|
211
|
+
Args:
|
212
|
+
request_options (Dict[str, str]): The request options to update with the random User-Agent.
|
213
|
+
"""
|
125
214
|
user_agent = fua.random
|
126
215
|
request_options.setdefault("headers", {})["User-Agent"] = user_agent
|
127
216
|
logger.debug(f"Updated request options with random User-Agent header: '{user_agent}'")
|
128
217
|
|
129
218
|
def _apply_wait_strategy(self, attempt: int):
|
219
|
+
"""
|
220
|
+
Applies a wait time before making a request based on the wait strategy for the given attempt.
|
221
|
+
|
222
|
+
Args:
|
223
|
+
attempt (int): The current attempt number to calculate the wait time.
|
224
|
+
"""
|
130
225
|
wait_time = self.wait_strategy(attempt)
|
131
226
|
if not wait_time:
|
132
227
|
return
|
@@ -136,6 +231,17 @@ class Yad2Scraper:
|
|
136
231
|
|
137
232
|
@staticmethod
|
138
233
|
def _validate_response(response: httpx.Response):
|
234
|
+
"""
|
235
|
+
Validates the response to ensure it is successful.
|
236
|
+
|
237
|
+
Args:
|
238
|
+
response (httpx.Response): The HTTP response object to validate.
|
239
|
+
|
240
|
+
Raises:
|
241
|
+
httpx.HTTPStatusError: If a status error occurred.
|
242
|
+
AntiBotDetectedError: If the response contains Anti-Bot content.
|
243
|
+
UnexpectedContentError: If a GET response does not contain expected content.
|
244
|
+
"""
|
139
245
|
response.raise_for_status()
|
140
246
|
|
141
247
|
if ANTIBOT_CONTENT_IDENTIFIER in response.content:
|
@@ -154,12 +260,35 @@ class Yad2Scraper:
|
|
154
260
|
logger.debug("Response validation succeeded")
|
155
261
|
|
156
262
|
def _format_attempt_info(self, attempt: int) -> str:
|
263
|
+
"""
|
264
|
+
Formats a string representing the current attempt number and total attempt count.
|
265
|
+
|
266
|
+
Args:
|
267
|
+
attempt (int): The current attempt number.
|
268
|
+
|
269
|
+
Returns:
|
270
|
+
str: A formatted string representing the attempt info, e.g., "(attempt 1/5)".
|
271
|
+
"""
|
157
272
|
return f"(attempt {attempt}/{self.max_request_attempts})"
|
158
273
|
|
159
274
|
def __enter__(self):
|
275
|
+
"""
|
276
|
+
Prepares the scraper to be used in a `with` statement, allowing for resource management.
|
277
|
+
|
278
|
+
Returns:
|
279
|
+
Yad2Scraper: The scraper instance to be used within the `with` block.
|
280
|
+
"""
|
160
281
|
logger.debug("Entering scraper context")
|
161
282
|
return self
|
162
283
|
|
163
284
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
285
|
+
"""
|
286
|
+
Cleans up resources and closes the scraper client when exiting the `with` statement.
|
287
|
+
|
288
|
+
Args:
|
289
|
+
exc_type: The exception type (if any).
|
290
|
+
exc_val: The exception value (if any).
|
291
|
+
exc_tb: The traceback object (if any).
|
292
|
+
"""
|
164
293
|
logger.debug("Exiting scraper context")
|
165
294
|
self.close()
|
yad2_scraper/utils.py
CHANGED
@@ -2,27 +2,30 @@ import functools
|
|
2
2
|
from bs4 import BeautifulSoup, Tag
|
3
3
|
from typing import Union, List, Tuple, Any
|
4
4
|
|
5
|
+
def any_param_specified(*params: Any) -> bool:
|
6
|
+
"""Check if any parameter is not None."""
|
7
|
+
return any(param is not None for param in params)
|
5
8
|
|
6
9
|
def join_url(url: str, path: str) -> str:
|
10
|
+
"""Join a base URL with a path, ensuring proper slashes."""
|
7
11
|
return url.rstrip("/") + "/" + path.lstrip("/")
|
8
12
|
|
9
|
-
|
10
13
|
def get_parent_url(url: str) -> str:
|
14
|
+
"""Return the parent URL by removing the last segment."""
|
11
15
|
if url.count("/") <= 2:
|
12
16
|
return url
|
13
|
-
|
14
17
|
return url.rstrip("/").rsplit("/", 1)[0]
|
15
18
|
|
16
|
-
|
17
19
|
def find_html_tag_by_class_substring(e: Union[BeautifulSoup, Tag], tag_name: str, substring: str) -> Tag:
|
20
|
+
"""Find the first HTML tag with a class containing the given substring."""
|
18
21
|
return e.find(tag_name, class_=lambda class_name: class_name and substring in class_name)
|
19
22
|
|
20
|
-
|
21
23
|
def find_all_html_tags_by_class_substring(e: Union[BeautifulSoup, Tag], tag_name: str, substring: str) -> List[Tag]:
|
24
|
+
"""Find all HTML tags with a class containing the given substring."""
|
22
25
|
return e.find_all(tag_name, class_=lambda class_name: class_name and substring in class_name)
|
23
26
|
|
24
|
-
|
25
27
|
def safe_access(exceptions: Tuple = (), default: Any = None):
|
28
|
+
"""Decorator to safely execute a function, returning a default value on exception."""
|
26
29
|
def decorator(func):
|
27
30
|
@functools.wraps(func)
|
28
31
|
def wrapper(*args, **kwargs):
|
@@ -30,7 +33,5 @@ def safe_access(exceptions: Tuple = (), default: Any = None):
|
|
30
33
|
return func(*args, **kwargs)
|
31
34
|
except exceptions:
|
32
35
|
return default
|
33
|
-
|
34
36
|
return wrapper
|
35
|
-
|
36
|
-
return decorator
|
37
|
+
return decorator
|
@@ -7,9 +7,11 @@ from yad2_scraper.vehicles.next_data import VehiclesNextData
|
|
7
7
|
|
8
8
|
class Yad2VehiclesCategory(Yad2Category):
|
9
9
|
def get_vehicle_tags(self) -> List[VehicleTag]:
|
10
|
+
"""Retrieve a and return list of vehicle tags from the current category."""
|
10
11
|
tags = self.find_all_tags_by_class_substring("div", "feedItemBox")
|
11
12
|
return [VehicleTag(tag) for tag in tags]
|
12
13
|
|
13
14
|
def load_next_data(self) -> Optional[VehiclesNextData]:
|
15
|
+
"""Extract and parse Next.js data from the current vehicle page."""
|
14
16
|
next_data = super().load_next_data()
|
15
17
|
return VehiclesNextData(next_data) if next_data else None
|
yad2_scraper/vehicles/query.py
CHANGED
@@ -13,13 +13,14 @@ class OrderVehiclesBy(int, Enum):
|
|
13
13
|
|
14
14
|
|
15
15
|
class VehiclesQueryFilters(QueryFilters):
|
16
|
+
"""Pydantic model representing query filters for querying a vehicle resource."""
|
16
17
|
year_range: Optional[NumberRange] = None
|
17
|
-
...
|
18
18
|
|
19
19
|
def to_params(self) -> dict:
|
20
|
+
"""Convert filter fields to query parameters, including 'year'."""
|
20
21
|
return {
|
21
22
|
**super().to_params(),
|
22
23
|
"year": format_number_range(self.year_range)
|
23
24
|
}
|
24
25
|
|
25
|
-
# TODO: add QueryParams class for each vehicle
|
26
|
+
# TODO: add QueryParams class for each vehicle category (some share the same attributes, sometimes with different enums)
|
yad2_scraper/vehicles/tag.py
CHANGED
yad2_scraper/vehicles/urls.py
CHANGED
@@ -5,12 +5,16 @@ from yad2_scraper.constants import BASE_URL
|
|
5
5
|
|
6
6
|
VEHICLES_URL = join_url(BASE_URL, "vehicles")
|
7
7
|
|
8
|
-
|
8
|
+
VehicleCategory = Literal["cars", "motorcycles", "scooters", "trucks", "watercraft", "others"]
|
9
9
|
|
10
|
-
|
10
|
+
_VALID_VEHICLE_CATEGORIES = get_args(VehicleCategory)
|
11
11
|
|
12
12
|
|
13
|
-
def
|
14
|
-
|
15
|
-
|
16
|
-
|
13
|
+
def get_vehicle_category_url(vehicle_category: VehicleCategory) -> str:
|
14
|
+
"""Generate the URL for the specified vehicle category."""
|
15
|
+
if vehicle_category not in _VALID_VEHICLE_CATEGORIES:
|
16
|
+
raise ValueError(
|
17
|
+
f"Invalid vehicle category: {repr(vehicle_category)}. Expected one of {_VALID_VEHICLE_CATEGORIES}"
|
18
|
+
)
|
19
|
+
|
20
|
+
return join_url(VEHICLES_URL, vehicle_category)
|
@@ -0,0 +1,164 @@
|
|
1
|
+
Metadata-Version: 2.3
|
2
|
+
Name: yad2-scraper
|
3
|
+
Version: 0.5.1
|
4
|
+
Summary: Scrape Yad2 in Python.
|
5
|
+
License: LICENSE
|
6
|
+
Author: dav ost
|
7
|
+
Author-email: davidost2003@gmail.com
|
8
|
+
Requires-Python: >=3.8
|
9
|
+
Classifier: License :: Other/Proprietary License
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
11
|
+
Classifier: Programming Language :: Python :: 3.8
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
17
|
+
Requires-Dist: beautifulsoup4 (>=4.11.1,<5.0.0)
|
18
|
+
Requires-Dist: fake-useragent (>=0.1.11,<0.2.0)
|
19
|
+
Requires-Dist: httpcore (>=0.15.0)
|
20
|
+
Requires-Dist: httpx (>=0.24.0,<0.25.0)
|
21
|
+
Requires-Dist: pydantic (>=1.10.0,<2.0.0)
|
22
|
+
Description-Content-Type: text/markdown
|
23
|
+
|
24
|
+
# Yad2 Scraper
|
25
|
+
|
26
|
+
A Python package for scraping listings from [Yad2](https://www.yad2.co.il/), Israel's leading classifieds platform.
|
27
|
+
This package provides a simple and flexible interface to fetch data, filter results, and extract relevant information.
|
28
|
+
|
29
|
+
__NOTE__: Currently, the package primarily supports the **vehicles category**.
|
30
|
+
Support for additional categories may be added in future updates.
|
31
|
+
|
32
|
+
---
|
33
|
+
|
34
|
+
## Features
|
35
|
+
|
36
|
+
- **Fetch Listings**: Retrieve listings by category (e.g., vehicles, real-estate, etc.).
|
37
|
+
- **Filter Results**: Apply filters such as price range, year range, and sorting order.
|
38
|
+
- **Dynamic URL Generation**: Generate URLs for specific categories and filters.
|
39
|
+
- **Type-Safe API**: Uses Python type hints (`Literal`, `Optional`, etc.) for better code clarity and safety.
|
40
|
+
- **Extensible**: Easily extendable to support additional categories and filters.
|
41
|
+
|
42
|
+
---
|
43
|
+
|
44
|
+
## Installation
|
45
|
+
|
46
|
+
Install the package using `pip`:
|
47
|
+
|
48
|
+
```bash
|
49
|
+
pip install yad2-scraper
|
50
|
+
```
|
51
|
+
|
52
|
+
## Usage
|
53
|
+
|
54
|
+
### Fetching Category Listings
|
55
|
+
|
56
|
+
To fetch any category, use the `fetch_category` function:
|
57
|
+
|
58
|
+
```python
|
59
|
+
from yad2_scraper import fetch_category, Yad2Category
|
60
|
+
|
61
|
+
# Fetch real estate category (returns a generic Yad2Category object)
|
62
|
+
real_estate_category_page1 = fetch_category("https://www.yad2.co.il/realestate/forsale", page=1)
|
63
|
+
...
|
64
|
+
real_estate_category_page2 = fetch_category("https://www.yad2.co.il/realestate/forsale", page=2)
|
65
|
+
...
|
66
|
+
```
|
67
|
+
|
68
|
+
### Fetching Vehicle Listings
|
69
|
+
|
70
|
+
To fetch vehicle listings for a specific category, use the `fetch_vehicle_category` function:
|
71
|
+
|
72
|
+
```python
|
73
|
+
from yad2_scraper import fetch_vehicle_category, OrderVehiclesBy, Field
|
74
|
+
|
75
|
+
# Fetch cars category
|
76
|
+
cars_category = fetch_vehicle_category("cars")
|
77
|
+
|
78
|
+
for car_data in cars_category.load_next_data().iterate_vehicles():
|
79
|
+
print(car_data.model(Field.ENGLISH_TEXT))
|
80
|
+
print(car_data.test_date)
|
81
|
+
print(car_data.price)
|
82
|
+
...
|
83
|
+
|
84
|
+
# Fetch motorcycles category
|
85
|
+
motorcycle_categories = fetch_vehicle_category(
|
86
|
+
"motorcycles",
|
87
|
+
price_range=(5000, 15000),
|
88
|
+
year_range=(2010, 2020),
|
89
|
+
order_by=OrderVehiclesBy.PRICE_LOWEST_TO_HIGHEST
|
90
|
+
)
|
91
|
+
|
92
|
+
for motorcycle_tag in motorcycle_categories.get_vehicle_tags():
|
93
|
+
print(motorcycle_tag.page_link)
|
94
|
+
print(motorcycle_tag.hand)
|
95
|
+
print(motorcycle_tag.price)
|
96
|
+
...
|
97
|
+
```
|
98
|
+
|
99
|
+
### The Scraper Object
|
100
|
+
|
101
|
+
The `Yad2Scraper` class is the core of the package.
|
102
|
+
It handles HTTP requests, parses responses, and provides methods to fetch and filter vehicle listings.
|
103
|
+
|
104
|
+
#### Creating a Scraper Instance
|
105
|
+
|
106
|
+
You can create a `Yad2Scraper` instance manually or use the default scraper provided by the package:
|
107
|
+
|
108
|
+
```python
|
109
|
+
from yad2_scraper import Yad2Scraper, get_default_scraper
|
110
|
+
|
111
|
+
# Create a custom scraper instance
|
112
|
+
scraper = Yad2Scraper()
|
113
|
+
|
114
|
+
# Use the default scraper
|
115
|
+
default_scraper = get_default_scraper()
|
116
|
+
```
|
117
|
+
|
118
|
+
#### Fetching Category Listings
|
119
|
+
|
120
|
+
The `fetch_category` method is used to fetch listings for a specific category.
|
121
|
+
It takes a URL, a `Category` type, and optionally query params as arguments:
|
122
|
+
|
123
|
+
```python
|
124
|
+
from yad2_scraper import Yad2Scraper, Yad2Category, QueryFilters, OrderBy
|
125
|
+
from yad2_scraper.vehicles import (
|
126
|
+
Yad2VehiclesCategory,
|
127
|
+
VehiclesQueryFilters,
|
128
|
+
OrderVehiclesBy,
|
129
|
+
get_vehicle_category_url
|
130
|
+
)
|
131
|
+
|
132
|
+
# Fetch businesses for sale category with filters
|
133
|
+
scraper = Yad2Scraper()
|
134
|
+
url = "https://www.yad2.co.il/products/businesses-for-sale"
|
135
|
+
query_filters = QueryFilters(price_range=(10000, 250000), order_by=OrderBy.PRICE_LOWEST_TO_HIGHEST)
|
136
|
+
real_estate_category = scraper.fetch_category(url, Yad2Category, params=query_filters)
|
137
|
+
|
138
|
+
# Fetch watercraft (vehicle) category with filters
|
139
|
+
url = get_vehicle_category_url("watercraft")
|
140
|
+
query_filters = VehiclesQueryFilters(year_range=(2010, 2020), order_by=OrderVehiclesBy.DATE)
|
141
|
+
watercraft_category = scraper.fetch_category(url, Yad2VehiclesCategory, params=query_filters)
|
142
|
+
```
|
143
|
+
|
144
|
+
#### Attributes & Methods
|
145
|
+
|
146
|
+
The `Yad2Scraper` object contains a lot of additional attributes & methods which you can use.
|
147
|
+
Please check out the actual code documentation for more details.
|
148
|
+
|
149
|
+
## Contributing
|
150
|
+
|
151
|
+
Contributions are welcomed! Here’s how you can get started:
|
152
|
+
|
153
|
+
1. Fork the repository.
|
154
|
+
2. Create a new branch for your feature or bugfix.
|
155
|
+
3. Write tests for your changes.
|
156
|
+
4. Submit a pull request.
|
157
|
+
|
158
|
+
## License
|
159
|
+
|
160
|
+
This project is licensed under the MIT License. See the LICENSE file for details.
|
161
|
+
|
162
|
+
## Support
|
163
|
+
|
164
|
+
For questions, issues, or feature requests, please open an issue on the GitHub repository.
|
@@ -0,0 +1,18 @@
|
|
1
|
+
yad2_scraper/__init__.py,sha256=oLANQo7jrtR5ex1tv4sM5ppaW9JpHS70Knsp0ZgVzm0,3708
|
2
|
+
yad2_scraper/category.py,sha256=SQ2eg0-fQ9hEaNryYpWVFaJqCx1d65t2_E_S3qpuw9g,1230
|
3
|
+
yad2_scraper/constants.py,sha256=8zXJ31fRqkDIOJp96BRK1PJofGXX8SG64YcfmJnVW8Q,910
|
4
|
+
yad2_scraper/exceptions.py,sha256=5yentEUBuEGItwRcjtZY89A19rvFErcTy4S4GUtY_WY,1526
|
5
|
+
yad2_scraper/next_data.py,sha256=OcZ7ingXSd6sLNkqQPz6NVTeEDbMkOai9QONFErc3FI,1977
|
6
|
+
yad2_scraper/query.py,sha256=HPBoLE6xFjsmvBFR2ULvPq96XXl-2zOqXt7LnHgetIk,1438
|
7
|
+
yad2_scraper/scraper.py,sha256=VA-P24Gvn1y5Pkn_n3hDdpVl1aeEnLoC82eBYteAbWQ,11816
|
8
|
+
yad2_scraper/utils.py,sha256=UDpFKel_TJa0dJv1FV-CVqA8-uaFo_hDcooiFAkSZI8,1578
|
9
|
+
yad2_scraper/vehicles/__init__.py,sha256=dxjZcNv3ExnN3fKW-m1oqKiX9YC7gj8lqpIa3uWo9iI,242
|
10
|
+
yad2_scraper/vehicles/category.py,sha256=HdUGCVpC1jw2V-2XvyAC4pPlVQR6cwHyVKDxS3pfQhc,744
|
11
|
+
yad2_scraper/vehicles/next_data.py,sha256=lEIWcTP7BOFDC3lL0FhBGp6u-7hsgGdbbrH0iw0Ux20,9203
|
12
|
+
yad2_scraper/vehicles/query.py,sha256=ieIJSGJELcgzqtJh6bQXalvDg743LnI2RYrAyHDIH80,912
|
13
|
+
yad2_scraper/vehicles/tag.py,sha256=Wj7v2c8IPQLYHVkfzP1UiulKKJE4yLqnbeh81nvWZhU,2052
|
14
|
+
yad2_scraper/vehicles/urls.py,sha256=zxipWjm0SXn2gGOBWw9VqKAJ59mhIGpzd_fTYitpW8c,715
|
15
|
+
yad2_scraper-0.5.1.dist-info/LICENSE,sha256=JCpnDxMx2kE40e0UQ1svSmifrLWg2Gni5VTkJR68thY,1065
|
16
|
+
yad2_scraper-0.5.1.dist-info/METADATA,sha256=SLeA6BPi1idJ20WWWbl7AW-hC_u1_vKPRmUTg4_VhVI,5225
|
17
|
+
yad2_scraper-0.5.1.dist-info/WHEEL,sha256=7dDg4QLnNKTvwIDR9Ac8jJaAmBC_owJrckbC0jjThyA,88
|
18
|
+
yad2_scraper-0.5.1.dist-info/RECORD,,
|
@@ -1,26 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.3
|
2
|
-
Name: yad2-scraper
|
3
|
-
Version: 0.5.0
|
4
|
-
Summary: Scrape Yad2 in Python.
|
5
|
-
License: LICENSE
|
6
|
-
Author: dav ost
|
7
|
-
Author-email: davidost2003@gmail.com
|
8
|
-
Requires-Python: >=3.8
|
9
|
-
Classifier: License :: Other/Proprietary License
|
10
|
-
Classifier: Programming Language :: Python :: 3
|
11
|
-
Classifier: Programming Language :: Python :: 3.8
|
12
|
-
Classifier: Programming Language :: Python :: 3.9
|
13
|
-
Classifier: Programming Language :: Python :: 3.10
|
14
|
-
Classifier: Programming Language :: Python :: 3.11
|
15
|
-
Classifier: Programming Language :: Python :: 3.12
|
16
|
-
Classifier: Programming Language :: Python :: 3.13
|
17
|
-
Requires-Dist: beautifulsoup4 (>=4.11.1,<5.0.0)
|
18
|
-
Requires-Dist: fake-useragent (>=0.1.11,<0.2.0)
|
19
|
-
Requires-Dist: httpcore (>=0.15.0)
|
20
|
-
Requires-Dist: httpx (>=0.24.0,<0.25.0)
|
21
|
-
Requires-Dist: pydantic (>=1.10.0,<2.0.0)
|
22
|
-
Description-Content-Type: text/markdown
|
23
|
-
|
24
|
-
# yad2-scraper
|
25
|
-
Scrape Yad2 in Python.
|
26
|
-
|
@@ -1,18 +0,0 @@
|
|
1
|
-
yad2_scraper/__init__.py,sha256=UUiIk6TAHTAP4IY86bIR4TcY3VVMTCyEF0Sq1MSneMM,141
|
2
|
-
yad2_scraper/category.py,sha256=KXLyjMOlPzu3xj08-uRmffAMD83DbqFVm-y1-T83Djw,910
|
3
|
-
yad2_scraper/constants.py,sha256=8zXJ31fRqkDIOJp96BRK1PJofGXX8SG64YcfmJnVW8Q,910
|
4
|
-
yad2_scraper/exceptions.py,sha256=_IcuDdJPKAznSUp_c3fLEuTnDdKf0NLJqpRPs0IzdXw,979
|
5
|
-
yad2_scraper/next_data.py,sha256=fOatioaBxR7LZgRnXp35CoOkR7-Adv6rW_YKBQpSYj8,1585
|
6
|
-
yad2_scraper/query.py,sha256=nURdupTnMbxgglJz7tdWSqnp4UG61nwWM1vjWQaylWE,1196
|
7
|
-
yad2_scraper/scraper.py,sha256=sgDpfnKlBSDIWEb2enpQ5O9E5fJvXz3cDOnGXHGCJL4,6653
|
8
|
-
yad2_scraper/utils.py,sha256=y6ErH2HcoCJn7OreNj4lvW--iOA7dv1LUIPa537GVjg,1070
|
9
|
-
yad2_scraper/vehicles/__init__.py,sha256=4-4vVFu836nLzaTf1KTlddrjSk7dX3Nu9hm3cj1EKIU,229
|
10
|
-
yad2_scraper/vehicles/category.py,sha256=BrH-aZY6hNlHtSqBmleifb7yY5R-76J2GAj9Bfd0Ulw,584
|
11
|
-
yad2_scraper/vehicles/next_data.py,sha256=0xUbEwmj8CsWc0uqoW9hbM4FW26e4IWBiv-UcraSwrw,9125
|
12
|
-
yad2_scraper/vehicles/query.py,sha256=VhL-E-sgpLxenZVvNgdCNWY15hMtoP0Oyv6SH_N3e04,757
|
13
|
-
yad2_scraper/vehicles/tag.py,sha256=YTeCfVnaPnHz9CYRnfcQljEbNqynBDdlbX0HNPiB-XY,1960
|
14
|
-
yad2_scraper/vehicles/urls.py,sha256=-aEtV_1elqHFdLIxBZglY0e0-UHGqQab5Rh5qKUyBtg,573
|
15
|
-
yad2_scraper-0.5.0.dist-info/LICENSE,sha256=JCpnDxMx2kE40e0UQ1svSmifrLWg2Gni5VTkJR68thY,1065
|
16
|
-
yad2_scraper-0.5.0.dist-info/METADATA,sha256=00MRqHUY9r2qiSRFgIZPCwli-aZgc_FJS2c0lshmKdY,875
|
17
|
-
yad2_scraper-0.5.0.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
|
18
|
-
yad2_scraper-0.5.0.dist-info/RECORD,,
|
File without changes
|