yad2-scraper 0.4.0__tar.gz → 0.5.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- yad2_scraper-0.5.1/PKG-INFO +164 -0
- yad2_scraper-0.5.1/README.md +141 -0
- {yad2_scraper-0.4.0 → yad2_scraper-0.5.1}/pyproject.toml +2 -2
- yad2_scraper-0.5.1/yad2_scraper/__init__.py +99 -0
- {yad2_scraper-0.4.0 → yad2_scraper-0.5.1}/yad2_scraper/category.py +6 -0
- {yad2_scraper-0.4.0 → yad2_scraper-0.5.1}/yad2_scraper/exceptions.py +11 -0
- yad2_scraper-0.5.1/yad2_scraper/next_data.py +59 -0
- {yad2_scraper-0.4.0 → yad2_scraper-0.5.1}/yad2_scraper/query.py +7 -5
- {yad2_scraper-0.4.0 → yad2_scraper-0.5.1}/yad2_scraper/scraper.py +130 -1
- yad2_scraper-0.5.1/yad2_scraper/utils.py +37 -0
- yad2_scraper-0.5.1/yad2_scraper/vehicles/__init__.py +5 -0
- yad2_scraper-0.5.1/yad2_scraper/vehicles/category.py +17 -0
- yad2_scraper-0.5.1/yad2_scraper/vehicles/next_data.py +320 -0
- yad2_scraper-0.5.1/yad2_scraper/vehicles/query.py +26 -0
- yad2_scraper-0.5.1/yad2_scraper/vehicles/tag.py +65 -0
- yad2_scraper-0.5.1/yad2_scraper/vehicles/urls.py +20 -0
- yad2_scraper-0.4.0/PKG-INFO +0 -27
- yad2_scraper-0.4.0/README.md +0 -2
- yad2_scraper-0.4.0/yad2_scraper/__init__.py +0 -4
- yad2_scraper-0.4.0/yad2_scraper/next_data.py +0 -27
- yad2_scraper-0.4.0/yad2_scraper/utils.py +0 -21
- {yad2_scraper-0.4.0 → yad2_scraper-0.5.1}/LICENSE +0 -0
- {yad2_scraper-0.4.0 → yad2_scraper-0.5.1}/yad2_scraper/constants.py +0 -0
@@ -0,0 +1,164 @@
|
|
1
|
+
Metadata-Version: 2.3
|
2
|
+
Name: yad2-scraper
|
3
|
+
Version: 0.5.1
|
4
|
+
Summary: Scrape Yad2 in Python.
|
5
|
+
License: LICENSE
|
6
|
+
Author: dav ost
|
7
|
+
Author-email: davidost2003@gmail.com
|
8
|
+
Requires-Python: >=3.8
|
9
|
+
Classifier: License :: Other/Proprietary License
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
11
|
+
Classifier: Programming Language :: Python :: 3.8
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
17
|
+
Requires-Dist: beautifulsoup4 (>=4.11.1,<5.0.0)
|
18
|
+
Requires-Dist: fake-useragent (>=0.1.11,<0.2.0)
|
19
|
+
Requires-Dist: httpcore (>=0.15.0)
|
20
|
+
Requires-Dist: httpx (>=0.24.0,<0.25.0)
|
21
|
+
Requires-Dist: pydantic (>=1.10.0,<2.0.0)
|
22
|
+
Description-Content-Type: text/markdown
|
23
|
+
|
24
|
+
# Yad2 Scraper
|
25
|
+
|
26
|
+
A Python package for scraping listings from [Yad2](https://www.yad2.co.il/), Israel's leading classifieds platform.
|
27
|
+
This package provides a simple and flexible interface to fetch data, filter results, and extract relevant information.
|
28
|
+
|
29
|
+
__NOTE__: Currently, the package primarily supports the **vehicles category**.
|
30
|
+
Support for additional categories may be added in future updates.
|
31
|
+
|
32
|
+
---
|
33
|
+
|
34
|
+
## Features
|
35
|
+
|
36
|
+
- **Fetch Listings**: Retrieve listings by category (e.g., vehicles, real-estate, etc.).
|
37
|
+
- **Filter Results**: Apply filters such as price range, year range, and sorting order.
|
38
|
+
- **Dynamic URL Generation**: Generate URLs for specific categories and filters.
|
39
|
+
- **Type-Safe API**: Uses Python type hints (`Literal`, `Optional`, etc.) for better code clarity and safety.
|
40
|
+
- **Extensible**: Easily extendable to support additional categories and filters.
|
41
|
+
|
42
|
+
---
|
43
|
+
|
44
|
+
## Installation
|
45
|
+
|
46
|
+
Install the package using `pip`:
|
47
|
+
|
48
|
+
```bash
|
49
|
+
pip install yad2-scraper
|
50
|
+
```
|
51
|
+
|
52
|
+
## Usage
|
53
|
+
|
54
|
+
### Fetching Category Listings
|
55
|
+
|
56
|
+
To fetch any category, use the `fetch_category` function:
|
57
|
+
|
58
|
+
```python
|
59
|
+
from yad2_scraper import fetch_category, Yad2Category
|
60
|
+
|
61
|
+
# Fetch real estate category (returns a generic Yad2Category object)
|
62
|
+
real_estate_category_page1 = fetch_category("https://www.yad2.co.il/realestate/forsale", page=1)
|
63
|
+
...
|
64
|
+
real_estate_category_page2 = fetch_category("https://www.yad2.co.il/realestate/forsale", page=2)
|
65
|
+
...
|
66
|
+
```
|
67
|
+
|
68
|
+
### Fetching Vehicle Listings
|
69
|
+
|
70
|
+
To fetch vehicle listings for a specific category, use the `fetch_vehicle_category` function:
|
71
|
+
|
72
|
+
```python
|
73
|
+
from yad2_scraper import fetch_vehicle_category, OrderVehiclesBy, Field
|
74
|
+
|
75
|
+
# Fetch cars category
|
76
|
+
cars_category = fetch_vehicle_category("cars")
|
77
|
+
|
78
|
+
for car_data in cars_category.load_next_data().iterate_vehicles():
|
79
|
+
print(car_data.model(Field.ENGLISH_TEXT))
|
80
|
+
print(car_data.test_date)
|
81
|
+
print(car_data.price)
|
82
|
+
...
|
83
|
+
|
84
|
+
# Fetch motorcycles category
|
85
|
+
motorcycle_categories = fetch_vehicle_category(
|
86
|
+
"motorcycles",
|
87
|
+
price_range=(5000, 15000),
|
88
|
+
year_range=(2010, 2020),
|
89
|
+
order_by=OrderVehiclesBy.PRICE_LOWEST_TO_HIGHEST
|
90
|
+
)
|
91
|
+
|
92
|
+
for motorcycle_tag in motorcycle_categories.get_vehicle_tags():
|
93
|
+
print(motorcycle_tag.page_link)
|
94
|
+
print(motorcycle_tag.hand)
|
95
|
+
print(motorcycle_tag.price)
|
96
|
+
...
|
97
|
+
```
|
98
|
+
|
99
|
+
### The Scraper Object
|
100
|
+
|
101
|
+
The `Yad2Scraper` class is the core of the package.
|
102
|
+
It handles HTTP requests, parses responses, and provides methods to fetch and filter vehicle listings.
|
103
|
+
|
104
|
+
#### Creating a Scraper Instance
|
105
|
+
|
106
|
+
You can create a `Yad2Scraper` instance manually or use the default scraper provided by the package:
|
107
|
+
|
108
|
+
```python
|
109
|
+
from yad2_scraper import Yad2Scraper, get_default_scraper
|
110
|
+
|
111
|
+
# Create a custom scraper instance
|
112
|
+
scraper = Yad2Scraper()
|
113
|
+
|
114
|
+
# Use the default scraper
|
115
|
+
default_scraper = get_default_scraper()
|
116
|
+
```
|
117
|
+
|
118
|
+
#### Fetching Category Listings
|
119
|
+
|
120
|
+
The `fetch_category` method is used to fetch listings for a specific category.
|
121
|
+
It takes a URL, a `Category` type, and optionally query params as arguments:
|
122
|
+
|
123
|
+
```python
|
124
|
+
from yad2_scraper import Yad2Scraper, Yad2Category, QueryFilters, OrderBy
|
125
|
+
from yad2_scraper.vehicles import (
|
126
|
+
Yad2VehiclesCategory,
|
127
|
+
VehiclesQueryFilters,
|
128
|
+
OrderVehiclesBy,
|
129
|
+
get_vehicle_category_url
|
130
|
+
)
|
131
|
+
|
132
|
+
# Fetch businesses for sale category with filters
|
133
|
+
scraper = Yad2Scraper()
|
134
|
+
url = "https://www.yad2.co.il/products/businesses-for-sale"
|
135
|
+
query_filters = QueryFilters(price_range=(10000, 250000), order_by=OrderBy.PRICE_LOWEST_TO_HIGHEST)
|
136
|
+
real_estate_category = scraper.fetch_category(url, Yad2Category, params=query_filters)
|
137
|
+
|
138
|
+
# Fetch watercraft (vehicle) category with filters
|
139
|
+
url = get_vehicle_category_url("watercraft")
|
140
|
+
query_filters = VehiclesQueryFilters(year_range=(2010, 2020), order_by=OrderVehiclesBy.DATE)
|
141
|
+
watercraft_category = scraper.fetch_category(url, Yad2VehiclesCategory, params=query_filters)
|
142
|
+
```
|
143
|
+
|
144
|
+
#### Attributes & Methods
|
145
|
+
|
146
|
+
The `Yad2Scraper` object contains a lot of additional attributes & methods which you can use.
|
147
|
+
Please check out the actual code documentation for more details.
|
148
|
+
|
149
|
+
## Contributing
|
150
|
+
|
151
|
+
Contributions are welcomed! Here’s how you can get started:
|
152
|
+
|
153
|
+
1. Fork the repository.
|
154
|
+
2. Create a new branch for your feature or bugfix.
|
155
|
+
3. Write tests for your changes.
|
156
|
+
4. Submit a pull request.
|
157
|
+
|
158
|
+
## License
|
159
|
+
|
160
|
+
This project is licensed under the MIT License. See the LICENSE file for details.
|
161
|
+
|
162
|
+
## Support
|
163
|
+
|
164
|
+
For questions, issues, or feature requests, please open an issue on the GitHub repository.
|
@@ -0,0 +1,141 @@
|
|
1
|
+
# Yad2 Scraper
|
2
|
+
|
3
|
+
A Python package for scraping listings from [Yad2](https://www.yad2.co.il/), Israel's leading classifieds platform.
|
4
|
+
This package provides a simple and flexible interface to fetch data, filter results, and extract relevant information.
|
5
|
+
|
6
|
+
__NOTE__: Currently, the package primarily supports the **vehicles category**.
|
7
|
+
Support for additional categories may be added in future updates.
|
8
|
+
|
9
|
+
---
|
10
|
+
|
11
|
+
## Features
|
12
|
+
|
13
|
+
- **Fetch Listings**: Retrieve listings by category (e.g., vehicles, real-estate, etc.).
|
14
|
+
- **Filter Results**: Apply filters such as price range, year range, and sorting order.
|
15
|
+
- **Dynamic URL Generation**: Generate URLs for specific categories and filters.
|
16
|
+
- **Type-Safe API**: Uses Python type hints (`Literal`, `Optional`, etc.) for better code clarity and safety.
|
17
|
+
- **Extensible**: Easily extendable to support additional categories and filters.
|
18
|
+
|
19
|
+
---
|
20
|
+
|
21
|
+
## Installation
|
22
|
+
|
23
|
+
Install the package using `pip`:
|
24
|
+
|
25
|
+
```bash
|
26
|
+
pip install yad2-scraper
|
27
|
+
```
|
28
|
+
|
29
|
+
## Usage
|
30
|
+
|
31
|
+
### Fetching Category Listings
|
32
|
+
|
33
|
+
To fetch any category, use the `fetch_category` function:
|
34
|
+
|
35
|
+
```python
|
36
|
+
from yad2_scraper import fetch_category, Yad2Category
|
37
|
+
|
38
|
+
# Fetch real estate category (returns a generic Yad2Category object)
|
39
|
+
real_estate_category_page1 = fetch_category("https://www.yad2.co.il/realestate/forsale", page=1)
|
40
|
+
...
|
41
|
+
real_estate_category_page2 = fetch_category("https://www.yad2.co.il/realestate/forsale", page=2)
|
42
|
+
...
|
43
|
+
```
|
44
|
+
|
45
|
+
### Fetching Vehicle Listings
|
46
|
+
|
47
|
+
To fetch vehicle listings for a specific category, use the `fetch_vehicle_category` function:
|
48
|
+
|
49
|
+
```python
|
50
|
+
from yad2_scraper import fetch_vehicle_category, OrderVehiclesBy, Field
|
51
|
+
|
52
|
+
# Fetch cars category
|
53
|
+
cars_category = fetch_vehicle_category("cars")
|
54
|
+
|
55
|
+
for car_data in cars_category.load_next_data().iterate_vehicles():
|
56
|
+
print(car_data.model(Field.ENGLISH_TEXT))
|
57
|
+
print(car_data.test_date)
|
58
|
+
print(car_data.price)
|
59
|
+
...
|
60
|
+
|
61
|
+
# Fetch motorcycles category
|
62
|
+
motorcycle_categories = fetch_vehicle_category(
|
63
|
+
"motorcycles",
|
64
|
+
price_range=(5000, 15000),
|
65
|
+
year_range=(2010, 2020),
|
66
|
+
order_by=OrderVehiclesBy.PRICE_LOWEST_TO_HIGHEST
|
67
|
+
)
|
68
|
+
|
69
|
+
for motorcycle_tag in motorcycle_categories.get_vehicle_tags():
|
70
|
+
print(motorcycle_tag.page_link)
|
71
|
+
print(motorcycle_tag.hand)
|
72
|
+
print(motorcycle_tag.price)
|
73
|
+
...
|
74
|
+
```
|
75
|
+
|
76
|
+
### The Scraper Object
|
77
|
+
|
78
|
+
The `Yad2Scraper` class is the core of the package.
|
79
|
+
It handles HTTP requests, parses responses, and provides methods to fetch and filter vehicle listings.
|
80
|
+
|
81
|
+
#### Creating a Scraper Instance
|
82
|
+
|
83
|
+
You can create a `Yad2Scraper` instance manually or use the default scraper provided by the package:
|
84
|
+
|
85
|
+
```python
|
86
|
+
from yad2_scraper import Yad2Scraper, get_default_scraper
|
87
|
+
|
88
|
+
# Create a custom scraper instance
|
89
|
+
scraper = Yad2Scraper()
|
90
|
+
|
91
|
+
# Use the default scraper
|
92
|
+
default_scraper = get_default_scraper()
|
93
|
+
```
|
94
|
+
|
95
|
+
#### Fetching Category Listings
|
96
|
+
|
97
|
+
The `fetch_category` method is used to fetch listings for a specific category.
|
98
|
+
It takes a URL, a `Category` type, and optionally query params as arguments:
|
99
|
+
|
100
|
+
```python
|
101
|
+
from yad2_scraper import Yad2Scraper, Yad2Category, QueryFilters, OrderBy
|
102
|
+
from yad2_scraper.vehicles import (
|
103
|
+
Yad2VehiclesCategory,
|
104
|
+
VehiclesQueryFilters,
|
105
|
+
OrderVehiclesBy,
|
106
|
+
get_vehicle_category_url
|
107
|
+
)
|
108
|
+
|
109
|
+
# Fetch businesses for sale category with filters
|
110
|
+
scraper = Yad2Scraper()
|
111
|
+
url = "https://www.yad2.co.il/products/businesses-for-sale"
|
112
|
+
query_filters = QueryFilters(price_range=(10000, 250000), order_by=OrderBy.PRICE_LOWEST_TO_HIGHEST)
|
113
|
+
real_estate_category = scraper.fetch_category(url, Yad2Category, params=query_filters)
|
114
|
+
|
115
|
+
# Fetch watercraft (vehicle) category with filters
|
116
|
+
url = get_vehicle_category_url("watercraft")
|
117
|
+
query_filters = VehiclesQueryFilters(year_range=(2010, 2020), order_by=OrderVehiclesBy.DATE)
|
118
|
+
watercraft_category = scraper.fetch_category(url, Yad2VehiclesCategory, params=query_filters)
|
119
|
+
```
|
120
|
+
|
121
|
+
#### Attributes & Methods
|
122
|
+
|
123
|
+
The `Yad2Scraper` object contains a lot of additional attributes & methods which you can use.
|
124
|
+
Please check out the actual code documentation for more details.
|
125
|
+
|
126
|
+
## Contributing
|
127
|
+
|
128
|
+
Contributions are welcomed! Here’s how you can get started:
|
129
|
+
|
130
|
+
1. Fork the repository.
|
131
|
+
2. Create a new branch for your feature or bugfix.
|
132
|
+
3. Write tests for your changes.
|
133
|
+
4. Submit a pull request.
|
134
|
+
|
135
|
+
## License
|
136
|
+
|
137
|
+
This project is licensed under the MIT License. See the LICENSE file for details.
|
138
|
+
|
139
|
+
## Support
|
140
|
+
|
141
|
+
For questions, issues, or feature requests, please open an issue on the GitHub repository.
|
@@ -1,13 +1,13 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "yad2-scraper"
|
3
|
-
version = "0.
|
3
|
+
version = "0.5.1"
|
4
4
|
description = "Scrape Yad2 in Python."
|
5
5
|
authors = ["dav ost <davidost2003@gmail.com>"]
|
6
6
|
license = "LICENSE"
|
7
7
|
readme = "README.md"
|
8
8
|
|
9
9
|
[tool.poetry.dependencies]
|
10
|
-
python = ">=3.
|
10
|
+
python = ">=3.8"
|
11
11
|
httpx = "^0.24.0"
|
12
12
|
httpcore = ">=0.15.0"
|
13
13
|
fake-useragent = "^0.1.11"
|
@@ -0,0 +1,99 @@
|
|
1
|
+
from typing import Optional, Type
|
2
|
+
|
3
|
+
from .scraper import Yad2Scraper, Category
|
4
|
+
from .query import QueryFilters, OrderBy, NumberRange
|
5
|
+
from .category import Yad2Category
|
6
|
+
from .next_data import NextData, Field
|
7
|
+
from .utils import any_param_specified
|
8
|
+
from .vehicles import (
|
9
|
+
Yad2VehiclesCategory,
|
10
|
+
VehiclesQueryFilters,
|
11
|
+
OrderVehiclesBy,
|
12
|
+
VehicleCategory,
|
13
|
+
get_vehicle_category_url
|
14
|
+
)
|
15
|
+
|
16
|
+
_default_scraper = None
|
17
|
+
|
18
|
+
|
19
|
+
def get_default_scraper() -> Yad2Scraper:
|
20
|
+
"""
|
21
|
+
Retrieves the default instance of the Yad2Scraper. If an instance does not already exist, it will be created.
|
22
|
+
|
23
|
+
Returns:
|
24
|
+
Yad2Scraper: The default instance of the Yad2Scraper.
|
25
|
+
|
26
|
+
Notes:
|
27
|
+
The default scraper is a singleton instance that is reused across multiple calls.
|
28
|
+
"""
|
29
|
+
global _default_scraper
|
30
|
+
|
31
|
+
if not _default_scraper:
|
32
|
+
_default_scraper = Yad2Scraper()
|
33
|
+
|
34
|
+
return _default_scraper
|
35
|
+
|
36
|
+
|
37
|
+
def fetch_category(
|
38
|
+
url: str,
|
39
|
+
category_type: Type[Category] = Yad2Category,
|
40
|
+
page: Optional[int] = None,
|
41
|
+
order_by: Optional[OrderBy] = None,
|
42
|
+
price_range: [NumberRange] = None
|
43
|
+
) -> Category:
|
44
|
+
"""
|
45
|
+
Fetches a specific category from the given URL, while applying optional filters.
|
46
|
+
|
47
|
+
Args:
|
48
|
+
url (str): The URL of the category to fetch.
|
49
|
+
category_type (Type[Category], optional): The type of category to return (default is `Yad2Category`).
|
50
|
+
page (Optional[int], optional): The page number for pagination (default is None).
|
51
|
+
order_by (Optional[OrderBy], optional): The sorting order for the results (default is None).
|
52
|
+
price_range (Optional[List[NumberRange]], optional): The price range filter for the results (default is None).
|
53
|
+
|
54
|
+
Returns:
|
55
|
+
Category: An instance of the specified `category_type`, populated with the fetched data.
|
56
|
+
|
57
|
+
Notes:
|
58
|
+
This method uses the default scraper to retrieve the category.
|
59
|
+
"""
|
60
|
+
if any_param_specified(page, order_by, price_range):
|
61
|
+
params = QueryFilters(page=page, order_by=order_by, price_range=price_range)
|
62
|
+
else:
|
63
|
+
params = None
|
64
|
+
|
65
|
+
default_scraper = get_default_scraper()
|
66
|
+
return default_scraper.fetch_category(url, category_type, params=params)
|
67
|
+
|
68
|
+
|
69
|
+
def fetch_vehicle_category(
|
70
|
+
vehicle_category: VehicleCategory,
|
71
|
+
page: Optional[int] = None,
|
72
|
+
order_by: Optional[OrderVehiclesBy] = None,
|
73
|
+
price_range: [NumberRange] = None,
|
74
|
+
year_range: [NumberRange] = None
|
75
|
+
) -> Yad2VehiclesCategory:
|
76
|
+
"""
|
77
|
+
Fetches a specific vehicle category, while applying optional filters.
|
78
|
+
|
79
|
+
Args:
|
80
|
+
vehicle_category (VehicleCategory): The vehicle category to fetch.
|
81
|
+
page (Optional[int], optional): The page number for pagination (default is None).
|
82
|
+
order_by (Optional[OrderVehiclesBy], optional): The sorting order for the results (default is None).
|
83
|
+
price_range (Optional[List[NumberRange]], optional): The price range filter for the results (default is None).
|
84
|
+
year_range (Optional[List[NumberRange]], optional): The year range filter for the results (default is None).
|
85
|
+
|
86
|
+
Returns:
|
87
|
+
Yad2VehiclesCategory: An instance of `Yad2VehiclesCategory`, populated with the fetched vehicle category data.
|
88
|
+
|
89
|
+
Notes:
|
90
|
+
This method uses the default scraper to fetch the vehicle category.
|
91
|
+
"""
|
92
|
+
if any_param_specified(page, order_by, price_range, year_range):
|
93
|
+
params = VehiclesQueryFilters(page=page, order_by=order_by, price_range=price_range, year_range=year_range)
|
94
|
+
else:
|
95
|
+
params = None
|
96
|
+
|
97
|
+
url = get_vehicle_category_url(vehicle_category)
|
98
|
+
default_scraper = get_default_scraper()
|
99
|
+
return default_scraper.fetch_category(url, Yad2VehiclesCategory, params=params)
|
@@ -8,18 +8,24 @@ from yad2_scraper.constants import NEXT_DATA_SCRIPT_ID
|
|
8
8
|
|
9
9
|
|
10
10
|
class Yad2Category:
|
11
|
+
"""Represents a Yad2 category parsed from an HTML page."""
|
12
|
+
|
11
13
|
def __init__(self, soup: BeautifulSoup):
|
14
|
+
"""Initialize with a BeautifulSoup object."""
|
12
15
|
self.soup = soup
|
13
16
|
|
14
17
|
@classmethod
|
15
18
|
def from_html_io(cls, html_io: Union[TextIO, BinaryIO]):
|
19
|
+
"""Create an instance from an HTML file-like object."""
|
16
20
|
html = html_io.read()
|
17
21
|
soup = BeautifulSoup(html, "html.parser")
|
18
22
|
return cls(soup)
|
19
23
|
|
20
24
|
def load_next_data(self) -> Optional[NextData]:
|
25
|
+
"""Extract and parse Next.js data from the page."""
|
21
26
|
tag = self.soup.find("script", id=NEXT_DATA_SCRIPT_ID)
|
22
27
|
return NextData(json.loads(tag.string)) if tag else None
|
23
28
|
|
24
29
|
def find_all_tags_by_class_substring(self, tag_name: str, substring: str) -> List[Tag]:
|
30
|
+
"""Find all HTML tags with a class containing the given substring."""
|
25
31
|
return find_all_html_tags_by_class_substring(self.soup, tag_name, substring)
|
@@ -3,29 +3,40 @@ from typing import List, Union
|
|
3
3
|
|
4
4
|
|
5
5
|
class ResponseError(Exception):
|
6
|
+
"""Represents an error response from an HTTP request."""
|
7
|
+
|
6
8
|
def __init__(self, msg: str, request: httpx.Request, response: httpx.Response):
|
9
|
+
"""Initialize with an error message, request, and response objects."""
|
7
10
|
super().__init__(msg)
|
8
11
|
self.request = request
|
9
12
|
self.response = response
|
10
13
|
|
11
14
|
|
12
15
|
class AntiBotDetectedError(ResponseError):
|
16
|
+
"""Raised when an anti-bot mechanism is detected."""
|
13
17
|
pass
|
14
18
|
|
15
19
|
|
16
20
|
class UnexpectedContentError(ResponseError):
|
21
|
+
"""Raised when the response content is not as expected."""
|
17
22
|
pass
|
18
23
|
|
19
24
|
|
20
25
|
class MaxAttemptsExceededError(Exception):
|
26
|
+
"""Raised when the maximum number of attempts is exceeded."""
|
27
|
+
|
21
28
|
def __init__(self, msg: str, max_attempts: int, errors: List[BaseException] = None):
|
29
|
+
"""Initialize with an error message, max attempts, and optional errors."""
|
22
30
|
super().__init__(msg)
|
23
31
|
self.max_attempts = max_attempts
|
24
32
|
self.errors = errors
|
25
33
|
|
26
34
|
|
27
35
|
class MaxRequestAttemptsExceededError(MaxAttemptsExceededError):
|
36
|
+
"""Raised when all HTTP request attempts fail."""
|
37
|
+
|
28
38
|
def __init__(self, method: str, url: str, max_attempts: int, errors: List[Union[httpx.HTTPError, ResponseError]]):
|
39
|
+
"""Initialize with request method, URL, max attempts, and error list."""
|
29
40
|
msg = f"All {max_attempts} attempts for {method} request to '{url}' have failed"
|
30
41
|
super().__init__(msg, max_attempts, errors)
|
31
42
|
self.method = method
|
@@ -0,0 +1,59 @@
|
|
1
|
+
from datetime import datetime
|
2
|
+
from enum import Enum
|
3
|
+
from typing import List, Union
|
4
|
+
|
5
|
+
from yad2_scraper.utils import safe_access
|
6
|
+
|
7
|
+
FieldTypes = Union[str, int]
|
8
|
+
|
9
|
+
_safe_access_optional_keys = safe_access(exceptions=(KeyError, TypeError), default=None)
|
10
|
+
|
11
|
+
|
12
|
+
class SafeAccessOptionalKeysMeta(type):
|
13
|
+
"""Metaclass that wraps methods and properties with safe access handling."""
|
14
|
+
|
15
|
+
def __new__(cls, name, bases, dictionary):
|
16
|
+
for attr_name, attr_value in dictionary.items():
|
17
|
+
if callable(attr_value): # Wrap methods
|
18
|
+
dictionary[attr_name] = _safe_access_optional_keys(attr_value)
|
19
|
+
elif isinstance(attr_value, property): # Wrap properties
|
20
|
+
dictionary[attr_name] = property(
|
21
|
+
_safe_access_optional_keys(attr_value.fget) if attr_value.fget else None,
|
22
|
+
_safe_access_optional_keys(attr_value.fset) if attr_value.fset else None,
|
23
|
+
_safe_access_optional_keys(attr_value.fdel) if attr_value.fdel else None,
|
24
|
+
attr_value.__doc__,
|
25
|
+
)
|
26
|
+
return super().__new__(cls, name, bases, dictionary)
|
27
|
+
|
28
|
+
|
29
|
+
class Field(str, Enum):
|
30
|
+
ID = "id"
|
31
|
+
TEXT = "text"
|
32
|
+
ENGLISH_TEXT = "textEng"
|
33
|
+
|
34
|
+
|
35
|
+
def convert_string_date_to_datetime(date_string: str) -> datetime:
|
36
|
+
"""Convert an ISO format string to a datetime object."""
|
37
|
+
return datetime.fromisoformat(date_string)
|
38
|
+
|
39
|
+
|
40
|
+
class NextData:
|
41
|
+
"""Represents structured Next.js data."""
|
42
|
+
|
43
|
+
def __init__(self, data: dict):
|
44
|
+
"""Initialize with Next.js data dictionary."""
|
45
|
+
self.data = data
|
46
|
+
|
47
|
+
@property
|
48
|
+
def json(self) -> dict:
|
49
|
+
"""Return raw JSON data."""
|
50
|
+
return self.data
|
51
|
+
|
52
|
+
@property
|
53
|
+
def queries(self) -> List[dict]:
|
54
|
+
"""Extract query data from Next.js state."""
|
55
|
+
return self.data["props"]["pageProps"]["dehydratedState"]["queries"]
|
56
|
+
|
57
|
+
def __getitem__(self, item):
|
58
|
+
"""Allow dictionary-style access to data."""
|
59
|
+
return self.data[item]
|
@@ -2,7 +2,7 @@ from pydantic import BaseModel
|
|
2
2
|
from enum import Enum
|
3
3
|
from typing import Optional, Tuple
|
4
4
|
|
5
|
-
|
5
|
+
NumberRange = Tuple[int, int]
|
6
6
|
|
7
7
|
|
8
8
|
class OrderBy(int, Enum):
|
@@ -13,6 +13,7 @@ class OrderBy(int, Enum):
|
|
13
13
|
|
14
14
|
|
15
15
|
def format_number_range(number_range: Optional[Tuple[int, int]]) -> Optional[str]:
|
16
|
+
"""Format a number range as 'min_value-max_value'."""
|
16
17
|
if number_range is None:
|
17
18
|
return None
|
18
19
|
|
@@ -25,12 +26,13 @@ def format_number_range(number_range: Optional[Tuple[int, int]]) -> Optional[str
|
|
25
26
|
|
26
27
|
|
27
28
|
class QueryFilters(BaseModel):
|
29
|
+
"""Pydantic model representing query filters for querying a resource."""
|
28
30
|
page: Optional[int] = None
|
29
31
|
order_by: Optional[OrderBy] = None
|
30
|
-
price_range: Optional[
|
31
|
-
...
|
32
|
+
price_range: Optional[NumberRange] = None
|
32
33
|
|
33
34
|
def to_params(self) -> dict:
|
35
|
+
"""Convert filter fields to query parameters."""
|
34
36
|
return {
|
35
37
|
"page": self.page,
|
36
38
|
"Order": self.order_by,
|
@@ -38,9 +40,9 @@ class QueryFilters(BaseModel):
|
|
38
40
|
}
|
39
41
|
|
40
42
|
def to_clean_params(self):
|
43
|
+
"""Return query parameters excluding None values."""
|
41
44
|
return {key: value for key, value in self.to_params().items() if value is not None}
|
42
45
|
|
43
|
-
# TODO: add helper methods for managing the attribute values
|
44
|
-
|
45
46
|
def __iter__(self):
|
47
|
+
"""Allow iteration over the clean query parameters."""
|
46
48
|
yield from self.to_clean_params().items()
|