pypararius 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pypararius/__init__.py +20 -0
- pypararius/listing.py +138 -0
- pypararius/pararius.py +238 -0
- pypararius/parser.py +328 -0
- pypararius-2.0.0.dist-info/METADATA +9 -0
- pypararius-2.0.0.dist-info/RECORD +7 -0
- pypararius-2.0.0.dist-info/WHEEL +4 -0
pypararius/__init__.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pararius - Python API for Pararius.com real estate listings.
|
|
3
|
+
|
|
4
|
+
Example usage:
|
|
5
|
+
>>> from pypararius import Pararius
|
|
6
|
+
>>> p = Pararius()
|
|
7
|
+
>>> listing = p.get_listing('amsterdam/abc123/street')
|
|
8
|
+
>>> print(listing['title'], listing['price'])
|
|
9
|
+
Ridderspoorweg 10 1850
|
|
10
|
+
|
|
11
|
+
>>> results = p.search_listing('amsterdam', price_max=2000)
|
|
12
|
+
>>> for r in results[:3]:
|
|
13
|
+
... print(r['title'], r['city'])
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from pypararius.pararius import Pararius, ParariusAPI
|
|
17
|
+
from pypararius.listing import Listing
|
|
18
|
+
|
|
19
|
+
__version__ = "2.0.0"
|
|
20
|
+
__all__ = ["Pararius", "ParariusAPI", "Listing"]
|
pypararius/listing.py
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"""Listing class - represents a Pararius property listing."""
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Listing:
|
|
7
|
+
"""A Pararius property listing.
|
|
8
|
+
|
|
9
|
+
Data can be accessed as listing['key'] or listing.get('key').
|
|
10
|
+
|
|
11
|
+
Example:
|
|
12
|
+
>>> listing = pararius.get_listing('amsterdam/abc123/street')
|
|
13
|
+
>>> listing['title']
|
|
14
|
+
'Ridderspoorweg 10'
|
|
15
|
+
>>> listing['price']
|
|
16
|
+
1850
|
|
17
|
+
>>> listing['city']
|
|
18
|
+
'Amsterdam'
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
keys_alias = {
|
|
22
|
+
'name': 'title',
|
|
23
|
+
'address': 'title',
|
|
24
|
+
'location': 'city',
|
|
25
|
+
'locality': 'city',
|
|
26
|
+
'area': 'living_area',
|
|
27
|
+
'size': 'living_area',
|
|
28
|
+
'area_m2': 'living_area',
|
|
29
|
+
'coords': 'coordinates',
|
|
30
|
+
'lat': 'latitude',
|
|
31
|
+
'lng': 'longitude',
|
|
32
|
+
'lon': 'longitude',
|
|
33
|
+
'zip': 'postcode',
|
|
34
|
+
'zipcode': 'postcode',
|
|
35
|
+
'postal_code': 'postcode',
|
|
36
|
+
'type': 'object_type',
|
|
37
|
+
'property_type': 'object_type',
|
|
38
|
+
'images': 'photos',
|
|
39
|
+
'pictures': 'photos',
|
|
40
|
+
'media': 'photos',
|
|
41
|
+
'desc': 'description',
|
|
42
|
+
'text': 'description',
|
|
43
|
+
'agent': 'broker',
|
|
44
|
+
'realtor': 'broker',
|
|
45
|
+
'makelaar': 'broker',
|
|
46
|
+
'energy_rating': 'energy_label',
|
|
47
|
+
'street': 'title',
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
def __init__(self, listing_id: str | int | None = None, data: dict | None = None):
|
|
51
|
+
self.listing_id = str(listing_id) if listing_id else None
|
|
52
|
+
self.data: dict[str, Any] = data or {}
|
|
53
|
+
|
|
54
|
+
def __repr__(self) -> str:
|
|
55
|
+
title = self.data.get('title', 'Unknown')
|
|
56
|
+
city = self.data.get('city', '')
|
|
57
|
+
return f"<Listing id:{self.listing_id} [{title}, {city}]>"
|
|
58
|
+
|
|
59
|
+
def __str__(self) -> str:
|
|
60
|
+
return self.__repr__()
|
|
61
|
+
|
|
62
|
+
def __contains__(self, key: str) -> bool:
|
|
63
|
+
return self._normalize_key(key) in self.data
|
|
64
|
+
|
|
65
|
+
def __getitem__(self, key: str) -> Any:
|
|
66
|
+
normalized = self._normalize_key(key)
|
|
67
|
+
if normalized not in self.data:
|
|
68
|
+
raise KeyError(key)
|
|
69
|
+
return self.data[normalized]
|
|
70
|
+
|
|
71
|
+
def __setitem__(self, key: str, value: Any) -> None:
|
|
72
|
+
self.data[self._normalize_key(key)] = value
|
|
73
|
+
|
|
74
|
+
def __bool__(self) -> bool:
|
|
75
|
+
return bool(self.listing_id or self.data.get('title'))
|
|
76
|
+
|
|
77
|
+
def _normalize_key(self, key: str) -> str:
|
|
78
|
+
"""Normalize key using aliases."""
|
|
79
|
+
key = key.lower().replace('-', '_').replace(' ', '_')
|
|
80
|
+
return self.keys_alias.get(key, key)
|
|
81
|
+
|
|
82
|
+
def get(self, key: str, default: Any = None) -> Any:
|
|
83
|
+
"""Get a value with optional default."""
|
|
84
|
+
try:
|
|
85
|
+
return self[key]
|
|
86
|
+
except KeyError:
|
|
87
|
+
return default
|
|
88
|
+
|
|
89
|
+
def keys(self) -> list[str]:
|
|
90
|
+
"""Return all available keys."""
|
|
91
|
+
return list(self.data.keys())
|
|
92
|
+
|
|
93
|
+
def items(self) -> list[tuple[str, Any]]:
|
|
94
|
+
"""Return all key-value pairs."""
|
|
95
|
+
return list(self.data.items())
|
|
96
|
+
|
|
97
|
+
def values(self) -> list[Any]:
|
|
98
|
+
"""Return all values."""
|
|
99
|
+
return list(self.data.values())
|
|
100
|
+
|
|
101
|
+
def to_dict(self) -> dict[str, Any]:
|
|
102
|
+
"""Return data as a plain dictionary."""
|
|
103
|
+
return self.data.copy()
|
|
104
|
+
|
|
105
|
+
def summary(self) -> str:
|
|
106
|
+
"""Return a text summary of the listing."""
|
|
107
|
+
lines = []
|
|
108
|
+
title = self.data.get('title', 'Unknown')
|
|
109
|
+
city = self.data.get('city', '')
|
|
110
|
+
lines.append(f"Listing: {title}, {city}")
|
|
111
|
+
|
|
112
|
+
if price := self.data.get('price_formatted'):
|
|
113
|
+
lines.append(f"Price: {price}")
|
|
114
|
+
elif price := self.data.get('price'):
|
|
115
|
+
lines.append(f"Price: €{price:,}")
|
|
116
|
+
|
|
117
|
+
if area := self.data.get('living_area'):
|
|
118
|
+
lines.append(f"Living area: {area} m²")
|
|
119
|
+
|
|
120
|
+
if bedrooms := self.data.get('bedrooms'):
|
|
121
|
+
lines.append(f"Bedrooms: {bedrooms}")
|
|
122
|
+
|
|
123
|
+
if energy := self.data.get('energy_label'):
|
|
124
|
+
lines.append(f"Energy label: {energy}")
|
|
125
|
+
|
|
126
|
+
if url := self.data.get('url'):
|
|
127
|
+
lines.append(f"URL: {url}")
|
|
128
|
+
|
|
129
|
+
return '\n'.join(lines)
|
|
130
|
+
|
|
131
|
+
def getID(self) -> str | None:
|
|
132
|
+
"""Return the listing ID."""
|
|
133
|
+
return self.listing_id
|
|
134
|
+
|
|
135
|
+
@property
|
|
136
|
+
def id(self) -> str | None:
|
|
137
|
+
"""Alias for listing_id."""
|
|
138
|
+
return self.listing_id
|
pypararius/pararius.py
ADDED
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
"""Main Pararius API class."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from urllib.parse import urljoin
|
|
5
|
+
|
|
6
|
+
import httpx
|
|
7
|
+
|
|
8
|
+
from pypararius.listing import Listing
|
|
9
|
+
from pypararius.parser import parse_listing_details, parse_search_response
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# Base URL
|
|
13
|
+
BASE_URL = "https://www.pararius.com"
|
|
14
|
+
|
|
15
|
+
# Headers
|
|
16
|
+
HEADERS = {
|
|
17
|
+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
|
|
18
|
+
"Accept": "application/json, text/html",
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class Pararius:
|
|
23
|
+
"""Main interface to Pararius API.
|
|
24
|
+
|
|
25
|
+
Example:
|
|
26
|
+
>>> from pypararius import Pararius
|
|
27
|
+
>>> p = Pararius()
|
|
28
|
+
>>> listing = p.get_listing('amsterdam/abc123/street')
|
|
29
|
+
>>> print(listing['title'], listing['city'])
|
|
30
|
+
Ridderspoorweg 10 Amsterdam
|
|
31
|
+
>>> results = p.search_listing('amsterdam', price_max=2000)
|
|
32
|
+
>>> for r in results[:3]:
|
|
33
|
+
... print(r['title'], r['price'])
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
ID_PATTERN = re.compile(r"/([a-f0-9]{8})/")
|
|
37
|
+
|
|
38
|
+
def __init__(self, timeout: int = 30):
|
|
39
|
+
"""Initialize Pararius API client.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
timeout: Request timeout in seconds
|
|
43
|
+
"""
|
|
44
|
+
self.timeout = timeout
|
|
45
|
+
self._client: httpx.Client | None = None
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def client(self) -> httpx.Client:
|
|
49
|
+
"""Lazily create HTTP client."""
|
|
50
|
+
if self._client is None:
|
|
51
|
+
self._client = httpx.Client(
|
|
52
|
+
timeout=self.timeout,
|
|
53
|
+
headers=HEADERS,
|
|
54
|
+
follow_redirects=True,
|
|
55
|
+
)
|
|
56
|
+
return self._client
|
|
57
|
+
|
|
58
|
+
def close(self) -> None:
|
|
59
|
+
"""Close the HTTP client."""
|
|
60
|
+
if self._client:
|
|
61
|
+
self._client.close()
|
|
62
|
+
self._client = None
|
|
63
|
+
|
|
64
|
+
def __enter__(self) -> "Pararius":
|
|
65
|
+
return self
|
|
66
|
+
|
|
67
|
+
def __exit__(self, *args) -> None:
|
|
68
|
+
self.close()
|
|
69
|
+
|
|
70
|
+
# -------------------------------------------------------------------------
|
|
71
|
+
# Listing methods
|
|
72
|
+
# -------------------------------------------------------------------------
|
|
73
|
+
|
|
74
|
+
def get_listing(self, listing_id: str) -> Listing:
|
|
75
|
+
"""Get a listing by ID or URL.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
listing_id: Listing ID (e.g., 'abc123de') or full/partial URL
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
Listing object with property data
|
|
82
|
+
|
|
83
|
+
Example:
|
|
84
|
+
>>> p.get_listing('eecd88d9')
|
|
85
|
+
>>> p.get_listing('amsterdam/eecd88d9/ridderspoorweg')
|
|
86
|
+
>>> p.get_listing('https://www.pararius.com/apartment-for-rent/amsterdam/eecd88d9/ridderspoorweg')
|
|
87
|
+
"""
|
|
88
|
+
# If it's a full URL, use it directly
|
|
89
|
+
if listing_id.startswith("http"):
|
|
90
|
+
url = listing_id
|
|
91
|
+
# If it contains slashes, it's a partial path
|
|
92
|
+
elif "/" in listing_id:
|
|
93
|
+
# Could be 'amsterdam/abc123/street' or '/apartment-for-rent/amsterdam/abc123/street'
|
|
94
|
+
if listing_id.startswith("/"):
|
|
95
|
+
url = urljoin(BASE_URL, listing_id)
|
|
96
|
+
else:
|
|
97
|
+
url = f"{BASE_URL}/apartment-for-rent/{listing_id}"
|
|
98
|
+
else:
|
|
99
|
+
# Just an ID - we need to search for it
|
|
100
|
+
raise ValueError(
|
|
101
|
+
f"Cannot fetch listing by ID alone. Please provide a URL or path like 'amsterdam/{listing_id}/street'"
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
response = self.client.get(url)
|
|
105
|
+
|
|
106
|
+
if response.status_code == 404:
|
|
107
|
+
raise LookupError(f"Listing {listing_id} not found")
|
|
108
|
+
|
|
109
|
+
response.raise_for_status()
|
|
110
|
+
return parse_listing_details(response.text, str(response.url))
|
|
111
|
+
|
|
112
|
+
def search_listing(
|
|
113
|
+
self,
|
|
114
|
+
location: str | list[str] | None = None,
|
|
115
|
+
price_min: int | None = None,
|
|
116
|
+
price_max: int | None = None,
|
|
117
|
+
area_min: int | None = None,
|
|
118
|
+
bedrooms: int | None = None,
|
|
119
|
+
interior: str | None = None,
|
|
120
|
+
sort: str | None = None,
|
|
121
|
+
page: int = 0,
|
|
122
|
+
) -> list[Listing]:
|
|
123
|
+
"""Search for listings.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
location: City name to search in (e.g., 'amsterdam')
|
|
127
|
+
price_min: Minimum rent price
|
|
128
|
+
price_max: Maximum rent price
|
|
129
|
+
area_min: Minimum living area in m²
|
|
130
|
+
bedrooms: Minimum number of bedrooms
|
|
131
|
+
interior: Interior type ('furnished', 'upholstered', 'shell')
|
|
132
|
+
sort: Sort order - 'newest', 'price_asc', 'price_desc',
|
|
133
|
+
'area_asc', 'area_desc', or None
|
|
134
|
+
page: Page number (0-indexed, ~30 results per page)
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
List of Listing objects
|
|
138
|
+
|
|
139
|
+
Example:
|
|
140
|
+
>>> p.search_listing('amsterdam', price_max=2000)
|
|
141
|
+
>>> p.search_listing('rotterdam', bedrooms=2, interior='furnished')
|
|
142
|
+
"""
|
|
143
|
+
# Normalize location
|
|
144
|
+
if isinstance(location, list):
|
|
145
|
+
city = location[0] if location else "amsterdam"
|
|
146
|
+
else:
|
|
147
|
+
city = location or "amsterdam"
|
|
148
|
+
|
|
149
|
+
city = city.lower().replace(" ", "-")
|
|
150
|
+
|
|
151
|
+
# Build URL
|
|
152
|
+
url = self._build_search_url(
|
|
153
|
+
city=city,
|
|
154
|
+
price_min=price_min,
|
|
155
|
+
price_max=price_max,
|
|
156
|
+
area_min=area_min,
|
|
157
|
+
bedrooms=bedrooms,
|
|
158
|
+
interior=interior,
|
|
159
|
+
sort=sort,
|
|
160
|
+
page=page + 1, # Pararius uses 1-indexed pages
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
# Add XHR header to get JSON response
|
|
164
|
+
headers = {"X-Requested-With": "XMLHttpRequest"}
|
|
165
|
+
|
|
166
|
+
response = self.client.get(url, headers=headers)
|
|
167
|
+
|
|
168
|
+
if response.status_code != 200:
|
|
169
|
+
raise RuntimeError(f"Search failed (status {response.status_code})")
|
|
170
|
+
|
|
171
|
+
data = response.json()
|
|
172
|
+
return parse_search_response(data, city)
|
|
173
|
+
|
|
174
|
+
# -------------------------------------------------------------------------
|
|
175
|
+
# URL building
|
|
176
|
+
# -------------------------------------------------------------------------
|
|
177
|
+
|
|
178
|
+
def _build_search_url(
|
|
179
|
+
self,
|
|
180
|
+
city: str,
|
|
181
|
+
price_min: int | None = None,
|
|
182
|
+
price_max: int | None = None,
|
|
183
|
+
area_min: int | None = None,
|
|
184
|
+
bedrooms: int | None = None,
|
|
185
|
+
interior: str | None = None,
|
|
186
|
+
sort: str | None = None,
|
|
187
|
+
page: int = 1,
|
|
188
|
+
) -> str:
|
|
189
|
+
"""Build the search URL with filters."""
|
|
190
|
+
parts = [f"{BASE_URL}/apartments/{city}"]
|
|
191
|
+
|
|
192
|
+
# Price filter
|
|
193
|
+
if price_min is not None or price_max is not None:
|
|
194
|
+
p_min = price_min or 0
|
|
195
|
+
p_max = price_max or 0
|
|
196
|
+
if p_min > 0 or p_max > 0:
|
|
197
|
+
parts.append(f"{p_min}-{p_max}")
|
|
198
|
+
|
|
199
|
+
# Bedrooms filter
|
|
200
|
+
if bedrooms is not None and bedrooms > 0:
|
|
201
|
+
parts.append(f"{bedrooms}-bedrooms")
|
|
202
|
+
|
|
203
|
+
# Area filter
|
|
204
|
+
if area_min is not None and area_min > 0:
|
|
205
|
+
parts.append(f"{area_min}m2")
|
|
206
|
+
|
|
207
|
+
# Interior filter
|
|
208
|
+
if interior is not None:
|
|
209
|
+
interior_map = {
|
|
210
|
+
"furnished": "furnished",
|
|
211
|
+
"upholstered": "upholstered",
|
|
212
|
+
"shell": "shell",
|
|
213
|
+
}
|
|
214
|
+
if interior.lower() in interior_map:
|
|
215
|
+
parts.append(interior_map[interior.lower()])
|
|
216
|
+
|
|
217
|
+
# Page (must come before sort in URL)
|
|
218
|
+
if page > 1:
|
|
219
|
+
parts.append(f"page-{page}")
|
|
220
|
+
|
|
221
|
+
# Sort order
|
|
222
|
+
if sort is not None:
|
|
223
|
+
sort_map = {
|
|
224
|
+
"newest": "", # Default
|
|
225
|
+
"price_asc": "sort-price-low",
|
|
226
|
+
"price_desc": "sort-price-high",
|
|
227
|
+
"area_asc": "sort-floor-low",
|
|
228
|
+
"area_desc": "sort-floor-high",
|
|
229
|
+
}
|
|
230
|
+
sort_val = sort_map.get(sort, "")
|
|
231
|
+
if sort_val:
|
|
232
|
+
parts.append(sort_val)
|
|
233
|
+
|
|
234
|
+
return "/".join(parts)
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
# Convenience alias
|
|
238
|
+
ParariusAPI = Pararius
|
pypararius/parser.py
ADDED
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
"""Parser utilities for Pararius HTML responses."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import re
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from .listing import Listing
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def parse_search_response(data: dict, city: str) -> list[Listing]:
|
|
11
|
+
"""Parse the JSON response from search endpoint into list of Listings."""
|
|
12
|
+
results_html = data.get("components", {}).get("results", "")
|
|
13
|
+
return _parse_listings_from_html(results_html, city)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _parse_listings_from_html(html: str, city: str) -> list[Listing]:
|
|
17
|
+
"""Extract listings from search results HTML."""
|
|
18
|
+
listings = []
|
|
19
|
+
|
|
20
|
+
# Split by section starts
|
|
21
|
+
sections = re.split(r'<section\s+class="listing-search-item[^>]*>', html)
|
|
22
|
+
|
|
23
|
+
for section in sections[1:]: # Skip first split
|
|
24
|
+
end_idx = section.find("</section>")
|
|
25
|
+
if end_idx <= 0:
|
|
26
|
+
continue
|
|
27
|
+
|
|
28
|
+
block = section[:end_idx]
|
|
29
|
+
listing = _parse_listing_block(block, city)
|
|
30
|
+
if listing:
|
|
31
|
+
listings.append(listing)
|
|
32
|
+
|
|
33
|
+
return listings
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _parse_listing_block(block: str, city: str) -> Optional[Listing]:
|
|
37
|
+
"""Parse a single listing block from search HTML."""
|
|
38
|
+
# URL and ID
|
|
39
|
+
url_match = re.search(r'href="(/apartment-for-rent/[^/]+/([^/]+)/([^"]+))"', block)
|
|
40
|
+
if not url_match:
|
|
41
|
+
return None
|
|
42
|
+
|
|
43
|
+
listing_id = url_match.group(2)
|
|
44
|
+
street = url_match.group(3).replace("-", " ").title()
|
|
45
|
+
url = f"https://www.pararius.com{url_match.group(1)}"
|
|
46
|
+
|
|
47
|
+
# Title from analytics data (more accurate street name)
|
|
48
|
+
title_match = re.search(r'element_text":"([^&]+)"', block)
|
|
49
|
+
if title_match:
|
|
50
|
+
title = title_match.group(1)
|
|
51
|
+
# Extract street from title like "Flat Ridderspoorweg"
|
|
52
|
+
if " " in title:
|
|
53
|
+
street = " ".join(title.split()[1:])
|
|
54
|
+
|
|
55
|
+
# Price
|
|
56
|
+
price = None
|
|
57
|
+
price_formatted = None
|
|
58
|
+
price_match = re.search(r'listing-search-item__price-main">([^<]+)</span>', block)
|
|
59
|
+
if price_match:
|
|
60
|
+
price_formatted = price_match.group(1).strip()
|
|
61
|
+
# Extract numeric price
|
|
62
|
+
price_nums = re.sub(r'[^\d]', '', price_formatted)
|
|
63
|
+
if price_nums:
|
|
64
|
+
price = int(price_nums)
|
|
65
|
+
|
|
66
|
+
# Neighborhood
|
|
67
|
+
neighbourhood = None
|
|
68
|
+
sub_match = re.search(r'listing-search-item__sub-title"[^>]*>\s*([^<]+)<', block)
|
|
69
|
+
if sub_match:
|
|
70
|
+
neighbourhood = sub_match.group(1).strip()
|
|
71
|
+
|
|
72
|
+
# Area
|
|
73
|
+
living_area = None
|
|
74
|
+
area_match = re.search(r'title="(\d+)\s*m[²2]"', block)
|
|
75
|
+
if area_match:
|
|
76
|
+
living_area = int(area_match.group(1))
|
|
77
|
+
|
|
78
|
+
# Rooms
|
|
79
|
+
rooms = None
|
|
80
|
+
rooms_match = re.search(r'title="(\d+)\s*room', block)
|
|
81
|
+
if rooms_match:
|
|
82
|
+
rooms = int(rooms_match.group(1))
|
|
83
|
+
|
|
84
|
+
# Image
|
|
85
|
+
photo_url = None
|
|
86
|
+
img_match = re.search(r'data-src="([^"]+)"', block)
|
|
87
|
+
if img_match:
|
|
88
|
+
photo_url = img_match.group(1).replace("&", "&")
|
|
89
|
+
|
|
90
|
+
listing_data = {
|
|
91
|
+
"title": street,
|
|
92
|
+
"city": city.title(),
|
|
93
|
+
"neighbourhood": neighbourhood,
|
|
94
|
+
"price": price,
|
|
95
|
+
"price_formatted": price_formatted,
|
|
96
|
+
"living_area": living_area,
|
|
97
|
+
"rooms": rooms,
|
|
98
|
+
"url": url,
|
|
99
|
+
"photos": [photo_url] if photo_url else [],
|
|
100
|
+
"photo_urls": [photo_url] if photo_url else [],
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
return Listing(listing_id=listing_id, data=listing_data)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def parse_listing_details(html: str, url: str) -> Listing:
|
|
107
|
+
"""Parse full listing details from detail page HTML."""
|
|
108
|
+
listing_id = url.split("/")[-2] if "/" in url else ""
|
|
109
|
+
|
|
110
|
+
# Extract JSON-LD
|
|
111
|
+
jsonld = _extract_jsonld(html)
|
|
112
|
+
|
|
113
|
+
# Basic info from JSON-LD
|
|
114
|
+
name = jsonld.get("name", "")
|
|
115
|
+
description = jsonld.get("description")
|
|
116
|
+
main_image = jsonld.get("image")
|
|
117
|
+
|
|
118
|
+
# Address
|
|
119
|
+
addr_data = jsonld.get("address", {})
|
|
120
|
+
street = addr_data.get("streetAddress", "")
|
|
121
|
+
city = addr_data.get("addressLocality", "")
|
|
122
|
+
postcode = addr_data.get("postalCode")
|
|
123
|
+
neighbourhood = addr_data.get("addressRegion")
|
|
124
|
+
|
|
125
|
+
# Rooms and area from JSON-LD
|
|
126
|
+
rooms = None
|
|
127
|
+
rooms_data = jsonld.get("numberOfRooms", [])
|
|
128
|
+
if rooms_data and isinstance(rooms_data, list) and len(rooms_data) > 0:
|
|
129
|
+
rooms = rooms_data[0].get("value")
|
|
130
|
+
|
|
131
|
+
living_area = None
|
|
132
|
+
floor_data = jsonld.get("floorSize", {})
|
|
133
|
+
if floor_data:
|
|
134
|
+
living_area = floor_data.get("value")
|
|
135
|
+
|
|
136
|
+
# Price
|
|
137
|
+
price = None
|
|
138
|
+
currency = "EUR"
|
|
139
|
+
offer = jsonld.get("offers", {})
|
|
140
|
+
if offer:
|
|
141
|
+
price_str = offer.get("price")
|
|
142
|
+
if price_str:
|
|
143
|
+
price = int(float(price_str))
|
|
144
|
+
currency = offer.get("priceCurrency", "EUR")
|
|
145
|
+
|
|
146
|
+
# Features from HTML
|
|
147
|
+
features = _extract_features(html)
|
|
148
|
+
|
|
149
|
+
# All images
|
|
150
|
+
images = _extract_images(html)
|
|
151
|
+
if main_image and main_image not in images:
|
|
152
|
+
images.insert(0, main_image)
|
|
153
|
+
|
|
154
|
+
# Agent/Broker
|
|
155
|
+
broker = _extract_agent(html)
|
|
156
|
+
|
|
157
|
+
# Coordinates
|
|
158
|
+
coords = _extract_coordinates(html)
|
|
159
|
+
|
|
160
|
+
# Extract specific features
|
|
161
|
+
deposit = features.get("Deposit")
|
|
162
|
+
interior = features.get("Interior")
|
|
163
|
+
available = features.get("Available")
|
|
164
|
+
offered_since = features.get("Offered since")
|
|
165
|
+
rental_agreement = features.get("Rental agreement")
|
|
166
|
+
energy_label = features.get("Energy rating")
|
|
167
|
+
|
|
168
|
+
# Boolean features
|
|
169
|
+
smoking_allowed = None
|
|
170
|
+
pets_allowed = None
|
|
171
|
+
if "Smoking allowed" in features:
|
|
172
|
+
smoking_allowed = features["Smoking allowed"].lower() in ("yes", "ja", "allowed")
|
|
173
|
+
if "Pets allowed" in features:
|
|
174
|
+
pets_allowed = features["Pets allowed"].lower() in ("yes", "ja", "allowed", "in consultation")
|
|
175
|
+
|
|
176
|
+
# Bedrooms
|
|
177
|
+
bedrooms = None
|
|
178
|
+
if "Number of bedrooms" in features:
|
|
179
|
+
try:
|
|
180
|
+
bedrooms = int(features["Number of bedrooms"])
|
|
181
|
+
except ValueError:
|
|
182
|
+
pass
|
|
183
|
+
|
|
184
|
+
# Price formatted
|
|
185
|
+
price_formatted = f"€{price:,} per month" if price else None
|
|
186
|
+
|
|
187
|
+
listing_data = {
|
|
188
|
+
"title": name or street,
|
|
189
|
+
"city": city,
|
|
190
|
+
"postcode": postcode,
|
|
191
|
+
"neighbourhood": neighbourhood,
|
|
192
|
+
"price": price,
|
|
193
|
+
"price_formatted": price_formatted,
|
|
194
|
+
"currency": currency,
|
|
195
|
+
"living_area": living_area,
|
|
196
|
+
"rooms": rooms,
|
|
197
|
+
"bedrooms": bedrooms,
|
|
198
|
+
"description": description,
|
|
199
|
+
"url": url,
|
|
200
|
+
"photos": images,
|
|
201
|
+
"photo_urls": images,
|
|
202
|
+
"photo_count": len(images),
|
|
203
|
+
"energy_label": energy_label,
|
|
204
|
+
"offered_since": offered_since,
|
|
205
|
+
"characteristics": features,
|
|
206
|
+
# Rental-specific
|
|
207
|
+
"deposit": deposit,
|
|
208
|
+
"interior": interior,
|
|
209
|
+
"available": available,
|
|
210
|
+
"rental_agreement": rental_agreement,
|
|
211
|
+
"smoking_allowed": smoking_allowed,
|
|
212
|
+
"pets_allowed": pets_allowed,
|
|
213
|
+
"offering_type": "rent",
|
|
214
|
+
"object_type": "apartment",
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
# Coordinates
|
|
218
|
+
if coords:
|
|
219
|
+
listing_data["latitude"] = coords[0]
|
|
220
|
+
listing_data["longitude"] = coords[1]
|
|
221
|
+
listing_data["coordinates"] = coords
|
|
222
|
+
|
|
223
|
+
# Broker
|
|
224
|
+
if broker:
|
|
225
|
+
listing_data["broker"] = broker.get("name")
|
|
226
|
+
listing_data["broker_url"] = broker.get("url")
|
|
227
|
+
listing_data["broker_phone"] = broker.get("phone")
|
|
228
|
+
|
|
229
|
+
return Listing(listing_id=listing_id, data=listing_data)
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def _extract_jsonld(html: str) -> dict:
|
|
233
|
+
"""Extract JSON-LD structured data from HTML."""
|
|
234
|
+
matches = re.findall(
|
|
235
|
+
r'<script type="application/ld\+json">(.*?)</script>',
|
|
236
|
+
html,
|
|
237
|
+
re.DOTALL,
|
|
238
|
+
)
|
|
239
|
+
for match in matches:
|
|
240
|
+
try:
|
|
241
|
+
data = json.loads(match)
|
|
242
|
+
type_val = data.get("@type", "")
|
|
243
|
+
if "House" in str(type_val) or "Product" in str(type_val):
|
|
244
|
+
return data
|
|
245
|
+
except json.JSONDecodeError:
|
|
246
|
+
continue
|
|
247
|
+
return {}
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def _extract_features(html: str) -> dict[str, str]:
|
|
251
|
+
"""Extract features from listing HTML."""
|
|
252
|
+
features = {}
|
|
253
|
+
|
|
254
|
+
# Pattern 1: <dd class="listing-features__term">Term</dd> <dd ...><span>Value</span>
|
|
255
|
+
pattern1 = (
|
|
256
|
+
r'<dd class="listing-features__term">([^<]+)</dd>\s*'
|
|
257
|
+
r'<dd class="listing-features__description[^"]*">\s*'
|
|
258
|
+
r'(?:<span class="listing-features__main-description">)?([^<]+)'
|
|
259
|
+
)
|
|
260
|
+
for term, value in re.findall(pattern1, html):
|
|
261
|
+
features[term.strip()] = value.strip().replace(" ", " ")
|
|
262
|
+
|
|
263
|
+
# Pattern 2: <dt ...>Term</dt> <dd ...><span>Value</span> (for some features)
|
|
264
|
+
pattern2 = (
|
|
265
|
+
r'<dt class="listing-features__term[^"]*">([^<]+)</dt>\s*'
|
|
266
|
+
r'<dd class="listing-features__description[^"]*">\s*'
|
|
267
|
+
r'(?:\s*<span class="listing-features__main-description">)?([^<]+)'
|
|
268
|
+
)
|
|
269
|
+
for term, value in re.findall(pattern2, html):
|
|
270
|
+
features[term.strip()] = value.strip().replace(" ", " ")
|
|
271
|
+
|
|
272
|
+
return features
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def _extract_images(html: str) -> list[str]:
|
|
276
|
+
"""Extract all listing images from HTML."""
|
|
277
|
+
images = set()
|
|
278
|
+
pattern = r'(https://casco-media-prod[^"&\s]+\.(?:jpg|png|webp))'
|
|
279
|
+
for img in re.findall(pattern, html):
|
|
280
|
+
# Prefer full-size images
|
|
281
|
+
if "width=600" in img or "width=" not in img:
|
|
282
|
+
clean_url = img.replace("&", "&")
|
|
283
|
+
images.add(clean_url)
|
|
284
|
+
return list(images)[:20] # Limit to 20 images
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def _extract_agent(html: str) -> Optional[dict]:
|
|
288
|
+
"""Extract agent information from HTML."""
|
|
289
|
+
agent_url = None
|
|
290
|
+
agent_name = None
|
|
291
|
+
agent_phone = None
|
|
292
|
+
|
|
293
|
+
url_match = re.search(r'href="(/real-estate-agent[^"]+)"', html)
|
|
294
|
+
if url_match:
|
|
295
|
+
agent_url = f"https://www.pararius.com{url_match.group(1)}"
|
|
296
|
+
|
|
297
|
+
# Agent name is inside: <a class="agent-summary__title-link" ...>Name</a>
|
|
298
|
+
name_match = re.search(r'agent-summary__title-link"[^>]*>([^<]+)', html)
|
|
299
|
+
if name_match:
|
|
300
|
+
agent_name = name_match.group(1).strip()
|
|
301
|
+
|
|
302
|
+
phone_match = re.search(r'tel:([^"]+)', html)
|
|
303
|
+
if phone_match:
|
|
304
|
+
agent_phone = phone_match.group(1)
|
|
305
|
+
|
|
306
|
+
if agent_url or agent_name:
|
|
307
|
+
return {"name": agent_name, "url": agent_url, "phone": agent_phone}
|
|
308
|
+
return None
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def _extract_coordinates(html: str) -> Optional[tuple[float, float]]:
|
|
312
|
+
"""Extract map coordinates from HTML."""
|
|
313
|
+
# Try data-latitude/data-longitude attributes
|
|
314
|
+
match = re.search(r'data-latitude="([^"]+)"[^>]*data-longitude="([^"]+)"', html)
|
|
315
|
+
if match:
|
|
316
|
+
return (float(match.group(1)), float(match.group(2)))
|
|
317
|
+
|
|
318
|
+
# Try data-lat/data-lon attributes (fallback)
|
|
319
|
+
match = re.search(r'data-lat="([^"]+)"[^>]*data-lon="([^"]+)"', html)
|
|
320
|
+
if match:
|
|
321
|
+
return (float(match.group(1)), float(match.group(2)))
|
|
322
|
+
|
|
323
|
+
# Try JSON in script
|
|
324
|
+
match = re.search(r'"lat":\s*([\d.]+).*?"lon":\s*([\d.]+)', html)
|
|
325
|
+
if match:
|
|
326
|
+
return (float(match.group(1)), float(match.group(2)))
|
|
327
|
+
|
|
328
|
+
return None
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pypararius
|
|
3
|
+
Version: 2.0.0
|
|
4
|
+
Summary: Python API wrapper for Pararius.com real estate listings
|
|
5
|
+
Requires-Python: >=3.10
|
|
6
|
+
Requires-Dist: httpx>=0.27.0
|
|
7
|
+
Provides-Extra: dev
|
|
8
|
+
Requires-Dist: pytest-asyncio>=0.23.0; extra == 'dev'
|
|
9
|
+
Requires-Dist: pytest>=8.0.0; extra == 'dev'
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
pypararius/__init__.py,sha256=q68u5npo3QIW3BV84YCIPrRyAqcydlJ4TrILELdlB3o,586
|
|
2
|
+
pypararius/listing.py,sha256=sxm7jl6Sod0ynW5-hcTFdZpvgIKgDXdjmQS60CnSO90,4166
|
|
3
|
+
pypararius/pararius.py,sha256=1673qk6Hd8nU4OKS3pbGcdczuN5BpyIbBWYnEvEHdIU,7493
|
|
4
|
+
pypararius/parser.py,sha256=mVtSkPCExxQMLLG5PzZXli_GUJeqmUKPpPXE4P8-DlI,10519
|
|
5
|
+
pypararius-2.0.0.dist-info/METADATA,sha256=QsYdxD0TbJB10bU9vH0kofGI1n8Uv9rs5w76XQsr17o,292
|
|
6
|
+
pypararius-2.0.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
7
|
+
pypararius-2.0.0.dist-info/RECORD,,
|