bachtrackapi 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- api/__init__.py +1 -0
- api/main.py +35 -0
- bachtrackapi-0.1.0.dist-info/METADATA +163 -0
- bachtrackapi-0.1.0.dist-info/RECORD +9 -0
- bachtrackapi-0.1.0.dist-info/WHEEL +5 -0
- bachtrackapi-0.1.0.dist-info/licenses/LICENSE +21 -0
- bachtrackapi-0.1.0.dist-info/top_level.txt +2 -0
- scraper/__init__.py +3 -0
- scraper/scraper.py +297 -0
api/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""FastAPI backend for Bachtrack opera events API."""
|
api/main.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""FastAPI application factory."""
|
|
2
|
+
from fastapi import FastAPI
|
|
3
|
+
from fastapi.middleware.cors import CORSMiddleware
|
|
4
|
+
from api.routes.events import router as events_router
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def create_app() -> FastAPI:
|
|
8
|
+
"""Create and configure FastAPI application."""
|
|
9
|
+
app = FastAPI(
|
|
10
|
+
title="BachtrackAPI",
|
|
11
|
+
description="API to search and retrieve opera events from Bachtrack.com",
|
|
12
|
+
version="0.1.0",
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
# Add CORS middleware
|
|
16
|
+
app.add_middleware(
|
|
17
|
+
CORSMiddleware,
|
|
18
|
+
allow_origins=["*"],
|
|
19
|
+
allow_credentials=True,
|
|
20
|
+
allow_methods=["*"],
|
|
21
|
+
allow_headers=["*"],
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
# Include routers
|
|
25
|
+
app.include_router(events_router)
|
|
26
|
+
|
|
27
|
+
# Health check endpoint
|
|
28
|
+
@app.get("/health")
|
|
29
|
+
async def health_check():
|
|
30
|
+
return {"status": "ok"}
|
|
31
|
+
|
|
32
|
+
return app
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
app = create_app()
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: bachtrackapi
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A Python web scraper and REST API for extracting classical music opera events from Bachtrack.com
|
|
5
|
+
Author-email: Clark Maio <clark@example.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/clarkmaio/bachtrackapi
|
|
8
|
+
Project-URL: Repository, https://github.com/clarkmaio/bachtrackapi.git
|
|
9
|
+
Project-URL: Documentation, https://github.com/clarkmaio/bachtrackapi#readme
|
|
10
|
+
Project-URL: Issues, https://github.com/clarkmaio/bachtrackapi/issues
|
|
11
|
+
Keywords: scraper,api,opera,classical-music,fastapi,bachtrack
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
22
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
23
|
+
Requires-Python: >=3.8
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
License-File: LICENSE
|
|
26
|
+
Requires-Dist: fastapi==0.104.1
|
|
27
|
+
Requires-Dist: uvicorn[standard]==0.24.0
|
|
28
|
+
Requires-Dist: pydantic==2.5.0
|
|
29
|
+
Requires-Dist: pydantic-settings==2.1.0
|
|
30
|
+
Requires-Dist: requests==2.31.0
|
|
31
|
+
Requires-Dist: beautifulsoup4==4.12.2
|
|
32
|
+
Requires-Dist: selenium==4.15.2
|
|
33
|
+
Requires-Dist: webdriver-manager==4.0.1
|
|
34
|
+
Requires-Dist: python-dotenv==1.0.0
|
|
35
|
+
Provides-Extra: dev
|
|
36
|
+
Requires-Dist: pytest==7.4.3; extra == "dev"
|
|
37
|
+
Requires-Dist: pytest-asyncio==0.21.1; extra == "dev"
|
|
38
|
+
Requires-Dist: black==23.12.0; extra == "dev"
|
|
39
|
+
Requires-Dist: flake8==6.1.0; extra == "dev"
|
|
40
|
+
Requires-Dist: mypy==1.7.0; extra == "dev"
|
|
41
|
+
Dynamic: license-file
|
|
42
|
+
Dynamic: requires-python
|
|
43
|
+
|
|
44
|
+
# BachtrackAPI
|
|
45
|
+
|
|
46
|
+
A Python web scraper and REST API for extracting classical music opera events from [Bachtrack.com](https://bachtrack.com/).
|
|
47
|
+
|
|
48
|
+
## Overview
|
|
49
|
+
|
|
50
|
+
BachtrackAPI provides two complementary ways to access opera event data:
|
|
51
|
+
|
|
52
|
+
1. **Scraper Module** - Direct web scraping of Bachtrack opera listings
|
|
53
|
+
2. **FastAPI Backend** - RESTful API endpoints for searching and filtering events
|
|
54
|
+
|
|
55
|
+
Search by work ID (e.g., `12285` for Gianni Schicchi) or freetext (e.g., `"La Traviata"`).
|
|
56
|
+
|
|
57
|
+
## Installation
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pip install -r requirements.txt
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Quick Start
|
|
64
|
+
|
|
65
|
+
### 1. Using the Scraper Directly
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
from scraper.scraper import BachtrackScraper
|
|
69
|
+
|
|
70
|
+
scraper = BachtrackScraper()
|
|
71
|
+
|
|
72
|
+
# Search by work ID
|
|
73
|
+
events = scraper.search_operas(12285) # Gianni Schicchi
|
|
74
|
+
print(f"Found {len(events)} events")
|
|
75
|
+
|
|
76
|
+
# Search by freetext
|
|
77
|
+
events = scraper.search_operas("La Traviata")
|
|
78
|
+
for event in events:
|
|
79
|
+
print(f"{event['title']} - {event['city']} @ {event['venue']}")
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
**Output:**
|
|
83
|
+
```
|
|
84
|
+
Found 28 events
|
|
85
|
+
Gianni Schicchi - Berlin @ Deutsche Oper
|
|
86
|
+
Gianni Schicchi - Winterthur @ Stadttheater Winterthur
|
|
87
|
+
...
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### 2. Using the FastAPI Backend
|
|
91
|
+
|
|
92
|
+
Start the server:
|
|
93
|
+
```bash
|
|
94
|
+
uvicorn api.main:app --reload
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
**Example API Requests:**
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
# Freetext search
|
|
101
|
+
curl "http://localhost:8000/api/v1/events/get_operas?q=gianni%20schicchi"
|
|
102
|
+
|
|
103
|
+
# Work ID search
|
|
104
|
+
curl "http://localhost:8000/api/v1/events/get_operas?q=12285"
|
|
105
|
+
|
|
106
|
+
# POST search
|
|
107
|
+
curl -X POST "http://localhost:8000/api/v1/events/search" \
|
|
108
|
+
-H "Content-Type: application/json" \
|
|
109
|
+
-d '{"work_id": 12285}'
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
**Response:**
|
|
113
|
+
```json
|
|
114
|
+
{
|
|
115
|
+
"query": "12285",
|
|
116
|
+
"total_results": 28,
|
|
117
|
+
"results": [
|
|
118
|
+
{
|
|
119
|
+
"title": "Gianni Schicchi",
|
|
120
|
+
"city": "Berlin",
|
|
121
|
+
"date": "2026-04-05T00:00:00",
|
|
122
|
+
"venue": "Deutsche Oper",
|
|
123
|
+
"detail_url": "https://bachtrack.com/opera-event/..."
|
|
124
|
+
}
|
|
125
|
+
]
|
|
126
|
+
}
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
## Available Endpoints
|
|
130
|
+
|
|
131
|
+
- `GET /api/v1/events/get_operas?q=<search>` - Raw scraper output
|
|
132
|
+
- `GET /api/v1/events/search?work_id=<id>` - Search by work ID
|
|
133
|
+
- `GET /api/v1/events/search?q=<term>` - Freetext search
|
|
134
|
+
- `POST /api/v1/events/search` - JSON body search
|
|
135
|
+
- `GET /docs` - Interactive API documentation
|
|
136
|
+
- `GET /health` - Health check
|
|
137
|
+
|
|
138
|
+
## Testing
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
# Run scraper tests
|
|
142
|
+
python tests/test_scraper.py
|
|
143
|
+
|
|
144
|
+
# Run full API integration tests
|
|
145
|
+
pytest tests/test_api.py -v -s
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
## Project Structure
|
|
149
|
+
|
|
150
|
+
```
|
|
151
|
+
scraper/scraper.py # Core scraping logic
|
|
152
|
+
api/
|
|
153
|
+
├── main.py # FastAPI app factory
|
|
154
|
+
├── routes/events.py # API endpoints
|
|
155
|
+
├── models/event.py # Pydantic models
|
|
156
|
+
└── services/opera_service.py # Service layer
|
|
157
|
+
tests/ # Unit and integration tests
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
## License
|
|
161
|
+
|
|
162
|
+
MIT License - see [LICENSE](LICENSE) file for details
|
|
163
|
+
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
api/__init__.py,sha256=JXZBmzqjHGf7CG-X_KWvZ7GTm3EHP7KQd4R9kdPE3t4,54
|
|
2
|
+
api/main.py,sha256=ix_DYJjwo6AjBQKggpI6jYtHQx5ybXuSu4Gu4X8QzS0,842
|
|
3
|
+
bachtrackapi-0.1.0.dist-info/licenses/LICENSE,sha256=PWqPgTDR0XxvjIt0t6OPvOVmJRV3TPYA8EQ6e01Zf1Y,1067
|
|
4
|
+
scraper/__init__.py,sha256=w5P9XC4ieEl9eIwm0OSqKETDCeNoBK9KCsqVh8QkJFE,39
|
|
5
|
+
scraper/scraper.py,sha256=2Aj4GoBFsie-LpNMzekiWuNEpyMQuzhoJ0arKOGBIMo,10762
|
|
6
|
+
bachtrackapi-0.1.0.dist-info/METADATA,sha256=iaI9qk4VFc5PBfc-s5uQaeTxnscPgJGZLSSuVuKxlrQ,4572
|
|
7
|
+
bachtrackapi-0.1.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
8
|
+
bachtrackapi-0.1.0.dist-info/top_level.txt,sha256=mepoNsBT0riUPSxOkrXxheORLHj-ayrWU2hFcrweb40,12
|
|
9
|
+
bachtrackapi-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Clark Maio
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
scraper/__init__.py
ADDED
scraper/scraper.py
ADDED
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
"""Bachtrack.com scraper for opera events."""
|
|
2
|
+
from typing import List, Dict, Union
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
import requests
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
6
|
+
from urllib.parse import quote
|
|
7
|
+
import re
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class BachtrackScraper:
|
|
11
|
+
"""Scraper for Bachtrack opera events."""
|
|
12
|
+
|
|
13
|
+
BASE_URL = "https://bachtrack.com"
|
|
14
|
+
|
|
15
|
+
def __init__(self):
|
|
16
|
+
self.headers = {
|
|
17
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
def search_operas(self, search_input: Union[int, str]) -> List[Dict]:
|
|
21
|
+
"""
|
|
22
|
+
Search for opera events by work ID or freetext search.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
search_input: Either an integer work ID (e.g., 12285) or a string search term (e.g., "Il barbiere di Siviglia")
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
List of opera event dictionaries with city, date, venue, title
|
|
29
|
+
"""
|
|
30
|
+
if isinstance(search_input, int):
|
|
31
|
+
# Search by work ID
|
|
32
|
+
search_url = f"{self.BASE_URL}/search-opera/work={search_input}"
|
|
33
|
+
else:
|
|
34
|
+
# Search by freetext
|
|
35
|
+
encoded_search = quote(search_input)
|
|
36
|
+
search_url = f"{self.BASE_URL}/search-opera/freetext={encoded_search}"
|
|
37
|
+
|
|
38
|
+
try:
|
|
39
|
+
response = requests.get(search_url, headers=self.headers, timeout=10)
|
|
40
|
+
response.raise_for_status()
|
|
41
|
+
except requests.RequestException as e:
|
|
42
|
+
raise RuntimeError(f"Failed to fetch search results: {e}")
|
|
43
|
+
|
|
44
|
+
soup = BeautifulSoup(response.content, 'html.parser')
|
|
45
|
+
|
|
46
|
+
# Extract opera events from listing
|
|
47
|
+
events = []
|
|
48
|
+
li_elements = soup.find_all('li', {'data-type': 'nothing'})
|
|
49
|
+
|
|
50
|
+
for element in li_elements:
|
|
51
|
+
try:
|
|
52
|
+
event_list = self._parse_event_element(element)
|
|
53
|
+
events.extend(event_list)
|
|
54
|
+
except (AttributeError, ValueError) as e:
|
|
55
|
+
# Skip malformed elements
|
|
56
|
+
continue
|
|
57
|
+
|
|
58
|
+
return events
|
|
59
|
+
|
|
60
|
+
def _parse_event_element(self, element) -> List[Dict]:
|
|
61
|
+
"""
|
|
62
|
+
Parse individual event element and expand to multiple events for each date.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
element: BeautifulSoup element representing an event listing
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
List of dictionaries with event details, one per date
|
|
69
|
+
"""
|
|
70
|
+
try:
|
|
71
|
+
city = element.find('div', {'class': 'listing-ms-city'}).text.strip()
|
|
72
|
+
date_str = element.find('div', {'class': 'listing-ms-dates'}).text.strip()
|
|
73
|
+
venue = element.find('div', {'class': 'listing-ms-venue'}).text.strip()
|
|
74
|
+
|
|
75
|
+
# Extract title from listing-ms-main, removing Wish list button
|
|
76
|
+
main_div = element.find('div', {'class': 'listing-ms-main'})
|
|
77
|
+
title = main_div.text.strip()
|
|
78
|
+
# Remove the wish list placeholder text
|
|
79
|
+
title = title.replace('Wish list', '').strip()
|
|
80
|
+
|
|
81
|
+
# Get detail page URL
|
|
82
|
+
detail_link = element.find('a', {'class': 'listing-ms-right'})
|
|
83
|
+
detail_url = None
|
|
84
|
+
if detail_link and detail_link.get('href'):
|
|
85
|
+
detail_url = f"{self.BASE_URL}{detail_link['href']}"
|
|
86
|
+
|
|
87
|
+
# Parse dates and create one event per date
|
|
88
|
+
parsed_dates = self._parse_dates_list(date_str)
|
|
89
|
+
events = []
|
|
90
|
+
|
|
91
|
+
for parsed_date in parsed_dates:
|
|
92
|
+
events.append({
|
|
93
|
+
'title': title,
|
|
94
|
+
'city': city,
|
|
95
|
+
'date': parsed_date,
|
|
96
|
+
'venue': venue,
|
|
97
|
+
'detail_url': detail_url,
|
|
98
|
+
})
|
|
99
|
+
|
|
100
|
+
return events
|
|
101
|
+
except (AttributeError, TypeError):
|
|
102
|
+
return []
|
|
103
|
+
|
|
104
|
+
def _parse_dates_list(self, date_str: str) -> List[datetime]:
|
|
105
|
+
"""
|
|
106
|
+
Parse date string and return list of datetime objects.
|
|
107
|
+
|
|
108
|
+
Handles formats like:
|
|
109
|
+
- "Apr 05, 10, 15, 17" (same month, same year - inferred as current/next year)
|
|
110
|
+
- "Sun 3 May at 14:00" (full date with time)
|
|
111
|
+
- "Feb 05, 07, 11, 13, 15 mat, 17, 19, 21" (with qualifiers like 'mat')
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
date_str: Date string with comma-separated dates
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
List of datetime objects
|
|
118
|
+
"""
|
|
119
|
+
# Clean up the date string
|
|
120
|
+
date_str = ' '.join(date_str.split())
|
|
121
|
+
date_parts = [d.strip() for d in date_str.split(',')]
|
|
122
|
+
|
|
123
|
+
parsed_dates = []
|
|
124
|
+
month = None
|
|
125
|
+
year = None
|
|
126
|
+
|
|
127
|
+
for part in date_parts:
|
|
128
|
+
try:
|
|
129
|
+
# Try to parse full date format: "Sun 3 May at 14:00"
|
|
130
|
+
if 'at' in part:
|
|
131
|
+
dt = self._parse_full_date(part)
|
|
132
|
+
else:
|
|
133
|
+
# Try abbreviated format like "May 03" or just "03"
|
|
134
|
+
dt = self._parse_abbreviated_date(part, month, year)
|
|
135
|
+
|
|
136
|
+
if dt:
|
|
137
|
+
parsed_dates.append(dt)
|
|
138
|
+
# Remember the month and year for subsequent dates
|
|
139
|
+
month = dt.month
|
|
140
|
+
year = dt.year
|
|
141
|
+
except (ValueError, AttributeError):
|
|
142
|
+
# Skip dates that can't be parsed
|
|
143
|
+
continue
|
|
144
|
+
|
|
145
|
+
return parsed_dates
|
|
146
|
+
|
|
147
|
+
def _parse_full_date(self, date_str: str) -> datetime:
|
|
148
|
+
"""
|
|
149
|
+
Parse full date format: "Sun 3 May at 14:00" or "Sunday 03 November 2024"
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
date_str: Full date string
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
Parsed datetime object
|
|
156
|
+
"""
|
|
157
|
+
# Remove qualifiers and extra whitespace
|
|
158
|
+
date_str = date_str.replace(' mat', '').replace(' at ', ' ').strip()
|
|
159
|
+
|
|
160
|
+
# Try format with time: "Sun 3 May 14:00"
|
|
161
|
+
try:
|
|
162
|
+
dt = datetime.strptime(date_str, '%a %d %b %H:%M')
|
|
163
|
+
# Add current year
|
|
164
|
+
return dt.replace(year=datetime.now().year)
|
|
165
|
+
except ValueError:
|
|
166
|
+
pass
|
|
167
|
+
|
|
168
|
+
# Try format with weekday and time but no leading zero on day: "Sun 3 May 14:00"
|
|
169
|
+
try:
|
|
170
|
+
# Handle single digit day
|
|
171
|
+
parts = date_str.split()
|
|
172
|
+
if len(parts) >= 3:
|
|
173
|
+
day = int(parts[1])
|
|
174
|
+
month_str = parts[2]
|
|
175
|
+
time_str = parts[3] if len(parts) > 3 else "00:00"
|
|
176
|
+
dt = datetime.strptime(f"{day} {month_str} {time_str}", '%d %b %H:%M')
|
|
177
|
+
return dt.replace(year=datetime.now().year)
|
|
178
|
+
except (ValueError, IndexError):
|
|
179
|
+
pass
|
|
180
|
+
|
|
181
|
+
# Try format: "Sunday 03 November 2024"
|
|
182
|
+
try:
|
|
183
|
+
return datetime.strptime(date_str, '%A %d %B %Y')
|
|
184
|
+
except ValueError:
|
|
185
|
+
pass
|
|
186
|
+
|
|
187
|
+
# Try format: "Sun 3 May" (no year, no time)
|
|
188
|
+
try:
|
|
189
|
+
dt = datetime.strptime(date_str, '%a %d %b')
|
|
190
|
+
# Use current year
|
|
191
|
+
return dt.replace(year=datetime.now().year)
|
|
192
|
+
except ValueError:
|
|
193
|
+
pass
|
|
194
|
+
|
|
195
|
+
# Try format: "3 May" (no weekday, no year, no time)
|
|
196
|
+
try:
|
|
197
|
+
dt = datetime.strptime(date_str, '%d %b')
|
|
198
|
+
# Use current year
|
|
199
|
+
return dt.replace(year=datetime.now().year)
|
|
200
|
+
except ValueError:
|
|
201
|
+
pass
|
|
202
|
+
|
|
203
|
+
raise ValueError(f"Could not parse date: {date_str}")
|
|
204
|
+
|
|
205
|
+
def _parse_abbreviated_date(self, date_str: str, prev_month: int = None, prev_year: int = None) -> datetime:
|
|
206
|
+
"""
|
|
207
|
+
Parse abbreviated date format like "05" (day only) or "May 03" (month and day)
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
date_str: Abbreviated date string
|
|
211
|
+
prev_month: Month from previous date (for inferring month of day-only dates)
|
|
212
|
+
prev_year: Year from previous date (for inferring year)
|
|
213
|
+
|
|
214
|
+
Returns:
|
|
215
|
+
Parsed datetime object
|
|
216
|
+
"""
|
|
217
|
+
date_str = date_str.replace(' mat', '').strip()
|
|
218
|
+
|
|
219
|
+
# Try "May 05" format
|
|
220
|
+
try:
|
|
221
|
+
dt = datetime.strptime(date_str, '%b %d')
|
|
222
|
+
return dt.replace(year=prev_year or datetime.now().year)
|
|
223
|
+
except ValueError:
|
|
224
|
+
pass
|
|
225
|
+
|
|
226
|
+
# Try "May 5" format (no leading zero)
|
|
227
|
+
try:
|
|
228
|
+
dt = datetime.strptime(date_str, '%b %e').replace(day=int(date_str.split()[-1]))
|
|
229
|
+
return dt.replace(year=prev_year or datetime.now().year)
|
|
230
|
+
except (ValueError, IndexError):
|
|
231
|
+
pass
|
|
232
|
+
|
|
233
|
+
# Try "05" or "5" format (day only) - use previous month/year
|
|
234
|
+
try:
|
|
235
|
+
day = int(date_str)
|
|
236
|
+
if prev_month and prev_year:
|
|
237
|
+
return datetime(prev_year, prev_month, day)
|
|
238
|
+
else:
|
|
239
|
+
# Fallback to current month/year
|
|
240
|
+
today = datetime.now()
|
|
241
|
+
return datetime(today.year, today.month, day)
|
|
242
|
+
except ValueError:
|
|
243
|
+
pass
|
|
244
|
+
|
|
245
|
+
raise ValueError(f"Could not parse abbreviated date: {date_str}")
|
|
246
|
+
|
|
247
|
+
def _parse_date(self, date_str: str) -> datetime:
|
|
248
|
+
"""
|
|
249
|
+
Parse date string format: "Sunday 03 November 2024" -> datetime object
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
date_str: Date string in format "Day DD Month YYYY"
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
Parsed datetime object
|
|
256
|
+
"""
|
|
257
|
+
# Handle multi-line date strings (strip extra whitespace)
|
|
258
|
+
date_str = ' '.join(date_str.split())
|
|
259
|
+
return datetime.strptime(date_str, '%A %d %B %Y')
|
|
260
|
+
|
|
261
|
+
def get_event_details(self, detail_url: str) -> Dict:
|
|
262
|
+
"""
|
|
263
|
+
Fetch additional event details from event detail page.
|
|
264
|
+
|
|
265
|
+
Args:
|
|
266
|
+
detail_url: URL of event detail page
|
|
267
|
+
|
|
268
|
+
Returns:
|
|
269
|
+
Dictionary with address and additional metadata
|
|
270
|
+
"""
|
|
271
|
+
try:
|
|
272
|
+
response = requests.get(detail_url, headers=self.headers, timeout=10)
|
|
273
|
+
response.raise_for_status()
|
|
274
|
+
except requests.RequestException as e:
|
|
275
|
+
raise RuntimeError(f"Failed to fetch event details: {e}")
|
|
276
|
+
|
|
277
|
+
soup = BeautifulSoup(response.content, 'html.parser')
|
|
278
|
+
|
|
279
|
+
details = {}
|
|
280
|
+
|
|
281
|
+
# Extract address
|
|
282
|
+
address_span = soup.find('span', {'class': 'listing-address'})
|
|
283
|
+
if address_span:
|
|
284
|
+
details['address'] = address_span.text.strip()
|
|
285
|
+
|
|
286
|
+
# Extract table data if present
|
|
287
|
+
table_tbody = soup.find('tbody', {'class': 'plassmap_table'})
|
|
288
|
+
if table_tbody:
|
|
289
|
+
rows = table_tbody.find_all('tr')
|
|
290
|
+
for row in rows:
|
|
291
|
+
cells = row.find_all('td')
|
|
292
|
+
if len(cells) >= 2:
|
|
293
|
+
key = cells[0].text.strip()
|
|
294
|
+
value = cells[1].text.strip()
|
|
295
|
+
details[key.lower()] = value
|
|
296
|
+
|
|
297
|
+
return details
|