bachtrackapi 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
api/__init__.py ADDED
@@ -0,0 +1 @@
1
+ """FastAPI backend for Bachtrack opera events API."""
api/main.py ADDED
@@ -0,0 +1,35 @@
1
+ """FastAPI application factory."""
2
+ from fastapi import FastAPI
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ from api.routes.events import router as events_router
5
+
6
+
7
+ def create_app() -> FastAPI:
8
+ """Create and configure FastAPI application."""
9
+ app = FastAPI(
10
+ title="BachtrackAPI",
11
+ description="API to search and retrieve opera events from Bachtrack.com",
12
+ version="0.1.0",
13
+ )
14
+
15
+ # Add CORS middleware
16
+ app.add_middleware(
17
+ CORSMiddleware,
18
+ allow_origins=["*"],
19
+ allow_credentials=True,
20
+ allow_methods=["*"],
21
+ allow_headers=["*"],
22
+ )
23
+
24
+ # Include routers
25
+ app.include_router(events_router)
26
+
27
+ # Health check endpoint
28
+ @app.get("/health")
29
+ async def health_check():
30
+ return {"status": "ok"}
31
+
32
+ return app
33
+
34
+
35
+ app = create_app()
@@ -0,0 +1,163 @@
1
+ Metadata-Version: 2.4
2
+ Name: bachtrackapi
3
+ Version: 0.1.0
4
+ Summary: A Python web scraper and REST API for extracting classical music opera events from Bachtrack.com
5
+ Author-email: Clark Maio <clark@example.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/clarkmaio/bachtrackapi
8
+ Project-URL: Repository, https://github.com/clarkmaio/bachtrackapi.git
9
+ Project-URL: Documentation, https://github.com/clarkmaio/bachtrackapi#readme
10
+ Project-URL: Issues, https://github.com/clarkmaio/bachtrackapi/issues
11
+ Keywords: scraper,api,opera,classical-music,fastapi,bachtrack
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.8
18
+ Classifier: Programming Language :: Python :: 3.9
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Topic :: Internet :: WWW/HTTP
22
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
23
+ Requires-Python: >=3.8
24
+ Description-Content-Type: text/markdown
25
+ License-File: LICENSE
26
+ Requires-Dist: fastapi==0.104.1
27
+ Requires-Dist: uvicorn[standard]==0.24.0
28
+ Requires-Dist: pydantic==2.5.0
29
+ Requires-Dist: pydantic-settings==2.1.0
30
+ Requires-Dist: requests==2.31.0
31
+ Requires-Dist: beautifulsoup4==4.12.2
32
+ Requires-Dist: selenium==4.15.2
33
+ Requires-Dist: webdriver-manager==4.0.1
34
+ Requires-Dist: python-dotenv==1.0.0
35
+ Provides-Extra: dev
36
+ Requires-Dist: pytest==7.4.3; extra == "dev"
37
+ Requires-Dist: pytest-asyncio==0.21.1; extra == "dev"
38
+ Requires-Dist: black==23.12.0; extra == "dev"
39
+ Requires-Dist: flake8==6.1.0; extra == "dev"
40
+ Requires-Dist: mypy==1.7.0; extra == "dev"
41
+ Dynamic: license-file
42
+ Dynamic: requires-python
43
+
44
+ # BachtrackAPI
45
+
46
+ A Python web scraper and REST API for extracting classical music opera events from [Bachtrack.com](https://bachtrack.com/).
47
+
48
+ ## Overview
49
+
50
+ BachtrackAPI provides two complementary ways to access opera event data:
51
+
52
+ 1. **Scraper Module** - Direct web scraping of Bachtrack opera listings
53
+ 2. **FastAPI Backend** - RESTful API endpoints for searching and filtering events
54
+
55
+ Search by work ID (e.g., `12285` for Gianni Schicchi) or freetext (e.g., `"La Traviata"`).
56
+
57
+ ## Installation
58
+
59
+ ```bash
60
+ pip install -r requirements.txt
61
+ ```
62
+
63
+ ## Quick Start
64
+
65
+ ### 1. Using the Scraper Directly
66
+
67
+ ```python
68
+ from scraper.scraper import BachtrackScraper
69
+
70
+ scraper = BachtrackScraper()
71
+
72
+ # Search by work ID
73
+ events = scraper.search_operas(12285) # Gianni Schicchi
74
+ print(f"Found {len(events)} events")
75
+
76
+ # Search by freetext
77
+ events = scraper.search_operas("La Traviata")
78
+ for event in events:
79
+ print(f"{event['title']} - {event['city']} @ {event['venue']}")
80
+ ```
81
+
82
+ **Output:**
83
+ ```
84
+ Found 28 events
85
+ Gianni Schicchi - Berlin @ Deutsche Oper
86
+ Gianni Schicchi - Winterthur @ Stadttheater Winterthur
87
+ ...
88
+ ```
89
+
90
+ ### 2. Using the FastAPI Backend
91
+
92
+ Start the server:
93
+ ```bash
94
+ uvicorn api.main:app --reload
95
+ ```
96
+
97
+ **Example API Requests:**
98
+
99
+ ```bash
100
+ # Freetext search
101
+ curl "http://localhost:8000/api/v1/events/get_operas?q=gianni%20schicchi"
102
+
103
+ # Work ID search
104
+ curl "http://localhost:8000/api/v1/events/get_operas?q=12285"
105
+
106
+ # POST search
107
+ curl -X POST "http://localhost:8000/api/v1/events/search" \
108
+ -H "Content-Type: application/json" \
109
+ -d '{"work_id": 12285}'
110
+ ```
111
+
112
+ **Response:**
113
+ ```json
114
+ {
115
+ "query": "12285",
116
+ "total_results": 28,
117
+ "results": [
118
+ {
119
+ "title": "Gianni Schicchi",
120
+ "city": "Berlin",
121
+ "date": "2026-04-05T00:00:00",
122
+ "venue": "Deutsche Oper",
123
+ "detail_url": "https://bachtrack.com/opera-event/..."
124
+ }
125
+ ]
126
+ }
127
+ ```
128
+
129
+ ## Available Endpoints
130
+
131
+ - `GET /api/v1/events/get_operas?q=<search>` - Raw scraper output
132
+ - `GET /api/v1/events/search?work_id=<id>` - Search by work ID
133
+ - `GET /api/v1/events/search?q=<term>` - Freetext search
134
+ - `POST /api/v1/events/search` - JSON body search
135
+ - `GET /docs` - Interactive API documentation
136
+ - `GET /health` - Health check
137
+
138
+ ## Testing
139
+
140
+ ```bash
141
+ # Run scraper tests
142
+ python tests/test_scraper.py
143
+
144
+ # Run full API integration tests
145
+ pytest tests/test_api.py -v -s
146
+ ```
147
+
148
+ ## Project Structure
149
+
150
+ ```
151
+ scraper/scraper.py # Core scraping logic
152
+ api/
153
+ ├── main.py # FastAPI app factory
154
+ ├── routes/events.py # API endpoints
155
+ ├── models/event.py # Pydantic models
156
+ └── services/opera_service.py # Service layer
157
+ tests/ # Unit and integration tests
158
+ ```
159
+
160
+ ## License
161
+
162
+ MIT License - see [LICENSE](LICENSE) file for details
163
+
@@ -0,0 +1,9 @@
1
+ api/__init__.py,sha256=JXZBmzqjHGf7CG-X_KWvZ7GTm3EHP7KQd4R9kdPE3t4,54
2
+ api/main.py,sha256=ix_DYJjwo6AjBQKggpI6jYtHQx5ybXuSu4Gu4X8QzS0,842
3
+ bachtrackapi-0.1.0.dist-info/licenses/LICENSE,sha256=PWqPgTDR0XxvjIt0t6OPvOVmJRV3TPYA8EQ6e01Zf1Y,1067
4
+ scraper/__init__.py,sha256=w5P9XC4ieEl9eIwm0OSqKETDCeNoBK9KCsqVh8QkJFE,39
5
+ scraper/scraper.py,sha256=2Aj4GoBFsie-LpNMzekiWuNEpyMQuzhoJ0arKOGBIMo,10762
6
+ bachtrackapi-0.1.0.dist-info/METADATA,sha256=iaI9qk4VFc5PBfc-s5uQaeTxnscPgJGZLSSuVuKxlrQ,4572
7
+ bachtrackapi-0.1.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
8
+ bachtrackapi-0.1.0.dist-info/top_level.txt,sha256=mepoNsBT0riUPSxOkrXxheORLHj-ayrWU2hFcrweb40,12
9
+ bachtrackapi-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.10.2)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Clark Maio
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,2 @@
1
+ api
2
+ scraper
scraper/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+
2
+
3
+ from .scraper import BachtrackScraper
scraper/scraper.py ADDED
@@ -0,0 +1,297 @@
1
+ """Bachtrack.com scraper for opera events."""
2
+ from typing import List, Dict, Union
3
+ from datetime import datetime
4
+ import requests
5
+ from bs4 import BeautifulSoup
6
+ from urllib.parse import quote
7
+ import re
8
+
9
+
10
+ class BachtrackScraper:
11
+ """Scraper for Bachtrack opera events."""
12
+
13
+ BASE_URL = "https://bachtrack.com"
14
+
15
+ def __init__(self):
16
+ self.headers = {
17
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
18
+ }
19
+
20
+ def search_operas(self, search_input: Union[int, str]) -> List[Dict]:
21
+ """
22
+ Search for opera events by work ID or freetext search.
23
+
24
+ Args:
25
+ search_input: Either an integer work ID (e.g., 12285) or a string search term (e.g., "Il barbiere di Siviglia")
26
+
27
+ Returns:
28
+ List of opera event dictionaries with city, date, venue, title
29
+ """
30
+ if isinstance(search_input, int):
31
+ # Search by work ID
32
+ search_url = f"{self.BASE_URL}/search-opera/work={search_input}"
33
+ else:
34
+ # Search by freetext
35
+ encoded_search = quote(search_input)
36
+ search_url = f"{self.BASE_URL}/search-opera/freetext={encoded_search}"
37
+
38
+ try:
39
+ response = requests.get(search_url, headers=self.headers, timeout=10)
40
+ response.raise_for_status()
41
+ except requests.RequestException as e:
42
+ raise RuntimeError(f"Failed to fetch search results: {e}")
43
+
44
+ soup = BeautifulSoup(response.content, 'html.parser')
45
+
46
+ # Extract opera events from listing
47
+ events = []
48
+ li_elements = soup.find_all('li', {'data-type': 'nothing'})
49
+
50
+ for element in li_elements:
51
+ try:
52
+ event_list = self._parse_event_element(element)
53
+ events.extend(event_list)
54
+ except (AttributeError, ValueError) as e:
55
+ # Skip malformed elements
56
+ continue
57
+
58
+ return events
59
+
60
+ def _parse_event_element(self, element) -> List[Dict]:
61
+ """
62
+ Parse individual event element and expand to multiple events for each date.
63
+
64
+ Args:
65
+ element: BeautifulSoup element representing an event listing
66
+
67
+ Returns:
68
+ List of dictionaries with event details, one per date
69
+ """
70
+ try:
71
+ city = element.find('div', {'class': 'listing-ms-city'}).text.strip()
72
+ date_str = element.find('div', {'class': 'listing-ms-dates'}).text.strip()
73
+ venue = element.find('div', {'class': 'listing-ms-venue'}).text.strip()
74
+
75
+ # Extract title from listing-ms-main, removing Wish list button
76
+ main_div = element.find('div', {'class': 'listing-ms-main'})
77
+ title = main_div.text.strip()
78
+ # Remove the wish list placeholder text
79
+ title = title.replace('Wish list', '').strip()
80
+
81
+ # Get detail page URL
82
+ detail_link = element.find('a', {'class': 'listing-ms-right'})
83
+ detail_url = None
84
+ if detail_link and detail_link.get('href'):
85
+ detail_url = f"{self.BASE_URL}{detail_link['href']}"
86
+
87
+ # Parse dates and create one event per date
88
+ parsed_dates = self._parse_dates_list(date_str)
89
+ events = []
90
+
91
+ for parsed_date in parsed_dates:
92
+ events.append({
93
+ 'title': title,
94
+ 'city': city,
95
+ 'date': parsed_date,
96
+ 'venue': venue,
97
+ 'detail_url': detail_url,
98
+ })
99
+
100
+ return events
101
+ except (AttributeError, TypeError):
102
+ return []
103
+
104
+ def _parse_dates_list(self, date_str: str) -> List[datetime]:
105
+ """
106
+ Parse date string and return list of datetime objects.
107
+
108
+ Handles formats like:
109
+ - "Apr 05, 10, 15, 17" (same month, same year - inferred as current/next year)
110
+ - "Sun 3 May at 14:00" (full date with time)
111
+ - "Feb 05, 07, 11, 13, 15 mat, 17, 19, 21" (with qualifiers like 'mat')
112
+
113
+ Args:
114
+ date_str: Date string with comma-separated dates
115
+
116
+ Returns:
117
+ List of datetime objects
118
+ """
119
+ # Clean up the date string
120
+ date_str = ' '.join(date_str.split())
121
+ date_parts = [d.strip() for d in date_str.split(',')]
122
+
123
+ parsed_dates = []
124
+ month = None
125
+ year = None
126
+
127
+ for part in date_parts:
128
+ try:
129
+ # Try to parse full date format: "Sun 3 May at 14:00"
130
+ if 'at' in part:
131
+ dt = self._parse_full_date(part)
132
+ else:
133
+ # Try abbreviated format like "May 03" or just "03"
134
+ dt = self._parse_abbreviated_date(part, month, year)
135
+
136
+ if dt:
137
+ parsed_dates.append(dt)
138
+ # Remember the month and year for subsequent dates
139
+ month = dt.month
140
+ year = dt.year
141
+ except (ValueError, AttributeError):
142
+ # Skip dates that can't be parsed
143
+ continue
144
+
145
+ return parsed_dates
146
+
147
+ def _parse_full_date(self, date_str: str) -> datetime:
148
+ """
149
+ Parse full date format: "Sun 3 May at 14:00" or "Sunday 03 November 2024"
150
+
151
+ Args:
152
+ date_str: Full date string
153
+
154
+ Returns:
155
+ Parsed datetime object
156
+ """
157
+ # Remove qualifiers and extra whitespace
158
+ date_str = date_str.replace(' mat', '').replace(' at ', ' ').strip()
159
+
160
+ # Try format with time: "Sun 3 May 14:00"
161
+ try:
162
+ dt = datetime.strptime(date_str, '%a %d %b %H:%M')
163
+ # Add current year
164
+ return dt.replace(year=datetime.now().year)
165
+ except ValueError:
166
+ pass
167
+
168
+ # Try format with weekday and time but no leading zero on day: "Sun 3 May 14:00"
169
+ try:
170
+ # Handle single digit day
171
+ parts = date_str.split()
172
+ if len(parts) >= 3:
173
+ day = int(parts[1])
174
+ month_str = parts[2]
175
+ time_str = parts[3] if len(parts) > 3 else "00:00"
176
+ dt = datetime.strptime(f"{day} {month_str} {time_str}", '%d %b %H:%M')
177
+ return dt.replace(year=datetime.now().year)
178
+ except (ValueError, IndexError):
179
+ pass
180
+
181
+ # Try format: "Sunday 03 November 2024"
182
+ try:
183
+ return datetime.strptime(date_str, '%A %d %B %Y')
184
+ except ValueError:
185
+ pass
186
+
187
+ # Try format: "Sun 3 May" (no year, no time)
188
+ try:
189
+ dt = datetime.strptime(date_str, '%a %d %b')
190
+ # Use current year
191
+ return dt.replace(year=datetime.now().year)
192
+ except ValueError:
193
+ pass
194
+
195
+ # Try format: "3 May" (no weekday, no year, no time)
196
+ try:
197
+ dt = datetime.strptime(date_str, '%d %b')
198
+ # Use current year
199
+ return dt.replace(year=datetime.now().year)
200
+ except ValueError:
201
+ pass
202
+
203
+ raise ValueError(f"Could not parse date: {date_str}")
204
+
205
+ def _parse_abbreviated_date(self, date_str: str, prev_month: int = None, prev_year: int = None) -> datetime:
206
+ """
207
+ Parse abbreviated date format like "05" (day only) or "May 03" (month and day)
208
+
209
+ Args:
210
+ date_str: Abbreviated date string
211
+ prev_month: Month from previous date (for inferring month of day-only dates)
212
+ prev_year: Year from previous date (for inferring year)
213
+
214
+ Returns:
215
+ Parsed datetime object
216
+ """
217
+ date_str = date_str.replace(' mat', '').strip()
218
+
219
+ # Try "May 05" format
220
+ try:
221
+ dt = datetime.strptime(date_str, '%b %d')
222
+ return dt.replace(year=prev_year or datetime.now().year)
223
+ except ValueError:
224
+ pass
225
+
226
+ # Try "May 5" format (no leading zero)
227
+ try:
228
+ dt = datetime.strptime(date_str, '%b %e').replace(day=int(date_str.split()[-1]))
229
+ return dt.replace(year=prev_year or datetime.now().year)
230
+ except (ValueError, IndexError):
231
+ pass
232
+
233
+ # Try "05" or "5" format (day only) - use previous month/year
234
+ try:
235
+ day = int(date_str)
236
+ if prev_month and prev_year:
237
+ return datetime(prev_year, prev_month, day)
238
+ else:
239
+ # Fallback to current month/year
240
+ today = datetime.now()
241
+ return datetime(today.year, today.month, day)
242
+ except ValueError:
243
+ pass
244
+
245
+ raise ValueError(f"Could not parse abbreviated date: {date_str}")
246
+
247
+ def _parse_date(self, date_str: str) -> datetime:
248
+ """
249
+ Parse date string format: "Sunday 03 November 2024" -> datetime object
250
+
251
+ Args:
252
+ date_str: Date string in format "Day DD Month YYYY"
253
+
254
+ Returns:
255
+ Parsed datetime object
256
+ """
257
+ # Handle multi-line date strings (strip extra whitespace)
258
+ date_str = ' '.join(date_str.split())
259
+ return datetime.strptime(date_str, '%A %d %B %Y')
260
+
261
+ def get_event_details(self, detail_url: str) -> Dict:
262
+ """
263
+ Fetch additional event details from event detail page.
264
+
265
+ Args:
266
+ detail_url: URL of event detail page
267
+
268
+ Returns:
269
+ Dictionary with address and additional metadata
270
+ """
271
+ try:
272
+ response = requests.get(detail_url, headers=self.headers, timeout=10)
273
+ response.raise_for_status()
274
+ except requests.RequestException as e:
275
+ raise RuntimeError(f"Failed to fetch event details: {e}")
276
+
277
+ soup = BeautifulSoup(response.content, 'html.parser')
278
+
279
+ details = {}
280
+
281
+ # Extract address
282
+ address_span = soup.find('span', {'class': 'listing-address'})
283
+ if address_span:
284
+ details['address'] = address_span.text.strip()
285
+
286
+ # Extract table data if present
287
+ table_tbody = soup.find('tbody', {'class': 'plassmap_table'})
288
+ if table_tbody:
289
+ rows = table_tbody.find_all('tr')
290
+ for row in rows:
291
+ cells = row.find_all('td')
292
+ if len(cells) >= 2:
293
+ key = cells[0].text.strip()
294
+ value = cells[1].text.strip()
295
+ details[key.lower()] = value
296
+
297
+ return details