lyrics-transcriber 0.30.0__py3-none-any.whl → 0.30.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,151 +1,216 @@
1
+ from dataclasses import dataclass
1
2
  import requests
2
3
  import time
3
4
  import os
4
- import json
5
- from .base import BaseTranscriber
5
+ from typing import Dict, Optional, Any, Union
6
+ from pathlib import Path
7
+ from .base_transcriber import BaseTranscriber, TranscriptionData, LyricsSegment, Word, TranscriptionError
6
8
 
7
9
 
8
- class AudioShakeTranscriber(BaseTranscriber):
9
- """Transcription service using AudioShake's API."""
10
-
11
- def __init__(self, api_token=None, logger=None, output_prefix=None):
12
- super().__init__(logger)
13
- self.api_token = api_token or os.getenv("AUDIOSHAKE_API_TOKEN")
14
- self.base_url = "https://groovy.audioshake.ai"
15
- self.output_prefix = output_prefix
16
-
17
- if not self.api_token:
18
- raise ValueError("AudioShake API token must be provided either directly or via AUDIOSHAKE_API_TOKEN env var")
19
-
20
- def get_name(self) -> str:
21
- return "AudioShake"
22
-
23
- def transcribe(self, audio_filepath: str) -> dict:
24
- """
25
- Transcribe an audio file using AudioShake API.
26
-
27
- Args:
28
- audio_filepath: Path to the audio file to transcribe
29
-
30
- Returns:
31
- Dict containing:
32
- - segments: List of segments with start/end times and word-level data
33
- - text: Full text transcription
34
- - metadata: Dict of additional info
35
- """
36
- self.logger.info(f"Starting transcription for {audio_filepath} using AudioShake API")
37
-
38
- # Start job and get results
39
- job_id = self.start_transcription(audio_filepath)
40
- result = self.get_transcription_result(job_id)
41
-
42
- # Add metadata to the result
43
- result["metadata"] = {
44
- "service": self.get_name(),
45
- "language": "en", # AudioShake currently only supports English
46
- }
47
-
48
- return result
10
+ @dataclass
11
+ class AudioShakeConfig:
12
+ """Configuration for AudioShake transcription service."""
49
13
 
50
- def start_transcription(self, audio_filepath: str) -> str:
51
- """Starts the transcription job and returns the job ID."""
52
- # Step 1: Upload the audio file
53
- asset_id = self._upload_file(audio_filepath)
54
- self.logger.info(f"File uploaded successfully. Asset ID: {asset_id}")
14
+ api_token: Optional[str] = None
15
+ base_url: str = "https://groovy.audioshake.ai"
16
+ output_prefix: Optional[str] = None
17
+ timeout_minutes: int = 10 # Added timeout configuration
55
18
 
56
- # Step 2: Create a job for transcription and alignment
57
- job_id = self._create_job(asset_id)
58
- self.logger.info(f"Job created successfully. Job ID: {job_id}")
59
19
 
60
- return job_id
20
+ class AudioShakeAPI:
21
+ """Handles direct API interactions with AudioShake."""
61
22
 
62
- def get_transcription_result(self, job_id: str) -> dict:
63
- """Gets the results for a previously started job."""
64
- self.logger.info(f"Getting results for job ID: {job_id}")
23
+ def __init__(self, config: AudioShakeConfig, logger):
24
+ self.config = config
25
+ self.logger = logger
65
26
 
66
- # Wait for job completion and get results
67
- result = self._get_job_result(job_id)
68
- self.logger.info(f"Job completed. Processing results...")
27
+ def _validate_config(self) -> None:
28
+ """Validate API configuration."""
29
+ if not self.config.api_token:
30
+ raise ValueError("AudioShake API token must be provided")
69
31
 
70
- # Process and return in standard format
71
- return self._process_result(result)
32
+ def _get_headers(self) -> Dict[str, str]:
33
+ """Get headers for API requests."""
34
+ self._validate_config() # Validate before making any API calls
35
+ return {"Authorization": f"Bearer {self.config.api_token}", "Content-Type": "application/json"}
72
36
 
73
- def _upload_file(self, filepath):
37
+ def upload_file(self, filepath: str) -> str:
38
+ """Upload audio file and return asset ID."""
74
39
  self.logger.info(f"Uploading {filepath} to AudioShake")
75
- url = f"{self.base_url}/upload"
76
- headers = {"Authorization": f"Bearer {self.api_token}"}
40
+ self._validate_config() # Validate before making API call
41
+
42
+ url = f"{self.config.base_url}/upload"
77
43
  with open(filepath, "rb") as file:
78
44
  files = {"file": (os.path.basename(filepath), file)}
79
- response = requests.post(url, headers=headers, files=files)
80
-
81
- self.logger.info(f"Upload response status code: {response.status_code}")
82
- self.logger.info(f"Upload response content: {response.text}")
45
+ response = requests.post(url, headers={"Authorization": self._get_headers()["Authorization"]}, files=files)
83
46
 
47
+ self.logger.debug(f"Upload response: {response.status_code} - {response.text}")
84
48
  response.raise_for_status()
85
49
  return response.json()["id"]
86
50
 
87
- def _create_job(self, asset_id):
51
+ def create_job(self, asset_id: str) -> str:
52
+ """Create transcription job and return job ID."""
88
53
  self.logger.info(f"Creating job for asset {asset_id}")
89
- url = f"{self.base_url}/job/"
90
- headers = {"Authorization": f"Bearer {self.api_token}", "Content-Type": "application/json"}
54
+
55
+ url = f"{self.config.base_url}/job/"
91
56
  data = {
92
57
  "metadata": {"format": "json", "name": "alignment", "language": "en"},
93
58
  "callbackUrl": "https://example.com/webhook/alignment",
94
59
  "assetId": asset_id,
95
60
  }
96
- response = requests.post(url, headers=headers, json=data)
61
+ response = requests.post(url, headers=self._get_headers(), json=data)
97
62
  response.raise_for_status()
98
63
  return response.json()["job"]["id"]
99
64
 
100
- def _get_job_result(self, job_id):
65
+ def wait_for_job_result(self, job_id: str) -> Dict[str, Any]:
66
+ """Poll for job completion and return results."""
101
67
  self.logger.info(f"Getting job result for job {job_id}")
102
- url = f"{self.base_url}/job/{job_id}"
103
- headers = {"Authorization": f"Bearer {self.api_token}", "Content-Type": "application/json"}
68
+
69
+ url = f"{self.config.base_url}/job/{job_id}"
70
+ start_time = time.time()
71
+ last_status_log = start_time
72
+ timeout_seconds = self.config.timeout_minutes * 60
73
+
104
74
  while True:
105
- response = requests.get(url, headers=headers)
75
+ current_time = time.time()
76
+ elapsed_time = current_time - start_time
77
+
78
+ # Check for timeout
79
+ if elapsed_time > timeout_seconds:
80
+ raise TranscriptionError(f"Transcription timed out after {self.config.timeout_minutes} minutes")
81
+
82
+ # Log status every minute
83
+ if current_time - last_status_log >= 60:
84
+ self.logger.info(f"Still waiting for transcription... " f"Elapsed time: {int(elapsed_time/60)} minutes")
85
+ last_status_log = current_time
86
+
87
+ response = requests.get(url, headers=self._get_headers())
106
88
  response.raise_for_status()
107
89
  job_data = response.json()["job"]
90
+
108
91
  if job_data["status"] == "completed":
109
92
  return job_data
110
93
  elif job_data["status"] == "failed":
111
- raise Exception("Job failed")
112
- time.sleep(5) # Wait 5 seconds before checking again
94
+ raise TranscriptionError(f"Job failed: {job_data.get('error', 'Unknown error')}")
113
95
 
114
- def _process_result(self, job_data):
115
- self.logger.debug(f"Processing result for job {job_data['id']}")
116
- self.logger.debug(f"Job data: {json.dumps(job_data, indent=2)}")
96
+ time.sleep(5) # Wait before next poll
117
97
 
118
- output_assets = job_data.get("outputAssets", [])
119
- self.logger.debug(f"Output assets: {output_assets}")
120
98
 
121
- output_asset = next((asset for asset in output_assets if asset["name"] == "alignment.json"), None)
99
+ class AudioShakeTranscriber(BaseTranscriber):
100
+ """Transcription service using AudioShake's API."""
122
101
 
123
- if not output_asset:
124
- self.logger.error("'alignment.json' found in job results")
125
- self.logger.error(f"Available output assets: {[asset['name'] for asset in output_assets]}")
126
- raise Exception("Required output not found in job results")
102
+ def __init__(
103
+ self,
104
+ cache_dir: Union[str, Path],
105
+ config: Optional[AudioShakeConfig] = None,
106
+ logger: Optional[Any] = None,
107
+ api_client: Optional[AudioShakeAPI] = None,
108
+ ):
109
+ """Initialize AudioShake transcriber."""
110
+ super().__init__(cache_dir=cache_dir, logger=logger)
111
+ self.config = config or AudioShakeConfig(api_token=os.getenv("AUDIOSHAKE_API_TOKEN"))
112
+ self.api = api_client or AudioShakeAPI(self.config, self.logger)
127
113
 
128
- transcription_url = output_asset["link"]
129
- self.logger.debug(f"Output URL: {transcription_url}")
114
+ def get_name(self) -> str:
115
+ return "AudioShake"
130
116
 
131
- response = requests.get(transcription_url)
132
- response.raise_for_status()
133
- transcription_data = response.json()
134
- self.logger.debug(f"Output data: {json.dumps(transcription_data, indent=2)}")
117
+ def _perform_transcription(self, audio_filepath: str) -> TranscriptionData:
118
+ """Actually perform the transcription using AudioShake API."""
119
+ self.logger.debug(f"Entering _perform_transcription() for {audio_filepath}")
120
+ self.logger.info(f"Starting transcription for {audio_filepath}")
121
+
122
+ try:
123
+ # Start job and get results
124
+ self.logger.debug("Calling start_transcription()")
125
+ job_id = self.start_transcription(audio_filepath)
126
+ self.logger.debug(f"Got job_id: {job_id}")
127
+
128
+ self.logger.debug("Calling get_transcription_result()")
129
+ result = self.get_transcription_result(job_id)
130
+ self.logger.debug("Got transcription result")
135
131
 
136
- transcription_data = {"segments": transcription_data.get("lines", []), "text": transcription_data.get("text", "")}
132
+ return result
133
+ except Exception as e:
134
+ self.logger.error(f"Error in _perform_transcription: {str(e)}")
135
+ raise
137
136
 
138
- # Ensure each segment has the required fields
139
- for segment in transcription_data["segments"]:
140
- if "words" not in segment:
141
- segment["words"] = []
142
- if "text" not in segment:
143
- segment["text"] = " ".join(word["text"] for word in segment["words"])
137
+ def start_transcription(self, audio_filepath: str) -> str:
138
+ """Starts the transcription job and returns the job ID."""
139
+ self.logger.debug(f"Entering start_transcription() for {audio_filepath}")
140
+
141
+ # Upload file and create job
142
+ asset_id = self.api.upload_file(audio_filepath)
143
+ self.logger.debug(f"File uploaded successfully. Asset ID: {asset_id}")
144
+
145
+ job_id = self.api.create_job(asset_id)
146
+ self.logger.debug(f"Job created successfully. Job ID: {job_id}")
147
+
148
+ return job_id
149
+
150
+ def get_transcription_result(self, job_id: str) -> Dict[str, Any]:
151
+ """Gets the raw results for a previously started job."""
152
+ self.logger.debug(f"Entering get_transcription_result() for job ID: {job_id}")
144
153
 
145
- transcription_data["output_filename"] = self.get_output_filename(" (AudioShake)")
154
+ # Wait for job completion
155
+ job_data = self.api.wait_for_job_result(job_id)
156
+ self.logger.debug("Job completed. Getting results...")
146
157
 
147
- return transcription_data
158
+ output_asset = next((asset for asset in job_data.get("outputAssets", []) if asset["name"] == "alignment.json"), None)
159
+ if not output_asset:
160
+ raise TranscriptionError("Required output not found in job results")
161
+
162
+ # Fetch transcription data
163
+ response = requests.get(output_asset["link"])
164
+ response.raise_for_status()
148
165
 
149
- def get_output_filename(self, suffix):
150
- """Generate consistent filename with (Purpose) suffix pattern"""
151
- return f"{self.output_prefix}{suffix}"
166
+ # Return combined raw data
167
+ raw_data = {"job_data": job_data, "transcription": response.json()}
168
+
169
+ self.logger.debug("Raw results retrieved successfully")
170
+ return raw_data
171
+
172
+ def _convert_result_format(self, raw_data: Dict[str, Any]) -> TranscriptionData:
173
+ """Process raw Audioshake API response into standard format."""
174
+ self.logger.debug(f"Processing result for job {raw_data['job_data']['id']}")
175
+
176
+ transcription_data = raw_data["transcription"]
177
+ job_data = raw_data["job_data"]
178
+
179
+ segments = []
180
+ all_words = [] # Collect all words across segments
181
+
182
+ for line in transcription_data.get("lines", []):
183
+ words = [
184
+ Word(
185
+ text=word["text"],
186
+ start_time=word.get("start", 0.0),
187
+ end_time=word.get("end", 0.0),
188
+ )
189
+ for word in line.get("words", [])
190
+ ]
191
+ all_words.extend(words) # Add words to flat list
192
+
193
+ segments.append(
194
+ LyricsSegment(
195
+ text=line.get("text", " ".join(w.text for w in words)),
196
+ words=words,
197
+ start_time=min((w.start_time for w in words), default=0.0),
198
+ end_time=max((w.end_time for w in words), default=0.0),
199
+ )
200
+ )
201
+
202
+ return TranscriptionData(
203
+ text=transcription_data.get("text", ""),
204
+ words=all_words,
205
+ segments=segments,
206
+ source=self.get_name(),
207
+ metadata={
208
+ "language": transcription_data.get("metadata", {}).get("language"),
209
+ "job_id": job_data["id"],
210
+ "duration": job_data.get("statusInfo", {}).get("duration"),
211
+ },
212
+ )
213
+
214
+ def get_output_filename(self, suffix: str) -> str:
215
+ """Generate consistent filename with (Purpose) suffix pattern."""
216
+ return f"{self.config.output_prefix}{suffix}"
@@ -0,0 +1,186 @@
1
+ from abc import ABC, abstractmethod
2
+ from dataclasses import dataclass
3
+ from typing import Dict, Any, Optional, Protocol, List, Union
4
+ from pathlib import Path
5
+ import logging
6
+ import os
7
+ import json
8
+ import hashlib
9
+ from lyrics_transcriber.lyrics.base_lyrics_provider import LyricsSegment, Word
10
+
11
+
12
+ @dataclass
13
+ class TranscriptionData:
14
+ """Structured container for transcription results."""
15
+
16
+ segments: List[LyricsSegment]
17
+ words: List[Word]
18
+ text: str
19
+ source: str # e.g., "whisper", "audioshake"
20
+ metadata: Optional[Dict[str, Any]] = None
21
+
22
+ def to_dict(self) -> Dict[str, Any]:
23
+ """Convert TranscriptionData to dictionary for JSON serialization."""
24
+ return {
25
+ "segments": [segment.to_dict() for segment in self.segments],
26
+ "words": [word.to_dict() for word in self.words],
27
+ "text": self.text,
28
+ "source": self.source,
29
+ "metadata": self.metadata,
30
+ }
31
+
32
+
33
+ @dataclass
34
+ class TranscriptionResult:
35
+ name: str
36
+ priority: int
37
+ result: TranscriptionData
38
+
39
+
40
+ class LoggerProtocol(Protocol):
41
+ """Protocol for logger interface."""
42
+
43
+ def debug(self, msg: str) -> None: ...
44
+ def info(self, msg: str) -> None: ...
45
+ def warning(self, msg: str) -> None: ...
46
+ def error(self, msg: str) -> None: ...
47
+
48
+
49
+ class TranscriptionError(Exception):
50
+ """Base exception for transcription errors."""
51
+
52
+ pass
53
+
54
+
55
+ class BaseTranscriber(ABC):
56
+ """Base class for all transcription services."""
57
+
58
+ def __init__(self, cache_dir: Union[str, Path], logger: Optional[LoggerProtocol] = None):
59
+ """
60
+ Initialize transcriber with cache directory and logger.
61
+
62
+ Args:
63
+ cache_dir: Directory to store cache files. Must be provided.
64
+ logger: Logger instance to use. If None, creates a new logger.
65
+ """
66
+ self.cache_dir = Path(cache_dir)
67
+ self.logger = logger or logging.getLogger(__name__)
68
+
69
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
70
+ self.logger.debug(f"Initialized {self.__class__.__name__} with cache dir: {self.cache_dir}")
71
+
72
+ def _get_file_hash(self, filepath: str) -> str:
73
+ """Calculate MD5 hash of a file."""
74
+ self.logger.debug(f"Calculating hash for file: {filepath}")
75
+ md5_hash = hashlib.md5()
76
+ with open(filepath, "rb") as f:
77
+ for chunk in iter(lambda: f.read(4096), b""):
78
+ md5_hash.update(chunk)
79
+ hash_result = md5_hash.hexdigest()
80
+ self.logger.debug(f"File hash: {hash_result}")
81
+ return hash_result
82
+
83
+ def _get_cache_path(self, file_hash: str, suffix: str) -> str:
84
+ """Get the cache file path for a given file hash."""
85
+ cache_path = os.path.join(self.cache_dir, f"{self.get_name().lower()}_{file_hash}_{suffix}.json")
86
+ self.logger.debug(f"Cache path: {cache_path}")
87
+ return cache_path
88
+
89
+ def _save_to_cache(self, cache_path: str, raw_data: Dict[str, Any]) -> None:
90
+ """Save raw API response data to cache."""
91
+ self.logger.debug(f"Saving JSON to cache: {cache_path}")
92
+ with open(cache_path, "w") as f:
93
+ json.dump(raw_data, f, indent=2)
94
+ self.logger.debug("Cache save completed")
95
+
96
+ def _load_from_cache(self, cache_path: str) -> Optional[Dict[str, Any]]:
97
+ """Load raw API response data from cache if it exists."""
98
+ self.logger.debug(f"Attempting to load from cache: {cache_path}")
99
+ try:
100
+ with open(cache_path, "r") as f:
101
+ data = json.load(f)
102
+ self.logger.debug("Raw API response loaded from cache")
103
+ return data
104
+ except FileNotFoundError:
105
+ self.logger.debug("Cache file not found")
106
+ return None
107
+ except json.JSONDecodeError:
108
+ self.logger.warning(f"Cache file {cache_path} is corrupted")
109
+ return None
110
+
111
+ def _save_and_convert_result(self, file_hash: str, raw_result: Dict[str, Any]) -> TranscriptionData:
112
+ """Convert raw result to TranscriptionData, save to cache, and return."""
113
+ converted_cache_path = self._get_cache_path(file_hash, "converted")
114
+ converted_result = self._convert_result_format(raw_result)
115
+ self._save_to_cache(converted_cache_path, converted_result.to_dict())
116
+ return converted_result
117
+
118
+ def transcribe(self, audio_filepath: str) -> TranscriptionData:
119
+ """
120
+ Transcribe an audio file, using cache if available.
121
+
122
+ Args:
123
+ audio_filepath: Path to the audio file to transcribe
124
+
125
+ Returns:
126
+ TranscriptionData containing segments, text, and metadata
127
+ """
128
+ self.logger.debug(f"Starting transcription for {audio_filepath}")
129
+
130
+ try:
131
+ self._validate_audio_file(audio_filepath)
132
+ self.logger.debug("Audio file validation passed")
133
+
134
+ # Check cache first
135
+ file_hash = self._get_file_hash(audio_filepath)
136
+ raw_cache_path = self._get_cache_path(file_hash, "raw")
137
+
138
+ raw_data = self._load_from_cache(raw_cache_path)
139
+ if raw_data:
140
+ self.logger.info(f"Using cached raw data for {audio_filepath}")
141
+ return self._save_and_convert_result(file_hash, raw_data)
142
+
143
+ # If not in cache, perform transcription
144
+ self.logger.info(f"No cache found, transcribing {audio_filepath}")
145
+ raw_result = self._perform_transcription(audio_filepath)
146
+ self.logger.debug("Transcription completed")
147
+
148
+ # Save raw result to cache
149
+ self._save_to_cache(raw_cache_path, raw_result)
150
+
151
+ return self._save_and_convert_result(file_hash, raw_result)
152
+
153
+ except Exception as e:
154
+ self.logger.error(f"Error during transcription: {str(e)}")
155
+ raise
156
+
157
+ @abstractmethod
158
+ def _perform_transcription(self, audio_filepath: str) -> TranscriptionData:
159
+ """
160
+ Actually perform the transcription (implemented by subclasses).
161
+
162
+ Args:
163
+ audio_filepath: Path to the audio file to transcribe
164
+
165
+ Returns:
166
+ TranscriptionData containing segments, text, and metadata
167
+ """
168
+ pass # pragma: no cover
169
+
170
+ @abstractmethod
171
+ def get_name(self) -> str:
172
+ """Return the name of this transcription service."""
173
+ pass # pragma: no cover
174
+
175
+ def _validate_audio_file(self, audio_filepath: str) -> None:
176
+ """Validate that the audio file exists and is accessible."""
177
+ self.logger.debug(f"Validating audio file: {audio_filepath}")
178
+ if not os.path.exists(audio_filepath):
179
+ self.logger.error(f"Audio file not found: {audio_filepath}")
180
+ raise FileNotFoundError(f"Audio file not found: {audio_filepath}")
181
+ self.logger.debug("Audio file validation successful")
182
+
183
+ @abstractmethod
184
+ def _convert_result_format(self, raw_data: Dict[str, Any]) -> TranscriptionData:
185
+ """Convert raw API response to TranscriptionData format."""
186
+ pass # pragma: no cover