remdb 0.2.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of remdb might be problematic. Click here for more details.
- rem/__init__.py +2 -0
- rem/agentic/README.md +650 -0
- rem/agentic/__init__.py +39 -0
- rem/agentic/agents/README.md +155 -0
- rem/agentic/agents/__init__.py +8 -0
- rem/agentic/context.py +148 -0
- rem/agentic/context_builder.py +329 -0
- rem/agentic/mcp/__init__.py +0 -0
- rem/agentic/mcp/tool_wrapper.py +107 -0
- rem/agentic/otel/__init__.py +5 -0
- rem/agentic/otel/setup.py +151 -0
- rem/agentic/providers/phoenix.py +674 -0
- rem/agentic/providers/pydantic_ai.py +572 -0
- rem/agentic/query.py +117 -0
- rem/agentic/query_helper.py +89 -0
- rem/agentic/schema.py +396 -0
- rem/agentic/serialization.py +245 -0
- rem/agentic/tools/__init__.py +5 -0
- rem/agentic/tools/rem_tools.py +231 -0
- rem/api/README.md +420 -0
- rem/api/main.py +324 -0
- rem/api/mcp_router/prompts.py +182 -0
- rem/api/mcp_router/resources.py +536 -0
- rem/api/mcp_router/server.py +213 -0
- rem/api/mcp_router/tools.py +584 -0
- rem/api/routers/auth.py +229 -0
- rem/api/routers/chat/__init__.py +5 -0
- rem/api/routers/chat/completions.py +281 -0
- rem/api/routers/chat/json_utils.py +76 -0
- rem/api/routers/chat/models.py +124 -0
- rem/api/routers/chat/streaming.py +185 -0
- rem/auth/README.md +258 -0
- rem/auth/__init__.py +26 -0
- rem/auth/middleware.py +100 -0
- rem/auth/providers/__init__.py +13 -0
- rem/auth/providers/base.py +376 -0
- rem/auth/providers/google.py +163 -0
- rem/auth/providers/microsoft.py +237 -0
- rem/cli/README.md +455 -0
- rem/cli/__init__.py +8 -0
- rem/cli/commands/README.md +126 -0
- rem/cli/commands/__init__.py +3 -0
- rem/cli/commands/ask.py +565 -0
- rem/cli/commands/configure.py +423 -0
- rem/cli/commands/db.py +493 -0
- rem/cli/commands/dreaming.py +324 -0
- rem/cli/commands/experiments.py +1124 -0
- rem/cli/commands/mcp.py +66 -0
- rem/cli/commands/process.py +245 -0
- rem/cli/commands/schema.py +183 -0
- rem/cli/commands/serve.py +106 -0
- rem/cli/dreaming.py +363 -0
- rem/cli/main.py +88 -0
- rem/config.py +237 -0
- rem/mcp_server.py +41 -0
- rem/models/core/__init__.py +49 -0
- rem/models/core/core_model.py +64 -0
- rem/models/core/engram.py +333 -0
- rem/models/core/experiment.py +628 -0
- rem/models/core/inline_edge.py +132 -0
- rem/models/core/rem_query.py +243 -0
- rem/models/entities/__init__.py +43 -0
- rem/models/entities/file.py +57 -0
- rem/models/entities/image_resource.py +88 -0
- rem/models/entities/message.py +35 -0
- rem/models/entities/moment.py +123 -0
- rem/models/entities/ontology.py +191 -0
- rem/models/entities/ontology_config.py +131 -0
- rem/models/entities/resource.py +95 -0
- rem/models/entities/schema.py +87 -0
- rem/models/entities/user.py +85 -0
- rem/py.typed +0 -0
- rem/schemas/README.md +507 -0
- rem/schemas/__init__.py +6 -0
- rem/schemas/agents/README.md +92 -0
- rem/schemas/agents/core/moment-builder.yaml +178 -0
- rem/schemas/agents/core/rem-query-agent.yaml +226 -0
- rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
- rem/schemas/agents/core/simple-assistant.yaml +19 -0
- rem/schemas/agents/core/user-profile-builder.yaml +163 -0
- rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
- rem/schemas/agents/examples/contract-extractor.yaml +134 -0
- rem/schemas/agents/examples/cv-parser.yaml +263 -0
- rem/schemas/agents/examples/hello-world.yaml +37 -0
- rem/schemas/agents/examples/query.yaml +54 -0
- rem/schemas/agents/examples/simple.yaml +21 -0
- rem/schemas/agents/examples/test.yaml +29 -0
- rem/schemas/agents/rem.yaml +128 -0
- rem/schemas/evaluators/hello-world/default.yaml +77 -0
- rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
- rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
- rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
- rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
- rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
- rem/services/__init__.py +16 -0
- rem/services/audio/INTEGRATION.md +308 -0
- rem/services/audio/README.md +376 -0
- rem/services/audio/__init__.py +15 -0
- rem/services/audio/chunker.py +354 -0
- rem/services/audio/transcriber.py +259 -0
- rem/services/content/README.md +1269 -0
- rem/services/content/__init__.py +5 -0
- rem/services/content/providers.py +806 -0
- rem/services/content/service.py +657 -0
- rem/services/dreaming/README.md +230 -0
- rem/services/dreaming/__init__.py +53 -0
- rem/services/dreaming/affinity_service.py +336 -0
- rem/services/dreaming/moment_service.py +264 -0
- rem/services/dreaming/ontology_service.py +54 -0
- rem/services/dreaming/user_model_service.py +297 -0
- rem/services/dreaming/utils.py +39 -0
- rem/services/embeddings/__init__.py +11 -0
- rem/services/embeddings/api.py +120 -0
- rem/services/embeddings/worker.py +421 -0
- rem/services/fs/README.md +662 -0
- rem/services/fs/__init__.py +62 -0
- rem/services/fs/examples.py +206 -0
- rem/services/fs/examples_paths.py +204 -0
- rem/services/fs/git_provider.py +935 -0
- rem/services/fs/local_provider.py +760 -0
- rem/services/fs/parsing-hooks-examples.md +172 -0
- rem/services/fs/paths.py +276 -0
- rem/services/fs/provider.py +460 -0
- rem/services/fs/s3_provider.py +1042 -0
- rem/services/fs/service.py +186 -0
- rem/services/git/README.md +1075 -0
- rem/services/git/__init__.py +17 -0
- rem/services/git/service.py +469 -0
- rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
- rem/services/phoenix/README.md +453 -0
- rem/services/phoenix/__init__.py +46 -0
- rem/services/phoenix/client.py +686 -0
- rem/services/phoenix/config.py +88 -0
- rem/services/phoenix/prompt_labels.py +477 -0
- rem/services/postgres/README.md +575 -0
- rem/services/postgres/__init__.py +23 -0
- rem/services/postgres/migration_service.py +427 -0
- rem/services/postgres/pydantic_to_sqlalchemy.py +232 -0
- rem/services/postgres/register_type.py +352 -0
- rem/services/postgres/repository.py +337 -0
- rem/services/postgres/schema_generator.py +379 -0
- rem/services/postgres/service.py +802 -0
- rem/services/postgres/sql_builder.py +354 -0
- rem/services/rem/README.md +304 -0
- rem/services/rem/__init__.py +23 -0
- rem/services/rem/exceptions.py +71 -0
- rem/services/rem/executor.py +293 -0
- rem/services/rem/parser.py +145 -0
- rem/services/rem/queries.py +196 -0
- rem/services/rem/query.py +371 -0
- rem/services/rem/service.py +527 -0
- rem/services/session/README.md +374 -0
- rem/services/session/__init__.py +6 -0
- rem/services/session/compression.py +360 -0
- rem/services/session/reload.py +77 -0
- rem/settings.py +1235 -0
- rem/sql/002_install_models.sql +1068 -0
- rem/sql/background_indexes.sql +42 -0
- rem/sql/install_models.sql +1038 -0
- rem/sql/migrations/001_install.sql +503 -0
- rem/sql/migrations/002_install_models.sql +1202 -0
- rem/utils/AGENTIC_CHUNKING.md +597 -0
- rem/utils/README.md +583 -0
- rem/utils/__init__.py +43 -0
- rem/utils/agentic_chunking.py +622 -0
- rem/utils/batch_ops.py +343 -0
- rem/utils/chunking.py +108 -0
- rem/utils/clip_embeddings.py +276 -0
- rem/utils/dict_utils.py +98 -0
- rem/utils/embeddings.py +423 -0
- rem/utils/examples/embeddings_example.py +305 -0
- rem/utils/examples/sql_types_example.py +202 -0
- rem/utils/markdown.py +16 -0
- rem/utils/model_helpers.py +236 -0
- rem/utils/schema_loader.py +229 -0
- rem/utils/sql_types.py +348 -0
- rem/utils/user_id.py +81 -0
- rem/utils/vision.py +330 -0
- rem/workers/README.md +506 -0
- rem/workers/__init__.py +5 -0
- rem/workers/dreaming.py +502 -0
- rem/workers/engram_processor.py +312 -0
- rem/workers/sqs_file_processor.py +193 -0
- remdb-0.2.6.dist-info/METADATA +1191 -0
- remdb-0.2.6.dist-info/RECORD +187 -0
- remdb-0.2.6.dist-info/WHEEL +4 -0
- remdb-0.2.6.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
# Audio Provider Integration
|
|
2
|
+
|
|
3
|
+
The AudioProvider is now fully integrated into REM's ContentService with a **consistent interface** that matches all other content providers.
|
|
4
|
+
|
|
5
|
+
## Architecture
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
┌────────────────────────────────────────────────────────────┐
|
|
9
|
+
│ ContentService │
|
|
10
|
+
│ (Pluggable Providers) │
|
|
11
|
+
├────────────────────────────────────────────────────────────┤
|
|
12
|
+
│ │
|
|
13
|
+
│ ┌──────────────┐ ┌────────────┐ ┌─────────────────┐ │
|
|
14
|
+
│ │ TextProvider │→│DocProvider │→│ AudioProvider │ │
|
|
15
|
+
│ └──────────────┘ └────────────┘ └─────────────────┘ │
|
|
16
|
+
│ │ │ │ │
|
|
17
|
+
│ ▼ ▼ ▼ │
|
|
18
|
+
│ extract() extract() extract() │
|
|
19
|
+
│ │ │ │ │
|
|
20
|
+
│ ▼ ▼ ▼ │
|
|
21
|
+
│ Markdown Markdown Markdown │
|
|
22
|
+
│ text text text │
|
|
23
|
+
│ │ │ │ │
|
|
24
|
+
│ └──────────────────┼──────────────────┘ │
|
|
25
|
+
│ │ │
|
|
26
|
+
│ ▼ │
|
|
27
|
+
│ chunk_text() → embed() │
|
|
28
|
+
│ │ │
|
|
29
|
+
│ ▼ │
|
|
30
|
+
│ Save to Database │
|
|
31
|
+
│ (File + Resource entities) │
|
|
32
|
+
└────────────────────────────────────────────────────────────┘
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Consistent Interface
|
|
36
|
+
|
|
37
|
+
All content providers implement the same `ContentProvider` base class:
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
class ContentProvider(ABC):
|
|
41
|
+
@property
|
|
42
|
+
@abstractmethod
|
|
43
|
+
def name(self) -> str:
|
|
44
|
+
"""Provider name for logging/debugging."""
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
@abstractmethod
|
|
48
|
+
def extract(self, content: bytes, metadata: dict[str, Any]) -> dict[str, Any]:
|
|
49
|
+
"""
|
|
50
|
+
Extract text content from file bytes.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
content: Raw file bytes
|
|
54
|
+
metadata: File metadata (size, type, etc.)
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
dict with:
|
|
58
|
+
- text: Extracted text content
|
|
59
|
+
- metadata: Additional metadata from extraction (optional)
|
|
60
|
+
"""
|
|
61
|
+
pass
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Provider Implementations
|
|
65
|
+
|
|
66
|
+
### 1. TextProvider
|
|
67
|
+
```python
|
|
68
|
+
def extract(self, content: bytes, metadata: dict) -> dict:
|
|
69
|
+
text = content.decode("utf-8")
|
|
70
|
+
return {
|
|
71
|
+
"text": text,
|
|
72
|
+
"metadata": {"line_count": len(text.split("\n"))}
|
|
73
|
+
}
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### 2. DocProvider (Kreuzberg)
|
|
77
|
+
```python
|
|
78
|
+
def extract(self, content: bytes, metadata: dict) -> dict:
|
|
79
|
+
# Uses Kreuzberg for PDF extraction
|
|
80
|
+
result = extract_file_sync(tmp_path, config=config)
|
|
81
|
+
return {
|
|
82
|
+
"text": result.content,
|
|
83
|
+
"metadata": {"table_count": len(result.tables)}
|
|
84
|
+
}
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### 3. AudioProvider (AudioChunker + Whisper)
|
|
88
|
+
```python
|
|
89
|
+
def extract(self, content: bytes, metadata: dict) -> dict:
|
|
90
|
+
# 1. Chunk audio by silence
|
|
91
|
+
chunks = chunker.chunk_audio(tmp_path)
|
|
92
|
+
|
|
93
|
+
# 2. Transcribe chunks
|
|
94
|
+
results = transcriber.transcribe_chunks(chunks)
|
|
95
|
+
|
|
96
|
+
# 3. Format as markdown with timestamps
|
|
97
|
+
markdown_parts = []
|
|
98
|
+
for result in results:
|
|
99
|
+
timestamp = f"{result.start_seconds:.1f}s - {result.end_seconds:.1f}s"
|
|
100
|
+
markdown_parts.append(f"## [{timestamp}]\n\n{result.text}\n")
|
|
101
|
+
|
|
102
|
+
return {
|
|
103
|
+
"text": "\n".join(markdown_parts),
|
|
104
|
+
"metadata": {
|
|
105
|
+
"chunk_count": len(chunks),
|
|
106
|
+
"duration_seconds": total_duration,
|
|
107
|
+
"estimated_cost": estimated_cost,
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
## Markdown Format
|
|
113
|
+
|
|
114
|
+
All providers return markdown-formatted text. AudioProvider returns:
|
|
115
|
+
|
|
116
|
+
```markdown
|
|
117
|
+
## [0.0s - 60.0s]
|
|
118
|
+
|
|
119
|
+
Transcription of first minute goes here...
|
|
120
|
+
|
|
121
|
+
## [60.0s - 120.0s]
|
|
122
|
+
|
|
123
|
+
Transcription of second minute goes here...
|
|
124
|
+
|
|
125
|
+
## [120.0s - 180.0s]
|
|
126
|
+
|
|
127
|
+
Transcription of third minute goes here...
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
This format:
|
|
131
|
+
- ✅ Is valid markdown
|
|
132
|
+
- ✅ Has clear section boundaries
|
|
133
|
+
- ✅ Preserves temporal information
|
|
134
|
+
- ✅ Can be chunked further if needed
|
|
135
|
+
- ✅ Embeds naturally with other content
|
|
136
|
+
|
|
137
|
+
## Processing Pipeline
|
|
138
|
+
|
|
139
|
+
### Example: Audio File Processing
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
from rem.services.content import ContentService
|
|
143
|
+
|
|
144
|
+
service = ContentService()
|
|
145
|
+
|
|
146
|
+
# Process audio file (same interface as PDF/markdown!)
|
|
147
|
+
result = service.process_uri("s3://bucket/meeting.m4a")
|
|
148
|
+
|
|
149
|
+
# Result structure (same for all providers):
|
|
150
|
+
{
|
|
151
|
+
"uri": "s3://bucket/meeting.m4a",
|
|
152
|
+
"content": "## [0.0s - 60.0s]\n\nDiscussion about...\n\n## [60.0s - 120.0s]...",
|
|
153
|
+
"metadata": {
|
|
154
|
+
"chunk_count": 5,
|
|
155
|
+
"duration_seconds": 300.0,
|
|
156
|
+
"estimated_cost": 0.030,
|
|
157
|
+
"parser": "whisper_api"
|
|
158
|
+
},
|
|
159
|
+
"provider": "audio"
|
|
160
|
+
}
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
### End-to-End Processing
|
|
164
|
+
|
|
165
|
+
```python
|
|
166
|
+
# Process and save to database
|
|
167
|
+
await service.process_and_save(
|
|
168
|
+
uri="s3://bucket/meeting.m4a",
|
|
169
|
+
user_id="user-123"
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
# This automatically:
|
|
173
|
+
# 1. Downloads from S3
|
|
174
|
+
# 2. Chunks audio by silence
|
|
175
|
+
# 3. Transcribes with Whisper
|
|
176
|
+
# 4. Converts to markdown
|
|
177
|
+
# 5. Chunks markdown text
|
|
178
|
+
# 6. Saves File entity
|
|
179
|
+
# 7. Saves Resource entities (one per chunk)
|
|
180
|
+
# 8. Generates embeddings (ready for vector search)
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
## Registered Extensions
|
|
184
|
+
|
|
185
|
+
The AudioProvider is automatically registered for:
|
|
186
|
+
- `.wav` - Uncompressed audio
|
|
187
|
+
- `.mp3` - Compressed audio
|
|
188
|
+
- `.m4a` - Apple audio format
|
|
189
|
+
- `.flac` - Lossless compression
|
|
190
|
+
- `.ogg` - Ogg Vorbis
|
|
191
|
+
|
|
192
|
+
## Graceful Degradation
|
|
193
|
+
|
|
194
|
+
Without OpenAI API key:
|
|
195
|
+
```python
|
|
196
|
+
result = audio_provider.extract(content, metadata)
|
|
197
|
+
|
|
198
|
+
# Returns:
|
|
199
|
+
{
|
|
200
|
+
"text": "[Audio transcription requires OPENAI_API_KEY environment variable]",
|
|
201
|
+
"metadata": {"error": "missing_api_key"}
|
|
202
|
+
}
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
Without pydub installed:
|
|
206
|
+
```python
|
|
207
|
+
# Returns:
|
|
208
|
+
{
|
|
209
|
+
"text": "[Audio processing requires: pip install rem[audio]]",
|
|
210
|
+
"metadata": {"error": "missing_dependencies"}
|
|
211
|
+
}
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
## Testing
|
|
215
|
+
|
|
216
|
+
All providers tested for interface consistency:
|
|
217
|
+
|
|
218
|
+
```bash
|
|
219
|
+
# Run integration tests
|
|
220
|
+
pytest tests/integration/services/test_content_providers.py -v
|
|
221
|
+
|
|
222
|
+
# Results:
|
|
223
|
+
# ✓ test_markdown_provider_interface PASSED
|
|
224
|
+
# ✓ test_pdf_provider_interface PASSED
|
|
225
|
+
# ✓ test_audio_provider_interface PASSED
|
|
226
|
+
# ✓ test_content_service_has_all_providers PASSED
|
|
227
|
+
# ✓ test_markdown_file_processing PASSED
|
|
228
|
+
# ✓ test_audio_file_processing_without_api_key PASSED
|
|
229
|
+
# ✓ test_all_providers_return_text_and_metadata PASSED
|
|
230
|
+
# ✓ test_all_providers_handle_empty_content PASSED
|
|
231
|
+
# ✓ test_markdown_to_audio_consistency PASSED
|
|
232
|
+
# ✓ test_audio_returns_markdown_with_timestamps PASSED
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
## Consistency Guarantees
|
|
236
|
+
|
|
237
|
+
All providers:
|
|
238
|
+
|
|
239
|
+
1. **Accept same input**: `extract(content: bytes, metadata: dict)`
|
|
240
|
+
2. **Return same structure**: `{"text": str, "metadata": dict}`
|
|
241
|
+
3. **Return markdown format**: Text is markdown-compatible
|
|
242
|
+
4. **Handle errors gracefully**: Return error messages, don't crash
|
|
243
|
+
5. **Register with ContentService**: Via file extension mapping
|
|
244
|
+
6. **Follow pipeline**: extract → markdown → chunk → embed → save
|
|
245
|
+
|
|
246
|
+
## Usage Examples
|
|
247
|
+
|
|
248
|
+
### Process Single File
|
|
249
|
+
|
|
250
|
+
```python
|
|
251
|
+
from rem.services.content import ContentService
|
|
252
|
+
|
|
253
|
+
service = ContentService()
|
|
254
|
+
|
|
255
|
+
# Process markdown
|
|
256
|
+
md_result = service.process_uri("document.md")
|
|
257
|
+
|
|
258
|
+
# Process PDF
|
|
259
|
+
pdf_result = service.process_uri("report.pdf")
|
|
260
|
+
|
|
261
|
+
# Process audio (same interface!)
|
|
262
|
+
audio_result = service.process_uri("meeting.m4a")
|
|
263
|
+
|
|
264
|
+
# All return same structure
|
|
265
|
+
assert "content" in md_result
|
|
266
|
+
assert "content" in pdf_result
|
|
267
|
+
assert "content" in audio_result
|
|
268
|
+
```
|
|
269
|
+
|
|
270
|
+
### Process with S3
|
|
271
|
+
|
|
272
|
+
```python
|
|
273
|
+
# S3 URI - automatic download and processing
|
|
274
|
+
result = service.process_uri("s3://recordings/standup.m4a")
|
|
275
|
+
|
|
276
|
+
# Transcribed, chunked, and ready to save
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
### Custom Provider Registration
|
|
280
|
+
|
|
281
|
+
```python
|
|
282
|
+
# Register custom provider
|
|
283
|
+
service.register_provider(
|
|
284
|
+
extensions=[".custom"],
|
|
285
|
+
provider=CustomProvider()
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
# Now .custom files use CustomProvider
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
## Future Enhancements
|
|
292
|
+
|
|
293
|
+
1. **Streaming Transcription**: Process long audio files in streams
|
|
294
|
+
2. **Speaker Diarization**: Identify different speakers
|
|
295
|
+
3. **Language Detection**: Auto-detect language for transcription
|
|
296
|
+
4. **Timestamp Refinement**: More accurate timestamps via VAD
|
|
297
|
+
5. **Batch Processing**: Parallel transcription of multiple files
|
|
298
|
+
|
|
299
|
+
## Key Takeaways
|
|
300
|
+
|
|
301
|
+
✅ **Pluggable**: Easy to add new content types
|
|
302
|
+
✅ **Consistent**: Same interface for all providers
|
|
303
|
+
✅ **Testable**: All providers tested for consistency
|
|
304
|
+
✅ **Graceful**: Handles missing dependencies/keys elegantly
|
|
305
|
+
✅ **Integrated**: Works with ContentService out of the box
|
|
306
|
+
✅ **Production-Ready**: Error handling, logging, cleanup
|
|
307
|
+
|
|
308
|
+
The AudioProvider is a **first-class citizen** in REM's content processing pipeline!
|
|
@@ -0,0 +1,376 @@
|
|
|
1
|
+
# REM Audio Processing
|
|
2
|
+
|
|
3
|
+
Lightweight audio processing service with minimal dependencies for chunking and transcribing audio files.
|
|
4
|
+
|
|
5
|
+
## Design Philosophy
|
|
6
|
+
|
|
7
|
+
**Minimal Dependencies:**
|
|
8
|
+
- `wave` (stdlib) for WAV file handling
|
|
9
|
+
- `pydub` for audio format conversion (wraps ffmpeg)
|
|
10
|
+
- `requests` for OpenAI Whisper API (already a REM dependency)
|
|
11
|
+
- `loguru` for logging (REM standard)
|
|
12
|
+
|
|
13
|
+
**No Heavy ML Libraries:**
|
|
14
|
+
- No `torch`, `torchaudio`, or other heavyweight dependencies
|
|
15
|
+
- No `librosa` for audio analysis
|
|
16
|
+
- Keep the Docker image lean and fast
|
|
17
|
+
|
|
18
|
+
## Architecture
|
|
19
|
+
|
|
20
|
+
```
|
|
21
|
+
┌─────────────────────────────────────────────────────────┐
|
|
22
|
+
│ REM Audio Service │
|
|
23
|
+
├─────────────────────────────────────────────────────────┤
|
|
24
|
+
│ │
|
|
25
|
+
│ ┌──────────────┐ ┌──────────────┐ │
|
|
26
|
+
│ │ AudioChunker │────────▶│AudioTranscriber│ │
|
|
27
|
+
│ └──────────────┘ └──────────────┘ │
|
|
28
|
+
│ │ │ │
|
|
29
|
+
│ │ │ │
|
|
30
|
+
│ Split by silence OpenAI Whisper API │
|
|
31
|
+
│ near minute ($0.006/minute) │
|
|
32
|
+
│ boundaries │
|
|
33
|
+
│ │
|
|
34
|
+
└─────────────────────────────────────────────────────────┘
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Components
|
|
38
|
+
|
|
39
|
+
### 1. AudioChunker
|
|
40
|
+
|
|
41
|
+
Splits audio files by detecting silence near minute boundaries.
|
|
42
|
+
|
|
43
|
+
**Strategy:**
|
|
44
|
+
- Target chunks around 60 seconds (configurable)
|
|
45
|
+
- Look for silence in window around target (±2 seconds)
|
|
46
|
+
- Split at longest silence in window
|
|
47
|
+
- If no silence, split at target boundary
|
|
48
|
+
|
|
49
|
+
**Benefits:**
|
|
50
|
+
- Keeps chunks under OpenAI's 25MB limit (~10 minutes)
|
|
51
|
+
- Natural breaks at silence points
|
|
52
|
+
- Maintains speech context within chunks
|
|
53
|
+
|
|
54
|
+
**Example:**
|
|
55
|
+
```python
|
|
56
|
+
from rem.services.audio import AudioChunker
|
|
57
|
+
|
|
58
|
+
chunker = AudioChunker(
|
|
59
|
+
target_chunk_seconds=60.0, # 1 minute target
|
|
60
|
+
chunk_window_seconds=2.0, # ±2 second search window
|
|
61
|
+
silence_threshold_db=-40.0, # Silence detection threshold
|
|
62
|
+
min_silence_ms=500, # Minimum 500ms silence
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
# Chunk audio file
|
|
66
|
+
chunks = chunker.chunk_audio("recording.m4a")
|
|
67
|
+
|
|
68
|
+
# Process chunks
|
|
69
|
+
for chunk in chunks:
|
|
70
|
+
print(f"Chunk {chunk.chunk_index}: {chunk.start_seconds:.1f}s - {chunk.end_seconds:.1f}s")
|
|
71
|
+
print(f"Duration: {chunk.duration_seconds:.1f}s")
|
|
72
|
+
print(f"File: {chunk.file_path}")
|
|
73
|
+
|
|
74
|
+
# Cleanup when done
|
|
75
|
+
chunker.cleanup_chunks(chunks)
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
### 2. AudioTranscriber
|
|
79
|
+
|
|
80
|
+
Transcribes audio using OpenAI Whisper API.
|
|
81
|
+
|
|
82
|
+
**Features:**
|
|
83
|
+
- Uses `requests` (no httpx dependency)
|
|
84
|
+
- Handles file uploads efficiently
|
|
85
|
+
- Automatic cost estimation
|
|
86
|
+
- Detailed logging with loguru
|
|
87
|
+
|
|
88
|
+
**Example:**
|
|
89
|
+
```python
|
|
90
|
+
from rem.services.audio import AudioTranscriber
|
|
91
|
+
|
|
92
|
+
transcriber = AudioTranscriber(
|
|
93
|
+
api_key="sk-...", # Or from OPENAI_API_KEY env
|
|
94
|
+
model="whisper-1", # OpenAI Whisper model
|
|
95
|
+
language=None, # Auto-detect language
|
|
96
|
+
temperature=0.0, # Deterministic transcription
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
# Transcribe single file
|
|
100
|
+
result = transcriber.transcribe_file("audio.wav")
|
|
101
|
+
print(result.text)
|
|
102
|
+
|
|
103
|
+
# Transcribe chunks
|
|
104
|
+
results = transcriber.transcribe_chunks(chunks)
|
|
105
|
+
for result in results:
|
|
106
|
+
print(f"[{result.start_seconds:.1f}s - {result.end_seconds:.1f}s]: {result.text}")
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### 3. Complete Workflow
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
from rem.services.audio import AudioChunker, AudioTranscriber
|
|
113
|
+
|
|
114
|
+
# 1. Chunk audio by silence
|
|
115
|
+
chunker = AudioChunker()
|
|
116
|
+
chunks = chunker.chunk_audio("meeting_recording.m4a")
|
|
117
|
+
|
|
118
|
+
print(f"Created {len(chunks)} chunks")
|
|
119
|
+
|
|
120
|
+
# 2. Transcribe chunks
|
|
121
|
+
transcriber = AudioTranscriber()
|
|
122
|
+
results = transcriber.transcribe_chunks(chunks)
|
|
123
|
+
|
|
124
|
+
print(f"Transcribed {len(results)} chunks")
|
|
125
|
+
|
|
126
|
+
# 3. Combine results
|
|
127
|
+
full_transcription = "\n\n".join([
|
|
128
|
+
f"[{r.start_seconds:.1f}s]: {r.text}"
|
|
129
|
+
for r in results
|
|
130
|
+
])
|
|
131
|
+
|
|
132
|
+
print(full_transcription)
|
|
133
|
+
|
|
134
|
+
# 4. Cleanup
|
|
135
|
+
chunker.cleanup_chunks(chunks)
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
## Configuration
|
|
139
|
+
|
|
140
|
+
### Environment Variables
|
|
141
|
+
|
|
142
|
+
```bash
|
|
143
|
+
# OpenAI API Key (required for transcription)
|
|
144
|
+
OPENAI_API_KEY=sk-...
|
|
145
|
+
|
|
146
|
+
# Chunker Settings (optional)
|
|
147
|
+
AUDIO_CHUNK_TARGET_SECONDS=60 # Target chunk duration
|
|
148
|
+
AUDIO_CHUNK_WINDOW_SECONDS=2 # Silence search window
|
|
149
|
+
AUDIO_SILENCE_THRESHOLD_DB=-40 # Silence detection threshold
|
|
150
|
+
AUDIO_MIN_SILENCE_MS=500 # Minimum silence duration
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
### Transcription Costs
|
|
154
|
+
|
|
155
|
+
OpenAI Whisper API pricing: **$0.006 per minute**
|
|
156
|
+
|
|
157
|
+
Examples:
|
|
158
|
+
- 10 minute recording: $0.06
|
|
159
|
+
- 1 hour recording: $0.36
|
|
160
|
+
- 10 hour recording: $3.60
|
|
161
|
+
|
|
162
|
+
## Supported Formats
|
|
163
|
+
|
|
164
|
+
### With pydub + ffmpeg:
|
|
165
|
+
- WAV (uncompressed)
|
|
166
|
+
- MP3 (compressed)
|
|
167
|
+
- M4A (Apple audio)
|
|
168
|
+
- FLAC (lossless)
|
|
169
|
+
- OGG (Vorbis)
|
|
170
|
+
- WMA (Windows)
|
|
171
|
+
|
|
172
|
+
### Without pydub:
|
|
173
|
+
- Only WAV files (requires pydub for format conversion)
|
|
174
|
+
|
|
175
|
+
## Docker Setup
|
|
176
|
+
|
|
177
|
+
The Dockerfile includes ffmpeg for audio processing:
|
|
178
|
+
|
|
179
|
+
```dockerfile
|
|
180
|
+
# Runtime dependencies
|
|
181
|
+
RUN apt-get install -y \
|
|
182
|
+
ffmpeg # Required by pydub for format conversion
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
Install pydub dependency:
|
|
186
|
+
|
|
187
|
+
```bash
|
|
188
|
+
# Install audio extras
|
|
189
|
+
pip install rem[audio]
|
|
190
|
+
|
|
191
|
+
# Or install all extras
|
|
192
|
+
pip install rem[all]
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
## Dependencies
|
|
196
|
+
|
|
197
|
+
### Core (always installed with rem[audio]):
|
|
198
|
+
- `pydub>=0.25.0` - Audio manipulation
|
|
199
|
+
|
|
200
|
+
### System (Docker):
|
|
201
|
+
- `ffmpeg` - Audio codec support (installed in Dockerfile)
|
|
202
|
+
|
|
203
|
+
### External APIs:
|
|
204
|
+
- OpenAI Whisper API - Speech-to-text transcription
|
|
205
|
+
|
|
206
|
+
## Error Handling
|
|
207
|
+
|
|
208
|
+
### Missing API Key
|
|
209
|
+
```python
|
|
210
|
+
transcriber = AudioTranscriber() # No API key
|
|
211
|
+
|
|
212
|
+
# Raises: ValueError("OpenAI API key required for transcription")
|
|
213
|
+
result = transcriber.transcribe_file("audio.wav")
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
### File Too Large
|
|
217
|
+
```python
|
|
218
|
+
# Whisper API limit: 25 MB
|
|
219
|
+
transcriber.transcribe_file("huge_file.wav")
|
|
220
|
+
|
|
221
|
+
# Raises: ValueError("Audio file too large: 30.5 MB (max 25 MB)")
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
### No pydub
|
|
225
|
+
```python
|
|
226
|
+
# Without pydub installed
|
|
227
|
+
chunker = AudioChunker()
|
|
228
|
+
chunker.chunk_audio("audio.m4a")
|
|
229
|
+
|
|
230
|
+
# Raises: RuntimeError("pydub required for .m4a files")
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
## Best Practices
|
|
234
|
+
|
|
235
|
+
1. **Chunk Before Transcribing**
|
|
236
|
+
- Don't send entire 2-hour recordings to Whisper
|
|
237
|
+
- Chunk into 1-minute segments for better quality
|
|
238
|
+
- Easier to debug and retry failed segments
|
|
239
|
+
|
|
240
|
+
2. **Monitor Costs**
|
|
241
|
+
- Log transcription duration and cost
|
|
242
|
+
- Set budgets for long recordings
|
|
243
|
+
- Use `transcriber.transcribe_chunks()` for cost estimation
|
|
244
|
+
|
|
245
|
+
3. **Handle Failures Gracefully**
|
|
246
|
+
- Chunks can fail independently
|
|
247
|
+
- Retry logic for transient errors
|
|
248
|
+
- Save partial results
|
|
249
|
+
|
|
250
|
+
4. **Cleanup Temporary Files**
|
|
251
|
+
- Always call `chunker.cleanup_chunks()` when done
|
|
252
|
+
- Or use context manager (future enhancement)
|
|
253
|
+
|
|
254
|
+
5. **Use Silence Detection**
|
|
255
|
+
- Default settings work well for most speech
|
|
256
|
+
- Adjust `silence_threshold_db` for noisy recordings
|
|
257
|
+
- Increase `min_silence_ms` for natural pauses
|
|
258
|
+
|
|
259
|
+
## Integration with REM
|
|
260
|
+
|
|
261
|
+
### File Processing
|
|
262
|
+
|
|
263
|
+
```python
|
|
264
|
+
# rem/workers/file_processor.py
|
|
265
|
+
from rem.services.audio import AudioChunker, AudioTranscriber
|
|
266
|
+
|
|
267
|
+
async def process_audio_file(file_path: Path, user_id: str):
|
|
268
|
+
"""Process audio file and create REM resources."""
|
|
269
|
+
|
|
270
|
+
# 1. Chunk audio
|
|
271
|
+
chunker = AudioChunker()
|
|
272
|
+
chunks = chunker.chunk_audio(file_path)
|
|
273
|
+
|
|
274
|
+
# 2. Transcribe chunks
|
|
275
|
+
transcriber = AudioTranscriber()
|
|
276
|
+
results = transcriber.transcribe_chunks(chunks)
|
|
277
|
+
|
|
278
|
+
# 3. Create REM resources
|
|
279
|
+
for i, result in enumerate(results):
|
|
280
|
+
resource = Resource(
|
|
281
|
+
name=f"{file_path.stem} - Part {i+1}",
|
|
282
|
+
uri=f"{file_path.as_uri()}#t={result.start_seconds},{result.end_seconds}",
|
|
283
|
+
content=result.text,
|
|
284
|
+
timestamp=datetime.now(),
|
|
285
|
+
category="transcription",
|
|
286
|
+
user_id=user_id,
|
|
287
|
+
)
|
|
288
|
+
await repository.upsert(resource)
|
|
289
|
+
|
|
290
|
+
# 4. Cleanup
|
|
291
|
+
chunker.cleanup_chunks(chunks)
|
|
292
|
+
```
|
|
293
|
+
|
|
294
|
+
### Dreaming Worker
|
|
295
|
+
|
|
296
|
+
```python
|
|
297
|
+
# rem/workers/dreaming.py
|
|
298
|
+
from rem.services.audio import AudioChunker, AudioTranscriber
|
|
299
|
+
|
|
300
|
+
async def extract_moments_from_audio(audio_resource: Resource):
|
|
301
|
+
"""Extract moments from audio transcription."""
|
|
302
|
+
|
|
303
|
+
# Audio already transcribed and stored as Resource
|
|
304
|
+
# Use transcription content to identify temporal moments
|
|
305
|
+
|
|
306
|
+
# Example: Split by speaker changes, topic shifts, etc.
|
|
307
|
+
moments = extract_temporal_segments(audio_resource.content)
|
|
308
|
+
|
|
309
|
+
for moment in moments:
|
|
310
|
+
await repository.upsert(moment)
|
|
311
|
+
```
|
|
312
|
+
|
|
313
|
+
## Logging
|
|
314
|
+
|
|
315
|
+
All logs use loguru (REM standard):
|
|
316
|
+
|
|
317
|
+
```python
|
|
318
|
+
from loguru import logger
|
|
319
|
+
|
|
320
|
+
# Chunker logs
|
|
321
|
+
logger.info("Chunking audio: /path/to/file.m4a")
|
|
322
|
+
logger.debug("Found silence at 58.3s (target: 60.0s)")
|
|
323
|
+
logger.info("Created 5 chunks in /tmp/rem_audio_chunks_xyz")
|
|
324
|
+
|
|
325
|
+
# Transcriber logs
|
|
326
|
+
logger.info("Transcribing chunk 1/5 (58.0s - 118.0s)")
|
|
327
|
+
logger.debug("Sending 2.3 MB to OpenAI Whisper API")
|
|
328
|
+
logger.info("✓ Transcription complete: 245 characters")
|
|
329
|
+
logger.info("Estimated cost: $0.180 (30.0 minutes)")
|
|
330
|
+
```
|
|
331
|
+
|
|
332
|
+
## Testing
|
|
333
|
+
|
|
334
|
+
```bash
|
|
335
|
+
# Run audio service tests
|
|
336
|
+
pytest tests/unit/services/audio/
|
|
337
|
+
|
|
338
|
+
# Test with real files (requires OpenAI API key)
|
|
339
|
+
export OPENAI_API_KEY=sk-...
|
|
340
|
+
pytest tests/integration/services/audio/
|
|
341
|
+
```
|
|
342
|
+
|
|
343
|
+
## Future Enhancements
|
|
344
|
+
|
|
345
|
+
1. **Context Manager for Cleanup**
|
|
346
|
+
```python
|
|
347
|
+
with AudioChunker() as chunker:
|
|
348
|
+
chunks = chunker.chunk_audio("file.m4a")
|
|
349
|
+
# Auto-cleanup on exit
|
|
350
|
+
```
|
|
351
|
+
|
|
352
|
+
2. **Batch Transcription**
|
|
353
|
+
- Parallel API requests
|
|
354
|
+
- Rate limiting
|
|
355
|
+
- Progress tracking
|
|
356
|
+
|
|
357
|
+
3. **Speaker Diarization**
|
|
358
|
+
- Detect speaker changes
|
|
359
|
+
- Label speakers
|
|
360
|
+
- Split on speaker boundaries
|
|
361
|
+
|
|
362
|
+
4. **Advanced Silence Detection**
|
|
363
|
+
- Machine learning-based VAD
|
|
364
|
+
- Energy-based fallback
|
|
365
|
+
- Adaptive thresholds
|
|
366
|
+
|
|
367
|
+
5. **Format Detection**
|
|
368
|
+
- Auto-detect audio format
|
|
369
|
+
- Validate before processing
|
|
370
|
+
- Better error messages
|
|
371
|
+
|
|
372
|
+
## References
|
|
373
|
+
|
|
374
|
+
- [OpenAI Whisper API](https://platform.openai.com/docs/guides/speech-to-text)
|
|
375
|
+
- [pydub Documentation](https://github.com/jiaaro/pydub)
|
|
376
|
+
- [ffmpeg Documentation](https://ffmpeg.org/documentation.html)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Audio processing service for REM.
|
|
3
|
+
|
|
4
|
+
Lightweight audio processing with minimal dependencies:
|
|
5
|
+
- wav module (stdlib) for WAV file handling
|
|
6
|
+
- pydub (optional) for format conversion (M4A, MP3, etc.)
|
|
7
|
+
- requests (already a dependency) for OpenAI Whisper API
|
|
8
|
+
|
|
9
|
+
No torch, torchaudio, or other heavy ML dependencies.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from .chunker import AudioChunker
|
|
13
|
+
from .transcriber import AudioTranscriber
|
|
14
|
+
|
|
15
|
+
__all__ = ["AudioChunker", "AudioTranscriber"]
|