remdb 0.3.242__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of remdb might be problematic. Click here for more details.
- rem/__init__.py +129 -0
- rem/agentic/README.md +760 -0
- rem/agentic/__init__.py +54 -0
- rem/agentic/agents/README.md +155 -0
- rem/agentic/agents/__init__.py +38 -0
- rem/agentic/agents/agent_manager.py +311 -0
- rem/agentic/agents/sse_simulator.py +502 -0
- rem/agentic/context.py +425 -0
- rem/agentic/context_builder.py +360 -0
- rem/agentic/llm_provider_models.py +301 -0
- rem/agentic/mcp/__init__.py +0 -0
- rem/agentic/mcp/tool_wrapper.py +273 -0
- rem/agentic/otel/__init__.py +5 -0
- rem/agentic/otel/setup.py +240 -0
- rem/agentic/providers/phoenix.py +926 -0
- rem/agentic/providers/pydantic_ai.py +854 -0
- rem/agentic/query.py +117 -0
- rem/agentic/query_helper.py +89 -0
- rem/agentic/schema.py +737 -0
- rem/agentic/serialization.py +245 -0
- rem/agentic/tools/__init__.py +5 -0
- rem/agentic/tools/rem_tools.py +242 -0
- rem/api/README.md +657 -0
- rem/api/deps.py +253 -0
- rem/api/main.py +460 -0
- rem/api/mcp_router/prompts.py +182 -0
- rem/api/mcp_router/resources.py +820 -0
- rem/api/mcp_router/server.py +243 -0
- rem/api/mcp_router/tools.py +1605 -0
- rem/api/middleware/tracking.py +172 -0
- rem/api/routers/admin.py +520 -0
- rem/api/routers/auth.py +898 -0
- rem/api/routers/chat/__init__.py +5 -0
- rem/api/routers/chat/child_streaming.py +394 -0
- rem/api/routers/chat/completions.py +702 -0
- rem/api/routers/chat/json_utils.py +76 -0
- rem/api/routers/chat/models.py +202 -0
- rem/api/routers/chat/otel_utils.py +33 -0
- rem/api/routers/chat/sse_events.py +546 -0
- rem/api/routers/chat/streaming.py +950 -0
- rem/api/routers/chat/streaming_utils.py +327 -0
- rem/api/routers/common.py +18 -0
- rem/api/routers/dev.py +87 -0
- rem/api/routers/feedback.py +276 -0
- rem/api/routers/messages.py +620 -0
- rem/api/routers/models.py +86 -0
- rem/api/routers/query.py +362 -0
- rem/api/routers/shared_sessions.py +422 -0
- rem/auth/README.md +258 -0
- rem/auth/__init__.py +36 -0
- rem/auth/jwt.py +367 -0
- rem/auth/middleware.py +318 -0
- rem/auth/providers/__init__.py +16 -0
- rem/auth/providers/base.py +376 -0
- rem/auth/providers/email.py +215 -0
- rem/auth/providers/google.py +163 -0
- rem/auth/providers/microsoft.py +237 -0
- rem/cli/README.md +517 -0
- rem/cli/__init__.py +8 -0
- rem/cli/commands/README.md +299 -0
- rem/cli/commands/__init__.py +3 -0
- rem/cli/commands/ask.py +549 -0
- rem/cli/commands/cluster.py +1808 -0
- rem/cli/commands/configure.py +495 -0
- rem/cli/commands/db.py +828 -0
- rem/cli/commands/dreaming.py +324 -0
- rem/cli/commands/experiments.py +1698 -0
- rem/cli/commands/mcp.py +66 -0
- rem/cli/commands/process.py +388 -0
- rem/cli/commands/query.py +109 -0
- rem/cli/commands/scaffold.py +47 -0
- rem/cli/commands/schema.py +230 -0
- rem/cli/commands/serve.py +106 -0
- rem/cli/commands/session.py +453 -0
- rem/cli/dreaming.py +363 -0
- rem/cli/main.py +123 -0
- rem/config.py +244 -0
- rem/mcp_server.py +41 -0
- rem/models/core/__init__.py +49 -0
- rem/models/core/core_model.py +70 -0
- rem/models/core/engram.py +333 -0
- rem/models/core/experiment.py +672 -0
- rem/models/core/inline_edge.py +132 -0
- rem/models/core/rem_query.py +246 -0
- rem/models/entities/__init__.py +68 -0
- rem/models/entities/domain_resource.py +38 -0
- rem/models/entities/feedback.py +123 -0
- rem/models/entities/file.py +57 -0
- rem/models/entities/image_resource.py +88 -0
- rem/models/entities/message.py +64 -0
- rem/models/entities/moment.py +123 -0
- rem/models/entities/ontology.py +181 -0
- rem/models/entities/ontology_config.py +131 -0
- rem/models/entities/resource.py +95 -0
- rem/models/entities/schema.py +87 -0
- rem/models/entities/session.py +84 -0
- rem/models/entities/shared_session.py +180 -0
- rem/models/entities/subscriber.py +175 -0
- rem/models/entities/user.py +93 -0
- rem/py.typed +0 -0
- rem/registry.py +373 -0
- rem/schemas/README.md +507 -0
- rem/schemas/__init__.py +6 -0
- rem/schemas/agents/README.md +92 -0
- rem/schemas/agents/core/agent-builder.yaml +235 -0
- rem/schemas/agents/core/moment-builder.yaml +178 -0
- rem/schemas/agents/core/rem-query-agent.yaml +226 -0
- rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
- rem/schemas/agents/core/simple-assistant.yaml +19 -0
- rem/schemas/agents/core/user-profile-builder.yaml +163 -0
- rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
- rem/schemas/agents/examples/contract-extractor.yaml +134 -0
- rem/schemas/agents/examples/cv-parser.yaml +263 -0
- rem/schemas/agents/examples/hello-world.yaml +37 -0
- rem/schemas/agents/examples/query.yaml +54 -0
- rem/schemas/agents/examples/simple.yaml +21 -0
- rem/schemas/agents/examples/test.yaml +29 -0
- rem/schemas/agents/rem.yaml +132 -0
- rem/schemas/evaluators/hello-world/default.yaml +77 -0
- rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
- rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
- rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
- rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
- rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
- rem/services/__init__.py +18 -0
- rem/services/audio/INTEGRATION.md +308 -0
- rem/services/audio/README.md +376 -0
- rem/services/audio/__init__.py +15 -0
- rem/services/audio/chunker.py +354 -0
- rem/services/audio/transcriber.py +259 -0
- rem/services/content/README.md +1269 -0
- rem/services/content/__init__.py +5 -0
- rem/services/content/providers.py +760 -0
- rem/services/content/service.py +762 -0
- rem/services/dreaming/README.md +230 -0
- rem/services/dreaming/__init__.py +53 -0
- rem/services/dreaming/affinity_service.py +322 -0
- rem/services/dreaming/moment_service.py +251 -0
- rem/services/dreaming/ontology_service.py +54 -0
- rem/services/dreaming/user_model_service.py +297 -0
- rem/services/dreaming/utils.py +39 -0
- rem/services/email/__init__.py +10 -0
- rem/services/email/service.py +522 -0
- rem/services/email/templates.py +360 -0
- rem/services/embeddings/__init__.py +11 -0
- rem/services/embeddings/api.py +127 -0
- rem/services/embeddings/worker.py +435 -0
- rem/services/fs/README.md +662 -0
- rem/services/fs/__init__.py +62 -0
- rem/services/fs/examples.py +206 -0
- rem/services/fs/examples_paths.py +204 -0
- rem/services/fs/git_provider.py +935 -0
- rem/services/fs/local_provider.py +760 -0
- rem/services/fs/parsing-hooks-examples.md +172 -0
- rem/services/fs/paths.py +276 -0
- rem/services/fs/provider.py +460 -0
- rem/services/fs/s3_provider.py +1042 -0
- rem/services/fs/service.py +186 -0
- rem/services/git/README.md +1075 -0
- rem/services/git/__init__.py +17 -0
- rem/services/git/service.py +469 -0
- rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
- rem/services/phoenix/README.md +453 -0
- rem/services/phoenix/__init__.py +46 -0
- rem/services/phoenix/client.py +960 -0
- rem/services/phoenix/config.py +88 -0
- rem/services/phoenix/prompt_labels.py +477 -0
- rem/services/postgres/README.md +757 -0
- rem/services/postgres/__init__.py +49 -0
- rem/services/postgres/diff_service.py +599 -0
- rem/services/postgres/migration_service.py +427 -0
- rem/services/postgres/programmable_diff_service.py +635 -0
- rem/services/postgres/pydantic_to_sqlalchemy.py +562 -0
- rem/services/postgres/register_type.py +353 -0
- rem/services/postgres/repository.py +481 -0
- rem/services/postgres/schema_generator.py +661 -0
- rem/services/postgres/service.py +802 -0
- rem/services/postgres/sql_builder.py +355 -0
- rem/services/rate_limit.py +113 -0
- rem/services/rem/README.md +318 -0
- rem/services/rem/__init__.py +23 -0
- rem/services/rem/exceptions.py +71 -0
- rem/services/rem/executor.py +293 -0
- rem/services/rem/parser.py +180 -0
- rem/services/rem/queries.py +196 -0
- rem/services/rem/query.py +371 -0
- rem/services/rem/service.py +608 -0
- rem/services/session/README.md +374 -0
- rem/services/session/__init__.py +13 -0
- rem/services/session/compression.py +488 -0
- rem/services/session/pydantic_messages.py +310 -0
- rem/services/session/reload.py +85 -0
- rem/services/user_service.py +130 -0
- rem/settings.py +1877 -0
- rem/sql/background_indexes.sql +52 -0
- rem/sql/migrations/001_install.sql +983 -0
- rem/sql/migrations/002_install_models.sql +3157 -0
- rem/sql/migrations/003_optional_extensions.sql +326 -0
- rem/sql/migrations/004_cache_system.sql +282 -0
- rem/sql/migrations/005_schema_update.sql +145 -0
- rem/sql/migrations/migrate_session_id_to_uuid.sql +45 -0
- rem/utils/AGENTIC_CHUNKING.md +597 -0
- rem/utils/README.md +628 -0
- rem/utils/__init__.py +61 -0
- rem/utils/agentic_chunking.py +622 -0
- rem/utils/batch_ops.py +343 -0
- rem/utils/chunking.py +108 -0
- rem/utils/clip_embeddings.py +276 -0
- rem/utils/constants.py +97 -0
- rem/utils/date_utils.py +228 -0
- rem/utils/dict_utils.py +98 -0
- rem/utils/embeddings.py +436 -0
- rem/utils/examples/embeddings_example.py +305 -0
- rem/utils/examples/sql_types_example.py +202 -0
- rem/utils/files.py +323 -0
- rem/utils/markdown.py +16 -0
- rem/utils/mime_types.py +158 -0
- rem/utils/model_helpers.py +492 -0
- rem/utils/schema_loader.py +649 -0
- rem/utils/sql_paths.py +146 -0
- rem/utils/sql_types.py +350 -0
- rem/utils/user_id.py +81 -0
- rem/utils/vision.py +325 -0
- rem/workers/README.md +506 -0
- rem/workers/__init__.py +7 -0
- rem/workers/db_listener.py +579 -0
- rem/workers/db_maintainer.py +74 -0
- rem/workers/dreaming.py +502 -0
- rem/workers/engram_processor.py +312 -0
- rem/workers/sqs_file_processor.py +193 -0
- rem/workers/unlogged_maintainer.py +463 -0
- remdb-0.3.242.dist-info/METADATA +1632 -0
- remdb-0.3.242.dist-info/RECORD +235 -0
- remdb-0.3.242.dist-info/WHEEL +4 -0
- remdb-0.3.242.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,760 @@
|
|
|
1
|
+
"""Content provider plugins for different file types."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import multiprocessing
|
|
5
|
+
import random
|
|
6
|
+
import subprocess
|
|
7
|
+
import sys
|
|
8
|
+
from abc import ABC, abstractmethod
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any, Optional
|
|
11
|
+
|
|
12
|
+
from loguru import logger
|
|
13
|
+
|
|
14
|
+
from rem.utils.constants import (
|
|
15
|
+
AUDIO_CHUNK_TARGET_SECONDS,
|
|
16
|
+
AUDIO_CHUNK_WINDOW_SECONDS,
|
|
17
|
+
MIN_SILENCE_MS,
|
|
18
|
+
SILENCE_THRESHOLD_DB,
|
|
19
|
+
SUBPROCESS_TIMEOUT_SECONDS,
|
|
20
|
+
WAV_HEADER_MIN_BYTES,
|
|
21
|
+
WHISPER_COST_PER_MINUTE,
|
|
22
|
+
)
|
|
23
|
+
from rem.utils.files import temp_file_from_bytes
|
|
24
|
+
from rem.utils.mime_types import get_extension
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ContentProvider(ABC):
|
|
28
|
+
"""Base class for content extraction providers."""
|
|
29
|
+
|
|
30
|
+
@property
|
|
31
|
+
@abstractmethod
|
|
32
|
+
def name(self) -> str:
|
|
33
|
+
"""Provider name for logging/debugging."""
|
|
34
|
+
pass
|
|
35
|
+
|
|
36
|
+
@abstractmethod
|
|
37
|
+
def extract(self, content: bytes, metadata: dict[str, Any]) -> dict[str, Any]:
|
|
38
|
+
"""
|
|
39
|
+
Extract text content from file bytes.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
content: Raw file bytes
|
|
43
|
+
metadata: File metadata (size, type, etc.)
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
dict with:
|
|
47
|
+
- text: Extracted text content
|
|
48
|
+
- metadata: Additional metadata from extraction (optional)
|
|
49
|
+
"""
|
|
50
|
+
pass
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class TextProvider(ContentProvider):
|
|
54
|
+
"""
|
|
55
|
+
Text content provider for plain text formats.
|
|
56
|
+
|
|
57
|
+
Supports:
|
|
58
|
+
- Markdown (.md, .markdown) - With heading detection
|
|
59
|
+
- JSON (.json) - Pretty-printed text extraction
|
|
60
|
+
- YAML (.yaml, .yml) - Text extraction
|
|
61
|
+
- Plain text (.txt) - Direct UTF-8 extraction
|
|
62
|
+
- Code files (.py, .js, .ts, etc.) - Source code as text
|
|
63
|
+
|
|
64
|
+
Simple UTF-8 text extraction with basic metadata.
|
|
65
|
+
Future: Could add frontmatter parsing, JSON schema validation, etc.
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
@property
|
|
69
|
+
def name(self) -> str:
|
|
70
|
+
return "text"
|
|
71
|
+
|
|
72
|
+
def extract(self, content: bytes, metadata: dict[str, Any]) -> dict[str, Any]:
|
|
73
|
+
"""
|
|
74
|
+
Extract text content from plain text files.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
content: Text file bytes
|
|
78
|
+
metadata: File metadata
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
dict with text and optional metadata (line count, headings for markdown, etc.)
|
|
82
|
+
"""
|
|
83
|
+
# Decode UTF-8 (with fallback to latin-1)
|
|
84
|
+
try:
|
|
85
|
+
text = content.decode("utf-8")
|
|
86
|
+
except UnicodeDecodeError:
|
|
87
|
+
logger.debug("UTF-8 decode failed, falling back to latin-1")
|
|
88
|
+
text = content.decode("latin-1")
|
|
89
|
+
|
|
90
|
+
# Basic text analysis
|
|
91
|
+
lines = text.split("\n")
|
|
92
|
+
|
|
93
|
+
# Detect headings (for markdown files)
|
|
94
|
+
headings = [line for line in lines if line.strip().startswith("#")]
|
|
95
|
+
|
|
96
|
+
extraction_metadata = {
|
|
97
|
+
"line_count": len(lines),
|
|
98
|
+
"heading_count": len(headings) if headings else None,
|
|
99
|
+
"char_count": len(text),
|
|
100
|
+
"encoding": "utf-8",
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
return {
|
|
104
|
+
"text": text,
|
|
105
|
+
"metadata": extraction_metadata,
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class DocProvider(ContentProvider):
|
|
110
|
+
"""
|
|
111
|
+
Document content provider using Kreuzberg.
|
|
112
|
+
|
|
113
|
+
Supports multiple document formats via Kreuzberg:
|
|
114
|
+
- PDF (.pdf) - Text extraction with OCR fallback
|
|
115
|
+
- Word (.docx) - Native format support
|
|
116
|
+
- PowerPoint (.pptx) - Slide content extraction
|
|
117
|
+
- Excel (.xlsx) - Spreadsheet data extraction
|
|
118
|
+
- Images (.png, .jpg) - OCR text extraction
|
|
119
|
+
|
|
120
|
+
Handles:
|
|
121
|
+
- Text extraction with OCR fallback
|
|
122
|
+
- Table detection and extraction
|
|
123
|
+
- Daemon process workaround for multiprocessing restrictions
|
|
124
|
+
"""
|
|
125
|
+
|
|
126
|
+
@property
|
|
127
|
+
def name(self) -> str:
|
|
128
|
+
return "doc"
|
|
129
|
+
|
|
130
|
+
def _is_daemon_process(self) -> bool:
|
|
131
|
+
"""Check if running in a daemon process."""
|
|
132
|
+
try:
|
|
133
|
+
return multiprocessing.current_process().daemon
|
|
134
|
+
except Exception:
|
|
135
|
+
return False
|
|
136
|
+
|
|
137
|
+
def _parse_in_subprocess(self, file_path: Path) -> dict:
|
|
138
|
+
"""Run kreuzberg in a separate subprocess to bypass daemon restrictions."""
|
|
139
|
+
script = """
|
|
140
|
+
import json
|
|
141
|
+
import sys
|
|
142
|
+
from pathlib import Path
|
|
143
|
+
from kreuzberg import ExtractionConfig, extract_file_sync
|
|
144
|
+
|
|
145
|
+
# Parse document with kreuzberg 3.x
|
|
146
|
+
config = ExtractionConfig(
|
|
147
|
+
extract_tables=True,
|
|
148
|
+
chunk_content=False,
|
|
149
|
+
extract_keywords=False,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
result = extract_file_sync(Path(sys.argv[1]), config=config)
|
|
153
|
+
|
|
154
|
+
# Serialize result to JSON
|
|
155
|
+
output = {
|
|
156
|
+
'content': result.content,
|
|
157
|
+
'tables': [t.model_dump() for t in result.tables] if result.tables else [],
|
|
158
|
+
'metadata': result.metadata
|
|
159
|
+
}
|
|
160
|
+
print(json.dumps(output))
|
|
161
|
+
"""
|
|
162
|
+
|
|
163
|
+
# Run in subprocess
|
|
164
|
+
result = subprocess.run(
|
|
165
|
+
[sys.executable, "-c", script, str(file_path)],
|
|
166
|
+
capture_output=True,
|
|
167
|
+
text=True,
|
|
168
|
+
timeout=SUBPROCESS_TIMEOUT_SECONDS,
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
if result.returncode != 0:
|
|
172
|
+
raise RuntimeError(f"Subprocess parsing failed: {result.stderr}")
|
|
173
|
+
|
|
174
|
+
return json.loads(result.stdout)
|
|
175
|
+
|
|
176
|
+
def extract(self, content: bytes, metadata: dict[str, Any]) -> dict[str, Any]:
|
|
177
|
+
"""
|
|
178
|
+
Extract document content using Kreuzberg.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
content: Document file bytes
|
|
182
|
+
metadata: File metadata (should include content_type or extension)
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
dict with text and extraction metadata
|
|
186
|
+
"""
|
|
187
|
+
# Write bytes to temp file for kreuzberg
|
|
188
|
+
# Detect extension from metadata
|
|
189
|
+
content_type = metadata.get("content_type", "")
|
|
190
|
+
suffix = get_extension(content_type, default=".pdf")
|
|
191
|
+
|
|
192
|
+
with temp_file_from_bytes(content, suffix=suffix) as tmp_path:
|
|
193
|
+
# Check if running in daemon process
|
|
194
|
+
if self._is_daemon_process():
|
|
195
|
+
logger.info("Daemon process detected - using subprocess workaround for document parsing")
|
|
196
|
+
try:
|
|
197
|
+
result_dict = self._parse_in_subprocess(tmp_path)
|
|
198
|
+
text = result_dict["content"]
|
|
199
|
+
extraction_metadata = {
|
|
200
|
+
"table_count": len(result_dict["tables"]),
|
|
201
|
+
"parser": "kreuzberg_subprocess",
|
|
202
|
+
"file_extension": tmp_path.suffix,
|
|
203
|
+
}
|
|
204
|
+
except Exception as e:
|
|
205
|
+
logger.error(f"Subprocess parsing failed: {e}. Falling back to text-only.")
|
|
206
|
+
# Fallback to simple text extraction (kreuzberg 3.x API)
|
|
207
|
+
from kreuzberg import ExtractionConfig, extract_file_sync
|
|
208
|
+
config = ExtractionConfig(extract_tables=False)
|
|
209
|
+
result = extract_file_sync(tmp_path, config=config)
|
|
210
|
+
text = result.content
|
|
211
|
+
extraction_metadata = {
|
|
212
|
+
"parser": "kreuzberg_fallback",
|
|
213
|
+
"file_extension": tmp_path.suffix,
|
|
214
|
+
}
|
|
215
|
+
else:
|
|
216
|
+
# Normal execution (not in daemon) - kreuzberg 4.x with native ONNX/Rust
|
|
217
|
+
from kreuzberg import ExtractionConfig, extract_file_sync
|
|
218
|
+
config = ExtractionConfig(
|
|
219
|
+
enable_quality_processing=True, # Enables table extraction with native ONNX
|
|
220
|
+
chunk_content=False, # We handle chunking ourselves
|
|
221
|
+
extract_tables=False, # Disable table extraction to avoid PyTorch dependency
|
|
222
|
+
)
|
|
223
|
+
result = extract_file_sync(tmp_path, config=config)
|
|
224
|
+
text = result.content
|
|
225
|
+
extraction_metadata = {
|
|
226
|
+
"table_count": len(result.tables) if result.tables else 0,
|
|
227
|
+
"parser": "kreuzberg",
|
|
228
|
+
"file_extension": tmp_path.suffix,
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
return {
|
|
232
|
+
"text": text,
|
|
233
|
+
"metadata": extraction_metadata,
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
class AudioProvider(ContentProvider):
|
|
238
|
+
"""
|
|
239
|
+
Audio content provider using AudioChunker + OpenAI Whisper.
|
|
240
|
+
|
|
241
|
+
Handles:
|
|
242
|
+
- Audio chunking by silence near minute boundaries
|
|
243
|
+
- Transcription via OpenAI Whisper API
|
|
244
|
+
- Converts chunks to markdown format
|
|
245
|
+
- Supports WAV, M4A, MP3, FLAC, OGG (via pydub + ffmpeg)
|
|
246
|
+
|
|
247
|
+
Process:
|
|
248
|
+
1. Write audio bytes to temp file
|
|
249
|
+
2. Chunk audio by silence (AudioChunker)
|
|
250
|
+
3. Transcribe chunks (AudioTranscriber)
|
|
251
|
+
4. Combine into markdown format with timestamps
|
|
252
|
+
5. Clean up temp files
|
|
253
|
+
|
|
254
|
+
Returns markdown-formatted transcription that integrates
|
|
255
|
+
seamlessly with ContentService's markdown → chunk → embed pipeline.
|
|
256
|
+
"""
|
|
257
|
+
|
|
258
|
+
@property
|
|
259
|
+
def name(self) -> str:
|
|
260
|
+
return "audio"
|
|
261
|
+
|
|
262
|
+
def extract(self, content: bytes, metadata: dict[str, Any]) -> dict[str, Any]:
|
|
263
|
+
"""
|
|
264
|
+
Extract audio content via transcription.
|
|
265
|
+
|
|
266
|
+
Args:
|
|
267
|
+
content: Audio file bytes
|
|
268
|
+
metadata: File metadata (size, type, etc.)
|
|
269
|
+
|
|
270
|
+
Returns:
|
|
271
|
+
dict with:
|
|
272
|
+
- text: Markdown-formatted transcription with timestamps
|
|
273
|
+
- metadata: Extraction metadata (chunk_count, duration, cost)
|
|
274
|
+
|
|
275
|
+
Raises:
|
|
276
|
+
RuntimeError: If transcription fails or pydub not available
|
|
277
|
+
ValueError: If OpenAI API key missing
|
|
278
|
+
"""
|
|
279
|
+
# Handle empty or invalid content
|
|
280
|
+
if not content or len(content) < WAV_HEADER_MIN_BYTES:
|
|
281
|
+
logger.warning("Audio content too small to be valid WAV file")
|
|
282
|
+
return {
|
|
283
|
+
"text": "[Invalid or empty audio file]",
|
|
284
|
+
"metadata": {"error": "invalid_content", "size": len(content)},
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
# Check for OpenAI API key (use settings)
|
|
288
|
+
from rem.settings import settings
|
|
289
|
+
api_key = settings.llm.openai_api_key
|
|
290
|
+
if not api_key:
|
|
291
|
+
logger.warning("No OpenAI API key found - audio transcription disabled")
|
|
292
|
+
return {
|
|
293
|
+
"text": "[Audio transcription requires LLM__OPENAI_API_KEY to be set]",
|
|
294
|
+
"metadata": {"error": "missing_api_key"},
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
# Import audio services (lazy import)
|
|
298
|
+
try:
|
|
299
|
+
from rem.services.audio import AudioChunker, AudioTranscriber
|
|
300
|
+
except ImportError as e:
|
|
301
|
+
logger.error(f"Audio services not available: {e}")
|
|
302
|
+
return {
|
|
303
|
+
"text": "[Audio processing requires: pip install rem[audio]]",
|
|
304
|
+
"metadata": {"error": "missing_dependencies"},
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
# Write bytes to temp file
|
|
308
|
+
# Detect extension from metadata or use .wav as fallback
|
|
309
|
+
content_type = metadata.get("content_type", "audio/wav")
|
|
310
|
+
extension = get_extension(content_type, default=".wav")
|
|
311
|
+
|
|
312
|
+
chunker = None
|
|
313
|
+
chunks = None
|
|
314
|
+
|
|
315
|
+
with temp_file_from_bytes(content, suffix=extension) as tmp_path:
|
|
316
|
+
try:
|
|
317
|
+
logger.info(f"Processing audio file: {tmp_path.name} ({len(content) / 1024 / 1024:.1f} MB)")
|
|
318
|
+
|
|
319
|
+
# Step 1: Chunk audio by silence
|
|
320
|
+
chunker = AudioChunker(
|
|
321
|
+
target_chunk_seconds=AUDIO_CHUNK_TARGET_SECONDS,
|
|
322
|
+
chunk_window_seconds=AUDIO_CHUNK_WINDOW_SECONDS,
|
|
323
|
+
silence_threshold_db=SILENCE_THRESHOLD_DB,
|
|
324
|
+
min_silence_ms=MIN_SILENCE_MS,
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
chunks = chunker.chunk_audio(tmp_path)
|
|
328
|
+
logger.info(f"Created {len(chunks)} audio chunks")
|
|
329
|
+
|
|
330
|
+
# Step 2: Transcribe chunks
|
|
331
|
+
transcriber = AudioTranscriber(api_key=api_key)
|
|
332
|
+
results = transcriber.transcribe_chunks(chunks)
|
|
333
|
+
logger.info(f"Transcribed {len(results)} chunks")
|
|
334
|
+
|
|
335
|
+
# Step 3: Combine into markdown format
|
|
336
|
+
# Format: Each chunk becomes a section with timestamp
|
|
337
|
+
markdown_parts = []
|
|
338
|
+
for result in results:
|
|
339
|
+
timestamp = f"{result.start_seconds:.1f}s - {result.end_seconds:.1f}s"
|
|
340
|
+
markdown_parts.append(f"## [{timestamp}]\n\n{result.text}\n")
|
|
341
|
+
|
|
342
|
+
markdown_text = "\n".join(markdown_parts)
|
|
343
|
+
|
|
344
|
+
# Calculate metadata
|
|
345
|
+
total_duration = sum(r.duration_seconds for r in results)
|
|
346
|
+
estimated_cost = (total_duration / 60) * WHISPER_COST_PER_MINUTE
|
|
347
|
+
successful_chunks = sum(1 for r in results if r.confidence > 0)
|
|
348
|
+
|
|
349
|
+
extraction_metadata = {
|
|
350
|
+
"chunk_count": len(chunks),
|
|
351
|
+
"transcribed_chunks": successful_chunks,
|
|
352
|
+
"duration_seconds": total_duration,
|
|
353
|
+
"estimated_cost": estimated_cost,
|
|
354
|
+
"parser": "whisper_api",
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
logger.info(
|
|
358
|
+
f"Transcription complete: {successful_chunks}/{len(chunks)} chunks, "
|
|
359
|
+
f"${estimated_cost:.3f} cost"
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
return {
|
|
363
|
+
"text": markdown_text,
|
|
364
|
+
"metadata": extraction_metadata,
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
except Exception as e:
|
|
368
|
+
logger.error(f"Audio extraction failed: {e}")
|
|
369
|
+
raise RuntimeError(f"Audio transcription failed: {e}") from e
|
|
370
|
+
|
|
371
|
+
finally:
|
|
372
|
+
# Clean up audio chunks (temp file cleanup handled by context manager)
|
|
373
|
+
if chunker is not None and chunks is not None:
|
|
374
|
+
try:
|
|
375
|
+
chunker.cleanup_chunks(chunks)
|
|
376
|
+
except Exception as e:
|
|
377
|
+
logger.warning(f"Chunk cleanup failed: {e}")
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
class SchemaProvider(ContentProvider):
|
|
381
|
+
"""
|
|
382
|
+
Schema content provider for agent/evaluator schemas.
|
|
383
|
+
|
|
384
|
+
Detects and processes YAML/JSON files containing:
|
|
385
|
+
- Agent schemas (type: object with json_schema_extra.kind: agent and json_schema_extra.name: <name>)
|
|
386
|
+
- Evaluator schemas (type: object with json_schema_extra.kind: evaluator and json_schema_extra.name: <name>)
|
|
387
|
+
|
|
388
|
+
Stores schemas in the schemas table with deterministic IDs for upsert by name.
|
|
389
|
+
|
|
390
|
+
Pattern:
|
|
391
|
+
- Checks for schema markers (type: object + kind + name)
|
|
392
|
+
- Generates deterministic ID for upsert (tenant+schema_name)
|
|
393
|
+
- Stores full schema JSON in schemas table
|
|
394
|
+
- Extracts metadata (version, tags, provider_configs, embedding_fields)
|
|
395
|
+
"""
|
|
396
|
+
|
|
397
|
+
@property
|
|
398
|
+
def name(self) -> str:
|
|
399
|
+
return "schema"
|
|
400
|
+
|
|
401
|
+
def extract(self, content: bytes, metadata: dict[str, Any]) -> dict[str, Any]:
|
|
402
|
+
"""
|
|
403
|
+
Extract and validate agent/evaluator schema.
|
|
404
|
+
|
|
405
|
+
Args:
|
|
406
|
+
content: YAML or JSON file bytes
|
|
407
|
+
metadata: File metadata
|
|
408
|
+
|
|
409
|
+
Returns:
|
|
410
|
+
dict with:
|
|
411
|
+
- text: Human-readable schema summary
|
|
412
|
+
- metadata: Schema metadata
|
|
413
|
+
- schema_data: Full schema dict for storage
|
|
414
|
+
- is_schema: True if valid schema detected
|
|
415
|
+
|
|
416
|
+
Raises:
|
|
417
|
+
ValueError: If schema is invalid
|
|
418
|
+
"""
|
|
419
|
+
import json
|
|
420
|
+
import yaml
|
|
421
|
+
from uuid import uuid5, NAMESPACE_DNS
|
|
422
|
+
|
|
423
|
+
# Decode content
|
|
424
|
+
try:
|
|
425
|
+
text = content.decode("utf-8")
|
|
426
|
+
except UnicodeDecodeError:
|
|
427
|
+
text = content.decode("latin-1")
|
|
428
|
+
|
|
429
|
+
# Try to parse as YAML/JSON
|
|
430
|
+
if metadata.get("content_type") == "application/json":
|
|
431
|
+
schema_data = json.loads(text) # Raises JSONDecodeError on invalid JSON
|
|
432
|
+
else:
|
|
433
|
+
# Try YAML first (supports both YAML and JSON)
|
|
434
|
+
schema_data = yaml.safe_load(text) # Raises yaml.YAMLError on invalid YAML
|
|
435
|
+
|
|
436
|
+
# Check if it's a schema (type: object + json_schema_extra.kind + json_schema_extra.name)
|
|
437
|
+
if not isinstance(schema_data, dict):
|
|
438
|
+
return {
|
|
439
|
+
"text": text,
|
|
440
|
+
"metadata": {"parser": "schema_fallback"},
|
|
441
|
+
"is_schema": False,
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
# Check for schema markers
|
|
445
|
+
is_object_type = schema_data.get("type") == "object"
|
|
446
|
+
json_schema_extra = schema_data.get("json_schema_extra", {})
|
|
447
|
+
kind = json_schema_extra.get("kind", "")
|
|
448
|
+
schema_name = json_schema_extra.get("name", "")
|
|
449
|
+
|
|
450
|
+
# Must have type: object, kind (agent or evaluator), and name
|
|
451
|
+
is_agent_schema = is_object_type and kind == "agent" and schema_name
|
|
452
|
+
is_evaluator_schema = is_object_type and kind == "evaluator" and schema_name
|
|
453
|
+
|
|
454
|
+
if not (is_agent_schema or is_evaluator_schema):
|
|
455
|
+
return {
|
|
456
|
+
"text": text,
|
|
457
|
+
"metadata": {"parser": "schema_fallback"},
|
|
458
|
+
"is_schema": False,
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
# Extract schema metadata
|
|
462
|
+
schema_type = kind # "agent" or "evaluator"
|
|
463
|
+
version = json_schema_extra.get("version", "1.0.0")
|
|
464
|
+
tags = json_schema_extra.get("tags", [])
|
|
465
|
+
|
|
466
|
+
# Use name directly (already in kebab-case format)
|
|
467
|
+
short_name = schema_name
|
|
468
|
+
|
|
469
|
+
# Build human-readable summary
|
|
470
|
+
description = schema_data.get("description", "No description provided")
|
|
471
|
+
description_preview = description[:200] + "..." if len(description) > 200 else description
|
|
472
|
+
|
|
473
|
+
properties = schema_data.get("properties", {})
|
|
474
|
+
required_fields = schema_data.get("required", [])
|
|
475
|
+
|
|
476
|
+
summary_parts = [
|
|
477
|
+
f"# {schema_type.title()} Schema: {short_name}",
|
|
478
|
+
f"**Version:** {version}",
|
|
479
|
+
f"**Name:** {schema_name}",
|
|
480
|
+
f"**Kind:** {kind}",
|
|
481
|
+
"",
|
|
482
|
+
"## Description",
|
|
483
|
+
description_preview,
|
|
484
|
+
"",
|
|
485
|
+
"## Output Fields",
|
|
486
|
+
]
|
|
487
|
+
|
|
488
|
+
for field_name, field_spec in list(properties.items())[:10]: # Limit to 10 fields
|
|
489
|
+
field_type = field_spec.get("type", "unknown")
|
|
490
|
+
field_desc = field_spec.get("description", "")
|
|
491
|
+
required = " (required)" if field_name in required_fields else ""
|
|
492
|
+
summary_parts.append(f"- **{field_name}**: {field_type}{required} - {field_desc[:50]}")
|
|
493
|
+
|
|
494
|
+
if len(properties) > 10:
|
|
495
|
+
summary_parts.append(f"- ... and {len(properties) - 10} more fields")
|
|
496
|
+
|
|
497
|
+
text_summary = "\n".join(summary_parts)
|
|
498
|
+
|
|
499
|
+
# Extract additional metadata
|
|
500
|
+
extraction_metadata = {
|
|
501
|
+
"parser": "schema",
|
|
502
|
+
"schema_type": schema_type,
|
|
503
|
+
"short_name": short_name,
|
|
504
|
+
"version": version,
|
|
505
|
+
"kind": kind,
|
|
506
|
+
"name": schema_name,
|
|
507
|
+
"tags": tags,
|
|
508
|
+
"field_count": len(properties),
|
|
509
|
+
"required_field_count": len(required_fields),
|
|
510
|
+
"provider_configs": json_schema_extra.get("provider_configs", []),
|
|
511
|
+
"embedding_fields": json_schema_extra.get("embedding_fields", []),
|
|
512
|
+
"category": json_schema_extra.get("category"),
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
return {
|
|
516
|
+
"text": text_summary,
|
|
517
|
+
"metadata": extraction_metadata,
|
|
518
|
+
"schema_data": schema_data,
|
|
519
|
+
"is_schema": True,
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
class ImageProvider(ContentProvider):
|
|
524
|
+
"""
|
|
525
|
+
Image content provider with vision LLM analysis and CLIP embeddings.
|
|
526
|
+
|
|
527
|
+
Features:
|
|
528
|
+
- Tier-based vision analysis (gold tier always gets analysis)
|
|
529
|
+
- Sampling-based vision analysis for non-gold users
|
|
530
|
+
- Vision LLM description generation (Anthropic, Gemini, OpenAI)
|
|
531
|
+
- Future: CLIP embeddings for semantic image search
|
|
532
|
+
|
|
533
|
+
Process:
|
|
534
|
+
1. Check user tier and sampling rate
|
|
535
|
+
2. If eligible, run vision LLM analysis
|
|
536
|
+
3. Extract image metadata (dimensions, format)
|
|
537
|
+
4. Return markdown description or basic metadata
|
|
538
|
+
5. Save to ImageResource table (not Resource)
|
|
539
|
+
|
|
540
|
+
Vision analysis is expensive, so it's gated by:
|
|
541
|
+
- User tier (gold = always, silver/free = sampled)
|
|
542
|
+
- Sample rate setting (0.0 = never, 1.0 = always)
|
|
543
|
+
"""
|
|
544
|
+
|
|
545
|
+
def __init__(self, user_tier: Optional[str] = None):
|
|
546
|
+
"""
|
|
547
|
+
Initialize image provider.
|
|
548
|
+
|
|
549
|
+
Args:
|
|
550
|
+
user_tier: User tier (free, silver, gold) for vision gating
|
|
551
|
+
"""
|
|
552
|
+
self.user_tier = user_tier
|
|
553
|
+
|
|
554
|
+
@property
|
|
555
|
+
def name(self) -> str:
|
|
556
|
+
return "image"
|
|
557
|
+
|
|
558
|
+
def _should_analyze_with_vision(self, sample_rate: float) -> bool:
|
|
559
|
+
"""
|
|
560
|
+
Determine if image should get vision LLM analysis.
|
|
561
|
+
|
|
562
|
+
Args:
|
|
563
|
+
sample_rate: Sampling rate from settings (0.0-1.0)
|
|
564
|
+
|
|
565
|
+
Returns:
|
|
566
|
+
True if should analyze, False otherwise
|
|
567
|
+
"""
|
|
568
|
+
# Import here to avoid circular dependency
|
|
569
|
+
from rem.models.entities import UserTier
|
|
570
|
+
|
|
571
|
+
# Gold tier always gets vision analysis
|
|
572
|
+
if self.user_tier == UserTier.GOLD.value:
|
|
573
|
+
logger.info("Gold tier user - vision analysis enabled")
|
|
574
|
+
return True
|
|
575
|
+
|
|
576
|
+
# For non-gold users, use sampling
|
|
577
|
+
if sample_rate > 0.0:
|
|
578
|
+
should_analyze = random.random() < sample_rate
|
|
579
|
+
if should_analyze:
|
|
580
|
+
logger.info(f"Vision analysis sampled (rate={sample_rate})")
|
|
581
|
+
return should_analyze
|
|
582
|
+
|
|
583
|
+
return False
|
|
584
|
+
|
|
585
|
+
def extract(self, content: bytes, metadata: dict[str, Any]) -> dict[str, Any]:
|
|
586
|
+
"""
|
|
587
|
+
Extract image content with optional vision LLM analysis.
|
|
588
|
+
|
|
589
|
+
Args:
|
|
590
|
+
content: Image file bytes
|
|
591
|
+
metadata: File metadata (size, type, etc.)
|
|
592
|
+
|
|
593
|
+
Returns:
|
|
594
|
+
dict with:
|
|
595
|
+
- text: Markdown description (if vision enabled) or basic metadata
|
|
596
|
+
- metadata: Extraction metadata (dimensions, format, vision info)
|
|
597
|
+
- image_specific: Additional image metadata for ImageResource
|
|
598
|
+
|
|
599
|
+
Raises:
|
|
600
|
+
RuntimeError: If vision analysis fails
|
|
601
|
+
"""
|
|
602
|
+
# Import settings here to avoid circular dependency
|
|
603
|
+
from rem.settings import settings
|
|
604
|
+
|
|
605
|
+
# Extract basic image metadata using PIL
|
|
606
|
+
try:
|
|
607
|
+
from PIL import Image
|
|
608
|
+
import io
|
|
609
|
+
|
|
610
|
+
img = Image.open(io.BytesIO(content))
|
|
611
|
+
image_width = img.width
|
|
612
|
+
image_height = img.height
|
|
613
|
+
image_format = img.format or "UNKNOWN"
|
|
614
|
+
except ImportError:
|
|
615
|
+
logger.warning("PIL not available - image metadata extraction disabled")
|
|
616
|
+
image_width = None
|
|
617
|
+
image_height = None
|
|
618
|
+
image_format = None
|
|
619
|
+
except Exception as e:
|
|
620
|
+
logger.warning(f"Failed to extract image metadata: {e}")
|
|
621
|
+
image_width = None
|
|
622
|
+
image_height = None
|
|
623
|
+
image_format = None
|
|
624
|
+
|
|
625
|
+
# Check if vision analysis should be performed
|
|
626
|
+
sample_rate = settings.content.image_vllm_sample_rate
|
|
627
|
+
should_analyze = self._should_analyze_with_vision(sample_rate)
|
|
628
|
+
|
|
629
|
+
vision_description = None
|
|
630
|
+
vision_provider = None
|
|
631
|
+
vision_model = None
|
|
632
|
+
|
|
633
|
+
if should_analyze:
|
|
634
|
+
# Perform vision LLM analysis
|
|
635
|
+
try:
|
|
636
|
+
from rem.utils.vision import ImageAnalyzer, VisionProvider
|
|
637
|
+
|
|
638
|
+
# Get provider from settings
|
|
639
|
+
provider_str = settings.content.image_vllm_provider.lower()
|
|
640
|
+
provider_map = {
|
|
641
|
+
"anthropic": VisionProvider.ANTHROPIC,
|
|
642
|
+
"gemini": VisionProvider.GEMINI,
|
|
643
|
+
"openai": VisionProvider.OPENAI,
|
|
644
|
+
}
|
|
645
|
+
provider = provider_map.get(provider_str, VisionProvider.ANTHROPIC)
|
|
646
|
+
|
|
647
|
+
# Create analyzer
|
|
648
|
+
analyzer = ImageAnalyzer(
|
|
649
|
+
provider=provider,
|
|
650
|
+
model=settings.content.image_vllm_model,
|
|
651
|
+
)
|
|
652
|
+
|
|
653
|
+
# Write bytes to temp file for analysis
|
|
654
|
+
content_type = metadata.get("content_type", "image/png")
|
|
655
|
+
extension = get_extension(content_type, default=".png")
|
|
656
|
+
|
|
657
|
+
with temp_file_from_bytes(content, suffix=extension) as tmp_path:
|
|
658
|
+
# Analyze image
|
|
659
|
+
result = analyzer.analyze_image(tmp_path)
|
|
660
|
+
vision_description = result.description
|
|
661
|
+
vision_provider = result.provider.value
|
|
662
|
+
vision_model = result.model
|
|
663
|
+
|
|
664
|
+
logger.info(f"Vision analysis complete: {len(vision_description)} chars")
|
|
665
|
+
|
|
666
|
+
except ImportError as e:
|
|
667
|
+
logger.warning(f"Vision analysis not available: {e}")
|
|
668
|
+
except Exception as e:
|
|
669
|
+
logger.error(f"Vision analysis failed: {e}")
|
|
670
|
+
|
|
671
|
+
# Build text content
|
|
672
|
+
if vision_description:
|
|
673
|
+
# Use vision description as primary content
|
|
674
|
+
text = f"# Image Analysis\n\n{vision_description}"
|
|
675
|
+
if image_width and image_height:
|
|
676
|
+
text += f"\n\n**Image Details:** {image_width}x{image_height} {image_format}"
|
|
677
|
+
else:
|
|
678
|
+
# Fallback to basic metadata
|
|
679
|
+
if image_width and image_height:
|
|
680
|
+
text = f"**Image:** {image_width}x{image_height} {image_format}"
|
|
681
|
+
else:
|
|
682
|
+
text = "**Image:** Metadata extraction unavailable"
|
|
683
|
+
|
|
684
|
+
# Generate CLIP embedding (if Jina API key available)
|
|
685
|
+
clip_embedding = None
|
|
686
|
+
clip_dimensions = None
|
|
687
|
+
clip_tokens = None
|
|
688
|
+
|
|
689
|
+
try:
|
|
690
|
+
from rem.utils.clip_embeddings import JinaCLIPEmbedder
|
|
691
|
+
|
|
692
|
+
# Only attempt CLIP embeddings if using Jina provider
|
|
693
|
+
if settings.content.clip_provider != "jina":
|
|
694
|
+
logger.debug(
|
|
695
|
+
f"CLIP provider set to '{settings.content.clip_provider}' - "
|
|
696
|
+
"skipping Jina embeddings (self-hosted not yet implemented)"
|
|
697
|
+
)
|
|
698
|
+
else:
|
|
699
|
+
embedder = JinaCLIPEmbedder(
|
|
700
|
+
api_key=settings.content.jina_api_key,
|
|
701
|
+
model=settings.content.clip_model,
|
|
702
|
+
)
|
|
703
|
+
|
|
704
|
+
if embedder.is_available():
|
|
705
|
+
# Write bytes to temp file for CLIP embedding
|
|
706
|
+
content_type = metadata.get("content_type", "image/png")
|
|
707
|
+
extension = get_extension(content_type, default=".png")
|
|
708
|
+
|
|
709
|
+
with temp_file_from_bytes(content, suffix=extension) as tmp_path:
|
|
710
|
+
# Generate CLIP embedding
|
|
711
|
+
result = embedder.embed_image(tmp_path)
|
|
712
|
+
if result:
|
|
713
|
+
clip_embedding = result.embedding # type: ignore[attr-defined]
|
|
714
|
+
clip_dimensions = result.dimensions # type: ignore[attr-defined]
|
|
715
|
+
clip_tokens = result.tokens_used # type: ignore[attr-defined]
|
|
716
|
+
logger.info(
|
|
717
|
+
f"CLIP embedding generated: {clip_dimensions} dims, {clip_tokens} tokens"
|
|
718
|
+
)
|
|
719
|
+
else:
|
|
720
|
+
logger.debug(
|
|
721
|
+
"CLIP embeddings disabled - set CONTENT__JINA_API_KEY to enable. "
|
|
722
|
+
"Get free API key at https://jina.ai/embeddings/"
|
|
723
|
+
)
|
|
724
|
+
|
|
725
|
+
except ImportError:
|
|
726
|
+
logger.debug("CLIP embedding module not available")
|
|
727
|
+
except Exception as e:
|
|
728
|
+
logger.warning(f"CLIP embedding generation failed (non-fatal): {e}")
|
|
729
|
+
|
|
730
|
+
# Build extraction metadata
|
|
731
|
+
extraction_metadata = {
|
|
732
|
+
"parser": "image_provider",
|
|
733
|
+
"vision_enabled": vision_description is not None,
|
|
734
|
+
"vision_provider": vision_provider,
|
|
735
|
+
"vision_model": vision_model,
|
|
736
|
+
"image_width": image_width,
|
|
737
|
+
"image_height": image_height,
|
|
738
|
+
"image_format": image_format,
|
|
739
|
+
"clip_enabled": clip_embedding is not None,
|
|
740
|
+
"clip_dimensions": clip_dimensions,
|
|
741
|
+
"clip_tokens": clip_tokens,
|
|
742
|
+
}
|
|
743
|
+
|
|
744
|
+
# Add image-specific metadata for ImageResource
|
|
745
|
+
image_specific = {
|
|
746
|
+
"image_width": image_width,
|
|
747
|
+
"image_height": image_height,
|
|
748
|
+
"image_format": image_format,
|
|
749
|
+
"vision_description": vision_description,
|
|
750
|
+
"vision_provider": vision_provider,
|
|
751
|
+
"vision_model": vision_model,
|
|
752
|
+
"clip_embedding": clip_embedding,
|
|
753
|
+
"clip_dimensions": clip_dimensions,
|
|
754
|
+
}
|
|
755
|
+
|
|
756
|
+
return {
|
|
757
|
+
"text": text,
|
|
758
|
+
"metadata": extraction_metadata,
|
|
759
|
+
"image_specific": image_specific,
|
|
760
|
+
}
|