remdb 0.3.242__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of remdb might be problematic. Click here for more details.
- rem/__init__.py +129 -0
- rem/agentic/README.md +760 -0
- rem/agentic/__init__.py +54 -0
- rem/agentic/agents/README.md +155 -0
- rem/agentic/agents/__init__.py +38 -0
- rem/agentic/agents/agent_manager.py +311 -0
- rem/agentic/agents/sse_simulator.py +502 -0
- rem/agentic/context.py +425 -0
- rem/agentic/context_builder.py +360 -0
- rem/agentic/llm_provider_models.py +301 -0
- rem/agentic/mcp/__init__.py +0 -0
- rem/agentic/mcp/tool_wrapper.py +273 -0
- rem/agentic/otel/__init__.py +5 -0
- rem/agentic/otel/setup.py +240 -0
- rem/agentic/providers/phoenix.py +926 -0
- rem/agentic/providers/pydantic_ai.py +854 -0
- rem/agentic/query.py +117 -0
- rem/agentic/query_helper.py +89 -0
- rem/agentic/schema.py +737 -0
- rem/agentic/serialization.py +245 -0
- rem/agentic/tools/__init__.py +5 -0
- rem/agentic/tools/rem_tools.py +242 -0
- rem/api/README.md +657 -0
- rem/api/deps.py +253 -0
- rem/api/main.py +460 -0
- rem/api/mcp_router/prompts.py +182 -0
- rem/api/mcp_router/resources.py +820 -0
- rem/api/mcp_router/server.py +243 -0
- rem/api/mcp_router/tools.py +1605 -0
- rem/api/middleware/tracking.py +172 -0
- rem/api/routers/admin.py +520 -0
- rem/api/routers/auth.py +898 -0
- rem/api/routers/chat/__init__.py +5 -0
- rem/api/routers/chat/child_streaming.py +394 -0
- rem/api/routers/chat/completions.py +702 -0
- rem/api/routers/chat/json_utils.py +76 -0
- rem/api/routers/chat/models.py +202 -0
- rem/api/routers/chat/otel_utils.py +33 -0
- rem/api/routers/chat/sse_events.py +546 -0
- rem/api/routers/chat/streaming.py +950 -0
- rem/api/routers/chat/streaming_utils.py +327 -0
- rem/api/routers/common.py +18 -0
- rem/api/routers/dev.py +87 -0
- rem/api/routers/feedback.py +276 -0
- rem/api/routers/messages.py +620 -0
- rem/api/routers/models.py +86 -0
- rem/api/routers/query.py +362 -0
- rem/api/routers/shared_sessions.py +422 -0
- rem/auth/README.md +258 -0
- rem/auth/__init__.py +36 -0
- rem/auth/jwt.py +367 -0
- rem/auth/middleware.py +318 -0
- rem/auth/providers/__init__.py +16 -0
- rem/auth/providers/base.py +376 -0
- rem/auth/providers/email.py +215 -0
- rem/auth/providers/google.py +163 -0
- rem/auth/providers/microsoft.py +237 -0
- rem/cli/README.md +517 -0
- rem/cli/__init__.py +8 -0
- rem/cli/commands/README.md +299 -0
- rem/cli/commands/__init__.py +3 -0
- rem/cli/commands/ask.py +549 -0
- rem/cli/commands/cluster.py +1808 -0
- rem/cli/commands/configure.py +495 -0
- rem/cli/commands/db.py +828 -0
- rem/cli/commands/dreaming.py +324 -0
- rem/cli/commands/experiments.py +1698 -0
- rem/cli/commands/mcp.py +66 -0
- rem/cli/commands/process.py +388 -0
- rem/cli/commands/query.py +109 -0
- rem/cli/commands/scaffold.py +47 -0
- rem/cli/commands/schema.py +230 -0
- rem/cli/commands/serve.py +106 -0
- rem/cli/commands/session.py +453 -0
- rem/cli/dreaming.py +363 -0
- rem/cli/main.py +123 -0
- rem/config.py +244 -0
- rem/mcp_server.py +41 -0
- rem/models/core/__init__.py +49 -0
- rem/models/core/core_model.py +70 -0
- rem/models/core/engram.py +333 -0
- rem/models/core/experiment.py +672 -0
- rem/models/core/inline_edge.py +132 -0
- rem/models/core/rem_query.py +246 -0
- rem/models/entities/__init__.py +68 -0
- rem/models/entities/domain_resource.py +38 -0
- rem/models/entities/feedback.py +123 -0
- rem/models/entities/file.py +57 -0
- rem/models/entities/image_resource.py +88 -0
- rem/models/entities/message.py +64 -0
- rem/models/entities/moment.py +123 -0
- rem/models/entities/ontology.py +181 -0
- rem/models/entities/ontology_config.py +131 -0
- rem/models/entities/resource.py +95 -0
- rem/models/entities/schema.py +87 -0
- rem/models/entities/session.py +84 -0
- rem/models/entities/shared_session.py +180 -0
- rem/models/entities/subscriber.py +175 -0
- rem/models/entities/user.py +93 -0
- rem/py.typed +0 -0
- rem/registry.py +373 -0
- rem/schemas/README.md +507 -0
- rem/schemas/__init__.py +6 -0
- rem/schemas/agents/README.md +92 -0
- rem/schemas/agents/core/agent-builder.yaml +235 -0
- rem/schemas/agents/core/moment-builder.yaml +178 -0
- rem/schemas/agents/core/rem-query-agent.yaml +226 -0
- rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
- rem/schemas/agents/core/simple-assistant.yaml +19 -0
- rem/schemas/agents/core/user-profile-builder.yaml +163 -0
- rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
- rem/schemas/agents/examples/contract-extractor.yaml +134 -0
- rem/schemas/agents/examples/cv-parser.yaml +263 -0
- rem/schemas/agents/examples/hello-world.yaml +37 -0
- rem/schemas/agents/examples/query.yaml +54 -0
- rem/schemas/agents/examples/simple.yaml +21 -0
- rem/schemas/agents/examples/test.yaml +29 -0
- rem/schemas/agents/rem.yaml +132 -0
- rem/schemas/evaluators/hello-world/default.yaml +77 -0
- rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
- rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
- rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
- rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
- rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
- rem/services/__init__.py +18 -0
- rem/services/audio/INTEGRATION.md +308 -0
- rem/services/audio/README.md +376 -0
- rem/services/audio/__init__.py +15 -0
- rem/services/audio/chunker.py +354 -0
- rem/services/audio/transcriber.py +259 -0
- rem/services/content/README.md +1269 -0
- rem/services/content/__init__.py +5 -0
- rem/services/content/providers.py +760 -0
- rem/services/content/service.py +762 -0
- rem/services/dreaming/README.md +230 -0
- rem/services/dreaming/__init__.py +53 -0
- rem/services/dreaming/affinity_service.py +322 -0
- rem/services/dreaming/moment_service.py +251 -0
- rem/services/dreaming/ontology_service.py +54 -0
- rem/services/dreaming/user_model_service.py +297 -0
- rem/services/dreaming/utils.py +39 -0
- rem/services/email/__init__.py +10 -0
- rem/services/email/service.py +522 -0
- rem/services/email/templates.py +360 -0
- rem/services/embeddings/__init__.py +11 -0
- rem/services/embeddings/api.py +127 -0
- rem/services/embeddings/worker.py +435 -0
- rem/services/fs/README.md +662 -0
- rem/services/fs/__init__.py +62 -0
- rem/services/fs/examples.py +206 -0
- rem/services/fs/examples_paths.py +204 -0
- rem/services/fs/git_provider.py +935 -0
- rem/services/fs/local_provider.py +760 -0
- rem/services/fs/parsing-hooks-examples.md +172 -0
- rem/services/fs/paths.py +276 -0
- rem/services/fs/provider.py +460 -0
- rem/services/fs/s3_provider.py +1042 -0
- rem/services/fs/service.py +186 -0
- rem/services/git/README.md +1075 -0
- rem/services/git/__init__.py +17 -0
- rem/services/git/service.py +469 -0
- rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
- rem/services/phoenix/README.md +453 -0
- rem/services/phoenix/__init__.py +46 -0
- rem/services/phoenix/client.py +960 -0
- rem/services/phoenix/config.py +88 -0
- rem/services/phoenix/prompt_labels.py +477 -0
- rem/services/postgres/README.md +757 -0
- rem/services/postgres/__init__.py +49 -0
- rem/services/postgres/diff_service.py +599 -0
- rem/services/postgres/migration_service.py +427 -0
- rem/services/postgres/programmable_diff_service.py +635 -0
- rem/services/postgres/pydantic_to_sqlalchemy.py +562 -0
- rem/services/postgres/register_type.py +353 -0
- rem/services/postgres/repository.py +481 -0
- rem/services/postgres/schema_generator.py +661 -0
- rem/services/postgres/service.py +802 -0
- rem/services/postgres/sql_builder.py +355 -0
- rem/services/rate_limit.py +113 -0
- rem/services/rem/README.md +318 -0
- rem/services/rem/__init__.py +23 -0
- rem/services/rem/exceptions.py +71 -0
- rem/services/rem/executor.py +293 -0
- rem/services/rem/parser.py +180 -0
- rem/services/rem/queries.py +196 -0
- rem/services/rem/query.py +371 -0
- rem/services/rem/service.py +608 -0
- rem/services/session/README.md +374 -0
- rem/services/session/__init__.py +13 -0
- rem/services/session/compression.py +488 -0
- rem/services/session/pydantic_messages.py +310 -0
- rem/services/session/reload.py +85 -0
- rem/services/user_service.py +130 -0
- rem/settings.py +1877 -0
- rem/sql/background_indexes.sql +52 -0
- rem/sql/migrations/001_install.sql +983 -0
- rem/sql/migrations/002_install_models.sql +3157 -0
- rem/sql/migrations/003_optional_extensions.sql +326 -0
- rem/sql/migrations/004_cache_system.sql +282 -0
- rem/sql/migrations/005_schema_update.sql +145 -0
- rem/sql/migrations/migrate_session_id_to_uuid.sql +45 -0
- rem/utils/AGENTIC_CHUNKING.md +597 -0
- rem/utils/README.md +628 -0
- rem/utils/__init__.py +61 -0
- rem/utils/agentic_chunking.py +622 -0
- rem/utils/batch_ops.py +343 -0
- rem/utils/chunking.py +108 -0
- rem/utils/clip_embeddings.py +276 -0
- rem/utils/constants.py +97 -0
- rem/utils/date_utils.py +228 -0
- rem/utils/dict_utils.py +98 -0
- rem/utils/embeddings.py +436 -0
- rem/utils/examples/embeddings_example.py +305 -0
- rem/utils/examples/sql_types_example.py +202 -0
- rem/utils/files.py +323 -0
- rem/utils/markdown.py +16 -0
- rem/utils/mime_types.py +158 -0
- rem/utils/model_helpers.py +492 -0
- rem/utils/schema_loader.py +649 -0
- rem/utils/sql_paths.py +146 -0
- rem/utils/sql_types.py +350 -0
- rem/utils/user_id.py +81 -0
- rem/utils/vision.py +325 -0
- rem/workers/README.md +506 -0
- rem/workers/__init__.py +7 -0
- rem/workers/db_listener.py +579 -0
- rem/workers/db_maintainer.py +74 -0
- rem/workers/dreaming.py +502 -0
- rem/workers/engram_processor.py +312 -0
- rem/workers/sqs_file_processor.py +193 -0
- rem/workers/unlogged_maintainer.py +463 -0
- remdb-0.3.242.dist-info/METADATA +1632 -0
- remdb-0.3.242.dist-info/RECORD +235 -0
- remdb-0.3.242.dist-info/WHEEL +4 -0
- remdb-0.3.242.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,762 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ContentService for file processing.
|
|
3
|
+
|
|
4
|
+
Pipeline:
|
|
5
|
+
1. Extract content via provider plugins
|
|
6
|
+
2. Convert to markdown
|
|
7
|
+
3. Chunk markdown
|
|
8
|
+
4. Save File + Resources to database via repositories
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
from datetime import UTC, datetime
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Any
|
|
15
|
+
from urllib.parse import urlparse
|
|
16
|
+
|
|
17
|
+
import boto3
|
|
18
|
+
from botocore.exceptions import ClientError
|
|
19
|
+
from loguru import logger
|
|
20
|
+
|
|
21
|
+
from rem.models.entities import File, Resource
|
|
22
|
+
from rem.services.postgres import Repository
|
|
23
|
+
from rem.settings import settings
|
|
24
|
+
from rem.utils.chunking import chunk_text
|
|
25
|
+
from rem.utils.markdown import to_markdown
|
|
26
|
+
|
|
27
|
+
from .providers import AudioProvider, ContentProvider, DocProvider, SchemaProvider, TextProvider
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ContentService:
|
|
31
|
+
"""
|
|
32
|
+
Service for processing files: extract → markdown → chunk → save.
|
|
33
|
+
|
|
34
|
+
Supports:
|
|
35
|
+
- S3 URIs (s3://bucket/key)
|
|
36
|
+
- Local file paths
|
|
37
|
+
- Pluggable content providers
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def __init__(
|
|
41
|
+
self, file_repo: Repository | None = None, resource_repo: Repository | None = None
|
|
42
|
+
):
|
|
43
|
+
self.s3_client = self._create_s3_client()
|
|
44
|
+
self.providers: dict[str, ContentProvider] = {}
|
|
45
|
+
self.file_repo = file_repo
|
|
46
|
+
self.resource_repo = resource_repo
|
|
47
|
+
|
|
48
|
+
# Register default providers from settings
|
|
49
|
+
self._register_default_providers()
|
|
50
|
+
|
|
51
|
+
def _register_default_providers(self):
|
|
52
|
+
"""Register default content providers from settings."""
|
|
53
|
+
# Schema provider for agent/evaluator schemas (YAML/JSON)
|
|
54
|
+
# Register first so it takes priority for .yaml/.json files
|
|
55
|
+
schema_provider = SchemaProvider()
|
|
56
|
+
self.providers[".yaml"] = schema_provider
|
|
57
|
+
self.providers[".yml"] = schema_provider
|
|
58
|
+
self.providers[".json"] = schema_provider
|
|
59
|
+
|
|
60
|
+
# Text provider for plain text, code, data files
|
|
61
|
+
text_provider = TextProvider()
|
|
62
|
+
for ext in settings.content.supported_text_types:
|
|
63
|
+
# Don't override schema provider for yaml/json
|
|
64
|
+
if ext.lower() not in [".yaml", ".yml", ".json"]:
|
|
65
|
+
self.providers[ext.lower()] = text_provider
|
|
66
|
+
|
|
67
|
+
# Doc provider for PDFs, Office docs, images (via Kreuzberg)
|
|
68
|
+
doc_provider = DocProvider()
|
|
69
|
+
for ext in settings.content.supported_doc_types:
|
|
70
|
+
self.providers[ext.lower()] = doc_provider
|
|
71
|
+
|
|
72
|
+
# Audio provider for audio files (via Whisper API)
|
|
73
|
+
audio_provider = AudioProvider()
|
|
74
|
+
for ext in settings.content.supported_audio_types:
|
|
75
|
+
self.providers[ext.lower()] = audio_provider
|
|
76
|
+
|
|
77
|
+
logger.debug(
|
|
78
|
+
f"Registered {len(self.providers)} file extensions across "
|
|
79
|
+
f"schema (yaml/json), "
|
|
80
|
+
f"{len(settings.content.supported_text_types)} text, "
|
|
81
|
+
f"{len(settings.content.supported_doc_types)} doc, "
|
|
82
|
+
f"{len(settings.content.supported_audio_types)} audio types"
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
def _create_s3_client(self):
|
|
86
|
+
"""Create S3 client with IRSA or configured credentials."""
|
|
87
|
+
s3_config: dict[str, Any] = {
|
|
88
|
+
"region_name": settings.s3.region,
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
# Custom endpoint for MinIO/LocalStack
|
|
92
|
+
if settings.s3.endpoint_url:
|
|
93
|
+
s3_config["endpoint_url"] = settings.s3.endpoint_url
|
|
94
|
+
|
|
95
|
+
# Access keys (not needed with IRSA in EKS)
|
|
96
|
+
if settings.s3.access_key_id and settings.s3.secret_access_key:
|
|
97
|
+
s3_config["aws_access_key_id"] = settings.s3.access_key_id
|
|
98
|
+
s3_config["aws_secret_access_key"] = settings.s3.secret_access_key
|
|
99
|
+
|
|
100
|
+
# SSL configuration
|
|
101
|
+
s3_config["use_ssl"] = settings.s3.use_ssl
|
|
102
|
+
|
|
103
|
+
return boto3.client("s3", **s3_config)
|
|
104
|
+
|
|
105
|
+
def process_uri(self, uri: str) -> dict[str, Any]:
|
|
106
|
+
"""
|
|
107
|
+
Process a file URI and extract content.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
uri: File URI (s3://bucket/key or local path)
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
dict with:
|
|
114
|
+
- uri: Original URI
|
|
115
|
+
- content: Extracted text content
|
|
116
|
+
- metadata: File metadata (size, type, etc.)
|
|
117
|
+
- provider: Provider used for extraction
|
|
118
|
+
|
|
119
|
+
Raises:
|
|
120
|
+
ValueError: If URI format is invalid
|
|
121
|
+
FileNotFoundError: If file doesn't exist
|
|
122
|
+
RuntimeError: If no provider available for file type
|
|
123
|
+
"""
|
|
124
|
+
logger.info(f"Processing URI: {uri}")
|
|
125
|
+
|
|
126
|
+
# Determine if S3 or local file
|
|
127
|
+
if uri.startswith("s3://"):
|
|
128
|
+
return self._process_s3_uri(uri)
|
|
129
|
+
else:
|
|
130
|
+
return self._process_local_file(uri)
|
|
131
|
+
|
|
132
|
+
def _process_s3_uri(self, uri: str) -> dict[str, Any]:
|
|
133
|
+
"""Process S3 URI."""
|
|
134
|
+
parsed = urlparse(uri)
|
|
135
|
+
bucket = parsed.netloc
|
|
136
|
+
key = parsed.path.lstrip("/")
|
|
137
|
+
|
|
138
|
+
if not bucket or not key:
|
|
139
|
+
raise ValueError(f"Invalid S3 URI: {uri}")
|
|
140
|
+
|
|
141
|
+
logger.debug(f"Downloading s3://{bucket}/{key}")
|
|
142
|
+
|
|
143
|
+
try:
|
|
144
|
+
# Download file from S3
|
|
145
|
+
response = self.s3_client.get_object(Bucket=bucket, Key=key)
|
|
146
|
+
content_bytes = response["Body"].read()
|
|
147
|
+
|
|
148
|
+
# Get metadata
|
|
149
|
+
metadata = {
|
|
150
|
+
"size": response["ContentLength"],
|
|
151
|
+
"content_type": response.get("ContentType", ""),
|
|
152
|
+
"last_modified": response["LastModified"].isoformat(),
|
|
153
|
+
"etag": response.get("ETag", "").strip('"'),
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
# Extract content using provider
|
|
157
|
+
file_path = Path(key)
|
|
158
|
+
provider = self._get_provider(file_path.suffix)
|
|
159
|
+
|
|
160
|
+
extracted_content = provider.extract(content_bytes, metadata)
|
|
161
|
+
|
|
162
|
+
# Build result with standard fields
|
|
163
|
+
result = {
|
|
164
|
+
"uri": uri,
|
|
165
|
+
"content": extracted_content["text"],
|
|
166
|
+
"metadata": {**metadata, **extracted_content.get("metadata", {})},
|
|
167
|
+
"provider": provider.name,
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
# Preserve schema-specific fields if present (from SchemaProvider)
|
|
171
|
+
if "is_schema" in extracted_content:
|
|
172
|
+
result["is_schema"] = extracted_content["is_schema"]
|
|
173
|
+
if "schema_data" in extracted_content:
|
|
174
|
+
result["schema_data"] = extracted_content["schema_data"]
|
|
175
|
+
|
|
176
|
+
return result
|
|
177
|
+
|
|
178
|
+
except ClientError as e:
|
|
179
|
+
error_code = e.response.get("Error", {}).get("Code", "")
|
|
180
|
+
if error_code == "NoSuchKey":
|
|
181
|
+
raise FileNotFoundError(f"S3 object not found: {uri}") from e
|
|
182
|
+
elif error_code == "NoSuchBucket":
|
|
183
|
+
raise FileNotFoundError(f"S3 bucket not found: {bucket}") from e
|
|
184
|
+
else:
|
|
185
|
+
raise RuntimeError(f"S3 error: {e}") from e
|
|
186
|
+
|
|
187
|
+
def _process_local_file(self, path: str) -> dict[str, Any]:
|
|
188
|
+
"""
|
|
189
|
+
Process local file path.
|
|
190
|
+
|
|
191
|
+
**PATH HANDLING FIX**: This method correctly handles both file:// URIs
|
|
192
|
+
and plain paths. Previously, file:// URIs from tools.py were NOT stripped,
|
|
193
|
+
causing FileNotFoundError because Path() treated "file:///Users/..." as a
|
|
194
|
+
literal filename instead of a URI.
|
|
195
|
+
|
|
196
|
+
The fix ensures consistent path handling:
|
|
197
|
+
- MCP tool creates: file:///Users/.../file.pdf
|
|
198
|
+
- This method strips: file:// → /Users/.../file.pdf
|
|
199
|
+
- Path() works correctly with absolute path
|
|
200
|
+
|
|
201
|
+
Related files:
|
|
202
|
+
- tools.py line 636: Creates file:// URIs
|
|
203
|
+
- FileSystemService line 58: Also strips file:// URIs
|
|
204
|
+
"""
|
|
205
|
+
# Handle file:// URI scheme
|
|
206
|
+
if path.startswith("file://"):
|
|
207
|
+
path = path.replace("file://", "")
|
|
208
|
+
|
|
209
|
+
file_path = Path(path)
|
|
210
|
+
|
|
211
|
+
if not file_path.exists():
|
|
212
|
+
raise FileNotFoundError(f"File not found: {path}")
|
|
213
|
+
|
|
214
|
+
if not file_path.is_file():
|
|
215
|
+
raise ValueError(f"Not a file: {path}")
|
|
216
|
+
|
|
217
|
+
logger.debug(f"Reading local file: {file_path}")
|
|
218
|
+
|
|
219
|
+
# Read file content
|
|
220
|
+
content_bytes = file_path.read_bytes()
|
|
221
|
+
|
|
222
|
+
# Get metadata
|
|
223
|
+
stat = file_path.stat()
|
|
224
|
+
metadata = {
|
|
225
|
+
"size": stat.st_size,
|
|
226
|
+
"modified": stat.st_mtime,
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
# Extract content using provider
|
|
230
|
+
provider = self._get_provider(file_path.suffix)
|
|
231
|
+
extracted_content = provider.extract(content_bytes, metadata)
|
|
232
|
+
|
|
233
|
+
# Build result with standard fields
|
|
234
|
+
result = {
|
|
235
|
+
"uri": str(file_path.absolute()),
|
|
236
|
+
"content": extracted_content["text"],
|
|
237
|
+
"metadata": {**metadata, **extracted_content.get("metadata", {})},
|
|
238
|
+
"provider": provider.name,
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
# Preserve schema-specific fields if present (from SchemaProvider)
|
|
242
|
+
if "is_schema" in extracted_content:
|
|
243
|
+
result["is_schema"] = extracted_content["is_schema"]
|
|
244
|
+
if "schema_data" in extracted_content:
|
|
245
|
+
result["schema_data"] = extracted_content["schema_data"]
|
|
246
|
+
|
|
247
|
+
return result
|
|
248
|
+
|
|
249
|
+
def _get_provider(self, suffix: str) -> ContentProvider:
|
|
250
|
+
"""Get content provider for file extension."""
|
|
251
|
+
suffix_lower = suffix.lower()
|
|
252
|
+
|
|
253
|
+
if suffix_lower not in self.providers:
|
|
254
|
+
raise RuntimeError(
|
|
255
|
+
f"No provider available for file type: {suffix}. "
|
|
256
|
+
f"Supported: {', '.join(self.providers.keys())}"
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
return self.providers[suffix_lower]
|
|
260
|
+
|
|
261
|
+
def register_provider(self, extensions: list[str], provider: ContentProvider):
|
|
262
|
+
"""
|
|
263
|
+
Register a custom content provider.
|
|
264
|
+
|
|
265
|
+
Args:
|
|
266
|
+
extensions: List of file extensions (e.g., ['.pdf', '.docx'])
|
|
267
|
+
provider: ContentProvider instance
|
|
268
|
+
"""
|
|
269
|
+
for ext in extensions:
|
|
270
|
+
ext_lower = ext.lower() if ext.startswith(".") else f".{ext.lower()}"
|
|
271
|
+
self.providers[ext_lower] = provider
|
|
272
|
+
logger.debug(f"Registered provider '{provider.name}' for {ext_lower}")
|
|
273
|
+
|
|
274
|
+
async def ingest_file(
|
|
275
|
+
self,
|
|
276
|
+
file_uri: str,
|
|
277
|
+
user_id: str | None = None,
|
|
278
|
+
category: str | None = None,
|
|
279
|
+
tags: list[str] | None = None,
|
|
280
|
+
is_local_server: bool = False,
|
|
281
|
+
resource_type: str | None = None,
|
|
282
|
+
) -> dict[str, Any]:
|
|
283
|
+
"""
|
|
284
|
+
Complete file ingestion pipeline: read → store → parse → chunk → embed.
|
|
285
|
+
|
|
286
|
+
**IMPORTANT: Data is PUBLIC by default (user_id=None).**
|
|
287
|
+
This is correct for shared knowledge bases (ontologies, procedures, reference data).
|
|
288
|
+
Private user-scoped data is rarely needed - only set user_id for truly personal content.
|
|
289
|
+
|
|
290
|
+
**CENTRALIZED INGESTION**: This is the single entry point for all file ingestion
|
|
291
|
+
in REM. It handles:
|
|
292
|
+
|
|
293
|
+
1. **File Reading**: From local/S3/HTTP sources via FileSystemService
|
|
294
|
+
2. **Storage**: Writes to user-scoped internal storage (~/.rem/fs/ or S3)
|
|
295
|
+
3. **Parsing**: Extracts content, metadata, tables, images (parsing state)
|
|
296
|
+
4. **Chunking**: Splits content into semantic chunks for embedding
|
|
297
|
+
5. **Database**: Creates File entity + Resource chunks with embeddings
|
|
298
|
+
|
|
299
|
+
**PARSING STATE - The Innovation**:
|
|
300
|
+
Files (PDF, WAV, DOCX, etc.) are converted to rich parsing state:
|
|
301
|
+
- **Content**: Markdown-formatted text (preserves structure)
|
|
302
|
+
- **Metadata**: File info, extraction details, timestamps
|
|
303
|
+
- **Tables**: Structured data extracted from documents (CSV format)
|
|
304
|
+
- **Images**: Extracted images saved to storage (for multimodal RAG)
|
|
305
|
+
- **Provider Info**: Which parser was used, version, settings
|
|
306
|
+
|
|
307
|
+
This parsing state enables agents to deeply understand documents:
|
|
308
|
+
- Query tables directly (structured data)
|
|
309
|
+
- Reference images (multimodal context)
|
|
310
|
+
- Understand document structure (markdown hierarchy)
|
|
311
|
+
- Track provenance (metadata lineage)
|
|
312
|
+
|
|
313
|
+
**CLIENT ABSTRACTION**: Clients (MCP tools, CLI, workers) don't worry about:
|
|
314
|
+
- Where files are stored (S3 vs local) - automatically selected
|
|
315
|
+
- How files are parsed (PDF vs DOCX) - provider auto-selected
|
|
316
|
+
- How chunks are created - semantic chunking with tiktoken
|
|
317
|
+
- How embeddings work - async worker with batching
|
|
318
|
+
|
|
319
|
+
Clients just call `ingest_file()` and get searchable resources.
|
|
320
|
+
|
|
321
|
+
**PERMISSION CHECK**: Remote MCP servers cannot read local files (security).
|
|
322
|
+
Only local/stdio MCP servers can access local filesystem paths.
|
|
323
|
+
|
|
324
|
+
Args:
|
|
325
|
+
file_uri: Source file location (local path, s3://, or https://)
|
|
326
|
+
user_id: User identifier for PRIVATE data only. Default None = PUBLIC/shared.
|
|
327
|
+
Leave as None for shared knowledge bases, ontologies, reference data.
|
|
328
|
+
Only set for truly private user-specific content.
|
|
329
|
+
category: Optional category tag (document, code, audio, etc.)
|
|
330
|
+
tags: Optional list of tags
|
|
331
|
+
is_local_server: True if running as local/stdio MCP server
|
|
332
|
+
resource_type: Optional resource type (case-insensitive). Supports:
|
|
333
|
+
- "resource", "resources", "Resource" → Resource (default)
|
|
334
|
+
- "domain-resource", "domain_resource", "DomainResource" → DomainResource
|
|
335
|
+
|
|
336
|
+
Returns:
|
|
337
|
+
dict with:
|
|
338
|
+
- file_id: UUID of created File entity
|
|
339
|
+
- file_name: Original filename
|
|
340
|
+
- storage_uri: Internal storage location
|
|
341
|
+
- internal_key: S3 key or local path
|
|
342
|
+
- size_bytes: File size
|
|
343
|
+
- content_type: MIME type
|
|
344
|
+
- processing_status: "completed" or "failed"
|
|
345
|
+
- resources_created: Number of Resource chunks created
|
|
346
|
+
- parsing_metadata: Rich parsing state (content, tables, images)
|
|
347
|
+
- content: Parsed file content (markdown format) if status is "completed"
|
|
348
|
+
|
|
349
|
+
Raises:
|
|
350
|
+
PermissionError: If remote server tries to read local file
|
|
351
|
+
FileNotFoundError: If source file doesn't exist
|
|
352
|
+
RuntimeError: If storage or processing fails
|
|
353
|
+
|
|
354
|
+
Example:
|
|
355
|
+
>>> service = ContentService()
|
|
356
|
+
>>> # PUBLIC data (default) - visible to all users
|
|
357
|
+
>>> result = await service.ingest_file(
|
|
358
|
+
... file_uri="s3://bucket/procedure.pdf",
|
|
359
|
+
... category="medical"
|
|
360
|
+
... )
|
|
361
|
+
>>> print(f"Created {result['resources_created']} searchable chunks")
|
|
362
|
+
>>>
|
|
363
|
+
>>> # PRIVATE data (rare) - only for user-specific content
|
|
364
|
+
>>> result = await service.ingest_file(
|
|
365
|
+
... file_uri="s3://bucket/personal-notes.pdf",
|
|
366
|
+
... user_id="user-123", # Only this user can access
|
|
367
|
+
... category="personal"
|
|
368
|
+
... )
|
|
369
|
+
"""
|
|
370
|
+
from pathlib import Path
|
|
371
|
+
from uuid import uuid4
|
|
372
|
+
import mimetypes
|
|
373
|
+
|
|
374
|
+
from ...models.entities import File
|
|
375
|
+
from ...services.fs import FileSystemService
|
|
376
|
+
from ...services.postgres import PostgresService
|
|
377
|
+
|
|
378
|
+
# Step 1: Read file from source using FileSystemService
|
|
379
|
+
fs_service = FileSystemService()
|
|
380
|
+
file_content, file_name, source_type = await fs_service.read_uri(
|
|
381
|
+
file_uri, is_local_server=is_local_server
|
|
382
|
+
)
|
|
383
|
+
file_size = len(file_content)
|
|
384
|
+
logger.info(f"Read {file_size} bytes from {file_uri} (source: {source_type})")
|
|
385
|
+
|
|
386
|
+
# Step 1.5: Early schema detection for YAML/JSON files
|
|
387
|
+
# Skip File entity creation for schemas (agents/evaluators)
|
|
388
|
+
file_suffix = Path(file_name).suffix.lower()
|
|
389
|
+
if file_suffix in ['.yaml', '.yml', '.json']:
|
|
390
|
+
import yaml
|
|
391
|
+
import json
|
|
392
|
+
try:
|
|
393
|
+
content_text = file_content.decode('utf-8') if isinstance(file_content, bytes) else file_content
|
|
394
|
+
data = yaml.safe_load(content_text) if file_suffix in ['.yaml', '.yml'] else json.loads(content_text)
|
|
395
|
+
if isinstance(data, dict):
|
|
396
|
+
json_schema_extra = data.get('json_schema_extra', {})
|
|
397
|
+
kind = json_schema_extra.get('kind', '')
|
|
398
|
+
if kind in ['agent', 'evaluator']:
|
|
399
|
+
# Route directly to schema processing, skip File entity
|
|
400
|
+
logger.info(f"Detected {kind} schema: {file_name}, routing to _process_schema")
|
|
401
|
+
result = self.process_uri(file_uri)
|
|
402
|
+
return await self._process_schema(result, file_uri, user_id)
|
|
403
|
+
except Exception as e:
|
|
404
|
+
logger.debug(f"Early schema detection failed for {file_name}: {e}")
|
|
405
|
+
# Fall through to standard file processing
|
|
406
|
+
|
|
407
|
+
# Step 2: Write to internal storage (public or user-scoped)
|
|
408
|
+
file_id = str(uuid4())
|
|
409
|
+
storage_uri, internal_key, content_type, _ = await fs_service.write_to_internal_storage(
|
|
410
|
+
content=file_content,
|
|
411
|
+
tenant_id=user_id or "public", # Storage path: public/ or user_id/
|
|
412
|
+
file_name=file_name,
|
|
413
|
+
file_id=file_id,
|
|
414
|
+
)
|
|
415
|
+
logger.info(f"Stored to internal storage: {storage_uri}")
|
|
416
|
+
|
|
417
|
+
# Step 3: Create File entity
|
|
418
|
+
file_entity = File(
|
|
419
|
+
id=file_id,
|
|
420
|
+
tenant_id=user_id, # None = public/shared
|
|
421
|
+
user_id=user_id,
|
|
422
|
+
name=file_name,
|
|
423
|
+
uri=storage_uri,
|
|
424
|
+
mime_type=content_type,
|
|
425
|
+
size_bytes=file_size,
|
|
426
|
+
metadata={
|
|
427
|
+
"source_uri": file_uri,
|
|
428
|
+
"source_type": source_type,
|
|
429
|
+
"category": category,
|
|
430
|
+
"storage_uri": storage_uri,
|
|
431
|
+
"s3_key": internal_key,
|
|
432
|
+
"s3_bucket": (
|
|
433
|
+
storage_uri.split("/")[2] if storage_uri.startswith("s3://") else "local"
|
|
434
|
+
),
|
|
435
|
+
},
|
|
436
|
+
tags=tags or [],
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
# Step 4: Save File entity to database
|
|
440
|
+
from rem.services.postgres import get_postgres_service
|
|
441
|
+
from rem.services.postgres.repository import Repository
|
|
442
|
+
|
|
443
|
+
postgres_service = get_postgres_service()
|
|
444
|
+
if not postgres_service:
|
|
445
|
+
raise RuntimeError("PostgreSQL is disabled. Cannot save File entity to database.")
|
|
446
|
+
|
|
447
|
+
await postgres_service.connect()
|
|
448
|
+
try:
|
|
449
|
+
repo = Repository(File, "files", db=postgres_service)
|
|
450
|
+
await repo.upsert(file_entity)
|
|
451
|
+
finally:
|
|
452
|
+
await postgres_service.disconnect()
|
|
453
|
+
|
|
454
|
+
# Step 5: Process file to create Resource chunks
|
|
455
|
+
try:
|
|
456
|
+
processing_result = await self.process_and_save(
|
|
457
|
+
uri=storage_uri,
|
|
458
|
+
user_id=user_id,
|
|
459
|
+
resource_type=resource_type,
|
|
460
|
+
)
|
|
461
|
+
processing_status = processing_result.get("status", "completed")
|
|
462
|
+
resources_created = processing_result.get("chunk_count", 0)
|
|
463
|
+
parsing_metadata = {
|
|
464
|
+
"content_extracted": bool(processing_result.get("content")),
|
|
465
|
+
"markdown_generated": bool(processing_result.get("markdown")),
|
|
466
|
+
"chunks_created": resources_created,
|
|
467
|
+
}
|
|
468
|
+
except Exception as e:
|
|
469
|
+
logger.error(f"File processing failed: {e}", exc_info=True)
|
|
470
|
+
processing_status = "failed"
|
|
471
|
+
resources_created = 0
|
|
472
|
+
parsing_metadata = {"error": str(e)}
|
|
473
|
+
|
|
474
|
+
logger.info(
|
|
475
|
+
f"File ingestion complete: {file_name} "
|
|
476
|
+
f"(user: {user_id}, status: {processing_status}, "
|
|
477
|
+
f"resources: {resources_created})"
|
|
478
|
+
)
|
|
479
|
+
|
|
480
|
+
# Extract content if available
|
|
481
|
+
content = None
|
|
482
|
+
if processing_status == "completed" and processing_result:
|
|
483
|
+
content = processing_result.get("content")
|
|
484
|
+
|
|
485
|
+
return {
|
|
486
|
+
"file_id": file_id,
|
|
487
|
+
"file_name": file_name,
|
|
488
|
+
"storage_uri": storage_uri,
|
|
489
|
+
"internal_key": internal_key,
|
|
490
|
+
"size_bytes": file_size,
|
|
491
|
+
"content_type": content_type,
|
|
492
|
+
"source_uri": file_uri,
|
|
493
|
+
"source_type": source_type,
|
|
494
|
+
"processing_status": processing_status,
|
|
495
|
+
"resources_created": resources_created,
|
|
496
|
+
"parsing_metadata": parsing_metadata,
|
|
497
|
+
"content": content, # Include parsed content when available
|
|
498
|
+
"message": f"File ingested and {processing_status}. Created {resources_created} resources.",
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
async def process_and_save(
|
|
502
|
+
self,
|
|
503
|
+
uri: str,
|
|
504
|
+
user_id: str | None = None,
|
|
505
|
+
resource_type: str | None = None,
|
|
506
|
+
) -> dict[str, Any]:
|
|
507
|
+
"""
|
|
508
|
+
Process file end-to-end: extract → markdown → chunk → save.
|
|
509
|
+
|
|
510
|
+
**INTERNAL METHOD**: This is called by ingest_file() after storage.
|
|
511
|
+
Clients should use ingest_file() instead for the full pipeline.
|
|
512
|
+
|
|
513
|
+
**KIND-BASED ROUTING**: For YAML/JSON files, checks for 'kind' field and routes to:
|
|
514
|
+
- kind=agent or kind=evaluator → Save to schemas table (not resources)
|
|
515
|
+
- kind=engram → Process via EngramProcessor (creates resources + moments)
|
|
516
|
+
- No kind → Standard resource processing (default)
|
|
517
|
+
|
|
518
|
+
Args:
|
|
519
|
+
uri: File URI (s3://bucket/key or local path)
|
|
520
|
+
user_id: Optional user ID for multi-tenancy
|
|
521
|
+
resource_type: Optional resource type (case-insensitive). Defaults to "Resource".
|
|
522
|
+
Supports: resource, domain-resource, domain_resource, DomainResource, etc.
|
|
523
|
+
|
|
524
|
+
Returns:
|
|
525
|
+
dict with file metadata and chunk count
|
|
526
|
+
"""
|
|
527
|
+
logger.info(f"Processing and saving: {uri}")
|
|
528
|
+
|
|
529
|
+
# Extract content
|
|
530
|
+
result = self.process_uri(uri)
|
|
531
|
+
filename = Path(uri).name
|
|
532
|
+
|
|
533
|
+
# Check for custom kind-based processing (YAML/JSON only)
|
|
534
|
+
file_suffix = Path(uri).suffix.lower()
|
|
535
|
+
if file_suffix in ['.yaml', '.yml', '.json']:
|
|
536
|
+
# Check if schema provider detected a valid schema
|
|
537
|
+
# is_schema flag is at top level of result (preserved from SchemaProvider)
|
|
538
|
+
if result.get('is_schema'):
|
|
539
|
+
logger.info(f"🔧 Custom provider flow initiated: kind={result.get('metadata', {}).get('kind')} for {filename}")
|
|
540
|
+
return await self._process_schema(result, uri, user_id)
|
|
541
|
+
|
|
542
|
+
# Check for engram kind in raw data
|
|
543
|
+
import yaml
|
|
544
|
+
import json
|
|
545
|
+
try:
|
|
546
|
+
# Parse the content to check for kind
|
|
547
|
+
content_text = result.get('content', '')
|
|
548
|
+
if file_suffix == '.json':
|
|
549
|
+
data = json.loads(content_text)
|
|
550
|
+
else:
|
|
551
|
+
data = yaml.safe_load(content_text)
|
|
552
|
+
|
|
553
|
+
if isinstance(data, dict) and data.get('kind') == 'engram':
|
|
554
|
+
logger.info(f"🔧 Custom provider flow initiated: kind=engram for {filename}")
|
|
555
|
+
return await self._process_engram(data, uri, user_id)
|
|
556
|
+
except Exception as e:
|
|
557
|
+
logger.debug(f"Could not parse {filename} for kind check: {e}")
|
|
558
|
+
# Fall through to standard processing
|
|
559
|
+
|
|
560
|
+
# Convert to markdown
|
|
561
|
+
markdown = to_markdown(result["content"], filename)
|
|
562
|
+
|
|
563
|
+
# Chunk markdown
|
|
564
|
+
chunks = chunk_text(markdown)
|
|
565
|
+
logger.info(f"Created {len(chunks)} chunks from {filename}")
|
|
566
|
+
|
|
567
|
+
# Save File entity
|
|
568
|
+
file = File(
|
|
569
|
+
name=filename,
|
|
570
|
+
uri=uri,
|
|
571
|
+
content=result["content"],
|
|
572
|
+
size_bytes=result["metadata"].get("size"),
|
|
573
|
+
mime_type=result["metadata"].get("content_type"),
|
|
574
|
+
processing_status="completed",
|
|
575
|
+
tenant_id=user_id, # None = public/shared
|
|
576
|
+
user_id=user_id,
|
|
577
|
+
)
|
|
578
|
+
|
|
579
|
+
if self.file_repo:
|
|
580
|
+
await self.file_repo.upsert(file)
|
|
581
|
+
logger.info(f"Saved File: {filename}")
|
|
582
|
+
|
|
583
|
+
# Resolve resource model class from type parameter (case-insensitive)
|
|
584
|
+
from typing import cast, Type
|
|
585
|
+
from pydantic import BaseModel
|
|
586
|
+
from rem.utils.model_helpers import model_from_arbitrary_casing, get_table_name
|
|
587
|
+
|
|
588
|
+
resource_model: Type[BaseModel] = Resource # Default
|
|
589
|
+
if resource_type:
|
|
590
|
+
try:
|
|
591
|
+
resource_model = model_from_arbitrary_casing(resource_type)
|
|
592
|
+
logger.info(f"Using resource model: {resource_model.__name__}")
|
|
593
|
+
except ValueError as e:
|
|
594
|
+
logger.warning(f"Invalid resource_type '{resource_type}', using default Resource: {e}")
|
|
595
|
+
resource_model = Resource
|
|
596
|
+
|
|
597
|
+
# Get table name for the resolved model
|
|
598
|
+
table_name = get_table_name(resource_model)
|
|
599
|
+
|
|
600
|
+
# Create resource entities for each chunk
|
|
601
|
+
resources: list[BaseModel] = [
|
|
602
|
+
resource_model(
|
|
603
|
+
name=f"{filename}#chunk-{i}",
|
|
604
|
+
uri=f"{uri}#chunk-{i}",
|
|
605
|
+
ordinal=i,
|
|
606
|
+
content=chunk,
|
|
607
|
+
category="document",
|
|
608
|
+
tenant_id=user_id, # None = public/shared
|
|
609
|
+
user_id=user_id,
|
|
610
|
+
)
|
|
611
|
+
for i, chunk in enumerate(chunks)
|
|
612
|
+
]
|
|
613
|
+
|
|
614
|
+
# Save resources to the appropriate table
|
|
615
|
+
if resources:
|
|
616
|
+
from rem.services.postgres import get_postgres_service
|
|
617
|
+
|
|
618
|
+
postgres = get_postgres_service()
|
|
619
|
+
if postgres:
|
|
620
|
+
await postgres.connect()
|
|
621
|
+
try:
|
|
622
|
+
await postgres.batch_upsert(
|
|
623
|
+
records=cast(list[BaseModel | dict], resources),
|
|
624
|
+
model=resource_model,
|
|
625
|
+
table_name=table_name,
|
|
626
|
+
entity_key_field="name",
|
|
627
|
+
embeddable_fields=["content"],
|
|
628
|
+
generate_embeddings=True,
|
|
629
|
+
)
|
|
630
|
+
logger.info(f"Saved {len(resources)} {resource_model.__name__} chunks to {table_name}")
|
|
631
|
+
logger.info(f"Queued {len(resources)} embedding generation tasks for content field")
|
|
632
|
+
finally:
|
|
633
|
+
await postgres.disconnect()
|
|
634
|
+
elif self.resource_repo:
|
|
635
|
+
# Fallback to injected repo (only works for default Resource)
|
|
636
|
+
await self.resource_repo.upsert(
|
|
637
|
+
resources,
|
|
638
|
+
embeddable_fields=["content"],
|
|
639
|
+
generate_embeddings=True,
|
|
640
|
+
)
|
|
641
|
+
logger.info(f"Saved {len(resources)} Resource chunks")
|
|
642
|
+
logger.info(f"Queued {len(resources)} embedding generation tasks for content field")
|
|
643
|
+
|
|
644
|
+
return {
|
|
645
|
+
"file": file.model_dump(),
|
|
646
|
+
"chunk_count": len(chunks),
|
|
647
|
+
"content": result["content"],
|
|
648
|
+
"markdown": markdown,
|
|
649
|
+
"status": "completed",
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
async def _process_schema(
|
|
653
|
+
self, result: dict[str, Any], uri: str, user_id: str | None = None
|
|
654
|
+
) -> dict[str, Any]:
|
|
655
|
+
"""
|
|
656
|
+
Process agent/evaluator schema and save to schemas table.
|
|
657
|
+
|
|
658
|
+
Args:
|
|
659
|
+
result: Extraction result from SchemaProvider with schema_data
|
|
660
|
+
uri: File URI
|
|
661
|
+
user_id: Optional user ID for multi-tenancy
|
|
662
|
+
|
|
663
|
+
Returns:
|
|
664
|
+
dict with schema save result
|
|
665
|
+
"""
|
|
666
|
+
from rem.models.entities import Schema
|
|
667
|
+
from rem.services.postgres import PostgresService
|
|
668
|
+
|
|
669
|
+
metadata = result.get("metadata", {})
|
|
670
|
+
schema_data = result.get("schema_data", {})
|
|
671
|
+
|
|
672
|
+
kind = metadata.get("kind")
|
|
673
|
+
name = metadata.get("name")
|
|
674
|
+
version = metadata.get("version", "1.0.0")
|
|
675
|
+
|
|
676
|
+
logger.info(f"Saving schema to schemas table: kind={kind}, name={name}, version={version}")
|
|
677
|
+
|
|
678
|
+
# Create Schema entity
|
|
679
|
+
# IMPORTANT: category field distinguishes agents from evaluators
|
|
680
|
+
# - kind=agent → category="agent" (AI agents with tools/resources)
|
|
681
|
+
# - kind=evaluator → category="evaluator" (LLM-as-a-Judge evaluators)
|
|
682
|
+
# User-scoped schemas: if user_id provided, scope to user's tenant
|
|
683
|
+
# System schemas: if no user_id, use "system" tenant for shared access
|
|
684
|
+
schema_entity = Schema(
|
|
685
|
+
tenant_id=user_id or "system",
|
|
686
|
+
user_id=user_id,
|
|
687
|
+
name=name,
|
|
688
|
+
spec=schema_data,
|
|
689
|
+
category=kind, # Maps kind → category for database filtering
|
|
690
|
+
provider_configs=metadata.get("provider_configs", []),
|
|
691
|
+
embedding_fields=metadata.get("embedding_fields", []),
|
|
692
|
+
metadata={
|
|
693
|
+
"uri": uri,
|
|
694
|
+
"version": version,
|
|
695
|
+
"tags": metadata.get("tags", []),
|
|
696
|
+
},
|
|
697
|
+
)
|
|
698
|
+
|
|
699
|
+
# Save to schemas table
|
|
700
|
+
from rem.services.postgres import get_postgres_service
|
|
701
|
+
postgres = get_postgres_service()
|
|
702
|
+
if not postgres:
|
|
703
|
+
raise RuntimeError("PostgreSQL is disabled. Cannot save Schema entity to database.")
|
|
704
|
+
|
|
705
|
+
await postgres.connect()
|
|
706
|
+
try:
|
|
707
|
+
from rem.models.entities import Schema as SchemaModel
|
|
708
|
+
await postgres.batch_upsert(
|
|
709
|
+
records=[schema_entity],
|
|
710
|
+
model=SchemaModel,
|
|
711
|
+
table_name="schemas",
|
|
712
|
+
entity_key_field="name",
|
|
713
|
+
generate_embeddings=False,
|
|
714
|
+
)
|
|
715
|
+
logger.info(f"✅ Schema saved: {name} (kind={kind})")
|
|
716
|
+
finally:
|
|
717
|
+
await postgres.disconnect()
|
|
718
|
+
|
|
719
|
+
return {
|
|
720
|
+
"schema_name": name,
|
|
721
|
+
"kind": kind,
|
|
722
|
+
"version": version,
|
|
723
|
+
"status": "completed",
|
|
724
|
+
"message": f"Schema '{name}' saved to schemas table",
|
|
725
|
+
}
|
|
726
|
+
|
|
727
|
+
async def _process_engram(
|
|
728
|
+
self, data: dict[str, Any], uri: str, user_id: str | None = None
|
|
729
|
+
) -> dict[str, Any]:
|
|
730
|
+
"""
|
|
731
|
+
Process engram and save to resources + moments tables.
|
|
732
|
+
|
|
733
|
+
Args:
|
|
734
|
+
data: Parsed engram data with kind=engram
|
|
735
|
+
uri: File URI
|
|
736
|
+
user_id: Optional user ID for multi-tenancy
|
|
737
|
+
|
|
738
|
+
Returns:
|
|
739
|
+
dict with engram processing result
|
|
740
|
+
"""
|
|
741
|
+
from rem.workers.engram_processor import EngramProcessor
|
|
742
|
+
from rem.services.postgres import PostgresService
|
|
743
|
+
|
|
744
|
+
logger.info(f"Processing engram: {data.get('name')}")
|
|
745
|
+
|
|
746
|
+
from rem.services.postgres import get_postgres_service
|
|
747
|
+
postgres = get_postgres_service()
|
|
748
|
+
if not postgres:
|
|
749
|
+
raise RuntimeError("PostgreSQL is disabled. Cannot process engram.")
|
|
750
|
+
|
|
751
|
+
await postgres.connect()
|
|
752
|
+
try:
|
|
753
|
+
processor = EngramProcessor(postgres)
|
|
754
|
+
result = await processor.process_engram(
|
|
755
|
+
data=data,
|
|
756
|
+
tenant_id=user_id, # None = public/shared
|
|
757
|
+
user_id=user_id,
|
|
758
|
+
)
|
|
759
|
+
logger.info(f"✅ Engram processed: {result.get('resource_id')} with {len(result.get('moment_ids', []))} moments")
|
|
760
|
+
return result
|
|
761
|
+
finally:
|
|
762
|
+
await postgres.disconnect()
|