remdb 0.2.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of remdb might be problematic. Click here for more details.

Files changed (187) hide show
  1. rem/__init__.py +2 -0
  2. rem/agentic/README.md +650 -0
  3. rem/agentic/__init__.py +39 -0
  4. rem/agentic/agents/README.md +155 -0
  5. rem/agentic/agents/__init__.py +8 -0
  6. rem/agentic/context.py +148 -0
  7. rem/agentic/context_builder.py +329 -0
  8. rem/agentic/mcp/__init__.py +0 -0
  9. rem/agentic/mcp/tool_wrapper.py +107 -0
  10. rem/agentic/otel/__init__.py +5 -0
  11. rem/agentic/otel/setup.py +151 -0
  12. rem/agentic/providers/phoenix.py +674 -0
  13. rem/agentic/providers/pydantic_ai.py +572 -0
  14. rem/agentic/query.py +117 -0
  15. rem/agentic/query_helper.py +89 -0
  16. rem/agentic/schema.py +396 -0
  17. rem/agentic/serialization.py +245 -0
  18. rem/agentic/tools/__init__.py +5 -0
  19. rem/agentic/tools/rem_tools.py +231 -0
  20. rem/api/README.md +420 -0
  21. rem/api/main.py +324 -0
  22. rem/api/mcp_router/prompts.py +182 -0
  23. rem/api/mcp_router/resources.py +536 -0
  24. rem/api/mcp_router/server.py +213 -0
  25. rem/api/mcp_router/tools.py +584 -0
  26. rem/api/routers/auth.py +229 -0
  27. rem/api/routers/chat/__init__.py +5 -0
  28. rem/api/routers/chat/completions.py +281 -0
  29. rem/api/routers/chat/json_utils.py +76 -0
  30. rem/api/routers/chat/models.py +124 -0
  31. rem/api/routers/chat/streaming.py +185 -0
  32. rem/auth/README.md +258 -0
  33. rem/auth/__init__.py +26 -0
  34. rem/auth/middleware.py +100 -0
  35. rem/auth/providers/__init__.py +13 -0
  36. rem/auth/providers/base.py +376 -0
  37. rem/auth/providers/google.py +163 -0
  38. rem/auth/providers/microsoft.py +237 -0
  39. rem/cli/README.md +455 -0
  40. rem/cli/__init__.py +8 -0
  41. rem/cli/commands/README.md +126 -0
  42. rem/cli/commands/__init__.py +3 -0
  43. rem/cli/commands/ask.py +565 -0
  44. rem/cli/commands/configure.py +423 -0
  45. rem/cli/commands/db.py +493 -0
  46. rem/cli/commands/dreaming.py +324 -0
  47. rem/cli/commands/experiments.py +1124 -0
  48. rem/cli/commands/mcp.py +66 -0
  49. rem/cli/commands/process.py +245 -0
  50. rem/cli/commands/schema.py +183 -0
  51. rem/cli/commands/serve.py +106 -0
  52. rem/cli/dreaming.py +363 -0
  53. rem/cli/main.py +88 -0
  54. rem/config.py +237 -0
  55. rem/mcp_server.py +41 -0
  56. rem/models/core/__init__.py +49 -0
  57. rem/models/core/core_model.py +64 -0
  58. rem/models/core/engram.py +333 -0
  59. rem/models/core/experiment.py +628 -0
  60. rem/models/core/inline_edge.py +132 -0
  61. rem/models/core/rem_query.py +243 -0
  62. rem/models/entities/__init__.py +43 -0
  63. rem/models/entities/file.py +57 -0
  64. rem/models/entities/image_resource.py +88 -0
  65. rem/models/entities/message.py +35 -0
  66. rem/models/entities/moment.py +123 -0
  67. rem/models/entities/ontology.py +191 -0
  68. rem/models/entities/ontology_config.py +131 -0
  69. rem/models/entities/resource.py +95 -0
  70. rem/models/entities/schema.py +87 -0
  71. rem/models/entities/user.py +85 -0
  72. rem/py.typed +0 -0
  73. rem/schemas/README.md +507 -0
  74. rem/schemas/__init__.py +6 -0
  75. rem/schemas/agents/README.md +92 -0
  76. rem/schemas/agents/core/moment-builder.yaml +178 -0
  77. rem/schemas/agents/core/rem-query-agent.yaml +226 -0
  78. rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
  79. rem/schemas/agents/core/simple-assistant.yaml +19 -0
  80. rem/schemas/agents/core/user-profile-builder.yaml +163 -0
  81. rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
  82. rem/schemas/agents/examples/contract-extractor.yaml +134 -0
  83. rem/schemas/agents/examples/cv-parser.yaml +263 -0
  84. rem/schemas/agents/examples/hello-world.yaml +37 -0
  85. rem/schemas/agents/examples/query.yaml +54 -0
  86. rem/schemas/agents/examples/simple.yaml +21 -0
  87. rem/schemas/agents/examples/test.yaml +29 -0
  88. rem/schemas/agents/rem.yaml +128 -0
  89. rem/schemas/evaluators/hello-world/default.yaml +77 -0
  90. rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
  91. rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
  92. rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
  93. rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
  94. rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
  95. rem/services/__init__.py +16 -0
  96. rem/services/audio/INTEGRATION.md +308 -0
  97. rem/services/audio/README.md +376 -0
  98. rem/services/audio/__init__.py +15 -0
  99. rem/services/audio/chunker.py +354 -0
  100. rem/services/audio/transcriber.py +259 -0
  101. rem/services/content/README.md +1269 -0
  102. rem/services/content/__init__.py +5 -0
  103. rem/services/content/providers.py +806 -0
  104. rem/services/content/service.py +657 -0
  105. rem/services/dreaming/README.md +230 -0
  106. rem/services/dreaming/__init__.py +53 -0
  107. rem/services/dreaming/affinity_service.py +336 -0
  108. rem/services/dreaming/moment_service.py +264 -0
  109. rem/services/dreaming/ontology_service.py +54 -0
  110. rem/services/dreaming/user_model_service.py +297 -0
  111. rem/services/dreaming/utils.py +39 -0
  112. rem/services/embeddings/__init__.py +11 -0
  113. rem/services/embeddings/api.py +120 -0
  114. rem/services/embeddings/worker.py +421 -0
  115. rem/services/fs/README.md +662 -0
  116. rem/services/fs/__init__.py +62 -0
  117. rem/services/fs/examples.py +206 -0
  118. rem/services/fs/examples_paths.py +204 -0
  119. rem/services/fs/git_provider.py +935 -0
  120. rem/services/fs/local_provider.py +760 -0
  121. rem/services/fs/parsing-hooks-examples.md +172 -0
  122. rem/services/fs/paths.py +276 -0
  123. rem/services/fs/provider.py +460 -0
  124. rem/services/fs/s3_provider.py +1042 -0
  125. rem/services/fs/service.py +186 -0
  126. rem/services/git/README.md +1075 -0
  127. rem/services/git/__init__.py +17 -0
  128. rem/services/git/service.py +469 -0
  129. rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
  130. rem/services/phoenix/README.md +453 -0
  131. rem/services/phoenix/__init__.py +46 -0
  132. rem/services/phoenix/client.py +686 -0
  133. rem/services/phoenix/config.py +88 -0
  134. rem/services/phoenix/prompt_labels.py +477 -0
  135. rem/services/postgres/README.md +575 -0
  136. rem/services/postgres/__init__.py +23 -0
  137. rem/services/postgres/migration_service.py +427 -0
  138. rem/services/postgres/pydantic_to_sqlalchemy.py +232 -0
  139. rem/services/postgres/register_type.py +352 -0
  140. rem/services/postgres/repository.py +337 -0
  141. rem/services/postgres/schema_generator.py +379 -0
  142. rem/services/postgres/service.py +802 -0
  143. rem/services/postgres/sql_builder.py +354 -0
  144. rem/services/rem/README.md +304 -0
  145. rem/services/rem/__init__.py +23 -0
  146. rem/services/rem/exceptions.py +71 -0
  147. rem/services/rem/executor.py +293 -0
  148. rem/services/rem/parser.py +145 -0
  149. rem/services/rem/queries.py +196 -0
  150. rem/services/rem/query.py +371 -0
  151. rem/services/rem/service.py +527 -0
  152. rem/services/session/README.md +374 -0
  153. rem/services/session/__init__.py +6 -0
  154. rem/services/session/compression.py +360 -0
  155. rem/services/session/reload.py +77 -0
  156. rem/settings.py +1235 -0
  157. rem/sql/002_install_models.sql +1068 -0
  158. rem/sql/background_indexes.sql +42 -0
  159. rem/sql/install_models.sql +1038 -0
  160. rem/sql/migrations/001_install.sql +503 -0
  161. rem/sql/migrations/002_install_models.sql +1202 -0
  162. rem/utils/AGENTIC_CHUNKING.md +597 -0
  163. rem/utils/README.md +583 -0
  164. rem/utils/__init__.py +43 -0
  165. rem/utils/agentic_chunking.py +622 -0
  166. rem/utils/batch_ops.py +343 -0
  167. rem/utils/chunking.py +108 -0
  168. rem/utils/clip_embeddings.py +276 -0
  169. rem/utils/dict_utils.py +98 -0
  170. rem/utils/embeddings.py +423 -0
  171. rem/utils/examples/embeddings_example.py +305 -0
  172. rem/utils/examples/sql_types_example.py +202 -0
  173. rem/utils/markdown.py +16 -0
  174. rem/utils/model_helpers.py +236 -0
  175. rem/utils/schema_loader.py +229 -0
  176. rem/utils/sql_types.py +348 -0
  177. rem/utils/user_id.py +81 -0
  178. rem/utils/vision.py +330 -0
  179. rem/workers/README.md +506 -0
  180. rem/workers/__init__.py +5 -0
  181. rem/workers/dreaming.py +502 -0
  182. rem/workers/engram_processor.py +312 -0
  183. rem/workers/sqs_file_processor.py +193 -0
  184. remdb-0.2.6.dist-info/METADATA +1191 -0
  185. remdb-0.2.6.dist-info/RECORD +187 -0
  186. remdb-0.2.6.dist-info/WHEEL +4 -0
  187. remdb-0.2.6.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,657 @@
1
+ """
2
+ ContentService for file processing.
3
+
4
+ Pipeline:
5
+ 1. Extract content via provider plugins
6
+ 2. Convert to markdown
7
+ 3. Chunk markdown
8
+ 4. Save File + Resources to database via repositories
9
+ """
10
+
11
+ import json
12
+ from datetime import UTC, datetime
13
+ from pathlib import Path
14
+ from typing import Any
15
+ from urllib.parse import urlparse
16
+
17
+ import boto3
18
+ from botocore.exceptions import ClientError
19
+ from loguru import logger
20
+
21
+ from rem.models.entities import File, Resource
22
+ from rem.services.postgres import Repository
23
+ from rem.settings import settings
24
+ from rem.utils.chunking import chunk_text
25
+ from rem.utils.markdown import to_markdown
26
+
27
+ from .providers import AudioProvider, ContentProvider, DocProvider, SchemaProvider, TextProvider
28
+
29
+
30
+ class ContentService:
31
+ """
32
+ Service for processing files: extract → markdown → chunk → save.
33
+
34
+ Supports:
35
+ - S3 URIs (s3://bucket/key)
36
+ - Local file paths
37
+ - Pluggable content providers
38
+ """
39
+
40
+ def __init__(
41
+ self, file_repo: Repository | None = None, resource_repo: Repository | None = None
42
+ ):
43
+ self.s3_client = self._create_s3_client()
44
+ self.providers: dict[str, ContentProvider] = {}
45
+ self.file_repo = file_repo
46
+ self.resource_repo = resource_repo
47
+
48
+ # Register default providers from settings
49
+ self._register_default_providers()
50
+
51
+ def _register_default_providers(self):
52
+ """Register default content providers from settings."""
53
+ # Schema provider for agent/evaluator schemas (YAML/JSON)
54
+ # Register first so it takes priority for .yaml/.json files
55
+ schema_provider = SchemaProvider()
56
+ self.providers[".yaml"] = schema_provider
57
+ self.providers[".yml"] = schema_provider
58
+ self.providers[".json"] = schema_provider
59
+
60
+ # Text provider for plain text, code, data files
61
+ text_provider = TextProvider()
62
+ for ext in settings.content.supported_text_types:
63
+ # Don't override schema provider for yaml/json
64
+ if ext.lower() not in [".yaml", ".yml", ".json"]:
65
+ self.providers[ext.lower()] = text_provider
66
+
67
+ # Doc provider for PDFs, Office docs, images (via Kreuzberg)
68
+ doc_provider = DocProvider()
69
+ for ext in settings.content.supported_doc_types:
70
+ self.providers[ext.lower()] = doc_provider
71
+
72
+ # Audio provider for audio files (via Whisper API)
73
+ audio_provider = AudioProvider()
74
+ for ext in settings.content.supported_audio_types:
75
+ self.providers[ext.lower()] = audio_provider
76
+
77
+ logger.debug(
78
+ f"Registered {len(self.providers)} file extensions across "
79
+ f"schema (yaml/json), "
80
+ f"{len(settings.content.supported_text_types)} text, "
81
+ f"{len(settings.content.supported_doc_types)} doc, "
82
+ f"{len(settings.content.supported_audio_types)} audio types"
83
+ )
84
+
85
+ def _create_s3_client(self):
86
+ """Create S3 client with IRSA or configured credentials."""
87
+ s3_config: dict[str, Any] = {
88
+ "region_name": settings.s3.region,
89
+ }
90
+
91
+ # Custom endpoint for MinIO/LocalStack
92
+ if settings.s3.endpoint_url:
93
+ s3_config["endpoint_url"] = settings.s3.endpoint_url
94
+
95
+ # Access keys (not needed with IRSA in EKS)
96
+ if settings.s3.access_key_id and settings.s3.secret_access_key:
97
+ s3_config["aws_access_key_id"] = settings.s3.access_key_id
98
+ s3_config["aws_secret_access_key"] = settings.s3.secret_access_key
99
+
100
+ # SSL configuration
101
+ s3_config["use_ssl"] = settings.s3.use_ssl
102
+
103
+ return boto3.client("s3", **s3_config)
104
+
105
+ def process_uri(self, uri: str) -> dict[str, Any]:
106
+ """
107
+ Process a file URI and extract content.
108
+
109
+ Args:
110
+ uri: File URI (s3://bucket/key or local path)
111
+
112
+ Returns:
113
+ dict with:
114
+ - uri: Original URI
115
+ - content: Extracted text content
116
+ - metadata: File metadata (size, type, etc.)
117
+ - provider: Provider used for extraction
118
+
119
+ Raises:
120
+ ValueError: If URI format is invalid
121
+ FileNotFoundError: If file doesn't exist
122
+ RuntimeError: If no provider available for file type
123
+ """
124
+ logger.info(f"Processing URI: {uri}")
125
+
126
+ # Determine if S3 or local file
127
+ if uri.startswith("s3://"):
128
+ return self._process_s3_uri(uri)
129
+ else:
130
+ return self._process_local_file(uri)
131
+
132
+ def _process_s3_uri(self, uri: str) -> dict[str, Any]:
133
+ """Process S3 URI."""
134
+ parsed = urlparse(uri)
135
+ bucket = parsed.netloc
136
+ key = parsed.path.lstrip("/")
137
+
138
+ if not bucket or not key:
139
+ raise ValueError(f"Invalid S3 URI: {uri}")
140
+
141
+ logger.debug(f"Downloading s3://{bucket}/{key}")
142
+
143
+ try:
144
+ # Download file from S3
145
+ response = self.s3_client.get_object(Bucket=bucket, Key=key)
146
+ content_bytes = response["Body"].read()
147
+
148
+ # Get metadata
149
+ metadata = {
150
+ "size": response["ContentLength"],
151
+ "content_type": response.get("ContentType", ""),
152
+ "last_modified": response["LastModified"].isoformat(),
153
+ "etag": response.get("ETag", "").strip('"'),
154
+ }
155
+
156
+ # Extract content using provider
157
+ file_path = Path(key)
158
+ provider = self._get_provider(file_path.suffix)
159
+
160
+ extracted_content = provider.extract(content_bytes, metadata)
161
+
162
+ return {
163
+ "uri": uri,
164
+ "content": extracted_content["text"],
165
+ "metadata": {**metadata, **extracted_content.get("metadata", {})},
166
+ "provider": provider.name,
167
+ }
168
+
169
+ except ClientError as e:
170
+ error_code = e.response.get("Error", {}).get("Code", "")
171
+ if error_code == "NoSuchKey":
172
+ raise FileNotFoundError(f"S3 object not found: {uri}") from e
173
+ elif error_code == "NoSuchBucket":
174
+ raise FileNotFoundError(f"S3 bucket not found: {bucket}") from e
175
+ else:
176
+ raise RuntimeError(f"S3 error: {e}") from e
177
+
178
+ def _process_local_file(self, path: str) -> dict[str, Any]:
179
+ """
180
+ Process local file path.
181
+
182
+ **PATH HANDLING FIX**: This method correctly handles both file:// URIs
183
+ and plain paths. Previously, file:// URIs from tools.py were NOT stripped,
184
+ causing FileNotFoundError because Path() treated "file:///Users/..." as a
185
+ literal filename instead of a URI.
186
+
187
+ The fix ensures consistent path handling:
188
+ - MCP tool creates: file:///Users/.../file.pdf
189
+ - This method strips: file:// → /Users/.../file.pdf
190
+ - Path() works correctly with absolute path
191
+
192
+ Related files:
193
+ - tools.py line 636: Creates file:// URIs
194
+ - FileSystemService line 58: Also strips file:// URIs
195
+ """
196
+ # Handle file:// URI scheme
197
+ if path.startswith("file://"):
198
+ path = path.replace("file://", "")
199
+
200
+ file_path = Path(path)
201
+
202
+ if not file_path.exists():
203
+ raise FileNotFoundError(f"File not found: {path}")
204
+
205
+ if not file_path.is_file():
206
+ raise ValueError(f"Not a file: {path}")
207
+
208
+ logger.debug(f"Reading local file: {file_path}")
209
+
210
+ # Read file content
211
+ content_bytes = file_path.read_bytes()
212
+
213
+ # Get metadata
214
+ stat = file_path.stat()
215
+ metadata = {
216
+ "size": stat.st_size,
217
+ "modified": stat.st_mtime,
218
+ }
219
+
220
+ # Extract content using provider
221
+ provider = self._get_provider(file_path.suffix)
222
+ extracted_content = provider.extract(content_bytes, metadata)
223
+
224
+ return {
225
+ "uri": str(file_path.absolute()),
226
+ "content": extracted_content["text"],
227
+ "metadata": {**metadata, **extracted_content.get("metadata", {})},
228
+ "provider": provider.name,
229
+ }
230
+
231
+ def _get_provider(self, suffix: str) -> ContentProvider:
232
+ """Get content provider for file extension."""
233
+ suffix_lower = suffix.lower()
234
+
235
+ if suffix_lower not in self.providers:
236
+ raise RuntimeError(
237
+ f"No provider available for file type: {suffix}. "
238
+ f"Supported: {', '.join(self.providers.keys())}"
239
+ )
240
+
241
+ return self.providers[suffix_lower]
242
+
243
+ def register_provider(self, extensions: list[str], provider: ContentProvider):
244
+ """
245
+ Register a custom content provider.
246
+
247
+ Args:
248
+ extensions: List of file extensions (e.g., ['.pdf', '.docx'])
249
+ provider: ContentProvider instance
250
+ """
251
+ for ext in extensions:
252
+ ext_lower = ext.lower() if ext.startswith(".") else f".{ext.lower()}"
253
+ self.providers[ext_lower] = provider
254
+ logger.debug(f"Registered provider '{provider.name}' for {ext_lower}")
255
+
256
+ async def ingest_file(
257
+ self,
258
+ file_uri: str,
259
+ user_id: str,
260
+ category: str | None = None,
261
+ tags: list[str] | None = None,
262
+ is_local_server: bool = False,
263
+ ) -> dict[str, Any]:
264
+ """
265
+ Complete file ingestion pipeline: read → store → parse → chunk → embed.
266
+
267
+ **CENTRALIZED INGESTION**: This is the single entry point for all file ingestion
268
+ in REM. It handles:
269
+
270
+ 1. **File Reading**: From local/S3/HTTP sources via FileSystemService
271
+ 2. **Storage**: Writes to user-scoped internal storage (~/.rem/fs/ or S3)
272
+ 3. **Parsing**: Extracts content, metadata, tables, images (parsing state)
273
+ 4. **Chunking**: Splits content into semantic chunks for embedding
274
+ 5. **Database**: Creates File entity + Resource chunks with embeddings
275
+
276
+ **PARSING STATE - The Innovation**:
277
+ Files (PDF, WAV, DOCX, etc.) are converted to rich parsing state:
278
+ - **Content**: Markdown-formatted text (preserves structure)
279
+ - **Metadata**: File info, extraction details, timestamps
280
+ - **Tables**: Structured data extracted from documents (CSV format)
281
+ - **Images**: Extracted images saved to storage (for multimodal RAG)
282
+ - **Provider Info**: Which parser was used, version, settings
283
+
284
+ This parsing state enables agents to deeply understand documents:
285
+ - Query tables directly (structured data)
286
+ - Reference images (multimodal context)
287
+ - Understand document structure (markdown hierarchy)
288
+ - Track provenance (metadata lineage)
289
+
290
+ **CLIENT ABSTRACTION**: Clients (MCP tools, CLI, workers) don't worry about:
291
+ - Where files are stored (S3 vs local) - automatically selected
292
+ - How files are parsed (PDF vs DOCX) - provider auto-selected
293
+ - How chunks are created - semantic chunking with tiktoken
294
+ - How embeddings work - async worker with batching
295
+
296
+ Clients just call `ingest_file()` and get searchable resources.
297
+
298
+ **PERMISSION CHECK**: Remote MCP servers cannot read local files (security).
299
+ Only local/stdio MCP servers can access local filesystem paths.
300
+
301
+ Args:
302
+ file_uri: Source file location (local path, s3://, or https://)
303
+ user_id: User identifier for data isolation and ownership
304
+ category: Optional category tag (document, code, audio, etc.)
305
+ tags: Optional list of tags
306
+ is_local_server: True if running as local/stdio MCP server
307
+
308
+ Returns:
309
+ dict with:
310
+ - file_id: UUID of created File entity
311
+ - file_name: Original filename
312
+ - storage_uri: Internal storage location
313
+ - internal_key: S3 key or local path
314
+ - size_bytes: File size
315
+ - content_type: MIME type
316
+ - processing_status: "completed" or "failed"
317
+ - resources_created: Number of Resource chunks created
318
+ - parsing_metadata: Rich parsing state (content, tables, images)
319
+ - content: Parsed file content (markdown format) if status is "completed"
320
+
321
+ Raises:
322
+ PermissionError: If remote server tries to read local file
323
+ FileNotFoundError: If source file doesn't exist
324
+ RuntimeError: If storage or processing fails
325
+
326
+ Example:
327
+ >>> service = ContentService()
328
+ >>> result = await service.ingest_file(
329
+ ... file_uri="s3://bucket/contract.pdf",
330
+ ... user_id="user-123",
331
+ ... category="legal"
332
+ ... )
333
+ >>> print(f"Created {result['resources_created']} searchable chunks")
334
+ """
335
+ from pathlib import Path
336
+ from uuid import uuid4
337
+ import mimetypes
338
+
339
+ from ...models.entities import File
340
+ from ...services.fs import FileSystemService
341
+ from ...services.postgres import PostgresService
342
+
343
+ # Step 1: Read file from source using FileSystemService
344
+ fs_service = FileSystemService()
345
+ file_content, file_name, source_type = await fs_service.read_uri(
346
+ file_uri, is_local_server=is_local_server
347
+ )
348
+ file_size = len(file_content)
349
+ logger.info(f"Read {file_size} bytes from {file_uri} (source: {source_type})")
350
+
351
+ # Step 2: Write to internal storage (user-scoped)
352
+ file_id = str(uuid4())
353
+ storage_uri, internal_key, content_type, _ = await fs_service.write_to_internal_storage(
354
+ content=file_content,
355
+ tenant_id=user_id, # Using user_id for storage scoping
356
+ file_name=file_name,
357
+ file_id=file_id,
358
+ )
359
+ logger.info(f"Stored to internal storage: {storage_uri}")
360
+
361
+ # Step 3: Create File entity
362
+ file_entity = File(
363
+ id=file_id,
364
+ tenant_id=user_id, # Set tenant_id to user_id (application scoped to user)
365
+ user_id=user_id,
366
+ name=file_name,
367
+ uri=storage_uri,
368
+ mime_type=content_type,
369
+ size_bytes=file_size,
370
+ metadata={
371
+ "source_uri": file_uri,
372
+ "source_type": source_type,
373
+ "category": category,
374
+ "storage_uri": storage_uri,
375
+ "s3_key": internal_key,
376
+ "s3_bucket": (
377
+ storage_uri.split("/")[2] if storage_uri.startswith("s3://") else "local"
378
+ ),
379
+ },
380
+ tags=tags or [],
381
+ )
382
+
383
+ # Step 4: Save File entity to database
384
+ from rem.services.postgres import get_postgres_service
385
+ from rem.services.postgres.repository import Repository
386
+
387
+ postgres_service = get_postgres_service()
388
+ if not postgres_service:
389
+ raise RuntimeError("PostgreSQL is disabled. Cannot save File entity to database.")
390
+
391
+ await postgres_service.connect()
392
+ try:
393
+ repo = Repository(File, "files", db=postgres_service)
394
+ await repo.upsert(file_entity)
395
+ finally:
396
+ await postgres_service.disconnect()
397
+
398
+ # Step 5: Process file to create Resource chunks
399
+ try:
400
+ processing_result = await self.process_and_save(
401
+ uri=storage_uri,
402
+ user_id=user_id,
403
+ )
404
+ processing_status = processing_result.get("status", "completed")
405
+ resources_created = processing_result.get("chunk_count", 0)
406
+ parsing_metadata = {
407
+ "content_extracted": bool(processing_result.get("content")),
408
+ "markdown_generated": bool(processing_result.get("markdown")),
409
+ "chunks_created": resources_created,
410
+ }
411
+ except Exception as e:
412
+ logger.error(f"File processing failed: {e}", exc_info=True)
413
+ processing_status = "failed"
414
+ resources_created = 0
415
+ parsing_metadata = {"error": str(e)}
416
+
417
+ logger.info(
418
+ f"File ingestion complete: {file_name} "
419
+ f"(user: {user_id}, status: {processing_status}, "
420
+ f"resources: {resources_created})"
421
+ )
422
+
423
+ # Extract content if available
424
+ content = None
425
+ if processing_status == "completed" and processing_result:
426
+ content = processing_result.get("content")
427
+
428
+ return {
429
+ "file_id": file_id,
430
+ "file_name": file_name,
431
+ "storage_uri": storage_uri,
432
+ "internal_key": internal_key,
433
+ "size_bytes": file_size,
434
+ "content_type": content_type,
435
+ "source_uri": file_uri,
436
+ "source_type": source_type,
437
+ "processing_status": processing_status,
438
+ "resources_created": resources_created,
439
+ "parsing_metadata": parsing_metadata,
440
+ "content": content, # Include parsed content when available
441
+ "message": f"File ingested and {processing_status}. Created {resources_created} resources.",
442
+ }
443
+
444
+ async def process_and_save(self, uri: str, user_id: str | None = None) -> dict[str, Any]:
445
+ """
446
+ Process file end-to-end: extract → markdown → chunk → save.
447
+
448
+ **INTERNAL METHOD**: This is called by ingest_file() after storage.
449
+ Clients should use ingest_file() instead for the full pipeline.
450
+
451
+ **KIND-BASED ROUTING**: For YAML/JSON files, checks for 'kind' field and routes to:
452
+ - kind=agent or kind=evaluator → Save to schemas table (not resources)
453
+ - kind=engram → Process via EngramProcessor (creates resources + moments)
454
+ - No kind → Standard resource processing (default)
455
+
456
+ Args:
457
+ uri: File URI (s3://bucket/key or local path)
458
+ user_id: Optional user ID for multi-tenancy
459
+
460
+ Returns:
461
+ dict with file metadata and chunk count
462
+ """
463
+ logger.info(f"Processing and saving: {uri}")
464
+
465
+ # Extract content
466
+ result = self.process_uri(uri)
467
+ filename = Path(uri).name
468
+
469
+ # Check for custom kind-based processing (YAML/JSON only)
470
+ file_suffix = Path(uri).suffix.lower()
471
+ if file_suffix in ['.yaml', '.yml', '.json']:
472
+ # Check if schema provider detected a valid schema
473
+ if result.get('metadata', {}).get('is_schema'):
474
+ logger.info(f"🔧 Custom provider flow initiated: kind={result['metadata'].get('kind')} for {filename}")
475
+ return await self._process_schema(result, uri, user_id)
476
+
477
+ # Check for engram kind in raw data
478
+ import yaml
479
+ import json
480
+ try:
481
+ # Parse the content to check for kind
482
+ content_text = result.get('content', '')
483
+ if file_suffix == '.json':
484
+ data = json.loads(content_text)
485
+ else:
486
+ data = yaml.safe_load(content_text)
487
+
488
+ if isinstance(data, dict) and data.get('kind') == 'engram':
489
+ logger.info(f"🔧 Custom provider flow initiated: kind=engram for {filename}")
490
+ return await self._process_engram(data, uri, user_id)
491
+ except Exception as e:
492
+ logger.debug(f"Could not parse {filename} for kind check: {e}")
493
+ # Fall through to standard processing
494
+
495
+ # Convert to markdown
496
+ markdown = to_markdown(result["content"], filename)
497
+
498
+ # Chunk markdown
499
+ chunks = chunk_text(markdown)
500
+ logger.info(f"Created {len(chunks)} chunks from {filename}")
501
+
502
+ # Save File entity
503
+ file = File(
504
+ name=filename,
505
+ uri=uri,
506
+ content=result["content"],
507
+ size_bytes=result["metadata"].get("size"),
508
+ mime_type=result["metadata"].get("content_type"),
509
+ processing_status="completed",
510
+ tenant_id=user_id or "default", # Required field
511
+ user_id=user_id,
512
+ )
513
+
514
+ if self.file_repo:
515
+ await self.file_repo.upsert(file)
516
+ logger.info(f"Saved File: {filename}")
517
+
518
+ # Create Resource entities for each chunk
519
+ resources = [
520
+ Resource(
521
+ name=f"{filename}#chunk-{i}",
522
+ uri=f"{uri}#chunk-{i}",
523
+ ordinal=i,
524
+ content=chunk,
525
+ category="document",
526
+ tenant_id=user_id or "default", # Required field
527
+ user_id=user_id,
528
+ )
529
+ for i, chunk in enumerate(chunks)
530
+ ]
531
+
532
+ if self.resource_repo:
533
+ await self.resource_repo.upsert(
534
+ resources,
535
+ embeddable_fields=["content"],
536
+ generate_embeddings=True,
537
+ )
538
+ logger.info(f"Saved {len(resources)} Resource chunks")
539
+ logger.info(f"Queued {len(resources)} embedding generation tasks for content field")
540
+
541
+ return {
542
+ "file": file.model_dump(),
543
+ "chunk_count": len(chunks),
544
+ "content": result["content"],
545
+ "markdown": markdown,
546
+ "status": "completed",
547
+ }
548
+
549
+ async def _process_schema(
550
+ self, result: dict[str, Any], uri: str, user_id: str | None = None
551
+ ) -> dict[str, Any]:
552
+ """
553
+ Process agent/evaluator schema and save to schemas table.
554
+
555
+ Args:
556
+ result: Extraction result from SchemaProvider with schema_data
557
+ uri: File URI
558
+ user_id: Optional user ID for multi-tenancy
559
+
560
+ Returns:
561
+ dict with schema save result
562
+ """
563
+ from rem.models.entities import Schema
564
+ from rem.services.postgres import PostgresService
565
+
566
+ metadata = result.get("metadata", {})
567
+ schema_data = result.get("schema_data", {})
568
+
569
+ kind = metadata.get("kind")
570
+ name = metadata.get("name")
571
+ version = metadata.get("version", "1.0.0")
572
+
573
+ logger.info(f"Saving schema to schemas table: kind={kind}, name={name}, version={version}")
574
+
575
+ # Create Schema entity
576
+ # IMPORTANT: category field distinguishes agents from evaluators
577
+ # - kind=agent → category="agent" (AI agents with tools/resources)
578
+ # - kind=evaluator → category="evaluator" (LLM-as-a-Judge evaluators)
579
+ schema_entity = Schema(
580
+ tenant_id=user_id or "default",
581
+ user_id=user_id,
582
+ name=name,
583
+ spec=schema_data,
584
+ category=kind, # Maps kind → category for database filtering
585
+ provider_configs=metadata.get("provider_configs", []),
586
+ embedding_fields=metadata.get("embedding_fields", []),
587
+ metadata={
588
+ "uri": uri,
589
+ "version": version,
590
+ "tags": metadata.get("tags", []),
591
+ },
592
+ )
593
+
594
+ # Save to schemas table
595
+ from rem.services.postgres import get_postgres_service
596
+ postgres = get_postgres_service()
597
+ if not postgres:
598
+ raise RuntimeError("PostgreSQL is disabled. Cannot save Schema entity to database.")
599
+
600
+ await postgres.connect()
601
+ try:
602
+ from rem.models.entities import Schema as SchemaModel
603
+ await postgres.batch_upsert(
604
+ records=[schema_entity],
605
+ model=SchemaModel,
606
+ table_name="schemas",
607
+ entity_key_field="name",
608
+ generate_embeddings=False,
609
+ )
610
+ logger.info(f"✅ Schema saved: {name} (kind={kind})")
611
+ finally:
612
+ await postgres.disconnect()
613
+
614
+ return {
615
+ "schema_name": name,
616
+ "kind": kind,
617
+ "version": version,
618
+ "status": "completed",
619
+ "message": f"Schema '{name}' saved to schemas table",
620
+ }
621
+
622
+ async def _process_engram(
623
+ self, data: dict[str, Any], uri: str, user_id: str | None = None
624
+ ) -> dict[str, Any]:
625
+ """
626
+ Process engram and save to resources + moments tables.
627
+
628
+ Args:
629
+ data: Parsed engram data with kind=engram
630
+ uri: File URI
631
+ user_id: Optional user ID for multi-tenancy
632
+
633
+ Returns:
634
+ dict with engram processing result
635
+ """
636
+ from rem.workers.engram_processor import EngramProcessor
637
+ from rem.services.postgres import PostgresService
638
+
639
+ logger.info(f"Processing engram: {data.get('name')}")
640
+
641
+ from rem.services.postgres import get_postgres_service
642
+ postgres = get_postgres_service()
643
+ if not postgres:
644
+ raise RuntimeError("PostgreSQL is disabled. Cannot process engram.")
645
+
646
+ await postgres.connect()
647
+ try:
648
+ processor = EngramProcessor(postgres)
649
+ result = await processor.process_engram(
650
+ data=data,
651
+ tenant_id=user_id or "default",
652
+ user_id=user_id,
653
+ )
654
+ logger.info(f"✅ Engram processed: {result.get('resource_id')} with {len(result.get('moment_ids', []))} moments")
655
+ return result
656
+ finally:
657
+ await postgres.disconnect()