remdb 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. rem/__init__.py +2 -0
  2. rem/agentic/README.md +650 -0
  3. rem/agentic/__init__.py +39 -0
  4. rem/agentic/agents/README.md +155 -0
  5. rem/agentic/agents/__init__.py +8 -0
  6. rem/agentic/context.py +148 -0
  7. rem/agentic/context_builder.py +329 -0
  8. rem/agentic/mcp/__init__.py +0 -0
  9. rem/agentic/mcp/tool_wrapper.py +107 -0
  10. rem/agentic/otel/__init__.py +5 -0
  11. rem/agentic/otel/setup.py +151 -0
  12. rem/agentic/providers/phoenix.py +674 -0
  13. rem/agentic/providers/pydantic_ai.py +572 -0
  14. rem/agentic/query.py +117 -0
  15. rem/agentic/query_helper.py +89 -0
  16. rem/agentic/schema.py +396 -0
  17. rem/agentic/serialization.py +245 -0
  18. rem/agentic/tools/__init__.py +5 -0
  19. rem/agentic/tools/rem_tools.py +231 -0
  20. rem/api/README.md +420 -0
  21. rem/api/main.py +324 -0
  22. rem/api/mcp_router/prompts.py +182 -0
  23. rem/api/mcp_router/resources.py +536 -0
  24. rem/api/mcp_router/server.py +213 -0
  25. rem/api/mcp_router/tools.py +584 -0
  26. rem/api/routers/auth.py +229 -0
  27. rem/api/routers/chat/__init__.py +5 -0
  28. rem/api/routers/chat/completions.py +281 -0
  29. rem/api/routers/chat/json_utils.py +76 -0
  30. rem/api/routers/chat/models.py +124 -0
  31. rem/api/routers/chat/streaming.py +185 -0
  32. rem/auth/README.md +258 -0
  33. rem/auth/__init__.py +26 -0
  34. rem/auth/middleware.py +100 -0
  35. rem/auth/providers/__init__.py +13 -0
  36. rem/auth/providers/base.py +376 -0
  37. rem/auth/providers/google.py +163 -0
  38. rem/auth/providers/microsoft.py +237 -0
  39. rem/cli/README.md +455 -0
  40. rem/cli/__init__.py +8 -0
  41. rem/cli/commands/README.md +126 -0
  42. rem/cli/commands/__init__.py +3 -0
  43. rem/cli/commands/ask.py +566 -0
  44. rem/cli/commands/configure.py +497 -0
  45. rem/cli/commands/db.py +493 -0
  46. rem/cli/commands/dreaming.py +324 -0
  47. rem/cli/commands/experiments.py +1302 -0
  48. rem/cli/commands/mcp.py +66 -0
  49. rem/cli/commands/process.py +245 -0
  50. rem/cli/commands/schema.py +183 -0
  51. rem/cli/commands/serve.py +106 -0
  52. rem/cli/dreaming.py +363 -0
  53. rem/cli/main.py +96 -0
  54. rem/config.py +237 -0
  55. rem/mcp_server.py +41 -0
  56. rem/models/core/__init__.py +49 -0
  57. rem/models/core/core_model.py +64 -0
  58. rem/models/core/engram.py +333 -0
  59. rem/models/core/experiment.py +628 -0
  60. rem/models/core/inline_edge.py +132 -0
  61. rem/models/core/rem_query.py +243 -0
  62. rem/models/entities/__init__.py +43 -0
  63. rem/models/entities/file.py +57 -0
  64. rem/models/entities/image_resource.py +88 -0
  65. rem/models/entities/message.py +35 -0
  66. rem/models/entities/moment.py +123 -0
  67. rem/models/entities/ontology.py +191 -0
  68. rem/models/entities/ontology_config.py +131 -0
  69. rem/models/entities/resource.py +95 -0
  70. rem/models/entities/schema.py +87 -0
  71. rem/models/entities/user.py +85 -0
  72. rem/py.typed +0 -0
  73. rem/schemas/README.md +507 -0
  74. rem/schemas/__init__.py +6 -0
  75. rem/schemas/agents/README.md +92 -0
  76. rem/schemas/agents/core/moment-builder.yaml +178 -0
  77. rem/schemas/agents/core/rem-query-agent.yaml +226 -0
  78. rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
  79. rem/schemas/agents/core/simple-assistant.yaml +19 -0
  80. rem/schemas/agents/core/user-profile-builder.yaml +163 -0
  81. rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
  82. rem/schemas/agents/examples/contract-extractor.yaml +134 -0
  83. rem/schemas/agents/examples/cv-parser.yaml +263 -0
  84. rem/schemas/agents/examples/hello-world.yaml +37 -0
  85. rem/schemas/agents/examples/query.yaml +54 -0
  86. rem/schemas/agents/examples/simple.yaml +21 -0
  87. rem/schemas/agents/examples/test.yaml +29 -0
  88. rem/schemas/agents/rem.yaml +128 -0
  89. rem/schemas/evaluators/hello-world/default.yaml +77 -0
  90. rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
  91. rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
  92. rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
  93. rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
  94. rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
  95. rem/services/__init__.py +16 -0
  96. rem/services/audio/INTEGRATION.md +308 -0
  97. rem/services/audio/README.md +376 -0
  98. rem/services/audio/__init__.py +15 -0
  99. rem/services/audio/chunker.py +354 -0
  100. rem/services/audio/transcriber.py +259 -0
  101. rem/services/content/README.md +1269 -0
  102. rem/services/content/__init__.py +5 -0
  103. rem/services/content/providers.py +801 -0
  104. rem/services/content/service.py +676 -0
  105. rem/services/dreaming/README.md +230 -0
  106. rem/services/dreaming/__init__.py +53 -0
  107. rem/services/dreaming/affinity_service.py +336 -0
  108. rem/services/dreaming/moment_service.py +264 -0
  109. rem/services/dreaming/ontology_service.py +54 -0
  110. rem/services/dreaming/user_model_service.py +297 -0
  111. rem/services/dreaming/utils.py +39 -0
  112. rem/services/embeddings/__init__.py +11 -0
  113. rem/services/embeddings/api.py +120 -0
  114. rem/services/embeddings/worker.py +421 -0
  115. rem/services/fs/README.md +662 -0
  116. rem/services/fs/__init__.py +62 -0
  117. rem/services/fs/examples.py +206 -0
  118. rem/services/fs/examples_paths.py +204 -0
  119. rem/services/fs/git_provider.py +935 -0
  120. rem/services/fs/local_provider.py +760 -0
  121. rem/services/fs/parsing-hooks-examples.md +172 -0
  122. rem/services/fs/paths.py +276 -0
  123. rem/services/fs/provider.py +460 -0
  124. rem/services/fs/s3_provider.py +1042 -0
  125. rem/services/fs/service.py +186 -0
  126. rem/services/git/README.md +1075 -0
  127. rem/services/git/__init__.py +17 -0
  128. rem/services/git/service.py +469 -0
  129. rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
  130. rem/services/phoenix/README.md +453 -0
  131. rem/services/phoenix/__init__.py +46 -0
  132. rem/services/phoenix/client.py +686 -0
  133. rem/services/phoenix/config.py +88 -0
  134. rem/services/phoenix/prompt_labels.py +477 -0
  135. rem/services/postgres/README.md +575 -0
  136. rem/services/postgres/__init__.py +23 -0
  137. rem/services/postgres/migration_service.py +427 -0
  138. rem/services/postgres/pydantic_to_sqlalchemy.py +232 -0
  139. rem/services/postgres/register_type.py +352 -0
  140. rem/services/postgres/repository.py +337 -0
  141. rem/services/postgres/schema_generator.py +379 -0
  142. rem/services/postgres/service.py +802 -0
  143. rem/services/postgres/sql_builder.py +354 -0
  144. rem/services/rem/README.md +304 -0
  145. rem/services/rem/__init__.py +23 -0
  146. rem/services/rem/exceptions.py +71 -0
  147. rem/services/rem/executor.py +293 -0
  148. rem/services/rem/parser.py +145 -0
  149. rem/services/rem/queries.py +196 -0
  150. rem/services/rem/query.py +371 -0
  151. rem/services/rem/service.py +527 -0
  152. rem/services/session/README.md +374 -0
  153. rem/services/session/__init__.py +6 -0
  154. rem/services/session/compression.py +360 -0
  155. rem/services/session/reload.py +77 -0
  156. rem/settings.py +1235 -0
  157. rem/sql/002_install_models.sql +1068 -0
  158. rem/sql/background_indexes.sql +42 -0
  159. rem/sql/install_models.sql +1038 -0
  160. rem/sql/migrations/001_install.sql +503 -0
  161. rem/sql/migrations/002_install_models.sql +1202 -0
  162. rem/utils/AGENTIC_CHUNKING.md +597 -0
  163. rem/utils/README.md +583 -0
  164. rem/utils/__init__.py +43 -0
  165. rem/utils/agentic_chunking.py +622 -0
  166. rem/utils/batch_ops.py +343 -0
  167. rem/utils/chunking.py +108 -0
  168. rem/utils/clip_embeddings.py +276 -0
  169. rem/utils/dict_utils.py +98 -0
  170. rem/utils/embeddings.py +423 -0
  171. rem/utils/examples/embeddings_example.py +305 -0
  172. rem/utils/examples/sql_types_example.py +202 -0
  173. rem/utils/markdown.py +16 -0
  174. rem/utils/model_helpers.py +236 -0
  175. rem/utils/schema_loader.py +336 -0
  176. rem/utils/sql_types.py +348 -0
  177. rem/utils/user_id.py +81 -0
  178. rem/utils/vision.py +330 -0
  179. rem/workers/README.md +506 -0
  180. rem/workers/__init__.py +5 -0
  181. rem/workers/dreaming.py +502 -0
  182. rem/workers/engram_processor.py +312 -0
  183. rem/workers/sqs_file_processor.py +193 -0
  184. remdb-0.3.7.dist-info/METADATA +1473 -0
  185. remdb-0.3.7.dist-info/RECORD +187 -0
  186. remdb-0.3.7.dist-info/WHEEL +4 -0
  187. remdb-0.3.7.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,676 @@
1
+ """
2
+ ContentService for file processing.
3
+
4
+ Pipeline:
5
+ 1. Extract content via provider plugins
6
+ 2. Convert to markdown
7
+ 3. Chunk markdown
8
+ 4. Save File + Resources to database via repositories
9
+ """
10
+
11
+ import json
12
+ from datetime import UTC, datetime
13
+ from pathlib import Path
14
+ from typing import Any
15
+ from urllib.parse import urlparse
16
+
17
+ import boto3
18
+ from botocore.exceptions import ClientError
19
+ from loguru import logger
20
+
21
+ from rem.models.entities import File, Resource
22
+ from rem.services.postgres import Repository
23
+ from rem.settings import settings
24
+ from rem.utils.chunking import chunk_text
25
+ from rem.utils.markdown import to_markdown
26
+
27
+ from .providers import AudioProvider, ContentProvider, DocProvider, SchemaProvider, TextProvider
28
+
29
+
30
+ class ContentService:
31
+ """
32
+ Service for processing files: extract → markdown → chunk → save.
33
+
34
+ Supports:
35
+ - S3 URIs (s3://bucket/key)
36
+ - Local file paths
37
+ - Pluggable content providers
38
+ """
39
+
40
+ def __init__(
41
+ self, file_repo: Repository | None = None, resource_repo: Repository | None = None
42
+ ):
43
+ self.s3_client = self._create_s3_client()
44
+ self.providers: dict[str, ContentProvider] = {}
45
+ self.file_repo = file_repo
46
+ self.resource_repo = resource_repo
47
+
48
+ # Register default providers from settings
49
+ self._register_default_providers()
50
+
51
+ def _register_default_providers(self):
52
+ """Register default content providers from settings."""
53
+ # Schema provider for agent/evaluator schemas (YAML/JSON)
54
+ # Register first so it takes priority for .yaml/.json files
55
+ schema_provider = SchemaProvider()
56
+ self.providers[".yaml"] = schema_provider
57
+ self.providers[".yml"] = schema_provider
58
+ self.providers[".json"] = schema_provider
59
+
60
+ # Text provider for plain text, code, data files
61
+ text_provider = TextProvider()
62
+ for ext in settings.content.supported_text_types:
63
+ # Don't override schema provider for yaml/json
64
+ if ext.lower() not in [".yaml", ".yml", ".json"]:
65
+ self.providers[ext.lower()] = text_provider
66
+
67
+ # Doc provider for PDFs, Office docs, images (via Kreuzberg)
68
+ doc_provider = DocProvider()
69
+ for ext in settings.content.supported_doc_types:
70
+ self.providers[ext.lower()] = doc_provider
71
+
72
+ # Audio provider for audio files (via Whisper API)
73
+ audio_provider = AudioProvider()
74
+ for ext in settings.content.supported_audio_types:
75
+ self.providers[ext.lower()] = audio_provider
76
+
77
+ logger.debug(
78
+ f"Registered {len(self.providers)} file extensions across "
79
+ f"schema (yaml/json), "
80
+ f"{len(settings.content.supported_text_types)} text, "
81
+ f"{len(settings.content.supported_doc_types)} doc, "
82
+ f"{len(settings.content.supported_audio_types)} audio types"
83
+ )
84
+
85
+ def _create_s3_client(self):
86
+ """Create S3 client with IRSA or configured credentials."""
87
+ s3_config: dict[str, Any] = {
88
+ "region_name": settings.s3.region,
89
+ }
90
+
91
+ # Custom endpoint for MinIO/LocalStack
92
+ if settings.s3.endpoint_url:
93
+ s3_config["endpoint_url"] = settings.s3.endpoint_url
94
+
95
+ # Access keys (not needed with IRSA in EKS)
96
+ if settings.s3.access_key_id and settings.s3.secret_access_key:
97
+ s3_config["aws_access_key_id"] = settings.s3.access_key_id
98
+ s3_config["aws_secret_access_key"] = settings.s3.secret_access_key
99
+
100
+ # SSL configuration
101
+ s3_config["use_ssl"] = settings.s3.use_ssl
102
+
103
+ return boto3.client("s3", **s3_config)
104
+
105
+ def process_uri(self, uri: str) -> dict[str, Any]:
106
+ """
107
+ Process a file URI and extract content.
108
+
109
+ Args:
110
+ uri: File URI (s3://bucket/key or local path)
111
+
112
+ Returns:
113
+ dict with:
114
+ - uri: Original URI
115
+ - content: Extracted text content
116
+ - metadata: File metadata (size, type, etc.)
117
+ - provider: Provider used for extraction
118
+
119
+ Raises:
120
+ ValueError: If URI format is invalid
121
+ FileNotFoundError: If file doesn't exist
122
+ RuntimeError: If no provider available for file type
123
+ """
124
+ logger.info(f"Processing URI: {uri}")
125
+
126
+ # Determine if S3 or local file
127
+ if uri.startswith("s3://"):
128
+ return self._process_s3_uri(uri)
129
+ else:
130
+ return self._process_local_file(uri)
131
+
132
+ def _process_s3_uri(self, uri: str) -> dict[str, Any]:
133
+ """Process S3 URI."""
134
+ parsed = urlparse(uri)
135
+ bucket = parsed.netloc
136
+ key = parsed.path.lstrip("/")
137
+
138
+ if not bucket or not key:
139
+ raise ValueError(f"Invalid S3 URI: {uri}")
140
+
141
+ logger.debug(f"Downloading s3://{bucket}/{key}")
142
+
143
+ try:
144
+ # Download file from S3
145
+ response = self.s3_client.get_object(Bucket=bucket, Key=key)
146
+ content_bytes = response["Body"].read()
147
+
148
+ # Get metadata
149
+ metadata = {
150
+ "size": response["ContentLength"],
151
+ "content_type": response.get("ContentType", ""),
152
+ "last_modified": response["LastModified"].isoformat(),
153
+ "etag": response.get("ETag", "").strip('"'),
154
+ }
155
+
156
+ # Extract content using provider
157
+ file_path = Path(key)
158
+ provider = self._get_provider(file_path.suffix)
159
+
160
+ extracted_content = provider.extract(content_bytes, metadata)
161
+
162
+ # Build result with standard fields
163
+ result = {
164
+ "uri": uri,
165
+ "content": extracted_content["text"],
166
+ "metadata": {**metadata, **extracted_content.get("metadata", {})},
167
+ "provider": provider.name,
168
+ }
169
+
170
+ # Preserve schema-specific fields if present (from SchemaProvider)
171
+ if "is_schema" in extracted_content:
172
+ result["is_schema"] = extracted_content["is_schema"]
173
+ if "schema_data" in extracted_content:
174
+ result["schema_data"] = extracted_content["schema_data"]
175
+
176
+ return result
177
+
178
+ except ClientError as e:
179
+ error_code = e.response.get("Error", {}).get("Code", "")
180
+ if error_code == "NoSuchKey":
181
+ raise FileNotFoundError(f"S3 object not found: {uri}") from e
182
+ elif error_code == "NoSuchBucket":
183
+ raise FileNotFoundError(f"S3 bucket not found: {bucket}") from e
184
+ else:
185
+ raise RuntimeError(f"S3 error: {e}") from e
186
+
187
+ def _process_local_file(self, path: str) -> dict[str, Any]:
188
+ """
189
+ Process local file path.
190
+
191
+ **PATH HANDLING FIX**: This method correctly handles both file:// URIs
192
+ and plain paths. Previously, file:// URIs from tools.py were NOT stripped,
193
+ causing FileNotFoundError because Path() treated "file:///Users/..." as a
194
+ literal filename instead of a URI.
195
+
196
+ The fix ensures consistent path handling:
197
+ - MCP tool creates: file:///Users/.../file.pdf
198
+ - This method strips: file:// → /Users/.../file.pdf
199
+ - Path() works correctly with absolute path
200
+
201
+ Related files:
202
+ - tools.py line 636: Creates file:// URIs
203
+ - FileSystemService line 58: Also strips file:// URIs
204
+ """
205
+ # Handle file:// URI scheme
206
+ if path.startswith("file://"):
207
+ path = path.replace("file://", "")
208
+
209
+ file_path = Path(path)
210
+
211
+ if not file_path.exists():
212
+ raise FileNotFoundError(f"File not found: {path}")
213
+
214
+ if not file_path.is_file():
215
+ raise ValueError(f"Not a file: {path}")
216
+
217
+ logger.debug(f"Reading local file: {file_path}")
218
+
219
+ # Read file content
220
+ content_bytes = file_path.read_bytes()
221
+
222
+ # Get metadata
223
+ stat = file_path.stat()
224
+ metadata = {
225
+ "size": stat.st_size,
226
+ "modified": stat.st_mtime,
227
+ }
228
+
229
+ # Extract content using provider
230
+ provider = self._get_provider(file_path.suffix)
231
+ extracted_content = provider.extract(content_bytes, metadata)
232
+
233
+ # Build result with standard fields
234
+ result = {
235
+ "uri": str(file_path.absolute()),
236
+ "content": extracted_content["text"],
237
+ "metadata": {**metadata, **extracted_content.get("metadata", {})},
238
+ "provider": provider.name,
239
+ }
240
+
241
+ # Preserve schema-specific fields if present (from SchemaProvider)
242
+ if "is_schema" in extracted_content:
243
+ result["is_schema"] = extracted_content["is_schema"]
244
+ if "schema_data" in extracted_content:
245
+ result["schema_data"] = extracted_content["schema_data"]
246
+
247
+ return result
248
+
249
+ def _get_provider(self, suffix: str) -> ContentProvider:
250
+ """Get content provider for file extension."""
251
+ suffix_lower = suffix.lower()
252
+
253
+ if suffix_lower not in self.providers:
254
+ raise RuntimeError(
255
+ f"No provider available for file type: {suffix}. "
256
+ f"Supported: {', '.join(self.providers.keys())}"
257
+ )
258
+
259
+ return self.providers[suffix_lower]
260
+
261
+ def register_provider(self, extensions: list[str], provider: ContentProvider):
262
+ """
263
+ Register a custom content provider.
264
+
265
+ Args:
266
+ extensions: List of file extensions (e.g., ['.pdf', '.docx'])
267
+ provider: ContentProvider instance
268
+ """
269
+ for ext in extensions:
270
+ ext_lower = ext.lower() if ext.startswith(".") else f".{ext.lower()}"
271
+ self.providers[ext_lower] = provider
272
+ logger.debug(f"Registered provider '{provider.name}' for {ext_lower}")
273
+
274
+ async def ingest_file(
275
+ self,
276
+ file_uri: str,
277
+ user_id: str,
278
+ category: str | None = None,
279
+ tags: list[str] | None = None,
280
+ is_local_server: bool = False,
281
+ ) -> dict[str, Any]:
282
+ """
283
+ Complete file ingestion pipeline: read → store → parse → chunk → embed.
284
+
285
+ **CENTRALIZED INGESTION**: This is the single entry point for all file ingestion
286
+ in REM. It handles:
287
+
288
+ 1. **File Reading**: From local/S3/HTTP sources via FileSystemService
289
+ 2. **Storage**: Writes to user-scoped internal storage (~/.rem/fs/ or S3)
290
+ 3. **Parsing**: Extracts content, metadata, tables, images (parsing state)
291
+ 4. **Chunking**: Splits content into semantic chunks for embedding
292
+ 5. **Database**: Creates File entity + Resource chunks with embeddings
293
+
294
+ **PARSING STATE - The Innovation**:
295
+ Files (PDF, WAV, DOCX, etc.) are converted to rich parsing state:
296
+ - **Content**: Markdown-formatted text (preserves structure)
297
+ - **Metadata**: File info, extraction details, timestamps
298
+ - **Tables**: Structured data extracted from documents (CSV format)
299
+ - **Images**: Extracted images saved to storage (for multimodal RAG)
300
+ - **Provider Info**: Which parser was used, version, settings
301
+
302
+ This parsing state enables agents to deeply understand documents:
303
+ - Query tables directly (structured data)
304
+ - Reference images (multimodal context)
305
+ - Understand document structure (markdown hierarchy)
306
+ - Track provenance (metadata lineage)
307
+
308
+ **CLIENT ABSTRACTION**: Clients (MCP tools, CLI, workers) don't worry about:
309
+ - Where files are stored (S3 vs local) - automatically selected
310
+ - How files are parsed (PDF vs DOCX) - provider auto-selected
311
+ - How chunks are created - semantic chunking with tiktoken
312
+ - How embeddings work - async worker with batching
313
+
314
+ Clients just call `ingest_file()` and get searchable resources.
315
+
316
+ **PERMISSION CHECK**: Remote MCP servers cannot read local files (security).
317
+ Only local/stdio MCP servers can access local filesystem paths.
318
+
319
+ Args:
320
+ file_uri: Source file location (local path, s3://, or https://)
321
+ user_id: User identifier for data isolation and ownership
322
+ category: Optional category tag (document, code, audio, etc.)
323
+ tags: Optional list of tags
324
+ is_local_server: True if running as local/stdio MCP server
325
+
326
+ Returns:
327
+ dict with:
328
+ - file_id: UUID of created File entity
329
+ - file_name: Original filename
330
+ - storage_uri: Internal storage location
331
+ - internal_key: S3 key or local path
332
+ - size_bytes: File size
333
+ - content_type: MIME type
334
+ - processing_status: "completed" or "failed"
335
+ - resources_created: Number of Resource chunks created
336
+ - parsing_metadata: Rich parsing state (content, tables, images)
337
+ - content: Parsed file content (markdown format) if status is "completed"
338
+
339
+ Raises:
340
+ PermissionError: If remote server tries to read local file
341
+ FileNotFoundError: If source file doesn't exist
342
+ RuntimeError: If storage or processing fails
343
+
344
+ Example:
345
+ >>> service = ContentService()
346
+ >>> result = await service.ingest_file(
347
+ ... file_uri="s3://bucket/contract.pdf",
348
+ ... user_id="user-123",
349
+ ... category="legal"
350
+ ... )
351
+ >>> print(f"Created {result['resources_created']} searchable chunks")
352
+ """
353
+ from pathlib import Path
354
+ from uuid import uuid4
355
+ import mimetypes
356
+
357
+ from ...models.entities import File
358
+ from ...services.fs import FileSystemService
359
+ from ...services.postgres import PostgresService
360
+
361
+ # Step 1: Read file from source using FileSystemService
362
+ fs_service = FileSystemService()
363
+ file_content, file_name, source_type = await fs_service.read_uri(
364
+ file_uri, is_local_server=is_local_server
365
+ )
366
+ file_size = len(file_content)
367
+ logger.info(f"Read {file_size} bytes from {file_uri} (source: {source_type})")
368
+
369
+ # Step 2: Write to internal storage (user-scoped)
370
+ file_id = str(uuid4())
371
+ storage_uri, internal_key, content_type, _ = await fs_service.write_to_internal_storage(
372
+ content=file_content,
373
+ tenant_id=user_id, # Using user_id for storage scoping
374
+ file_name=file_name,
375
+ file_id=file_id,
376
+ )
377
+ logger.info(f"Stored to internal storage: {storage_uri}")
378
+
379
+ # Step 3: Create File entity
380
+ file_entity = File(
381
+ id=file_id,
382
+ tenant_id=user_id, # Set tenant_id to user_id (application scoped to user)
383
+ user_id=user_id,
384
+ name=file_name,
385
+ uri=storage_uri,
386
+ mime_type=content_type,
387
+ size_bytes=file_size,
388
+ metadata={
389
+ "source_uri": file_uri,
390
+ "source_type": source_type,
391
+ "category": category,
392
+ "storage_uri": storage_uri,
393
+ "s3_key": internal_key,
394
+ "s3_bucket": (
395
+ storage_uri.split("/")[2] if storage_uri.startswith("s3://") else "local"
396
+ ),
397
+ },
398
+ tags=tags or [],
399
+ )
400
+
401
+ # Step 4: Save File entity to database
402
+ from rem.services.postgres import get_postgres_service
403
+ from rem.services.postgres.repository import Repository
404
+
405
+ postgres_service = get_postgres_service()
406
+ if not postgres_service:
407
+ raise RuntimeError("PostgreSQL is disabled. Cannot save File entity to database.")
408
+
409
+ await postgres_service.connect()
410
+ try:
411
+ repo = Repository(File, "files", db=postgres_service)
412
+ await repo.upsert(file_entity)
413
+ finally:
414
+ await postgres_service.disconnect()
415
+
416
+ # Step 5: Process file to create Resource chunks
417
+ try:
418
+ processing_result = await self.process_and_save(
419
+ uri=storage_uri,
420
+ user_id=user_id,
421
+ )
422
+ processing_status = processing_result.get("status", "completed")
423
+ resources_created = processing_result.get("chunk_count", 0)
424
+ parsing_metadata = {
425
+ "content_extracted": bool(processing_result.get("content")),
426
+ "markdown_generated": bool(processing_result.get("markdown")),
427
+ "chunks_created": resources_created,
428
+ }
429
+ except Exception as e:
430
+ logger.error(f"File processing failed: {e}", exc_info=True)
431
+ processing_status = "failed"
432
+ resources_created = 0
433
+ parsing_metadata = {"error": str(e)}
434
+
435
+ logger.info(
436
+ f"File ingestion complete: {file_name} "
437
+ f"(user: {user_id}, status: {processing_status}, "
438
+ f"resources: {resources_created})"
439
+ )
440
+
441
+ # Extract content if available
442
+ content = None
443
+ if processing_status == "completed" and processing_result:
444
+ content = processing_result.get("content")
445
+
446
+ return {
447
+ "file_id": file_id,
448
+ "file_name": file_name,
449
+ "storage_uri": storage_uri,
450
+ "internal_key": internal_key,
451
+ "size_bytes": file_size,
452
+ "content_type": content_type,
453
+ "source_uri": file_uri,
454
+ "source_type": source_type,
455
+ "processing_status": processing_status,
456
+ "resources_created": resources_created,
457
+ "parsing_metadata": parsing_metadata,
458
+ "content": content, # Include parsed content when available
459
+ "message": f"File ingested and {processing_status}. Created {resources_created} resources.",
460
+ }
461
+
462
+ async def process_and_save(self, uri: str, user_id: str | None = None) -> dict[str, Any]:
463
+ """
464
+ Process file end-to-end: extract → markdown → chunk → save.
465
+
466
+ **INTERNAL METHOD**: This is called by ingest_file() after storage.
467
+ Clients should use ingest_file() instead for the full pipeline.
468
+
469
+ **KIND-BASED ROUTING**: For YAML/JSON files, checks for 'kind' field and routes to:
470
+ - kind=agent or kind=evaluator → Save to schemas table (not resources)
471
+ - kind=engram → Process via EngramProcessor (creates resources + moments)
472
+ - No kind → Standard resource processing (default)
473
+
474
+ Args:
475
+ uri: File URI (s3://bucket/key or local path)
476
+ user_id: Optional user ID for multi-tenancy
477
+
478
+ Returns:
479
+ dict with file metadata and chunk count
480
+ """
481
+ logger.info(f"Processing and saving: {uri}")
482
+
483
+ # Extract content
484
+ result = self.process_uri(uri)
485
+ filename = Path(uri).name
486
+
487
+ # Check for custom kind-based processing (YAML/JSON only)
488
+ file_suffix = Path(uri).suffix.lower()
489
+ if file_suffix in ['.yaml', '.yml', '.json']:
490
+ # Check if schema provider detected a valid schema
491
+ # is_schema flag is at top level of result (preserved from SchemaProvider)
492
+ if result.get('is_schema'):
493
+ logger.info(f"🔧 Custom provider flow initiated: kind={result.get('metadata', {}).get('kind')} for {filename}")
494
+ return await self._process_schema(result, uri, user_id)
495
+
496
+ # Check for engram kind in raw data
497
+ import yaml
498
+ import json
499
+ try:
500
+ # Parse the content to check for kind
501
+ content_text = result.get('content', '')
502
+ if file_suffix == '.json':
503
+ data = json.loads(content_text)
504
+ else:
505
+ data = yaml.safe_load(content_text)
506
+
507
+ if isinstance(data, dict) and data.get('kind') == 'engram':
508
+ logger.info(f"🔧 Custom provider flow initiated: kind=engram for {filename}")
509
+ return await self._process_engram(data, uri, user_id)
510
+ except Exception as e:
511
+ logger.debug(f"Could not parse {filename} for kind check: {e}")
512
+ # Fall through to standard processing
513
+
514
+ # Convert to markdown
515
+ markdown = to_markdown(result["content"], filename)
516
+
517
+ # Chunk markdown
518
+ chunks = chunk_text(markdown)
519
+ logger.info(f"Created {len(chunks)} chunks from {filename}")
520
+
521
+ # Save File entity
522
+ file = File(
523
+ name=filename,
524
+ uri=uri,
525
+ content=result["content"],
526
+ size_bytes=result["metadata"].get("size"),
527
+ mime_type=result["metadata"].get("content_type"),
528
+ processing_status="completed",
529
+ tenant_id=user_id or "default", # Required field
530
+ user_id=user_id,
531
+ )
532
+
533
+ if self.file_repo:
534
+ await self.file_repo.upsert(file)
535
+ logger.info(f"Saved File: {filename}")
536
+
537
+ # Create Resource entities for each chunk
538
+ resources = [
539
+ Resource(
540
+ name=f"{filename}#chunk-{i}",
541
+ uri=f"{uri}#chunk-{i}",
542
+ ordinal=i,
543
+ content=chunk,
544
+ category="document",
545
+ tenant_id=user_id or "default", # Required field
546
+ user_id=user_id,
547
+ )
548
+ for i, chunk in enumerate(chunks)
549
+ ]
550
+
551
+ if self.resource_repo:
552
+ await self.resource_repo.upsert(
553
+ resources,
554
+ embeddable_fields=["content"],
555
+ generate_embeddings=True,
556
+ )
557
+ logger.info(f"Saved {len(resources)} Resource chunks")
558
+ logger.info(f"Queued {len(resources)} embedding generation tasks for content field")
559
+
560
+ return {
561
+ "file": file.model_dump(),
562
+ "chunk_count": len(chunks),
563
+ "content": result["content"],
564
+ "markdown": markdown,
565
+ "status": "completed",
566
+ }
567
+
568
+ async def _process_schema(
569
+ self, result: dict[str, Any], uri: str, user_id: str | None = None
570
+ ) -> dict[str, Any]:
571
+ """
572
+ Process agent/evaluator schema and save to schemas table.
573
+
574
+ Args:
575
+ result: Extraction result from SchemaProvider with schema_data
576
+ uri: File URI
577
+ user_id: Optional user ID for multi-tenancy
578
+
579
+ Returns:
580
+ dict with schema save result
581
+ """
582
+ from rem.models.entities import Schema
583
+ from rem.services.postgres import PostgresService
584
+
585
+ metadata = result.get("metadata", {})
586
+ schema_data = result.get("schema_data", {})
587
+
588
+ kind = metadata.get("kind")
589
+ name = metadata.get("name")
590
+ version = metadata.get("version", "1.0.0")
591
+
592
+ logger.info(f"Saving schema to schemas table: kind={kind}, name={name}, version={version}")
593
+
594
+ # Create Schema entity
595
+ # IMPORTANT: category field distinguishes agents from evaluators
596
+ # - kind=agent → category="agent" (AI agents with tools/resources)
597
+ # - kind=evaluator → category="evaluator" (LLM-as-a-Judge evaluators)
598
+ schema_entity = Schema(
599
+ tenant_id=user_id or "default",
600
+ user_id=user_id,
601
+ name=name,
602
+ spec=schema_data,
603
+ category=kind, # Maps kind → category for database filtering
604
+ provider_configs=metadata.get("provider_configs", []),
605
+ embedding_fields=metadata.get("embedding_fields", []),
606
+ metadata={
607
+ "uri": uri,
608
+ "version": version,
609
+ "tags": metadata.get("tags", []),
610
+ },
611
+ )
612
+
613
+ # Save to schemas table
614
+ from rem.services.postgres import get_postgres_service
615
+ postgres = get_postgres_service()
616
+ if not postgres:
617
+ raise RuntimeError("PostgreSQL is disabled. Cannot save Schema entity to database.")
618
+
619
+ await postgres.connect()
620
+ try:
621
+ from rem.models.entities import Schema as SchemaModel
622
+ await postgres.batch_upsert(
623
+ records=[schema_entity],
624
+ model=SchemaModel,
625
+ table_name="schemas",
626
+ entity_key_field="name",
627
+ generate_embeddings=False,
628
+ )
629
+ logger.info(f"✅ Schema saved: {name} (kind={kind})")
630
+ finally:
631
+ await postgres.disconnect()
632
+
633
+ return {
634
+ "schema_name": name,
635
+ "kind": kind,
636
+ "version": version,
637
+ "status": "completed",
638
+ "message": f"Schema '{name}' saved to schemas table",
639
+ }
640
+
641
+ async def _process_engram(
642
+ self, data: dict[str, Any], uri: str, user_id: str | None = None
643
+ ) -> dict[str, Any]:
644
+ """
645
+ Process engram and save to resources + moments tables.
646
+
647
+ Args:
648
+ data: Parsed engram data with kind=engram
649
+ uri: File URI
650
+ user_id: Optional user ID for multi-tenancy
651
+
652
+ Returns:
653
+ dict with engram processing result
654
+ """
655
+ from rem.workers.engram_processor import EngramProcessor
656
+ from rem.services.postgres import PostgresService
657
+
658
+ logger.info(f"Processing engram: {data.get('name')}")
659
+
660
+ from rem.services.postgres import get_postgres_service
661
+ postgres = get_postgres_service()
662
+ if not postgres:
663
+ raise RuntimeError("PostgreSQL is disabled. Cannot process engram.")
664
+
665
+ await postgres.connect()
666
+ try:
667
+ processor = EngramProcessor(postgres)
668
+ result = await processor.process_engram(
669
+ data=data,
670
+ tenant_id=user_id or "default",
671
+ user_id=user_id,
672
+ )
673
+ logger.info(f"✅ Engram processed: {result.get('resource_id')} with {len(result.get('moment_ids', []))} moments")
674
+ return result
675
+ finally:
676
+ await postgres.disconnect()