remdb 0.3.242__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of remdb might be problematic. Click here for more details.

Files changed (235) hide show
  1. rem/__init__.py +129 -0
  2. rem/agentic/README.md +760 -0
  3. rem/agentic/__init__.py +54 -0
  4. rem/agentic/agents/README.md +155 -0
  5. rem/agentic/agents/__init__.py +38 -0
  6. rem/agentic/agents/agent_manager.py +311 -0
  7. rem/agentic/agents/sse_simulator.py +502 -0
  8. rem/agentic/context.py +425 -0
  9. rem/agentic/context_builder.py +360 -0
  10. rem/agentic/llm_provider_models.py +301 -0
  11. rem/agentic/mcp/__init__.py +0 -0
  12. rem/agentic/mcp/tool_wrapper.py +273 -0
  13. rem/agentic/otel/__init__.py +5 -0
  14. rem/agentic/otel/setup.py +240 -0
  15. rem/agentic/providers/phoenix.py +926 -0
  16. rem/agentic/providers/pydantic_ai.py +854 -0
  17. rem/agentic/query.py +117 -0
  18. rem/agentic/query_helper.py +89 -0
  19. rem/agentic/schema.py +737 -0
  20. rem/agentic/serialization.py +245 -0
  21. rem/agentic/tools/__init__.py +5 -0
  22. rem/agentic/tools/rem_tools.py +242 -0
  23. rem/api/README.md +657 -0
  24. rem/api/deps.py +253 -0
  25. rem/api/main.py +460 -0
  26. rem/api/mcp_router/prompts.py +182 -0
  27. rem/api/mcp_router/resources.py +820 -0
  28. rem/api/mcp_router/server.py +243 -0
  29. rem/api/mcp_router/tools.py +1605 -0
  30. rem/api/middleware/tracking.py +172 -0
  31. rem/api/routers/admin.py +520 -0
  32. rem/api/routers/auth.py +898 -0
  33. rem/api/routers/chat/__init__.py +5 -0
  34. rem/api/routers/chat/child_streaming.py +394 -0
  35. rem/api/routers/chat/completions.py +702 -0
  36. rem/api/routers/chat/json_utils.py +76 -0
  37. rem/api/routers/chat/models.py +202 -0
  38. rem/api/routers/chat/otel_utils.py +33 -0
  39. rem/api/routers/chat/sse_events.py +546 -0
  40. rem/api/routers/chat/streaming.py +950 -0
  41. rem/api/routers/chat/streaming_utils.py +327 -0
  42. rem/api/routers/common.py +18 -0
  43. rem/api/routers/dev.py +87 -0
  44. rem/api/routers/feedback.py +276 -0
  45. rem/api/routers/messages.py +620 -0
  46. rem/api/routers/models.py +86 -0
  47. rem/api/routers/query.py +362 -0
  48. rem/api/routers/shared_sessions.py +422 -0
  49. rem/auth/README.md +258 -0
  50. rem/auth/__init__.py +36 -0
  51. rem/auth/jwt.py +367 -0
  52. rem/auth/middleware.py +318 -0
  53. rem/auth/providers/__init__.py +16 -0
  54. rem/auth/providers/base.py +376 -0
  55. rem/auth/providers/email.py +215 -0
  56. rem/auth/providers/google.py +163 -0
  57. rem/auth/providers/microsoft.py +237 -0
  58. rem/cli/README.md +517 -0
  59. rem/cli/__init__.py +8 -0
  60. rem/cli/commands/README.md +299 -0
  61. rem/cli/commands/__init__.py +3 -0
  62. rem/cli/commands/ask.py +549 -0
  63. rem/cli/commands/cluster.py +1808 -0
  64. rem/cli/commands/configure.py +495 -0
  65. rem/cli/commands/db.py +828 -0
  66. rem/cli/commands/dreaming.py +324 -0
  67. rem/cli/commands/experiments.py +1698 -0
  68. rem/cli/commands/mcp.py +66 -0
  69. rem/cli/commands/process.py +388 -0
  70. rem/cli/commands/query.py +109 -0
  71. rem/cli/commands/scaffold.py +47 -0
  72. rem/cli/commands/schema.py +230 -0
  73. rem/cli/commands/serve.py +106 -0
  74. rem/cli/commands/session.py +453 -0
  75. rem/cli/dreaming.py +363 -0
  76. rem/cli/main.py +123 -0
  77. rem/config.py +244 -0
  78. rem/mcp_server.py +41 -0
  79. rem/models/core/__init__.py +49 -0
  80. rem/models/core/core_model.py +70 -0
  81. rem/models/core/engram.py +333 -0
  82. rem/models/core/experiment.py +672 -0
  83. rem/models/core/inline_edge.py +132 -0
  84. rem/models/core/rem_query.py +246 -0
  85. rem/models/entities/__init__.py +68 -0
  86. rem/models/entities/domain_resource.py +38 -0
  87. rem/models/entities/feedback.py +123 -0
  88. rem/models/entities/file.py +57 -0
  89. rem/models/entities/image_resource.py +88 -0
  90. rem/models/entities/message.py +64 -0
  91. rem/models/entities/moment.py +123 -0
  92. rem/models/entities/ontology.py +181 -0
  93. rem/models/entities/ontology_config.py +131 -0
  94. rem/models/entities/resource.py +95 -0
  95. rem/models/entities/schema.py +87 -0
  96. rem/models/entities/session.py +84 -0
  97. rem/models/entities/shared_session.py +180 -0
  98. rem/models/entities/subscriber.py +175 -0
  99. rem/models/entities/user.py +93 -0
  100. rem/py.typed +0 -0
  101. rem/registry.py +373 -0
  102. rem/schemas/README.md +507 -0
  103. rem/schemas/__init__.py +6 -0
  104. rem/schemas/agents/README.md +92 -0
  105. rem/schemas/agents/core/agent-builder.yaml +235 -0
  106. rem/schemas/agents/core/moment-builder.yaml +178 -0
  107. rem/schemas/agents/core/rem-query-agent.yaml +226 -0
  108. rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
  109. rem/schemas/agents/core/simple-assistant.yaml +19 -0
  110. rem/schemas/agents/core/user-profile-builder.yaml +163 -0
  111. rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
  112. rem/schemas/agents/examples/contract-extractor.yaml +134 -0
  113. rem/schemas/agents/examples/cv-parser.yaml +263 -0
  114. rem/schemas/agents/examples/hello-world.yaml +37 -0
  115. rem/schemas/agents/examples/query.yaml +54 -0
  116. rem/schemas/agents/examples/simple.yaml +21 -0
  117. rem/schemas/agents/examples/test.yaml +29 -0
  118. rem/schemas/agents/rem.yaml +132 -0
  119. rem/schemas/evaluators/hello-world/default.yaml +77 -0
  120. rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
  121. rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
  122. rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
  123. rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
  124. rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
  125. rem/services/__init__.py +18 -0
  126. rem/services/audio/INTEGRATION.md +308 -0
  127. rem/services/audio/README.md +376 -0
  128. rem/services/audio/__init__.py +15 -0
  129. rem/services/audio/chunker.py +354 -0
  130. rem/services/audio/transcriber.py +259 -0
  131. rem/services/content/README.md +1269 -0
  132. rem/services/content/__init__.py +5 -0
  133. rem/services/content/providers.py +760 -0
  134. rem/services/content/service.py +762 -0
  135. rem/services/dreaming/README.md +230 -0
  136. rem/services/dreaming/__init__.py +53 -0
  137. rem/services/dreaming/affinity_service.py +322 -0
  138. rem/services/dreaming/moment_service.py +251 -0
  139. rem/services/dreaming/ontology_service.py +54 -0
  140. rem/services/dreaming/user_model_service.py +297 -0
  141. rem/services/dreaming/utils.py +39 -0
  142. rem/services/email/__init__.py +10 -0
  143. rem/services/email/service.py +522 -0
  144. rem/services/email/templates.py +360 -0
  145. rem/services/embeddings/__init__.py +11 -0
  146. rem/services/embeddings/api.py +127 -0
  147. rem/services/embeddings/worker.py +435 -0
  148. rem/services/fs/README.md +662 -0
  149. rem/services/fs/__init__.py +62 -0
  150. rem/services/fs/examples.py +206 -0
  151. rem/services/fs/examples_paths.py +204 -0
  152. rem/services/fs/git_provider.py +935 -0
  153. rem/services/fs/local_provider.py +760 -0
  154. rem/services/fs/parsing-hooks-examples.md +172 -0
  155. rem/services/fs/paths.py +276 -0
  156. rem/services/fs/provider.py +460 -0
  157. rem/services/fs/s3_provider.py +1042 -0
  158. rem/services/fs/service.py +186 -0
  159. rem/services/git/README.md +1075 -0
  160. rem/services/git/__init__.py +17 -0
  161. rem/services/git/service.py +469 -0
  162. rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
  163. rem/services/phoenix/README.md +453 -0
  164. rem/services/phoenix/__init__.py +46 -0
  165. rem/services/phoenix/client.py +960 -0
  166. rem/services/phoenix/config.py +88 -0
  167. rem/services/phoenix/prompt_labels.py +477 -0
  168. rem/services/postgres/README.md +757 -0
  169. rem/services/postgres/__init__.py +49 -0
  170. rem/services/postgres/diff_service.py +599 -0
  171. rem/services/postgres/migration_service.py +427 -0
  172. rem/services/postgres/programmable_diff_service.py +635 -0
  173. rem/services/postgres/pydantic_to_sqlalchemy.py +562 -0
  174. rem/services/postgres/register_type.py +353 -0
  175. rem/services/postgres/repository.py +481 -0
  176. rem/services/postgres/schema_generator.py +661 -0
  177. rem/services/postgres/service.py +802 -0
  178. rem/services/postgres/sql_builder.py +355 -0
  179. rem/services/rate_limit.py +113 -0
  180. rem/services/rem/README.md +318 -0
  181. rem/services/rem/__init__.py +23 -0
  182. rem/services/rem/exceptions.py +71 -0
  183. rem/services/rem/executor.py +293 -0
  184. rem/services/rem/parser.py +180 -0
  185. rem/services/rem/queries.py +196 -0
  186. rem/services/rem/query.py +371 -0
  187. rem/services/rem/service.py +608 -0
  188. rem/services/session/README.md +374 -0
  189. rem/services/session/__init__.py +13 -0
  190. rem/services/session/compression.py +488 -0
  191. rem/services/session/pydantic_messages.py +310 -0
  192. rem/services/session/reload.py +85 -0
  193. rem/services/user_service.py +130 -0
  194. rem/settings.py +1877 -0
  195. rem/sql/background_indexes.sql +52 -0
  196. rem/sql/migrations/001_install.sql +983 -0
  197. rem/sql/migrations/002_install_models.sql +3157 -0
  198. rem/sql/migrations/003_optional_extensions.sql +326 -0
  199. rem/sql/migrations/004_cache_system.sql +282 -0
  200. rem/sql/migrations/005_schema_update.sql +145 -0
  201. rem/sql/migrations/migrate_session_id_to_uuid.sql +45 -0
  202. rem/utils/AGENTIC_CHUNKING.md +597 -0
  203. rem/utils/README.md +628 -0
  204. rem/utils/__init__.py +61 -0
  205. rem/utils/agentic_chunking.py +622 -0
  206. rem/utils/batch_ops.py +343 -0
  207. rem/utils/chunking.py +108 -0
  208. rem/utils/clip_embeddings.py +276 -0
  209. rem/utils/constants.py +97 -0
  210. rem/utils/date_utils.py +228 -0
  211. rem/utils/dict_utils.py +98 -0
  212. rem/utils/embeddings.py +436 -0
  213. rem/utils/examples/embeddings_example.py +305 -0
  214. rem/utils/examples/sql_types_example.py +202 -0
  215. rem/utils/files.py +323 -0
  216. rem/utils/markdown.py +16 -0
  217. rem/utils/mime_types.py +158 -0
  218. rem/utils/model_helpers.py +492 -0
  219. rem/utils/schema_loader.py +649 -0
  220. rem/utils/sql_paths.py +146 -0
  221. rem/utils/sql_types.py +350 -0
  222. rem/utils/user_id.py +81 -0
  223. rem/utils/vision.py +325 -0
  224. rem/workers/README.md +506 -0
  225. rem/workers/__init__.py +7 -0
  226. rem/workers/db_listener.py +579 -0
  227. rem/workers/db_maintainer.py +74 -0
  228. rem/workers/dreaming.py +502 -0
  229. rem/workers/engram_processor.py +312 -0
  230. rem/workers/sqs_file_processor.py +193 -0
  231. rem/workers/unlogged_maintainer.py +463 -0
  232. remdb-0.3.242.dist-info/METADATA +1632 -0
  233. remdb-0.3.242.dist-info/RECORD +235 -0
  234. remdb-0.3.242.dist-info/WHEEL +4 -0
  235. remdb-0.3.242.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,762 @@
1
+ """
2
+ ContentService for file processing.
3
+
4
+ Pipeline:
5
+ 1. Extract content via provider plugins
6
+ 2. Convert to markdown
7
+ 3. Chunk markdown
8
+ 4. Save File + Resources to database via repositories
9
+ """
10
+
11
+ import json
12
+ from datetime import UTC, datetime
13
+ from pathlib import Path
14
+ from typing import Any
15
+ from urllib.parse import urlparse
16
+
17
+ import boto3
18
+ from botocore.exceptions import ClientError
19
+ from loguru import logger
20
+
21
+ from rem.models.entities import File, Resource
22
+ from rem.services.postgres import Repository
23
+ from rem.settings import settings
24
+ from rem.utils.chunking import chunk_text
25
+ from rem.utils.markdown import to_markdown
26
+
27
+ from .providers import AudioProvider, ContentProvider, DocProvider, SchemaProvider, TextProvider
28
+
29
+
30
+ class ContentService:
31
+ """
32
+ Service for processing files: extract → markdown → chunk → save.
33
+
34
+ Supports:
35
+ - S3 URIs (s3://bucket/key)
36
+ - Local file paths
37
+ - Pluggable content providers
38
+ """
39
+
40
+ def __init__(
41
+ self, file_repo: Repository | None = None, resource_repo: Repository | None = None
42
+ ):
43
+ self.s3_client = self._create_s3_client()
44
+ self.providers: dict[str, ContentProvider] = {}
45
+ self.file_repo = file_repo
46
+ self.resource_repo = resource_repo
47
+
48
+ # Register default providers from settings
49
+ self._register_default_providers()
50
+
51
+ def _register_default_providers(self):
52
+ """Register default content providers from settings."""
53
+ # Schema provider for agent/evaluator schemas (YAML/JSON)
54
+ # Register first so it takes priority for .yaml/.json files
55
+ schema_provider = SchemaProvider()
56
+ self.providers[".yaml"] = schema_provider
57
+ self.providers[".yml"] = schema_provider
58
+ self.providers[".json"] = schema_provider
59
+
60
+ # Text provider for plain text, code, data files
61
+ text_provider = TextProvider()
62
+ for ext in settings.content.supported_text_types:
63
+ # Don't override schema provider for yaml/json
64
+ if ext.lower() not in [".yaml", ".yml", ".json"]:
65
+ self.providers[ext.lower()] = text_provider
66
+
67
+ # Doc provider for PDFs, Office docs, images (via Kreuzberg)
68
+ doc_provider = DocProvider()
69
+ for ext in settings.content.supported_doc_types:
70
+ self.providers[ext.lower()] = doc_provider
71
+
72
+ # Audio provider for audio files (via Whisper API)
73
+ audio_provider = AudioProvider()
74
+ for ext in settings.content.supported_audio_types:
75
+ self.providers[ext.lower()] = audio_provider
76
+
77
+ logger.debug(
78
+ f"Registered {len(self.providers)} file extensions across "
79
+ f"schema (yaml/json), "
80
+ f"{len(settings.content.supported_text_types)} text, "
81
+ f"{len(settings.content.supported_doc_types)} doc, "
82
+ f"{len(settings.content.supported_audio_types)} audio types"
83
+ )
84
+
85
+ def _create_s3_client(self):
86
+ """Create S3 client with IRSA or configured credentials."""
87
+ s3_config: dict[str, Any] = {
88
+ "region_name": settings.s3.region,
89
+ }
90
+
91
+ # Custom endpoint for MinIO/LocalStack
92
+ if settings.s3.endpoint_url:
93
+ s3_config["endpoint_url"] = settings.s3.endpoint_url
94
+
95
+ # Access keys (not needed with IRSA in EKS)
96
+ if settings.s3.access_key_id and settings.s3.secret_access_key:
97
+ s3_config["aws_access_key_id"] = settings.s3.access_key_id
98
+ s3_config["aws_secret_access_key"] = settings.s3.secret_access_key
99
+
100
+ # SSL configuration
101
+ s3_config["use_ssl"] = settings.s3.use_ssl
102
+
103
+ return boto3.client("s3", **s3_config)
104
+
105
+ def process_uri(self, uri: str) -> dict[str, Any]:
106
+ """
107
+ Process a file URI and extract content.
108
+
109
+ Args:
110
+ uri: File URI (s3://bucket/key or local path)
111
+
112
+ Returns:
113
+ dict with:
114
+ - uri: Original URI
115
+ - content: Extracted text content
116
+ - metadata: File metadata (size, type, etc.)
117
+ - provider: Provider used for extraction
118
+
119
+ Raises:
120
+ ValueError: If URI format is invalid
121
+ FileNotFoundError: If file doesn't exist
122
+ RuntimeError: If no provider available for file type
123
+ """
124
+ logger.info(f"Processing URI: {uri}")
125
+
126
+ # Determine if S3 or local file
127
+ if uri.startswith("s3://"):
128
+ return self._process_s3_uri(uri)
129
+ else:
130
+ return self._process_local_file(uri)
131
+
132
+ def _process_s3_uri(self, uri: str) -> dict[str, Any]:
133
+ """Process S3 URI."""
134
+ parsed = urlparse(uri)
135
+ bucket = parsed.netloc
136
+ key = parsed.path.lstrip("/")
137
+
138
+ if not bucket or not key:
139
+ raise ValueError(f"Invalid S3 URI: {uri}")
140
+
141
+ logger.debug(f"Downloading s3://{bucket}/{key}")
142
+
143
+ try:
144
+ # Download file from S3
145
+ response = self.s3_client.get_object(Bucket=bucket, Key=key)
146
+ content_bytes = response["Body"].read()
147
+
148
+ # Get metadata
149
+ metadata = {
150
+ "size": response["ContentLength"],
151
+ "content_type": response.get("ContentType", ""),
152
+ "last_modified": response["LastModified"].isoformat(),
153
+ "etag": response.get("ETag", "").strip('"'),
154
+ }
155
+
156
+ # Extract content using provider
157
+ file_path = Path(key)
158
+ provider = self._get_provider(file_path.suffix)
159
+
160
+ extracted_content = provider.extract(content_bytes, metadata)
161
+
162
+ # Build result with standard fields
163
+ result = {
164
+ "uri": uri,
165
+ "content": extracted_content["text"],
166
+ "metadata": {**metadata, **extracted_content.get("metadata", {})},
167
+ "provider": provider.name,
168
+ }
169
+
170
+ # Preserve schema-specific fields if present (from SchemaProvider)
171
+ if "is_schema" in extracted_content:
172
+ result["is_schema"] = extracted_content["is_schema"]
173
+ if "schema_data" in extracted_content:
174
+ result["schema_data"] = extracted_content["schema_data"]
175
+
176
+ return result
177
+
178
+ except ClientError as e:
179
+ error_code = e.response.get("Error", {}).get("Code", "")
180
+ if error_code == "NoSuchKey":
181
+ raise FileNotFoundError(f"S3 object not found: {uri}") from e
182
+ elif error_code == "NoSuchBucket":
183
+ raise FileNotFoundError(f"S3 bucket not found: {bucket}") from e
184
+ else:
185
+ raise RuntimeError(f"S3 error: {e}") from e
186
+
187
+ def _process_local_file(self, path: str) -> dict[str, Any]:
188
+ """
189
+ Process local file path.
190
+
191
+ **PATH HANDLING FIX**: This method correctly handles both file:// URIs
192
+ and plain paths. Previously, file:// URIs from tools.py were NOT stripped,
193
+ causing FileNotFoundError because Path() treated "file:///Users/..." as a
194
+ literal filename instead of a URI.
195
+
196
+ The fix ensures consistent path handling:
197
+ - MCP tool creates: file:///Users/.../file.pdf
198
+ - This method strips: file:// → /Users/.../file.pdf
199
+ - Path() works correctly with absolute path
200
+
201
+ Related files:
202
+ - tools.py line 636: Creates file:// URIs
203
+ - FileSystemService line 58: Also strips file:// URIs
204
+ """
205
+ # Handle file:// URI scheme
206
+ if path.startswith("file://"):
207
+ path = path.replace("file://", "")
208
+
209
+ file_path = Path(path)
210
+
211
+ if not file_path.exists():
212
+ raise FileNotFoundError(f"File not found: {path}")
213
+
214
+ if not file_path.is_file():
215
+ raise ValueError(f"Not a file: {path}")
216
+
217
+ logger.debug(f"Reading local file: {file_path}")
218
+
219
+ # Read file content
220
+ content_bytes = file_path.read_bytes()
221
+
222
+ # Get metadata
223
+ stat = file_path.stat()
224
+ metadata = {
225
+ "size": stat.st_size,
226
+ "modified": stat.st_mtime,
227
+ }
228
+
229
+ # Extract content using provider
230
+ provider = self._get_provider(file_path.suffix)
231
+ extracted_content = provider.extract(content_bytes, metadata)
232
+
233
+ # Build result with standard fields
234
+ result = {
235
+ "uri": str(file_path.absolute()),
236
+ "content": extracted_content["text"],
237
+ "metadata": {**metadata, **extracted_content.get("metadata", {})},
238
+ "provider": provider.name,
239
+ }
240
+
241
+ # Preserve schema-specific fields if present (from SchemaProvider)
242
+ if "is_schema" in extracted_content:
243
+ result["is_schema"] = extracted_content["is_schema"]
244
+ if "schema_data" in extracted_content:
245
+ result["schema_data"] = extracted_content["schema_data"]
246
+
247
+ return result
248
+
249
+ def _get_provider(self, suffix: str) -> ContentProvider:
250
+ """Get content provider for file extension."""
251
+ suffix_lower = suffix.lower()
252
+
253
+ if suffix_lower not in self.providers:
254
+ raise RuntimeError(
255
+ f"No provider available for file type: {suffix}. "
256
+ f"Supported: {', '.join(self.providers.keys())}"
257
+ )
258
+
259
+ return self.providers[suffix_lower]
260
+
261
+ def register_provider(self, extensions: list[str], provider: ContentProvider):
262
+ """
263
+ Register a custom content provider.
264
+
265
+ Args:
266
+ extensions: List of file extensions (e.g., ['.pdf', '.docx'])
267
+ provider: ContentProvider instance
268
+ """
269
+ for ext in extensions:
270
+ ext_lower = ext.lower() if ext.startswith(".") else f".{ext.lower()}"
271
+ self.providers[ext_lower] = provider
272
+ logger.debug(f"Registered provider '{provider.name}' for {ext_lower}")
273
+
274
+ async def ingest_file(
275
+ self,
276
+ file_uri: str,
277
+ user_id: str | None = None,
278
+ category: str | None = None,
279
+ tags: list[str] | None = None,
280
+ is_local_server: bool = False,
281
+ resource_type: str | None = None,
282
+ ) -> dict[str, Any]:
283
+ """
284
+ Complete file ingestion pipeline: read → store → parse → chunk → embed.
285
+
286
+ **IMPORTANT: Data is PUBLIC by default (user_id=None).**
287
+ This is correct for shared knowledge bases (ontologies, procedures, reference data).
288
+ Private user-scoped data is rarely needed - only set user_id for truly personal content.
289
+
290
+ **CENTRALIZED INGESTION**: This is the single entry point for all file ingestion
291
+ in REM. It handles:
292
+
293
+ 1. **File Reading**: From local/S3/HTTP sources via FileSystemService
294
+ 2. **Storage**: Writes to user-scoped internal storage (~/.rem/fs/ or S3)
295
+ 3. **Parsing**: Extracts content, metadata, tables, images (parsing state)
296
+ 4. **Chunking**: Splits content into semantic chunks for embedding
297
+ 5. **Database**: Creates File entity + Resource chunks with embeddings
298
+
299
+ **PARSING STATE - The Innovation**:
300
+ Files (PDF, WAV, DOCX, etc.) are converted to rich parsing state:
301
+ - **Content**: Markdown-formatted text (preserves structure)
302
+ - **Metadata**: File info, extraction details, timestamps
303
+ - **Tables**: Structured data extracted from documents (CSV format)
304
+ - **Images**: Extracted images saved to storage (for multimodal RAG)
305
+ - **Provider Info**: Which parser was used, version, settings
306
+
307
+ This parsing state enables agents to deeply understand documents:
308
+ - Query tables directly (structured data)
309
+ - Reference images (multimodal context)
310
+ - Understand document structure (markdown hierarchy)
311
+ - Track provenance (metadata lineage)
312
+
313
+ **CLIENT ABSTRACTION**: Clients (MCP tools, CLI, workers) don't worry about:
314
+ - Where files are stored (S3 vs local) - automatically selected
315
+ - How files are parsed (PDF vs DOCX) - provider auto-selected
316
+ - How chunks are created - semantic chunking with tiktoken
317
+ - How embeddings work - async worker with batching
318
+
319
+ Clients just call `ingest_file()` and get searchable resources.
320
+
321
+ **PERMISSION CHECK**: Remote MCP servers cannot read local files (security).
322
+ Only local/stdio MCP servers can access local filesystem paths.
323
+
324
+ Args:
325
+ file_uri: Source file location (local path, s3://, or https://)
326
+ user_id: User identifier for PRIVATE data only. Default None = PUBLIC/shared.
327
+ Leave as None for shared knowledge bases, ontologies, reference data.
328
+ Only set for truly private user-specific content.
329
+ category: Optional category tag (document, code, audio, etc.)
330
+ tags: Optional list of tags
331
+ is_local_server: True if running as local/stdio MCP server
332
+ resource_type: Optional resource type (case-insensitive). Supports:
333
+ - "resource", "resources", "Resource" → Resource (default)
334
+ - "domain-resource", "domain_resource", "DomainResource" → DomainResource
335
+
336
+ Returns:
337
+ dict with:
338
+ - file_id: UUID of created File entity
339
+ - file_name: Original filename
340
+ - storage_uri: Internal storage location
341
+ - internal_key: S3 key or local path
342
+ - size_bytes: File size
343
+ - content_type: MIME type
344
+ - processing_status: "completed" or "failed"
345
+ - resources_created: Number of Resource chunks created
346
+ - parsing_metadata: Rich parsing state (content, tables, images)
347
+ - content: Parsed file content (markdown format) if status is "completed"
348
+
349
+ Raises:
350
+ PermissionError: If remote server tries to read local file
351
+ FileNotFoundError: If source file doesn't exist
352
+ RuntimeError: If storage or processing fails
353
+
354
+ Example:
355
+ >>> service = ContentService()
356
+ >>> # PUBLIC data (default) - visible to all users
357
+ >>> result = await service.ingest_file(
358
+ ... file_uri="s3://bucket/procedure.pdf",
359
+ ... category="medical"
360
+ ... )
361
+ >>> print(f"Created {result['resources_created']} searchable chunks")
362
+ >>>
363
+ >>> # PRIVATE data (rare) - only for user-specific content
364
+ >>> result = await service.ingest_file(
365
+ ... file_uri="s3://bucket/personal-notes.pdf",
366
+ ... user_id="user-123", # Only this user can access
367
+ ... category="personal"
368
+ ... )
369
+ """
370
+ from pathlib import Path
371
+ from uuid import uuid4
372
+ import mimetypes
373
+
374
+ from ...models.entities import File
375
+ from ...services.fs import FileSystemService
376
+ from ...services.postgres import PostgresService
377
+
378
+ # Step 1: Read file from source using FileSystemService
379
+ fs_service = FileSystemService()
380
+ file_content, file_name, source_type = await fs_service.read_uri(
381
+ file_uri, is_local_server=is_local_server
382
+ )
383
+ file_size = len(file_content)
384
+ logger.info(f"Read {file_size} bytes from {file_uri} (source: {source_type})")
385
+
386
+ # Step 1.5: Early schema detection for YAML/JSON files
387
+ # Skip File entity creation for schemas (agents/evaluators)
388
+ file_suffix = Path(file_name).suffix.lower()
389
+ if file_suffix in ['.yaml', '.yml', '.json']:
390
+ import yaml
391
+ import json
392
+ try:
393
+ content_text = file_content.decode('utf-8') if isinstance(file_content, bytes) else file_content
394
+ data = yaml.safe_load(content_text) if file_suffix in ['.yaml', '.yml'] else json.loads(content_text)
395
+ if isinstance(data, dict):
396
+ json_schema_extra = data.get('json_schema_extra', {})
397
+ kind = json_schema_extra.get('kind', '')
398
+ if kind in ['agent', 'evaluator']:
399
+ # Route directly to schema processing, skip File entity
400
+ logger.info(f"Detected {kind} schema: {file_name}, routing to _process_schema")
401
+ result = self.process_uri(file_uri)
402
+ return await self._process_schema(result, file_uri, user_id)
403
+ except Exception as e:
404
+ logger.debug(f"Early schema detection failed for {file_name}: {e}")
405
+ # Fall through to standard file processing
406
+
407
+ # Step 2: Write to internal storage (public or user-scoped)
408
+ file_id = str(uuid4())
409
+ storage_uri, internal_key, content_type, _ = await fs_service.write_to_internal_storage(
410
+ content=file_content,
411
+ tenant_id=user_id or "public", # Storage path: public/ or user_id/
412
+ file_name=file_name,
413
+ file_id=file_id,
414
+ )
415
+ logger.info(f"Stored to internal storage: {storage_uri}")
416
+
417
+ # Step 3: Create File entity
418
+ file_entity = File(
419
+ id=file_id,
420
+ tenant_id=user_id, # None = public/shared
421
+ user_id=user_id,
422
+ name=file_name,
423
+ uri=storage_uri,
424
+ mime_type=content_type,
425
+ size_bytes=file_size,
426
+ metadata={
427
+ "source_uri": file_uri,
428
+ "source_type": source_type,
429
+ "category": category,
430
+ "storage_uri": storage_uri,
431
+ "s3_key": internal_key,
432
+ "s3_bucket": (
433
+ storage_uri.split("/")[2] if storage_uri.startswith("s3://") else "local"
434
+ ),
435
+ },
436
+ tags=tags or [],
437
+ )
438
+
439
+ # Step 4: Save File entity to database
440
+ from rem.services.postgres import get_postgres_service
441
+ from rem.services.postgres.repository import Repository
442
+
443
+ postgres_service = get_postgres_service()
444
+ if not postgres_service:
445
+ raise RuntimeError("PostgreSQL is disabled. Cannot save File entity to database.")
446
+
447
+ await postgres_service.connect()
448
+ try:
449
+ repo = Repository(File, "files", db=postgres_service)
450
+ await repo.upsert(file_entity)
451
+ finally:
452
+ await postgres_service.disconnect()
453
+
454
+ # Step 5: Process file to create Resource chunks
455
+ try:
456
+ processing_result = await self.process_and_save(
457
+ uri=storage_uri,
458
+ user_id=user_id,
459
+ resource_type=resource_type,
460
+ )
461
+ processing_status = processing_result.get("status", "completed")
462
+ resources_created = processing_result.get("chunk_count", 0)
463
+ parsing_metadata = {
464
+ "content_extracted": bool(processing_result.get("content")),
465
+ "markdown_generated": bool(processing_result.get("markdown")),
466
+ "chunks_created": resources_created,
467
+ }
468
+ except Exception as e:
469
+ logger.error(f"File processing failed: {e}", exc_info=True)
470
+ processing_status = "failed"
471
+ resources_created = 0
472
+ parsing_metadata = {"error": str(e)}
473
+
474
+ logger.info(
475
+ f"File ingestion complete: {file_name} "
476
+ f"(user: {user_id}, status: {processing_status}, "
477
+ f"resources: {resources_created})"
478
+ )
479
+
480
+ # Extract content if available
481
+ content = None
482
+ if processing_status == "completed" and processing_result:
483
+ content = processing_result.get("content")
484
+
485
+ return {
486
+ "file_id": file_id,
487
+ "file_name": file_name,
488
+ "storage_uri": storage_uri,
489
+ "internal_key": internal_key,
490
+ "size_bytes": file_size,
491
+ "content_type": content_type,
492
+ "source_uri": file_uri,
493
+ "source_type": source_type,
494
+ "processing_status": processing_status,
495
+ "resources_created": resources_created,
496
+ "parsing_metadata": parsing_metadata,
497
+ "content": content, # Include parsed content when available
498
+ "message": f"File ingested and {processing_status}. Created {resources_created} resources.",
499
+ }
500
+
501
+ async def process_and_save(
502
+ self,
503
+ uri: str,
504
+ user_id: str | None = None,
505
+ resource_type: str | None = None,
506
+ ) -> dict[str, Any]:
507
+ """
508
+ Process file end-to-end: extract → markdown → chunk → save.
509
+
510
+ **INTERNAL METHOD**: This is called by ingest_file() after storage.
511
+ Clients should use ingest_file() instead for the full pipeline.
512
+
513
+ **KIND-BASED ROUTING**: For YAML/JSON files, checks for 'kind' field and routes to:
514
+ - kind=agent or kind=evaluator → Save to schemas table (not resources)
515
+ - kind=engram → Process via EngramProcessor (creates resources + moments)
516
+ - No kind → Standard resource processing (default)
517
+
518
+ Args:
519
+ uri: File URI (s3://bucket/key or local path)
520
+ user_id: Optional user ID for multi-tenancy
521
+ resource_type: Optional resource type (case-insensitive). Defaults to "Resource".
522
+ Supports: resource, domain-resource, domain_resource, DomainResource, etc.
523
+
524
+ Returns:
525
+ dict with file metadata and chunk count
526
+ """
527
+ logger.info(f"Processing and saving: {uri}")
528
+
529
+ # Extract content
530
+ result = self.process_uri(uri)
531
+ filename = Path(uri).name
532
+
533
+ # Check for custom kind-based processing (YAML/JSON only)
534
+ file_suffix = Path(uri).suffix.lower()
535
+ if file_suffix in ['.yaml', '.yml', '.json']:
536
+ # Check if schema provider detected a valid schema
537
+ # is_schema flag is at top level of result (preserved from SchemaProvider)
538
+ if result.get('is_schema'):
539
+ logger.info(f"🔧 Custom provider flow initiated: kind={result.get('metadata', {}).get('kind')} for {filename}")
540
+ return await self._process_schema(result, uri, user_id)
541
+
542
+ # Check for engram kind in raw data
543
+ import yaml
544
+ import json
545
+ try:
546
+ # Parse the content to check for kind
547
+ content_text = result.get('content', '')
548
+ if file_suffix == '.json':
549
+ data = json.loads(content_text)
550
+ else:
551
+ data = yaml.safe_load(content_text)
552
+
553
+ if isinstance(data, dict) and data.get('kind') == 'engram':
554
+ logger.info(f"🔧 Custom provider flow initiated: kind=engram for {filename}")
555
+ return await self._process_engram(data, uri, user_id)
556
+ except Exception as e:
557
+ logger.debug(f"Could not parse {filename} for kind check: {e}")
558
+ # Fall through to standard processing
559
+
560
+ # Convert to markdown
561
+ markdown = to_markdown(result["content"], filename)
562
+
563
+ # Chunk markdown
564
+ chunks = chunk_text(markdown)
565
+ logger.info(f"Created {len(chunks)} chunks from {filename}")
566
+
567
+ # Save File entity
568
+ file = File(
569
+ name=filename,
570
+ uri=uri,
571
+ content=result["content"],
572
+ size_bytes=result["metadata"].get("size"),
573
+ mime_type=result["metadata"].get("content_type"),
574
+ processing_status="completed",
575
+ tenant_id=user_id, # None = public/shared
576
+ user_id=user_id,
577
+ )
578
+
579
+ if self.file_repo:
580
+ await self.file_repo.upsert(file)
581
+ logger.info(f"Saved File: {filename}")
582
+
583
+ # Resolve resource model class from type parameter (case-insensitive)
584
+ from typing import cast, Type
585
+ from pydantic import BaseModel
586
+ from rem.utils.model_helpers import model_from_arbitrary_casing, get_table_name
587
+
588
+ resource_model: Type[BaseModel] = Resource # Default
589
+ if resource_type:
590
+ try:
591
+ resource_model = model_from_arbitrary_casing(resource_type)
592
+ logger.info(f"Using resource model: {resource_model.__name__}")
593
+ except ValueError as e:
594
+ logger.warning(f"Invalid resource_type '{resource_type}', using default Resource: {e}")
595
+ resource_model = Resource
596
+
597
+ # Get table name for the resolved model
598
+ table_name = get_table_name(resource_model)
599
+
600
+ # Create resource entities for each chunk
601
+ resources: list[BaseModel] = [
602
+ resource_model(
603
+ name=f"{filename}#chunk-{i}",
604
+ uri=f"{uri}#chunk-{i}",
605
+ ordinal=i,
606
+ content=chunk,
607
+ category="document",
608
+ tenant_id=user_id, # None = public/shared
609
+ user_id=user_id,
610
+ )
611
+ for i, chunk in enumerate(chunks)
612
+ ]
613
+
614
+ # Save resources to the appropriate table
615
+ if resources:
616
+ from rem.services.postgres import get_postgres_service
617
+
618
+ postgres = get_postgres_service()
619
+ if postgres:
620
+ await postgres.connect()
621
+ try:
622
+ await postgres.batch_upsert(
623
+ records=cast(list[BaseModel | dict], resources),
624
+ model=resource_model,
625
+ table_name=table_name,
626
+ entity_key_field="name",
627
+ embeddable_fields=["content"],
628
+ generate_embeddings=True,
629
+ )
630
+ logger.info(f"Saved {len(resources)} {resource_model.__name__} chunks to {table_name}")
631
+ logger.info(f"Queued {len(resources)} embedding generation tasks for content field")
632
+ finally:
633
+ await postgres.disconnect()
634
+ elif self.resource_repo:
635
+ # Fallback to injected repo (only works for default Resource)
636
+ await self.resource_repo.upsert(
637
+ resources,
638
+ embeddable_fields=["content"],
639
+ generate_embeddings=True,
640
+ )
641
+ logger.info(f"Saved {len(resources)} Resource chunks")
642
+ logger.info(f"Queued {len(resources)} embedding generation tasks for content field")
643
+
644
+ return {
645
+ "file": file.model_dump(),
646
+ "chunk_count": len(chunks),
647
+ "content": result["content"],
648
+ "markdown": markdown,
649
+ "status": "completed",
650
+ }
651
+
652
+ async def _process_schema(
653
+ self, result: dict[str, Any], uri: str, user_id: str | None = None
654
+ ) -> dict[str, Any]:
655
+ """
656
+ Process agent/evaluator schema and save to schemas table.
657
+
658
+ Args:
659
+ result: Extraction result from SchemaProvider with schema_data
660
+ uri: File URI
661
+ user_id: Optional user ID for multi-tenancy
662
+
663
+ Returns:
664
+ dict with schema save result
665
+ """
666
+ from rem.models.entities import Schema
667
+ from rem.services.postgres import PostgresService
668
+
669
+ metadata = result.get("metadata", {})
670
+ schema_data = result.get("schema_data", {})
671
+
672
+ kind = metadata.get("kind")
673
+ name = metadata.get("name")
674
+ version = metadata.get("version", "1.0.0")
675
+
676
+ logger.info(f"Saving schema to schemas table: kind={kind}, name={name}, version={version}")
677
+
678
+ # Create Schema entity
679
+ # IMPORTANT: category field distinguishes agents from evaluators
680
+ # - kind=agent → category="agent" (AI agents with tools/resources)
681
+ # - kind=evaluator → category="evaluator" (LLM-as-a-Judge evaluators)
682
+ # User-scoped schemas: if user_id provided, scope to user's tenant
683
+ # System schemas: if no user_id, use "system" tenant for shared access
684
+ schema_entity = Schema(
685
+ tenant_id=user_id or "system",
686
+ user_id=user_id,
687
+ name=name,
688
+ spec=schema_data,
689
+ category=kind, # Maps kind → category for database filtering
690
+ provider_configs=metadata.get("provider_configs", []),
691
+ embedding_fields=metadata.get("embedding_fields", []),
692
+ metadata={
693
+ "uri": uri,
694
+ "version": version,
695
+ "tags": metadata.get("tags", []),
696
+ },
697
+ )
698
+
699
+ # Save to schemas table
700
+ from rem.services.postgres import get_postgres_service
701
+ postgres = get_postgres_service()
702
+ if not postgres:
703
+ raise RuntimeError("PostgreSQL is disabled. Cannot save Schema entity to database.")
704
+
705
+ await postgres.connect()
706
+ try:
707
+ from rem.models.entities import Schema as SchemaModel
708
+ await postgres.batch_upsert(
709
+ records=[schema_entity],
710
+ model=SchemaModel,
711
+ table_name="schemas",
712
+ entity_key_field="name",
713
+ generate_embeddings=False,
714
+ )
715
+ logger.info(f"✅ Schema saved: {name} (kind={kind})")
716
+ finally:
717
+ await postgres.disconnect()
718
+
719
+ return {
720
+ "schema_name": name,
721
+ "kind": kind,
722
+ "version": version,
723
+ "status": "completed",
724
+ "message": f"Schema '{name}' saved to schemas table",
725
+ }
726
+
727
+ async def _process_engram(
728
+ self, data: dict[str, Any], uri: str, user_id: str | None = None
729
+ ) -> dict[str, Any]:
730
+ """
731
+ Process engram and save to resources + moments tables.
732
+
733
+ Args:
734
+ data: Parsed engram data with kind=engram
735
+ uri: File URI
736
+ user_id: Optional user ID for multi-tenancy
737
+
738
+ Returns:
739
+ dict with engram processing result
740
+ """
741
+ from rem.workers.engram_processor import EngramProcessor
742
+ from rem.services.postgres import PostgresService
743
+
744
+ logger.info(f"Processing engram: {data.get('name')}")
745
+
746
+ from rem.services.postgres import get_postgres_service
747
+ postgres = get_postgres_service()
748
+ if not postgres:
749
+ raise RuntimeError("PostgreSQL is disabled. Cannot process engram.")
750
+
751
+ await postgres.connect()
752
+ try:
753
+ processor = EngramProcessor(postgres)
754
+ result = await processor.process_engram(
755
+ data=data,
756
+ tenant_id=user_id, # None = public/shared
757
+ user_id=user_id,
758
+ )
759
+ logger.info(f"✅ Engram processed: {result.get('resource_id')} with {len(result.get('moment_ids', []))} moments")
760
+ return result
761
+ finally:
762
+ await postgres.disconnect()