remdb 0.2.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of remdb might be problematic. Click here for more details.

Files changed (187) hide show
  1. rem/__init__.py +2 -0
  2. rem/agentic/README.md +650 -0
  3. rem/agentic/__init__.py +39 -0
  4. rem/agentic/agents/README.md +155 -0
  5. rem/agentic/agents/__init__.py +8 -0
  6. rem/agentic/context.py +148 -0
  7. rem/agentic/context_builder.py +329 -0
  8. rem/agentic/mcp/__init__.py +0 -0
  9. rem/agentic/mcp/tool_wrapper.py +107 -0
  10. rem/agentic/otel/__init__.py +5 -0
  11. rem/agentic/otel/setup.py +151 -0
  12. rem/agentic/providers/phoenix.py +674 -0
  13. rem/agentic/providers/pydantic_ai.py +572 -0
  14. rem/agentic/query.py +117 -0
  15. rem/agentic/query_helper.py +89 -0
  16. rem/agentic/schema.py +396 -0
  17. rem/agentic/serialization.py +245 -0
  18. rem/agentic/tools/__init__.py +5 -0
  19. rem/agentic/tools/rem_tools.py +231 -0
  20. rem/api/README.md +420 -0
  21. rem/api/main.py +324 -0
  22. rem/api/mcp_router/prompts.py +182 -0
  23. rem/api/mcp_router/resources.py +536 -0
  24. rem/api/mcp_router/server.py +213 -0
  25. rem/api/mcp_router/tools.py +584 -0
  26. rem/api/routers/auth.py +229 -0
  27. rem/api/routers/chat/__init__.py +5 -0
  28. rem/api/routers/chat/completions.py +281 -0
  29. rem/api/routers/chat/json_utils.py +76 -0
  30. rem/api/routers/chat/models.py +124 -0
  31. rem/api/routers/chat/streaming.py +185 -0
  32. rem/auth/README.md +258 -0
  33. rem/auth/__init__.py +26 -0
  34. rem/auth/middleware.py +100 -0
  35. rem/auth/providers/__init__.py +13 -0
  36. rem/auth/providers/base.py +376 -0
  37. rem/auth/providers/google.py +163 -0
  38. rem/auth/providers/microsoft.py +237 -0
  39. rem/cli/README.md +455 -0
  40. rem/cli/__init__.py +8 -0
  41. rem/cli/commands/README.md +126 -0
  42. rem/cli/commands/__init__.py +3 -0
  43. rem/cli/commands/ask.py +565 -0
  44. rem/cli/commands/configure.py +423 -0
  45. rem/cli/commands/db.py +493 -0
  46. rem/cli/commands/dreaming.py +324 -0
  47. rem/cli/commands/experiments.py +1124 -0
  48. rem/cli/commands/mcp.py +66 -0
  49. rem/cli/commands/process.py +245 -0
  50. rem/cli/commands/schema.py +183 -0
  51. rem/cli/commands/serve.py +106 -0
  52. rem/cli/dreaming.py +363 -0
  53. rem/cli/main.py +88 -0
  54. rem/config.py +237 -0
  55. rem/mcp_server.py +41 -0
  56. rem/models/core/__init__.py +49 -0
  57. rem/models/core/core_model.py +64 -0
  58. rem/models/core/engram.py +333 -0
  59. rem/models/core/experiment.py +628 -0
  60. rem/models/core/inline_edge.py +132 -0
  61. rem/models/core/rem_query.py +243 -0
  62. rem/models/entities/__init__.py +43 -0
  63. rem/models/entities/file.py +57 -0
  64. rem/models/entities/image_resource.py +88 -0
  65. rem/models/entities/message.py +35 -0
  66. rem/models/entities/moment.py +123 -0
  67. rem/models/entities/ontology.py +191 -0
  68. rem/models/entities/ontology_config.py +131 -0
  69. rem/models/entities/resource.py +95 -0
  70. rem/models/entities/schema.py +87 -0
  71. rem/models/entities/user.py +85 -0
  72. rem/py.typed +0 -0
  73. rem/schemas/README.md +507 -0
  74. rem/schemas/__init__.py +6 -0
  75. rem/schemas/agents/README.md +92 -0
  76. rem/schemas/agents/core/moment-builder.yaml +178 -0
  77. rem/schemas/agents/core/rem-query-agent.yaml +226 -0
  78. rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
  79. rem/schemas/agents/core/simple-assistant.yaml +19 -0
  80. rem/schemas/agents/core/user-profile-builder.yaml +163 -0
  81. rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
  82. rem/schemas/agents/examples/contract-extractor.yaml +134 -0
  83. rem/schemas/agents/examples/cv-parser.yaml +263 -0
  84. rem/schemas/agents/examples/hello-world.yaml +37 -0
  85. rem/schemas/agents/examples/query.yaml +54 -0
  86. rem/schemas/agents/examples/simple.yaml +21 -0
  87. rem/schemas/agents/examples/test.yaml +29 -0
  88. rem/schemas/agents/rem.yaml +128 -0
  89. rem/schemas/evaluators/hello-world/default.yaml +77 -0
  90. rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
  91. rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
  92. rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
  93. rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
  94. rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
  95. rem/services/__init__.py +16 -0
  96. rem/services/audio/INTEGRATION.md +308 -0
  97. rem/services/audio/README.md +376 -0
  98. rem/services/audio/__init__.py +15 -0
  99. rem/services/audio/chunker.py +354 -0
  100. rem/services/audio/transcriber.py +259 -0
  101. rem/services/content/README.md +1269 -0
  102. rem/services/content/__init__.py +5 -0
  103. rem/services/content/providers.py +806 -0
  104. rem/services/content/service.py +657 -0
  105. rem/services/dreaming/README.md +230 -0
  106. rem/services/dreaming/__init__.py +53 -0
  107. rem/services/dreaming/affinity_service.py +336 -0
  108. rem/services/dreaming/moment_service.py +264 -0
  109. rem/services/dreaming/ontology_service.py +54 -0
  110. rem/services/dreaming/user_model_service.py +297 -0
  111. rem/services/dreaming/utils.py +39 -0
  112. rem/services/embeddings/__init__.py +11 -0
  113. rem/services/embeddings/api.py +120 -0
  114. rem/services/embeddings/worker.py +421 -0
  115. rem/services/fs/README.md +662 -0
  116. rem/services/fs/__init__.py +62 -0
  117. rem/services/fs/examples.py +206 -0
  118. rem/services/fs/examples_paths.py +204 -0
  119. rem/services/fs/git_provider.py +935 -0
  120. rem/services/fs/local_provider.py +760 -0
  121. rem/services/fs/parsing-hooks-examples.md +172 -0
  122. rem/services/fs/paths.py +276 -0
  123. rem/services/fs/provider.py +460 -0
  124. rem/services/fs/s3_provider.py +1042 -0
  125. rem/services/fs/service.py +186 -0
  126. rem/services/git/README.md +1075 -0
  127. rem/services/git/__init__.py +17 -0
  128. rem/services/git/service.py +469 -0
  129. rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
  130. rem/services/phoenix/README.md +453 -0
  131. rem/services/phoenix/__init__.py +46 -0
  132. rem/services/phoenix/client.py +686 -0
  133. rem/services/phoenix/config.py +88 -0
  134. rem/services/phoenix/prompt_labels.py +477 -0
  135. rem/services/postgres/README.md +575 -0
  136. rem/services/postgres/__init__.py +23 -0
  137. rem/services/postgres/migration_service.py +427 -0
  138. rem/services/postgres/pydantic_to_sqlalchemy.py +232 -0
  139. rem/services/postgres/register_type.py +352 -0
  140. rem/services/postgres/repository.py +337 -0
  141. rem/services/postgres/schema_generator.py +379 -0
  142. rem/services/postgres/service.py +802 -0
  143. rem/services/postgres/sql_builder.py +354 -0
  144. rem/services/rem/README.md +304 -0
  145. rem/services/rem/__init__.py +23 -0
  146. rem/services/rem/exceptions.py +71 -0
  147. rem/services/rem/executor.py +293 -0
  148. rem/services/rem/parser.py +145 -0
  149. rem/services/rem/queries.py +196 -0
  150. rem/services/rem/query.py +371 -0
  151. rem/services/rem/service.py +527 -0
  152. rem/services/session/README.md +374 -0
  153. rem/services/session/__init__.py +6 -0
  154. rem/services/session/compression.py +360 -0
  155. rem/services/session/reload.py +77 -0
  156. rem/settings.py +1235 -0
  157. rem/sql/002_install_models.sql +1068 -0
  158. rem/sql/background_indexes.sql +42 -0
  159. rem/sql/install_models.sql +1038 -0
  160. rem/sql/migrations/001_install.sql +503 -0
  161. rem/sql/migrations/002_install_models.sql +1202 -0
  162. rem/utils/AGENTIC_CHUNKING.md +597 -0
  163. rem/utils/README.md +583 -0
  164. rem/utils/__init__.py +43 -0
  165. rem/utils/agentic_chunking.py +622 -0
  166. rem/utils/batch_ops.py +343 -0
  167. rem/utils/chunking.py +108 -0
  168. rem/utils/clip_embeddings.py +276 -0
  169. rem/utils/dict_utils.py +98 -0
  170. rem/utils/embeddings.py +423 -0
  171. rem/utils/examples/embeddings_example.py +305 -0
  172. rem/utils/examples/sql_types_example.py +202 -0
  173. rem/utils/markdown.py +16 -0
  174. rem/utils/model_helpers.py +236 -0
  175. rem/utils/schema_loader.py +229 -0
  176. rem/utils/sql_types.py +348 -0
  177. rem/utils/user_id.py +81 -0
  178. rem/utils/vision.py +330 -0
  179. rem/workers/README.md +506 -0
  180. rem/workers/__init__.py +5 -0
  181. rem/workers/dreaming.py +502 -0
  182. rem/workers/engram_processor.py +312 -0
  183. rem/workers/sqs_file_processor.py +193 -0
  184. remdb-0.2.6.dist-info/METADATA +1191 -0
  185. remdb-0.2.6.dist-info/RECORD +187 -0
  186. remdb-0.2.6.dist-info/WHEEL +4 -0
  187. remdb-0.2.6.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,312 @@
1
+ """
2
+ Engram Processor for REM.
3
+
4
+ Processes engram YAML/JSON files into Resources and Moments following
5
+ the p8fs-modules engram specification.
6
+
7
+ Key Design Principles:
8
+ - Engrams ARE Resources (category="engram")
9
+ - Human-friendly labels in graph edges (not UUIDs)
10
+ - Upsert with JSON merge behavior (never overwrite)
11
+ - Dual indexing handled by repository (SQL + embeddings + KV)
12
+ - Moment attachment via part_of relationship
13
+
14
+ Processing Flow:
15
+ 1. Parse YAML/JSON engram
16
+ 2. Create Resource from top-level engram fields
17
+ 3. Upsert Resource (triggers embeddings + KV store population)
18
+ 4. Create Moments from moments array
19
+ 5. Link moments to parent engram via graph edges
20
+
21
+ See: /Users/sirsh/code/p8fs-modules/p8fs/docs/04 engram-specification.md
22
+ """
23
+
24
+ """
25
+ Engram Processor for REM.
26
+
27
+ Processes engram YAML/JSON files into Resources and Moments following
28
+ the p8fs-modules engram specification.
29
+
30
+ Key Design Principles:
31
+ - Engrams ARE Resources (category="engram")
32
+ - Human-friendly labels in graph edges (not UUIDs)
33
+ - Upsert with JSON merge behavior (never overwrite)
34
+ - Dual indexing handled by repository (SQL + embeddings + KV)
35
+ - Moment attachment via part_of relationship
36
+
37
+ Processing Flow:
38
+ 1. Parse YAML/JSON engram
39
+ 2. Create Resource from top-level engram fields
40
+ 3. Upsert Resource (triggers embeddings + KV store population)
41
+ 4. Create Moments from moments array
42
+ 5. Link moments to parent engram via graph edges
43
+
44
+ See: /Users/sirsh/code/p8fs-modules/p8fs/docs/04 engram-specification.md
45
+ """
46
+
47
+ import logging
48
+ from datetime import datetime
49
+ from pathlib import Path
50
+ from typing import Any, Optional, cast # Added cast
51
+
52
+ import yaml
53
+
54
+ from rem.models.core import InlineEdge
55
+ from rem.models.entities import Moment, Resource
56
+ from rem.services.postgres import PostgresService
57
+
58
+ logger = logging.getLogger(__name__)
59
+
60
+
61
+ class EngramProcessor:
62
+ """
63
+ Process engram files into REM Resources and Moments.
64
+
65
+ Example usage:
66
+ processor = EngramProcessor(postgres_service)
67
+ result = await processor.process_file(
68
+ file_path="/path/to/engram.yaml",
69
+ tenant_id="acme-corp",
70
+ user_id="sarah-chen"
71
+ )
72
+ """
73
+
74
+ def __init__(self, postgres_service: PostgresService):
75
+ """
76
+ Initialize engram processor.
77
+
78
+ Args:
79
+ postgres_service: PostgreSQL service for upsert operations
80
+ """
81
+ self.postgres = postgres_service
82
+
83
+ async def process_file(
84
+ self,
85
+ file_path: Path | str,
86
+ tenant_id: str,
87
+ user_id: Optional[str] = None,
88
+ ) -> dict[str, Any]:
89
+ """
90
+ Process an engram file (YAML or JSON).
91
+
92
+ Args:
93
+ file_path: Path to engram file
94
+ tenant_id: Tenant ID for multi-tenancy
95
+ user_id: Optional user ID (owner)
96
+
97
+ Returns:
98
+ Result dict with resource_id, moment_ids, chunks_created, etc.
99
+ """
100
+ file_path = Path(file_path)
101
+
102
+ # Parse file
103
+ with open(file_path) as f:
104
+ if file_path.suffix in (".yaml", ".yml"):
105
+ data = yaml.safe_load(f)
106
+ elif file_path.suffix == ".json":
107
+ import json
108
+
109
+ data = json.load(f)
110
+ else:
111
+ raise ValueError(f"Unsupported file format: {file_path.suffix}")
112
+
113
+ return await self.process_engram(data, tenant_id, user_id)
114
+
115
+ async def process_engram(
116
+ self,
117
+ data: dict[str, Any],
118
+ tenant_id: str,
119
+ user_id: Optional[str] = None,
120
+ ) -> dict[str, Any]:
121
+ """
122
+ Process engram data into REM entities.
123
+
124
+ Args:
125
+ data: Parsed engram data (dict from YAML/JSON)
126
+ tenant_id: Tenant ID
127
+ user_id: Optional user ID
128
+
129
+ Returns:
130
+ Result dict with resource_id, moment_ids, chunks_created, etc.
131
+ """
132
+ # Validate kind
133
+ if data.get("kind") != "engram":
134
+ raise ValueError(f"Expected kind='engram', got: {data.get('kind')}")
135
+
136
+ # Extract top-level engram fields
137
+ name = data["name"]
138
+ category = data.get("category", "engram")
139
+ summary = data.get("summary")
140
+ content = data.get("content", "")
141
+ uri = data.get("uri")
142
+ resource_timestamp = data.get("resource_timestamp")
143
+ if resource_timestamp:
144
+ resource_timestamp = datetime.fromisoformat(resource_timestamp)
145
+ else:
146
+ resource_timestamp = datetime.utcnow()
147
+
148
+ metadata = data.get("metadata", {})
149
+ graph_edges_data = data.get("graph_edges", [])
150
+ moments_data = data.get("moments", [])
151
+
152
+ # Convert graph edges to InlineEdge objects
153
+ graph_edges = []
154
+ for edge_data in graph_edges_data:
155
+ edge = InlineEdge(
156
+ dst=edge_data["dst"],
157
+ rel_type=edge_data["rel_type"],
158
+ weight=edge_data.get("weight", 0.5),
159
+ properties=edge_data.get("properties", {}),
160
+ )
161
+ graph_edges.append(edge)
162
+
163
+ # Create Resource (engram)
164
+ resource = Resource(
165
+ tenant_id=tenant_id,
166
+ user_id=user_id,
167
+ name=name,
168
+ category=category,
169
+ uri=uri,
170
+ content=content,
171
+ timestamp=resource_timestamp,
172
+ metadata=metadata,
173
+ graph_edges=[edge.model_dump() for edge in graph_edges],
174
+ )
175
+
176
+ # Upsert resource (triggers embeddings + KV store population)
177
+ logger.info(f"Upserting engram resource: {name}")
178
+ upsert_result = await self.postgres.batch_upsert(
179
+ records=[resource.model_dump(mode="json")],
180
+ model=Resource,
181
+ table_name="resources",
182
+ entity_key_field="name", # Explicit entity_key for KV store
183
+ )
184
+ resource_id = upsert_result["ids"][0]
185
+ logger.info(f"Upserted resource: {resource_id}")
186
+
187
+ # Process attached moments
188
+ moment_ids = []
189
+ if moments_data:
190
+ logger.info(f"Processing {len(moments_data)} moments for engram: {name}")
191
+ moment_ids = await self._process_moments(
192
+ moments_data=moments_data,
193
+ parent_resource_name=name,
194
+ parent_resource_id=resource_id,
195
+ tenant_id=tenant_id,
196
+ user_id=user_id,
197
+ )
198
+
199
+ return {
200
+ "resource_id": resource_id,
201
+ "moment_ids": moment_ids,
202
+ "chunks_created": 1 + len(moment_ids), # Resource + moments
203
+ "embeddings_generated": 0, # Handled by embedding worker
204
+ }
205
+
206
+ async def _process_moments(
207
+ self,
208
+ moments_data: list[dict],
209
+ parent_resource_name: str,
210
+ parent_resource_id: str,
211
+ tenant_id: str,
212
+ user_id: Optional[str] = None,
213
+ ) -> list[str]:
214
+ """
215
+ Create Moments from moments array in engram.
216
+
217
+ Args:
218
+ moments_data: List of moment dicts from engram
219
+ parent_resource_name: Parent engram name
220
+ parent_resource_id: Parent engram resource ID
221
+ tenant_id: Tenant ID
222
+ user_id: Optional user ID
223
+
224
+ Returns:
225
+ List of moment IDs
226
+ """
227
+ moments = []
228
+ for moment_data in moments_data:
229
+ # Extract moment fields
230
+ name = moment_data["name"]
231
+ content = moment_data["content"]
232
+ summary = moment_data.get("summary")
233
+ moment_type = moment_data.get("moment_type")
234
+ category = moment_data.get("category")
235
+ uri = moment_data.get("uri")
236
+
237
+ # Parse timestamps
238
+ starts_timestamp = moment_data.get("starts_timestamp") or moment_data.get("resource_timestamp")
239
+ if starts_timestamp:
240
+ if isinstance(starts_timestamp, str):
241
+ starts_timestamp = datetime.fromisoformat(starts_timestamp)
242
+ else:
243
+ starts_timestamp = datetime.utcnow()
244
+
245
+ ends_timestamp = moment_data.get("ends_timestamp") or moment_data.get("resource_ends_timestamp")
246
+ if ends_timestamp and isinstance(ends_timestamp, str):
247
+ ends_timestamp = datetime.fromisoformat(ends_timestamp)
248
+
249
+ emotion_tags = moment_data.get("emotion_tags", [])
250
+ topic_tags = moment_data.get("topic_tags", [])
251
+ present_persons_data = moment_data.get("present_persons", [])
252
+ metadata = moment_data.get("metadata", {})
253
+ graph_edges_data = moment_data.get("graph_edges", [])
254
+
255
+ # Create link to parent engram
256
+ parent_edge = InlineEdge(
257
+ dst=parent_resource_name,
258
+ rel_type="part_of",
259
+ weight=1.0,
260
+ properties={
261
+ "dst_name": parent_resource_name,
262
+ "dst_id": parent_resource_id,
263
+ "dst_entity_type": "resource/engram",
264
+ "match_type": "parent_child",
265
+ "confidence": 1.0,
266
+ },
267
+ )
268
+
269
+ # Combine moment edges with parent edge
270
+ all_edges = [parent_edge]
271
+ for edge_data in graph_edges_data:
272
+ edge = InlineEdge(
273
+ dst=edge_data["dst"],
274
+ rel_type=edge_data["rel_type"],
275
+ weight=edge_data.get("weight", 0.5),
276
+ properties=edge_data.get("properties", {}),
277
+ )
278
+ all_edges.append(edge)
279
+
280
+ # Create Moment entity
281
+ moment = Moment(
282
+ tenant_id=tenant_id,
283
+ user_id=user_id,
284
+ name=name,
285
+ moment_type=moment_type,
286
+ category=category,
287
+ starts_timestamp=starts_timestamp,
288
+ ends_timestamp=ends_timestamp,
289
+ present_persons=present_persons_data, # Will be validated by Pydantic
290
+ emotion_tags=emotion_tags,
291
+ topic_tags=topic_tags,
292
+ summary=summary or content[:200], # Fallback to content prefix
293
+ source_resource_ids=[parent_resource_id],
294
+ metadata=metadata,
295
+ graph_edges=[edge.model_dump() for edge in all_edges],
296
+ )
297
+ moments.append(moment)
298
+
299
+ # Batch upsert all moments
300
+ if moments:
301
+ logger.info(f"Batch upserting {len(moments)} moments")
302
+ upsert_result = await self.postgres.batch_upsert(
303
+ records=[m.model_dump(mode="json") for m in moments],
304
+ model=Moment,
305
+ table_name="moments",
306
+ entity_key_field="name", # Explicit entity_key for KV store
307
+ )
308
+ moment_ids = upsert_result["ids"]
309
+ logger.info(f"Upserted {len(moment_ids)} moments")
310
+ return moment_ids
311
+
312
+ return []
@@ -0,0 +1,193 @@
1
+ """
2
+ SQS File Processor Worker.
3
+
4
+ Consumes S3 ObjectCreated events from SQS queue and processes files.
5
+ Designed to run as a K8s deployment scaled by KEDA based on queue depth.
6
+ """
7
+
8
+ import json
9
+ import signal
10
+ import sys
11
+ from typing import Any
12
+
13
+ import boto3
14
+ from botocore.exceptions import ClientError
15
+ from loguru import logger
16
+
17
+ from rem.services.content import ContentService
18
+ from rem.settings import settings
19
+
20
+
21
+ class SQSFileProcessor:
22
+ """
23
+ SQS-based file processor worker.
24
+
25
+ Polls SQS queue for S3 ObjectCreated events and processes files
26
+ using ContentService.
27
+
28
+ Gracefully handles:
29
+ - SIGTERM (K8s pod termination)
30
+ - SIGINT (Ctrl+C for local testing)
31
+ - SQS visibility timeout (message redelivery)
32
+ - DLQ (failed messages after retries)
33
+ """
34
+
35
+ def __init__(self):
36
+ self.sqs_client = self._create_sqs_client()
37
+ self.content_service = ContentService()
38
+ self.running = True
39
+ self.processing_message = False
40
+
41
+ # Register signal handlers
42
+ signal.signal(signal.SIGTERM, self._handle_shutdown)
43
+ signal.signal(signal.SIGINT, self._handle_shutdown)
44
+
45
+ def _create_sqs_client(self):
46
+ """Create SQS client with IRSA or configured credentials."""
47
+ return boto3.client("sqs", region_name=settings.sqs.region)
48
+
49
+ def _handle_shutdown(self, signum, frame):
50
+ """Handle graceful shutdown signals."""
51
+ logger.info(f"Received shutdown signal ({signum}), finishing current message...")
52
+ self.running = False
53
+
54
+ def run(self):
55
+ """
56
+ Main worker loop.
57
+
58
+ Long polls SQS queue, processes messages, deletes on success.
59
+ Exits gracefully on SIGTERM/SIGINT after completing current message.
60
+ """
61
+ if not settings.sqs.queue_url:
62
+ logger.error("SQS_QUEUE_URL not configured")
63
+ sys.exit(1)
64
+
65
+ logger.info(f"Starting file processor worker")
66
+ logger.info(f"Queue: {settings.sqs.queue_url}")
67
+ logger.info(f"Polling interval: {settings.sqs.wait_time_seconds}s (long polling)")
68
+
69
+ while self.running:
70
+ try:
71
+ # Long poll for messages
72
+ response = self.sqs_client.receive_message(
73
+ QueueUrl=settings.sqs.queue_url,
74
+ MaxNumberOfMessages=settings.sqs.max_messages,
75
+ WaitTimeSeconds=settings.sqs.wait_time_seconds,
76
+ AttributeNames=["All"],
77
+ MessageAttributeNames=["All"],
78
+ )
79
+
80
+ messages = response.get("Messages", [])
81
+ if not messages:
82
+ continue
83
+
84
+ logger.info(f"Received {len(messages)} message(s)")
85
+
86
+ # Process each message
87
+ for message in messages:
88
+ if not self.running:
89
+ logger.info("Shutdown requested, stopping message processing")
90
+ break
91
+
92
+ self.processing_message = True
93
+ try:
94
+ self._process_message(message)
95
+ self._delete_message(message)
96
+ except Exception as e:
97
+ logger.error(f"Failed to process message: {e}", exc_info=True)
98
+ # Message will be redelivered after visibility timeout
99
+ finally:
100
+ self.processing_message = False
101
+
102
+ except KeyboardInterrupt:
103
+ logger.info("Keyboard interrupt, shutting down...")
104
+ break
105
+ except Exception as e:
106
+ logger.error(f"Error in worker loop: {e}", exc_info=True)
107
+
108
+ logger.info("Worker stopped")
109
+
110
+ def _process_message(self, message: dict):
111
+ """
112
+ Process a single SQS message containing S3 event(s).
113
+
114
+ S3 notification format:
115
+ {
116
+ "Records": [{
117
+ "eventName": "ObjectCreated:Put",
118
+ "s3": {
119
+ "bucket": {"name": "rem"},
120
+ "object": {"key": "uploads/file.md", "size": 12345}
121
+ }
122
+ }]
123
+ }
124
+ """
125
+ body = json.loads(message["Body"])
126
+
127
+ for record in body.get("Records", []):
128
+ event_name = record.get("eventName", "")
129
+
130
+ if not event_name.startswith("ObjectCreated:"):
131
+ logger.debug(f"Skipping non-create event: {event_name}")
132
+ continue
133
+
134
+ bucket = record["s3"]["bucket"]["name"]
135
+ key = record["s3"]["object"]["key"]
136
+ size = record["s3"]["object"].get("size", 0)
137
+
138
+ logger.info(f"Processing {event_name}: s3://{bucket}/{key} ({size} bytes)")
139
+
140
+ try:
141
+ # Process file using ContentService
142
+ uri = f"s3://{bucket}/{key}"
143
+ result = self.content_service.process_uri(uri)
144
+
145
+ # TODO: Store in PostgreSQL with pgvector
146
+ # TODO: Generate embeddings
147
+ # TODO: Create graph edges
148
+ # TODO: Update REM entities
149
+
150
+ logger.info(
151
+ f"Successfully processed s3://{bucket}/{key} "
152
+ f"({len(result['content'])} chars, provider={result['provider']})"
153
+ )
154
+
155
+ # Log extraction metadata
156
+ logger.debug(f"Metadata: {json.dumps(result['metadata'], default=str)}")
157
+
158
+ except FileNotFoundError as e:
159
+ logger.error(f"File not found: {e}")
160
+ # Don't retry - file is gone
161
+ raise
162
+ except RuntimeError as e:
163
+ logger.error(f"Processing error: {e}")
164
+ # Retry on next visibility timeout
165
+ raise
166
+ except Exception as e:
167
+ logger.exception(f"Unexpected error processing s3://{bucket}/{key}: {e}")
168
+ raise
169
+
170
+ def _delete_message(self, message: dict):
171
+ """Delete message from queue after successful processing."""
172
+ try:
173
+ self.sqs_client.delete_message(
174
+ QueueUrl=settings.sqs.queue_url,
175
+ ReceiptHandle=message["ReceiptHandle"],
176
+ )
177
+ logger.debug("Message deleted from queue")
178
+ except ClientError as e:
179
+ logger.error(f"Failed to delete message: {e}")
180
+ # Message will be redelivered, but processing was successful
181
+
182
+
183
+ def main():
184
+ """Entry point for containerized deployment."""
185
+ logger.info("REM File Processor Worker")
186
+ logger.info(f"Environment: {settings.environment}")
187
+
188
+ processor = SQSFileProcessor()
189
+ processor.run()
190
+
191
+
192
+ if __name__ == "__main__":
193
+ main()