remdb 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rem/__init__.py +2 -0
- rem/agentic/README.md +650 -0
- rem/agentic/__init__.py +39 -0
- rem/agentic/agents/README.md +155 -0
- rem/agentic/agents/__init__.py +8 -0
- rem/agentic/context.py +148 -0
- rem/agentic/context_builder.py +329 -0
- rem/agentic/mcp/__init__.py +0 -0
- rem/agentic/mcp/tool_wrapper.py +107 -0
- rem/agentic/otel/__init__.py +5 -0
- rem/agentic/otel/setup.py +151 -0
- rem/agentic/providers/phoenix.py +674 -0
- rem/agentic/providers/pydantic_ai.py +572 -0
- rem/agentic/query.py +117 -0
- rem/agentic/query_helper.py +89 -0
- rem/agentic/schema.py +396 -0
- rem/agentic/serialization.py +245 -0
- rem/agentic/tools/__init__.py +5 -0
- rem/agentic/tools/rem_tools.py +231 -0
- rem/api/README.md +420 -0
- rem/api/main.py +324 -0
- rem/api/mcp_router/prompts.py +182 -0
- rem/api/mcp_router/resources.py +536 -0
- rem/api/mcp_router/server.py +213 -0
- rem/api/mcp_router/tools.py +584 -0
- rem/api/routers/auth.py +229 -0
- rem/api/routers/chat/__init__.py +5 -0
- rem/api/routers/chat/completions.py +281 -0
- rem/api/routers/chat/json_utils.py +76 -0
- rem/api/routers/chat/models.py +124 -0
- rem/api/routers/chat/streaming.py +185 -0
- rem/auth/README.md +258 -0
- rem/auth/__init__.py +26 -0
- rem/auth/middleware.py +100 -0
- rem/auth/providers/__init__.py +13 -0
- rem/auth/providers/base.py +376 -0
- rem/auth/providers/google.py +163 -0
- rem/auth/providers/microsoft.py +237 -0
- rem/cli/README.md +455 -0
- rem/cli/__init__.py +8 -0
- rem/cli/commands/README.md +126 -0
- rem/cli/commands/__init__.py +3 -0
- rem/cli/commands/ask.py +566 -0
- rem/cli/commands/configure.py +497 -0
- rem/cli/commands/db.py +493 -0
- rem/cli/commands/dreaming.py +324 -0
- rem/cli/commands/experiments.py +1302 -0
- rem/cli/commands/mcp.py +66 -0
- rem/cli/commands/process.py +245 -0
- rem/cli/commands/schema.py +183 -0
- rem/cli/commands/serve.py +106 -0
- rem/cli/dreaming.py +363 -0
- rem/cli/main.py +96 -0
- rem/config.py +237 -0
- rem/mcp_server.py +41 -0
- rem/models/core/__init__.py +49 -0
- rem/models/core/core_model.py +64 -0
- rem/models/core/engram.py +333 -0
- rem/models/core/experiment.py +628 -0
- rem/models/core/inline_edge.py +132 -0
- rem/models/core/rem_query.py +243 -0
- rem/models/entities/__init__.py +43 -0
- rem/models/entities/file.py +57 -0
- rem/models/entities/image_resource.py +88 -0
- rem/models/entities/message.py +35 -0
- rem/models/entities/moment.py +123 -0
- rem/models/entities/ontology.py +191 -0
- rem/models/entities/ontology_config.py +131 -0
- rem/models/entities/resource.py +95 -0
- rem/models/entities/schema.py +87 -0
- rem/models/entities/user.py +85 -0
- rem/py.typed +0 -0
- rem/schemas/README.md +507 -0
- rem/schemas/__init__.py +6 -0
- rem/schemas/agents/README.md +92 -0
- rem/schemas/agents/core/moment-builder.yaml +178 -0
- rem/schemas/agents/core/rem-query-agent.yaml +226 -0
- rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
- rem/schemas/agents/core/simple-assistant.yaml +19 -0
- rem/schemas/agents/core/user-profile-builder.yaml +163 -0
- rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
- rem/schemas/agents/examples/contract-extractor.yaml +134 -0
- rem/schemas/agents/examples/cv-parser.yaml +263 -0
- rem/schemas/agents/examples/hello-world.yaml +37 -0
- rem/schemas/agents/examples/query.yaml +54 -0
- rem/schemas/agents/examples/simple.yaml +21 -0
- rem/schemas/agents/examples/test.yaml +29 -0
- rem/schemas/agents/rem.yaml +128 -0
- rem/schemas/evaluators/hello-world/default.yaml +77 -0
- rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
- rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
- rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
- rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
- rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
- rem/services/__init__.py +16 -0
- rem/services/audio/INTEGRATION.md +308 -0
- rem/services/audio/README.md +376 -0
- rem/services/audio/__init__.py +15 -0
- rem/services/audio/chunker.py +354 -0
- rem/services/audio/transcriber.py +259 -0
- rem/services/content/README.md +1269 -0
- rem/services/content/__init__.py +5 -0
- rem/services/content/providers.py +801 -0
- rem/services/content/service.py +676 -0
- rem/services/dreaming/README.md +230 -0
- rem/services/dreaming/__init__.py +53 -0
- rem/services/dreaming/affinity_service.py +336 -0
- rem/services/dreaming/moment_service.py +264 -0
- rem/services/dreaming/ontology_service.py +54 -0
- rem/services/dreaming/user_model_service.py +297 -0
- rem/services/dreaming/utils.py +39 -0
- rem/services/embeddings/__init__.py +11 -0
- rem/services/embeddings/api.py +120 -0
- rem/services/embeddings/worker.py +421 -0
- rem/services/fs/README.md +662 -0
- rem/services/fs/__init__.py +62 -0
- rem/services/fs/examples.py +206 -0
- rem/services/fs/examples_paths.py +204 -0
- rem/services/fs/git_provider.py +935 -0
- rem/services/fs/local_provider.py +760 -0
- rem/services/fs/parsing-hooks-examples.md +172 -0
- rem/services/fs/paths.py +276 -0
- rem/services/fs/provider.py +460 -0
- rem/services/fs/s3_provider.py +1042 -0
- rem/services/fs/service.py +186 -0
- rem/services/git/README.md +1075 -0
- rem/services/git/__init__.py +17 -0
- rem/services/git/service.py +469 -0
- rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
- rem/services/phoenix/README.md +453 -0
- rem/services/phoenix/__init__.py +46 -0
- rem/services/phoenix/client.py +686 -0
- rem/services/phoenix/config.py +88 -0
- rem/services/phoenix/prompt_labels.py +477 -0
- rem/services/postgres/README.md +575 -0
- rem/services/postgres/__init__.py +23 -0
- rem/services/postgres/migration_service.py +427 -0
- rem/services/postgres/pydantic_to_sqlalchemy.py +232 -0
- rem/services/postgres/register_type.py +352 -0
- rem/services/postgres/repository.py +337 -0
- rem/services/postgres/schema_generator.py +379 -0
- rem/services/postgres/service.py +802 -0
- rem/services/postgres/sql_builder.py +354 -0
- rem/services/rem/README.md +304 -0
- rem/services/rem/__init__.py +23 -0
- rem/services/rem/exceptions.py +71 -0
- rem/services/rem/executor.py +293 -0
- rem/services/rem/parser.py +145 -0
- rem/services/rem/queries.py +196 -0
- rem/services/rem/query.py +371 -0
- rem/services/rem/service.py +527 -0
- rem/services/session/README.md +374 -0
- rem/services/session/__init__.py +6 -0
- rem/services/session/compression.py +360 -0
- rem/services/session/reload.py +77 -0
- rem/settings.py +1235 -0
- rem/sql/002_install_models.sql +1068 -0
- rem/sql/background_indexes.sql +42 -0
- rem/sql/install_models.sql +1038 -0
- rem/sql/migrations/001_install.sql +503 -0
- rem/sql/migrations/002_install_models.sql +1202 -0
- rem/utils/AGENTIC_CHUNKING.md +597 -0
- rem/utils/README.md +583 -0
- rem/utils/__init__.py +43 -0
- rem/utils/agentic_chunking.py +622 -0
- rem/utils/batch_ops.py +343 -0
- rem/utils/chunking.py +108 -0
- rem/utils/clip_embeddings.py +276 -0
- rem/utils/dict_utils.py +98 -0
- rem/utils/embeddings.py +423 -0
- rem/utils/examples/embeddings_example.py +305 -0
- rem/utils/examples/sql_types_example.py +202 -0
- rem/utils/markdown.py +16 -0
- rem/utils/model_helpers.py +236 -0
- rem/utils/schema_loader.py +336 -0
- rem/utils/sql_types.py +348 -0
- rem/utils/user_id.py +81 -0
- rem/utils/vision.py +330 -0
- rem/workers/README.md +506 -0
- rem/workers/__init__.py +5 -0
- rem/workers/dreaming.py +502 -0
- rem/workers/engram_processor.py +312 -0
- rem/workers/sqs_file_processor.py +193 -0
- remdb-0.3.7.dist-info/METADATA +1473 -0
- remdb-0.3.7.dist-info/RECORD +187 -0
- remdb-0.3.7.dist-info/WHEEL +4 -0
- remdb-0.3.7.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Engram Processor for REM.
|
|
3
|
+
|
|
4
|
+
Processes engram YAML/JSON files into Resources and Moments following
|
|
5
|
+
the p8fs-modules engram specification.
|
|
6
|
+
|
|
7
|
+
Key Design Principles:
|
|
8
|
+
- Engrams ARE Resources (category="engram")
|
|
9
|
+
- Human-friendly labels in graph edges (not UUIDs)
|
|
10
|
+
- Upsert with JSON merge behavior (never overwrite)
|
|
11
|
+
- Dual indexing handled by repository (SQL + embeddings + KV)
|
|
12
|
+
- Moment attachment via part_of relationship
|
|
13
|
+
|
|
14
|
+
Processing Flow:
|
|
15
|
+
1. Parse YAML/JSON engram
|
|
16
|
+
2. Create Resource from top-level engram fields
|
|
17
|
+
3. Upsert Resource (triggers embeddings + KV store population)
|
|
18
|
+
4. Create Moments from moments array
|
|
19
|
+
5. Link moments to parent engram via graph edges
|
|
20
|
+
|
|
21
|
+
See: /Users/sirsh/code/p8fs-modules/p8fs/docs/04 engram-specification.md
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
"""
|
|
25
|
+
Engram Processor for REM.
|
|
26
|
+
|
|
27
|
+
Processes engram YAML/JSON files into Resources and Moments following
|
|
28
|
+
the p8fs-modules engram specification.
|
|
29
|
+
|
|
30
|
+
Key Design Principles:
|
|
31
|
+
- Engrams ARE Resources (category="engram")
|
|
32
|
+
- Human-friendly labels in graph edges (not UUIDs)
|
|
33
|
+
- Upsert with JSON merge behavior (never overwrite)
|
|
34
|
+
- Dual indexing handled by repository (SQL + embeddings + KV)
|
|
35
|
+
- Moment attachment via part_of relationship
|
|
36
|
+
|
|
37
|
+
Processing Flow:
|
|
38
|
+
1. Parse YAML/JSON engram
|
|
39
|
+
2. Create Resource from top-level engram fields
|
|
40
|
+
3. Upsert Resource (triggers embeddings + KV store population)
|
|
41
|
+
4. Create Moments from moments array
|
|
42
|
+
5. Link moments to parent engram via graph edges
|
|
43
|
+
|
|
44
|
+
See: /Users/sirsh/code/p8fs-modules/p8fs/docs/04 engram-specification.md
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
import logging
|
|
48
|
+
from datetime import datetime
|
|
49
|
+
from pathlib import Path
|
|
50
|
+
from typing import Any, Optional, cast # Added cast
|
|
51
|
+
|
|
52
|
+
import yaml
|
|
53
|
+
|
|
54
|
+
from rem.models.core import InlineEdge
|
|
55
|
+
from rem.models.entities import Moment, Resource
|
|
56
|
+
from rem.services.postgres import PostgresService
|
|
57
|
+
|
|
58
|
+
logger = logging.getLogger(__name__)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class EngramProcessor:
|
|
62
|
+
"""
|
|
63
|
+
Process engram files into REM Resources and Moments.
|
|
64
|
+
|
|
65
|
+
Example usage:
|
|
66
|
+
processor = EngramProcessor(postgres_service)
|
|
67
|
+
result = await processor.process_file(
|
|
68
|
+
file_path="/path/to/engram.yaml",
|
|
69
|
+
tenant_id="acme-corp",
|
|
70
|
+
user_id="sarah-chen"
|
|
71
|
+
)
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
def __init__(self, postgres_service: PostgresService):
|
|
75
|
+
"""
|
|
76
|
+
Initialize engram processor.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
postgres_service: PostgreSQL service for upsert operations
|
|
80
|
+
"""
|
|
81
|
+
self.postgres = postgres_service
|
|
82
|
+
|
|
83
|
+
async def process_file(
|
|
84
|
+
self,
|
|
85
|
+
file_path: Path | str,
|
|
86
|
+
tenant_id: str,
|
|
87
|
+
user_id: Optional[str] = None,
|
|
88
|
+
) -> dict[str, Any]:
|
|
89
|
+
"""
|
|
90
|
+
Process an engram file (YAML or JSON).
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
file_path: Path to engram file
|
|
94
|
+
tenant_id: Tenant ID for multi-tenancy
|
|
95
|
+
user_id: Optional user ID (owner)
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
Result dict with resource_id, moment_ids, chunks_created, etc.
|
|
99
|
+
"""
|
|
100
|
+
file_path = Path(file_path)
|
|
101
|
+
|
|
102
|
+
# Parse file
|
|
103
|
+
with open(file_path) as f:
|
|
104
|
+
if file_path.suffix in (".yaml", ".yml"):
|
|
105
|
+
data = yaml.safe_load(f)
|
|
106
|
+
elif file_path.suffix == ".json":
|
|
107
|
+
import json
|
|
108
|
+
|
|
109
|
+
data = json.load(f)
|
|
110
|
+
else:
|
|
111
|
+
raise ValueError(f"Unsupported file format: {file_path.suffix}")
|
|
112
|
+
|
|
113
|
+
return await self.process_engram(data, tenant_id, user_id)
|
|
114
|
+
|
|
115
|
+
async def process_engram(
|
|
116
|
+
self,
|
|
117
|
+
data: dict[str, Any],
|
|
118
|
+
tenant_id: str,
|
|
119
|
+
user_id: Optional[str] = None,
|
|
120
|
+
) -> dict[str, Any]:
|
|
121
|
+
"""
|
|
122
|
+
Process engram data into REM entities.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
data: Parsed engram data (dict from YAML/JSON)
|
|
126
|
+
tenant_id: Tenant ID
|
|
127
|
+
user_id: Optional user ID
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
Result dict with resource_id, moment_ids, chunks_created, etc.
|
|
131
|
+
"""
|
|
132
|
+
# Validate kind
|
|
133
|
+
if data.get("kind") != "engram":
|
|
134
|
+
raise ValueError(f"Expected kind='engram', got: {data.get('kind')}")
|
|
135
|
+
|
|
136
|
+
# Extract top-level engram fields
|
|
137
|
+
name = data["name"]
|
|
138
|
+
category = data.get("category", "engram")
|
|
139
|
+
summary = data.get("summary")
|
|
140
|
+
content = data.get("content", "")
|
|
141
|
+
uri = data.get("uri")
|
|
142
|
+
resource_timestamp = data.get("resource_timestamp")
|
|
143
|
+
if resource_timestamp:
|
|
144
|
+
resource_timestamp = datetime.fromisoformat(resource_timestamp)
|
|
145
|
+
else:
|
|
146
|
+
resource_timestamp = datetime.utcnow()
|
|
147
|
+
|
|
148
|
+
metadata = data.get("metadata", {})
|
|
149
|
+
graph_edges_data = data.get("graph_edges", [])
|
|
150
|
+
moments_data = data.get("moments", [])
|
|
151
|
+
|
|
152
|
+
# Convert graph edges to InlineEdge objects
|
|
153
|
+
graph_edges = []
|
|
154
|
+
for edge_data in graph_edges_data:
|
|
155
|
+
edge = InlineEdge(
|
|
156
|
+
dst=edge_data["dst"],
|
|
157
|
+
rel_type=edge_data["rel_type"],
|
|
158
|
+
weight=edge_data.get("weight", 0.5),
|
|
159
|
+
properties=edge_data.get("properties", {}),
|
|
160
|
+
)
|
|
161
|
+
graph_edges.append(edge)
|
|
162
|
+
|
|
163
|
+
# Create Resource (engram)
|
|
164
|
+
resource = Resource(
|
|
165
|
+
tenant_id=tenant_id,
|
|
166
|
+
user_id=user_id,
|
|
167
|
+
name=name,
|
|
168
|
+
category=category,
|
|
169
|
+
uri=uri,
|
|
170
|
+
content=content,
|
|
171
|
+
timestamp=resource_timestamp,
|
|
172
|
+
metadata=metadata,
|
|
173
|
+
graph_edges=[edge.model_dump() for edge in graph_edges],
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
# Upsert resource (triggers embeddings + KV store population)
|
|
177
|
+
logger.info(f"Upserting engram resource: {name}")
|
|
178
|
+
upsert_result = await self.postgres.batch_upsert(
|
|
179
|
+
records=[resource.model_dump(mode="json")],
|
|
180
|
+
model=Resource,
|
|
181
|
+
table_name="resources",
|
|
182
|
+
entity_key_field="name", # Explicit entity_key for KV store
|
|
183
|
+
)
|
|
184
|
+
resource_id = upsert_result["ids"][0]
|
|
185
|
+
logger.info(f"Upserted resource: {resource_id}")
|
|
186
|
+
|
|
187
|
+
# Process attached moments
|
|
188
|
+
moment_ids = []
|
|
189
|
+
if moments_data:
|
|
190
|
+
logger.info(f"Processing {len(moments_data)} moments for engram: {name}")
|
|
191
|
+
moment_ids = await self._process_moments(
|
|
192
|
+
moments_data=moments_data,
|
|
193
|
+
parent_resource_name=name,
|
|
194
|
+
parent_resource_id=resource_id,
|
|
195
|
+
tenant_id=tenant_id,
|
|
196
|
+
user_id=user_id,
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
return {
|
|
200
|
+
"resource_id": resource_id,
|
|
201
|
+
"moment_ids": moment_ids,
|
|
202
|
+
"chunks_created": 1 + len(moment_ids), # Resource + moments
|
|
203
|
+
"embeddings_generated": 0, # Handled by embedding worker
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
async def _process_moments(
|
|
207
|
+
self,
|
|
208
|
+
moments_data: list[dict],
|
|
209
|
+
parent_resource_name: str,
|
|
210
|
+
parent_resource_id: str,
|
|
211
|
+
tenant_id: str,
|
|
212
|
+
user_id: Optional[str] = None,
|
|
213
|
+
) -> list[str]:
|
|
214
|
+
"""
|
|
215
|
+
Create Moments from moments array in engram.
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
moments_data: List of moment dicts from engram
|
|
219
|
+
parent_resource_name: Parent engram name
|
|
220
|
+
parent_resource_id: Parent engram resource ID
|
|
221
|
+
tenant_id: Tenant ID
|
|
222
|
+
user_id: Optional user ID
|
|
223
|
+
|
|
224
|
+
Returns:
|
|
225
|
+
List of moment IDs
|
|
226
|
+
"""
|
|
227
|
+
moments = []
|
|
228
|
+
for moment_data in moments_data:
|
|
229
|
+
# Extract moment fields
|
|
230
|
+
name = moment_data["name"]
|
|
231
|
+
content = moment_data["content"]
|
|
232
|
+
summary = moment_data.get("summary")
|
|
233
|
+
moment_type = moment_data.get("moment_type")
|
|
234
|
+
category = moment_data.get("category")
|
|
235
|
+
uri = moment_data.get("uri")
|
|
236
|
+
|
|
237
|
+
# Parse timestamps
|
|
238
|
+
starts_timestamp = moment_data.get("starts_timestamp") or moment_data.get("resource_timestamp")
|
|
239
|
+
if starts_timestamp:
|
|
240
|
+
if isinstance(starts_timestamp, str):
|
|
241
|
+
starts_timestamp = datetime.fromisoformat(starts_timestamp)
|
|
242
|
+
else:
|
|
243
|
+
starts_timestamp = datetime.utcnow()
|
|
244
|
+
|
|
245
|
+
ends_timestamp = moment_data.get("ends_timestamp") or moment_data.get("resource_ends_timestamp")
|
|
246
|
+
if ends_timestamp and isinstance(ends_timestamp, str):
|
|
247
|
+
ends_timestamp = datetime.fromisoformat(ends_timestamp)
|
|
248
|
+
|
|
249
|
+
emotion_tags = moment_data.get("emotion_tags", [])
|
|
250
|
+
topic_tags = moment_data.get("topic_tags", [])
|
|
251
|
+
present_persons_data = moment_data.get("present_persons", [])
|
|
252
|
+
metadata = moment_data.get("metadata", {})
|
|
253
|
+
graph_edges_data = moment_data.get("graph_edges", [])
|
|
254
|
+
|
|
255
|
+
# Create link to parent engram
|
|
256
|
+
parent_edge = InlineEdge(
|
|
257
|
+
dst=parent_resource_name,
|
|
258
|
+
rel_type="part_of",
|
|
259
|
+
weight=1.0,
|
|
260
|
+
properties={
|
|
261
|
+
"dst_name": parent_resource_name,
|
|
262
|
+
"dst_id": parent_resource_id,
|
|
263
|
+
"dst_entity_type": "resource/engram",
|
|
264
|
+
"match_type": "parent_child",
|
|
265
|
+
"confidence": 1.0,
|
|
266
|
+
},
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
# Combine moment edges with parent edge
|
|
270
|
+
all_edges = [parent_edge]
|
|
271
|
+
for edge_data in graph_edges_data:
|
|
272
|
+
edge = InlineEdge(
|
|
273
|
+
dst=edge_data["dst"],
|
|
274
|
+
rel_type=edge_data["rel_type"],
|
|
275
|
+
weight=edge_data.get("weight", 0.5),
|
|
276
|
+
properties=edge_data.get("properties", {}),
|
|
277
|
+
)
|
|
278
|
+
all_edges.append(edge)
|
|
279
|
+
|
|
280
|
+
# Create Moment entity
|
|
281
|
+
moment = Moment(
|
|
282
|
+
tenant_id=tenant_id,
|
|
283
|
+
user_id=user_id,
|
|
284
|
+
name=name,
|
|
285
|
+
moment_type=moment_type,
|
|
286
|
+
category=category,
|
|
287
|
+
starts_timestamp=starts_timestamp,
|
|
288
|
+
ends_timestamp=ends_timestamp,
|
|
289
|
+
present_persons=present_persons_data, # Will be validated by Pydantic
|
|
290
|
+
emotion_tags=emotion_tags,
|
|
291
|
+
topic_tags=topic_tags,
|
|
292
|
+
summary=summary or content[:200], # Fallback to content prefix
|
|
293
|
+
source_resource_ids=[parent_resource_id],
|
|
294
|
+
metadata=metadata,
|
|
295
|
+
graph_edges=[edge.model_dump() for edge in all_edges],
|
|
296
|
+
)
|
|
297
|
+
moments.append(moment)
|
|
298
|
+
|
|
299
|
+
# Batch upsert all moments
|
|
300
|
+
if moments:
|
|
301
|
+
logger.info(f"Batch upserting {len(moments)} moments")
|
|
302
|
+
upsert_result = await self.postgres.batch_upsert(
|
|
303
|
+
records=[m.model_dump(mode="json") for m in moments],
|
|
304
|
+
model=Moment,
|
|
305
|
+
table_name="moments",
|
|
306
|
+
entity_key_field="name", # Explicit entity_key for KV store
|
|
307
|
+
)
|
|
308
|
+
moment_ids = upsert_result["ids"]
|
|
309
|
+
logger.info(f"Upserted {len(moment_ids)} moments")
|
|
310
|
+
return moment_ids
|
|
311
|
+
|
|
312
|
+
return []
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SQS File Processor Worker.
|
|
3
|
+
|
|
4
|
+
Consumes S3 ObjectCreated events from SQS queue and processes files.
|
|
5
|
+
Designed to run as a K8s deployment scaled by KEDA based on queue depth.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
import signal
|
|
10
|
+
import sys
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
import boto3
|
|
14
|
+
from botocore.exceptions import ClientError
|
|
15
|
+
from loguru import logger
|
|
16
|
+
|
|
17
|
+
from rem.services.content import ContentService
|
|
18
|
+
from rem.settings import settings
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class SQSFileProcessor:
|
|
22
|
+
"""
|
|
23
|
+
SQS-based file processor worker.
|
|
24
|
+
|
|
25
|
+
Polls SQS queue for S3 ObjectCreated events and processes files
|
|
26
|
+
using ContentService.
|
|
27
|
+
|
|
28
|
+
Gracefully handles:
|
|
29
|
+
- SIGTERM (K8s pod termination)
|
|
30
|
+
- SIGINT (Ctrl+C for local testing)
|
|
31
|
+
- SQS visibility timeout (message redelivery)
|
|
32
|
+
- DLQ (failed messages after retries)
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(self):
|
|
36
|
+
self.sqs_client = self._create_sqs_client()
|
|
37
|
+
self.content_service = ContentService()
|
|
38
|
+
self.running = True
|
|
39
|
+
self.processing_message = False
|
|
40
|
+
|
|
41
|
+
# Register signal handlers
|
|
42
|
+
signal.signal(signal.SIGTERM, self._handle_shutdown)
|
|
43
|
+
signal.signal(signal.SIGINT, self._handle_shutdown)
|
|
44
|
+
|
|
45
|
+
def _create_sqs_client(self):
|
|
46
|
+
"""Create SQS client with IRSA or configured credentials."""
|
|
47
|
+
return boto3.client("sqs", region_name=settings.sqs.region)
|
|
48
|
+
|
|
49
|
+
def _handle_shutdown(self, signum, frame):
|
|
50
|
+
"""Handle graceful shutdown signals."""
|
|
51
|
+
logger.info(f"Received shutdown signal ({signum}), finishing current message...")
|
|
52
|
+
self.running = False
|
|
53
|
+
|
|
54
|
+
def run(self):
|
|
55
|
+
"""
|
|
56
|
+
Main worker loop.
|
|
57
|
+
|
|
58
|
+
Long polls SQS queue, processes messages, deletes on success.
|
|
59
|
+
Exits gracefully on SIGTERM/SIGINT after completing current message.
|
|
60
|
+
"""
|
|
61
|
+
if not settings.sqs.queue_url:
|
|
62
|
+
logger.error("SQS_QUEUE_URL not configured")
|
|
63
|
+
sys.exit(1)
|
|
64
|
+
|
|
65
|
+
logger.info(f"Starting file processor worker")
|
|
66
|
+
logger.info(f"Queue: {settings.sqs.queue_url}")
|
|
67
|
+
logger.info(f"Polling interval: {settings.sqs.wait_time_seconds}s (long polling)")
|
|
68
|
+
|
|
69
|
+
while self.running:
|
|
70
|
+
try:
|
|
71
|
+
# Long poll for messages
|
|
72
|
+
response = self.sqs_client.receive_message(
|
|
73
|
+
QueueUrl=settings.sqs.queue_url,
|
|
74
|
+
MaxNumberOfMessages=settings.sqs.max_messages,
|
|
75
|
+
WaitTimeSeconds=settings.sqs.wait_time_seconds,
|
|
76
|
+
AttributeNames=["All"],
|
|
77
|
+
MessageAttributeNames=["All"],
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
messages = response.get("Messages", [])
|
|
81
|
+
if not messages:
|
|
82
|
+
continue
|
|
83
|
+
|
|
84
|
+
logger.info(f"Received {len(messages)} message(s)")
|
|
85
|
+
|
|
86
|
+
# Process each message
|
|
87
|
+
for message in messages:
|
|
88
|
+
if not self.running:
|
|
89
|
+
logger.info("Shutdown requested, stopping message processing")
|
|
90
|
+
break
|
|
91
|
+
|
|
92
|
+
self.processing_message = True
|
|
93
|
+
try:
|
|
94
|
+
self._process_message(message)
|
|
95
|
+
self._delete_message(message)
|
|
96
|
+
except Exception as e:
|
|
97
|
+
logger.error(f"Failed to process message: {e}", exc_info=True)
|
|
98
|
+
# Message will be redelivered after visibility timeout
|
|
99
|
+
finally:
|
|
100
|
+
self.processing_message = False
|
|
101
|
+
|
|
102
|
+
except KeyboardInterrupt:
|
|
103
|
+
logger.info("Keyboard interrupt, shutting down...")
|
|
104
|
+
break
|
|
105
|
+
except Exception as e:
|
|
106
|
+
logger.error(f"Error in worker loop: {e}", exc_info=True)
|
|
107
|
+
|
|
108
|
+
logger.info("Worker stopped")
|
|
109
|
+
|
|
110
|
+
def _process_message(self, message: dict):
|
|
111
|
+
"""
|
|
112
|
+
Process a single SQS message containing S3 event(s).
|
|
113
|
+
|
|
114
|
+
S3 notification format:
|
|
115
|
+
{
|
|
116
|
+
"Records": [{
|
|
117
|
+
"eventName": "ObjectCreated:Put",
|
|
118
|
+
"s3": {
|
|
119
|
+
"bucket": {"name": "rem"},
|
|
120
|
+
"object": {"key": "uploads/file.md", "size": 12345}
|
|
121
|
+
}
|
|
122
|
+
}]
|
|
123
|
+
}
|
|
124
|
+
"""
|
|
125
|
+
body = json.loads(message["Body"])
|
|
126
|
+
|
|
127
|
+
for record in body.get("Records", []):
|
|
128
|
+
event_name = record.get("eventName", "")
|
|
129
|
+
|
|
130
|
+
if not event_name.startswith("ObjectCreated:"):
|
|
131
|
+
logger.debug(f"Skipping non-create event: {event_name}")
|
|
132
|
+
continue
|
|
133
|
+
|
|
134
|
+
bucket = record["s3"]["bucket"]["name"]
|
|
135
|
+
key = record["s3"]["object"]["key"]
|
|
136
|
+
size = record["s3"]["object"].get("size", 0)
|
|
137
|
+
|
|
138
|
+
logger.info(f"Processing {event_name}: s3://{bucket}/{key} ({size} bytes)")
|
|
139
|
+
|
|
140
|
+
try:
|
|
141
|
+
# Process file using ContentService
|
|
142
|
+
uri = f"s3://{bucket}/{key}"
|
|
143
|
+
result = self.content_service.process_uri(uri)
|
|
144
|
+
|
|
145
|
+
# TODO: Store in PostgreSQL with pgvector
|
|
146
|
+
# TODO: Generate embeddings
|
|
147
|
+
# TODO: Create graph edges
|
|
148
|
+
# TODO: Update REM entities
|
|
149
|
+
|
|
150
|
+
logger.info(
|
|
151
|
+
f"Successfully processed s3://{bucket}/{key} "
|
|
152
|
+
f"({len(result['content'])} chars, provider={result['provider']})"
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
# Log extraction metadata
|
|
156
|
+
logger.debug(f"Metadata: {json.dumps(result['metadata'], default=str)}")
|
|
157
|
+
|
|
158
|
+
except FileNotFoundError as e:
|
|
159
|
+
logger.error(f"File not found: {e}")
|
|
160
|
+
# Don't retry - file is gone
|
|
161
|
+
raise
|
|
162
|
+
except RuntimeError as e:
|
|
163
|
+
logger.error(f"Processing error: {e}")
|
|
164
|
+
# Retry on next visibility timeout
|
|
165
|
+
raise
|
|
166
|
+
except Exception as e:
|
|
167
|
+
logger.exception(f"Unexpected error processing s3://{bucket}/{key}: {e}")
|
|
168
|
+
raise
|
|
169
|
+
|
|
170
|
+
def _delete_message(self, message: dict):
|
|
171
|
+
"""Delete message from queue after successful processing."""
|
|
172
|
+
try:
|
|
173
|
+
self.sqs_client.delete_message(
|
|
174
|
+
QueueUrl=settings.sqs.queue_url,
|
|
175
|
+
ReceiptHandle=message["ReceiptHandle"],
|
|
176
|
+
)
|
|
177
|
+
logger.debug("Message deleted from queue")
|
|
178
|
+
except ClientError as e:
|
|
179
|
+
logger.error(f"Failed to delete message: {e}")
|
|
180
|
+
# Message will be redelivered, but processing was successful
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def main():
|
|
184
|
+
"""Entry point for containerized deployment."""
|
|
185
|
+
logger.info("REM File Processor Worker")
|
|
186
|
+
logger.info(f"Environment: {settings.environment}")
|
|
187
|
+
|
|
188
|
+
processor = SQSFileProcessor()
|
|
189
|
+
processor.run()
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
if __name__ == "__main__":
|
|
193
|
+
main()
|