remdb 0.3.163__py3-none-any.whl → 0.3.181__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of remdb might be problematic. Click here for more details.

@@ -23,6 +23,8 @@ Future:
23
23
  import asyncio
24
24
  import os
25
25
  from typing import Any, Optional
26
+ import hashlib
27
+ import uuid
26
28
  from uuid import uuid4
27
29
 
28
30
  import httpx
@@ -108,6 +110,7 @@ class EmbeddingWorker:
108
110
  self.task_queue: asyncio.Queue = asyncio.Queue()
109
111
  self.workers: list[asyncio.Task] = []
110
112
  self.running = False
113
+ self._in_flight_count = 0 # Track tasks being processed (not just in queue)
111
114
 
112
115
  # Store API key for direct HTTP requests
113
116
  from ...settings import settings
@@ -143,17 +146,18 @@ class EmbeddingWorker:
143
146
  return
144
147
 
145
148
  queue_size = self.task_queue.qsize()
146
- logger.debug(f"Stopping EmbeddingWorker (processing {queue_size} queued tasks first)")
149
+ in_flight = self._in_flight_count
150
+ logger.debug(f"Stopping EmbeddingWorker (queue={queue_size}, in_flight={in_flight})")
147
151
 
148
- # Wait for queue to drain (with timeout)
152
+ # Wait for both queue to drain AND in-flight tasks to complete
149
153
  max_wait = 30 # 30 seconds max
150
154
  waited = 0.0
151
- while not self.task_queue.empty() and waited < max_wait:
155
+ while (not self.task_queue.empty() or self._in_flight_count > 0) and waited < max_wait:
152
156
  await asyncio.sleep(0.5)
153
157
  waited += 0.5
154
158
 
155
- if not self.task_queue.empty():
156
- remaining = self.task_queue.qsize()
159
+ if not self.task_queue.empty() or self._in_flight_count > 0:
160
+ remaining = self.task_queue.qsize() + self._in_flight_count
157
161
  logger.warning(
158
162
  f"EmbeddingWorker timeout: {remaining} tasks remaining after {max_wait}s"
159
163
  )
@@ -205,12 +209,18 @@ class EmbeddingWorker:
205
209
  if not batch:
206
210
  continue
207
211
 
208
- logger.debug(f"Worker {worker_id} processing batch of {len(batch)} tasks")
212
+ # Track in-flight tasks
213
+ self._in_flight_count += len(batch)
209
214
 
210
- # Generate embeddings for batch
211
- await self._process_batch(batch)
215
+ logger.debug(f"Worker {worker_id} processing batch of {len(batch)} tasks")
212
216
 
213
- logger.debug(f"Worker {worker_id} completed batch")
217
+ try:
218
+ # Generate embeddings for batch
219
+ await self._process_batch(batch)
220
+ logger.debug(f"Worker {worker_id} completed batch")
221
+ finally:
222
+ # Always decrement in-flight count, even on error
223
+ self._in_flight_count -= len(batch)
214
224
 
215
225
  except asyncio.CancelledError:
216
226
  logger.debug(f"Worker {worker_id} cancelled")
@@ -373,7 +383,11 @@ class EmbeddingWorker:
373
383
  for task, embedding in zip(tasks, embeddings):
374
384
  table_name = f"embeddings_{task.table_name}"
375
385
 
376
- # Build upsert SQL
386
+ # Generate deterministic ID from key fields (entity_id, field_name, provider)
387
+ key_string = f"{task.entity_id}:{task.field_name}:{task.provider}"
388
+ embedding_id = str(uuid.UUID(hashlib.md5(key_string.encode()).hexdigest()))
389
+
390
+ # Build upsert SQL - conflict on deterministic ID
377
391
  sql = f"""
378
392
  INSERT INTO {table_name} (
379
393
  id,
@@ -386,7 +400,7 @@ class EmbeddingWorker:
386
400
  updated_at
387
401
  )
388
402
  VALUES ($1, $2, $3, $4, $5, $6, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
389
- ON CONFLICT (entity_id, field_name, provider)
403
+ ON CONFLICT (id)
390
404
  DO UPDATE SET
391
405
  model = EXCLUDED.model,
392
406
  embedding = EXCLUDED.embedding,
@@ -400,7 +414,7 @@ class EmbeddingWorker:
400
414
  await self.postgres_service.execute(
401
415
  sql,
402
416
  (
403
- str(uuid4()),
417
+ embedding_id,
404
418
  task.entity_id,
405
419
  task.field_name,
406
420
  task.provider,
@@ -268,7 +268,7 @@ BEGIN
268
268
  graph_edges,
269
269
  updated_at
270
270
  ) VALUES (
271
- NEW.{entity_key_field}::VARCHAR,
271
+ normalize_key(NEW.{entity_key_field}::VARCHAR),
272
272
  '{table_name}',
273
273
  NEW.id,
274
274
  NEW.tenant_id,
@@ -74,7 +74,7 @@ class Repository(Generic[T]):
74
74
  self,
75
75
  records: T | list[T],
76
76
  embeddable_fields: list[str] | None = None,
77
- generate_embeddings: bool = False,
77
+ generate_embeddings: bool = True,
78
78
  ) -> T | list[T]:
79
79
  """
80
80
  Upsert single record or list of records (create or update on ID conflict).
@@ -84,8 +84,9 @@ class Repository(Generic[T]):
84
84
 
85
85
  Args:
86
86
  records: Single model instance or list of model instances
87
- embeddable_fields: Optional list of fields to generate embeddings for
88
- generate_embeddings: Whether to queue embedding generation tasks
87
+ embeddable_fields: Optional list of fields to generate embeddings for.
88
+ If None, auto-detects 'content' field if present.
89
+ generate_embeddings: Whether to queue embedding generation tasks (default: True)
89
90
 
90
91
  Returns:
91
92
  Single record or list of records with generated IDs (matches input type)
@@ -118,25 +119,35 @@ class Repository(Generic[T]):
118
119
  record.id = row["id"] # type: ignore[attr-defined]
119
120
 
120
121
  # Queue embedding generation if requested and worker is available
121
- if generate_embeddings and embeddable_fields and self.db.embedding_worker:
122
+ if generate_embeddings and self.db.embedding_worker:
122
123
  from rem.services.embeddings import EmbeddingTask
123
-
124
- for record in records_list:
125
- for field_name in embeddable_fields:
126
- content = getattr(record, field_name, None)
127
- if content and isinstance(content, str):
128
- task = EmbeddingTask(
129
- task_id=f"{record.id}-{field_name}", # type: ignore[attr-defined]
130
- entity_id=str(record.id), # type: ignore[attr-defined]
131
- table_name=self.table_name,
132
- field_name=field_name,
133
- content=content,
134
- provider="openai", # Default provider
135
- model="text-embedding-3-small", # Default model
136
- )
137
- await self.db.embedding_worker.queue_task(task)
138
-
139
- logger.debug(f"Queued {len(records_list) * len(embeddable_fields)} embedding tasks")
124
+ from .register_type import should_embed_field
125
+
126
+ # Auto-detect embeddable fields if not specified
127
+ if embeddable_fields is None:
128
+ embeddable_fields = [
129
+ field_name
130
+ for field_name, field_info in self.model_class.model_fields.items()
131
+ if should_embed_field(field_name, field_info)
132
+ ]
133
+
134
+ if embeddable_fields:
135
+ for record in records_list:
136
+ for field_name in embeddable_fields:
137
+ content = getattr(record, field_name, None)
138
+ if content and isinstance(content, str):
139
+ task = EmbeddingTask(
140
+ task_id=f"{record.id}-{field_name}", # type: ignore[attr-defined]
141
+ entity_id=str(record.id), # type: ignore[attr-defined]
142
+ table_name=self.table_name,
143
+ field_name=field_name,
144
+ content=content,
145
+ provider="openai", # Default provider
146
+ model="text-embedding-3-small", # Default model
147
+ )
148
+ await self.db.embedding_worker.queue_task(task)
149
+
150
+ logger.debug(f"Queued {len(records_list) * len(embeddable_fields)} embedding tasks")
140
151
 
141
152
  # Return single item or list to match input type
142
153
  return records_list[0] if is_single else records_list
@@ -351,10 +351,10 @@ class SchemaGenerator:
351
351
 
352
352
  Priority:
353
353
  1. Field with json_schema_extra={\"entity_key\": True}
354
- 2. Field named \"name\"
354
+ 2. Field named \"name\" (human-readable identifier)
355
355
  3. Field named \"key\"
356
- 4. Field named \"label\"
357
- 5. First string field
356
+ 4. Field named \"uri\"
357
+ 5. Field named \"id\" (fallback)
358
358
 
359
359
  Args:
360
360
  model: Pydantic model class
@@ -369,9 +369,9 @@ class SchemaGenerator:
369
369
  if json_extra.get("entity_key"):
370
370
  return field_name
371
371
 
372
- # Check for key fields in priority order: id -> uri -> key -> name
372
+ # Check for key fields in priority order: name -> key -> uri -> id
373
373
  # (matching sql_builder.get_entity_key convention)
374
- for candidate in ["id", "uri", "key", "name"]:
374
+ for candidate in ["name", "key", "uri", "id"]:
375
375
  if candidate in model.model_fields:
376
376
  return candidate
377
377
 
@@ -35,10 +35,11 @@ def get_natural_key(model: BaseModel) -> str | None:
35
35
 
36
36
  def get_entity_key(model: BaseModel) -> str:
37
37
  """
38
- Get entity key for KV store following precedence: id -> uri -> key -> name.
38
+ Get entity key for KV store following precedence: name -> key -> uri -> id.
39
39
 
40
- For KV store lookups, we prefer globally unique identifiers first (id),
41
- then natural keys (uri/key/name). Always returns a value (id as fallback).
40
+ For KV store lookups, we prefer human-readable identifiers first (name/key),
41
+ then URIs, with id as the fallback. This allows users to lookup entities
42
+ by their natural names like "panic-disorder" instead of UUIDs.
42
43
 
43
44
  Args:
44
45
  model: Pydantic model instance
@@ -46,13 +47,13 @@ def get_entity_key(model: BaseModel) -> str:
46
47
  Returns:
47
48
  Entity key string (guaranteed to exist)
48
49
  """
49
- for field in ["id", "uri", "key", "name"]:
50
+ for field in ["name", "key", "uri", "id"]:
50
51
  if hasattr(model, field):
51
52
  value = getattr(model, field)
52
53
  if value:
53
54
  return str(value)
54
55
  # Should never reach here since id always exists in CoreModel
55
- raise ValueError(f"Model {type(model)} has no id, uri, key, or name field")
56
+ raise ValueError(f"Model {type(model)} has no name, key, uri, or id field")
56
57
 
57
58
 
58
59
  def generate_deterministic_id(user_id: str | None, entity_key: str) -> uuid.UUID:
@@ -4,7 +4,8 @@ User Service - User account management.
4
4
  Handles user creation, profile updates, and session linking.
5
5
  """
6
6
 
7
- from datetime import datetime
7
+ from rem.utils.date_utils import utc_now
8
+ from rem.utils.user_id import email_to_user_id
8
9
  from typing import Optional
9
10
 
10
11
  from loguru import logger
@@ -51,22 +52,24 @@ class UserService:
51
52
  updated = True
52
53
 
53
54
  if updated:
54
- user.updated_at = datetime.utcnow()
55
+ user.updated_at = utc_now()
55
56
  await self.repo.upsert(user)
56
57
 
57
58
  return user
58
59
 
59
60
  # Create new user
61
+ # id and user_id = UUID5 hash of email (deterministic bijection)
62
+ # name = email (entity_key for LOOKUP by email in KV store)
63
+ hashed_id = email_to_user_id(email)
60
64
  user = User(
65
+ id=hashed_id, # Database id = hash of email
61
66
  tenant_id=tenant_id,
62
- user_id=email, # Use email as user_id for now? Or UUID?
63
- # The User model has 'user_id' field but also 'id' UUID.
64
- # Usually user_id is the external ID or email.
65
- name=name,
67
+ user_id=hashed_id, # user_id = hash of email (same as id)
68
+ name=email, # Email as entity_key for REM LOOKUP
66
69
  email=email,
67
70
  tier=UserTier.FREE,
68
- created_at=datetime.utcnow(),
69
- updated_at=datetime.utcnow(),
71
+ created_at=utc_now(),
72
+ updated_at=utc_now(),
70
73
  metadata={"avatar_url": avatar_url} if avatar_url else {},
71
74
  )
72
75
  await self.repo.upsert(user)
@@ -117,7 +120,7 @@ class UserService:
117
120
 
118
121
  # Add to list
119
122
  user.anonymous_ids.append(anon_id)
120
- user.updated_at = datetime.utcnow()
123
+ user.updated_at = utc_now()
121
124
 
122
125
  # Save
123
126
  await self.repo.upsert(user)
rem/settings.py CHANGED
@@ -77,6 +77,7 @@ class LLMSettings(BaseSettings):
77
77
  LLM__ANTHROPIC_API_KEY or ANTHROPIC_API_KEY - Anthropic API key
78
78
  LLM__EMBEDDING_PROVIDER or EMBEDDING_PROVIDER - Default embedding provider (openai)
79
79
  LLM__EMBEDDING_MODEL or EMBEDDING_MODEL - Default embedding model name
80
+ LLM__DEFAULT_STRUCTURED_OUTPUT - Default structured output mode (False = streaming text)
80
81
  """
81
82
 
82
83
  model_config = SettingsConfigDict(
@@ -138,6 +139,11 @@ class LLMSettings(BaseSettings):
138
139
  description="Default embedding model (provider-specific model name)",
139
140
  )
140
141
 
142
+ default_structured_output: bool = Field(
143
+ default=False,
144
+ description="Default structured output mode for agents. False = streaming text (easier), True = JSON schema validation",
145
+ )
146
+
141
147
  @field_validator("openai_api_key", mode="before")
142
148
  @classmethod
143
149
  def validate_openai_api_key(cls, v):
@@ -1028,7 +1034,7 @@ class ChatSettings(BaseSettings):
1028
1034
  - Prevents context window bloat while maintaining conversation continuity
1029
1035
 
1030
1036
  User Context (on-demand by default):
1031
- - Agent system prompt includes: "User ID: {user_id}. To load user profile: Use REM LOOKUP users/{user_id}"
1037
+ - Agent system prompt includes: "User: {email}. To load user profile: Use REM LOOKUP \"{email}\""
1032
1038
  - Agent decides whether to load profile based on query
1033
1039
  - More efficient for queries that don't need personalization
1034
1040
 
@@ -21,6 +21,11 @@ CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_embeddings_moments_vector_hnsw
21
21
  ON embeddings_moments
22
22
  USING hnsw (embedding vector_cosine_ops);
23
23
 
24
+ -- HNSW vector index for embeddings_ontologies
25
+ CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_embeddings_ontologies_vector_hnsw
26
+ ON embeddings_ontologies
27
+ USING hnsw (embedding vector_cosine_ops);
28
+
24
29
  -- HNSW vector index for embeddings_ontology_configs
25
30
  CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_embeddings_ontology_configs_vector_hnsw
26
31
  ON embeddings_ontology_configs
@@ -44,6 +44,33 @@ BEGIN
44
44
  RAISE NOTICE '✓ All required extensions installed successfully';
45
45
  END $$;
46
46
 
47
+ -- ============================================================================
48
+ -- NORMALIZATION HELPER
49
+ -- ============================================================================
50
+
51
+ -- Normalize entity keys to lower-kebab-case for consistent lookups
52
+ -- "Mood Disorder" -> "mood-disorder"
53
+ -- "mood_disorder" -> "mood-disorder"
54
+ -- "MoodDisorder" -> "mood-disorder"
55
+ CREATE OR REPLACE FUNCTION normalize_key(input TEXT)
56
+ RETURNS TEXT AS $$
57
+ BEGIN
58
+ RETURN lower(
59
+ regexp_replace(
60
+ regexp_replace(
61
+ regexp_replace(input, '([a-z])([A-Z])', '\1-\2', 'g'), -- camelCase -> kebab
62
+ '[_\s]+', '-', 'g' -- underscores/spaces -> hyphens
63
+ ),
64
+ '-+', '-', 'g' -- collapse multiple hyphens
65
+ )
66
+ );
67
+ END;
68
+ $$ LANGUAGE plpgsql IMMUTABLE;
69
+
70
+ COMMENT ON FUNCTION normalize_key IS
71
+ 'Normalizes entity keys to lower-kebab-case for consistent lookups.
72
+ Examples: "Mood Disorder" -> "mood-disorder", "mood_disorder" -> "mood-disorder"';
73
+
47
74
  -- ============================================================================
48
75
  -- MIGRATION TRACKING
49
76
  -- ============================================================================
@@ -237,10 +264,11 @@ BEGIN
237
264
 
238
265
  -- First lookup in KV store to get entity_type (table name)
239
266
  -- Include user-owned AND public (NULL user_id) entries
267
+ -- Normalize input key for consistent matching
240
268
  SELECT kv.entity_type INTO entity_table
241
269
  FROM kv_store kv
242
270
  WHERE (kv.user_id = effective_user_id OR kv.user_id IS NULL)
243
- AND kv.entity_key = p_entity_key
271
+ AND kv.entity_key = normalize_key(p_entity_key)
244
272
  LIMIT 1;
245
273
 
246
274
  -- If not found, return empty
@@ -414,6 +442,7 @@ BEGIN
414
442
  FOR graph_keys IN
415
443
  WITH RECURSIVE graph_traversal AS (
416
444
  -- Base case: Find starting entity (user-owned OR public)
445
+ -- Normalize input key for consistent matching
417
446
  SELECT
418
447
  0 AS depth,
419
448
  kv.entity_key,
@@ -424,7 +453,7 @@ BEGIN
424
453
  ARRAY[kv.entity_key]::TEXT[] AS path
425
454
  FROM kv_store kv
426
455
  WHERE (kv.user_id = effective_user_id OR kv.user_id IS NULL)
427
- AND kv.entity_key = p_entity_key
456
+ AND kv.entity_key = normalize_key(p_entity_key)
428
457
 
429
458
  UNION ALL
430
459
 
@@ -441,7 +470,7 @@ BEGIN
441
470
  JOIN kv_store source_kv ON source_kv.entity_key = gt.entity_key
442
471
  AND (source_kv.user_id = effective_user_id OR source_kv.user_id IS NULL)
443
472
  CROSS JOIN LATERAL jsonb_array_elements(COALESCE(source_kv.graph_edges, '[]'::jsonb)) AS edge
444
- JOIN kv_store target_kv ON target_kv.entity_key = (edge->>'dst')::VARCHAR(255)
473
+ JOIN kv_store target_kv ON target_kv.entity_key = normalize_key((edge->>'dst')::VARCHAR(255))
445
474
  AND (target_kv.user_id = effective_user_id OR target_kv.user_id IS NULL)
446
475
  WHERE gt.depth < p_max_depth
447
476
  AND (p_rel_type IS NULL OR (edge->>'rel_type')::VARCHAR(100) = p_rel_type)
@@ -657,7 +686,7 @@ BEGIN
657
686
  MIN(msg_counts.first_msg)::TIMESTAMP AS first_message_at,
658
687
  MAX(msg_counts.last_msg)::TIMESTAMP AS last_message_at
659
688
  FROM shared_sessions ss
660
- LEFT JOIN users u ON u.user_id = ss.owner_user_id AND u.tenant_id = ss.tenant_id
689
+ LEFT JOIN users u ON u.id::text = ss.owner_user_id AND u.tenant_id = ss.tenant_id
661
690
  LEFT JOIN (
662
691
  SELECT
663
692
  m.session_id,