remdb 0.3.7__py3-none-any.whl → 0.3.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. rem/__init__.py +129 -2
  2. rem/agentic/context.py +7 -5
  3. rem/agentic/providers/phoenix.py +32 -43
  4. rem/api/README.md +23 -0
  5. rem/api/main.py +27 -2
  6. rem/api/middleware/tracking.py +172 -0
  7. rem/api/routers/auth.py +54 -0
  8. rem/api/routers/chat/completions.py +1 -1
  9. rem/cli/commands/ask.py +13 -10
  10. rem/cli/commands/configure.py +4 -3
  11. rem/cli/commands/db.py +17 -3
  12. rem/cli/commands/experiments.py +76 -72
  13. rem/cli/commands/process.py +8 -7
  14. rem/cli/commands/scaffold.py +47 -0
  15. rem/cli/main.py +2 -0
  16. rem/models/entities/user.py +10 -3
  17. rem/registry.py +367 -0
  18. rem/services/content/providers.py +92 -133
  19. rem/services/dreaming/affinity_service.py +2 -16
  20. rem/services/dreaming/moment_service.py +2 -15
  21. rem/services/embeddings/api.py +20 -13
  22. rem/services/phoenix/EXPERIMENT_DESIGN.md +3 -3
  23. rem/services/phoenix/client.py +148 -14
  24. rem/services/postgres/schema_generator.py +86 -5
  25. rem/services/rate_limit.py +113 -0
  26. rem/services/rem/README.md +14 -0
  27. rem/services/user_service.py +98 -0
  28. rem/settings.py +79 -10
  29. rem/sql/install_models.sql +13 -0
  30. rem/sql/migrations/003_seed_default_user.sql +48 -0
  31. rem/utils/constants.py +97 -0
  32. rem/utils/date_utils.py +228 -0
  33. rem/utils/embeddings.py +17 -4
  34. rem/utils/files.py +167 -0
  35. rem/utils/mime_types.py +158 -0
  36. rem/utils/schema_loader.py +63 -14
  37. rem/utils/vision.py +9 -14
  38. rem/workers/README.md +14 -14
  39. rem/workers/db_maintainer.py +74 -0
  40. {remdb-0.3.7.dist-info → remdb-0.3.14.dist-info}/METADATA +169 -121
  41. {remdb-0.3.7.dist-info → remdb-0.3.14.dist-info}/RECORD +43 -32
  42. {remdb-0.3.7.dist-info → remdb-0.3.14.dist-info}/WHEEL +0 -0
  43. {remdb-0.3.7.dist-info → remdb-0.3.14.dist-info}/entry_points.txt +0 -0
@@ -53,7 +53,7 @@ from datetime import datetime
53
53
  from pathlib import Path
54
54
  from typing import Any, Callable, TYPE_CHECKING, cast
55
55
 
56
- import pandas as pd
56
+ import polars as pl
57
57
  from loguru import logger
58
58
 
59
59
  from .config import PhoenixConfig
@@ -64,6 +64,95 @@ if TYPE_CHECKING:
64
64
  from phoenix.client.resources.experiments.types import RanExperiment
65
65
 
66
66
 
67
+ def dataframe_to_phoenix_dataset(
68
+ client: "PhoenixClient",
69
+ df: pl.DataFrame,
70
+ dataset_name: str,
71
+ input_keys: list[str] | None = None,
72
+ output_keys: list[str] | None = None,
73
+ metadata_keys: list[str] | None = None,
74
+ description: str | None = None,
75
+ ) -> "Dataset":
76
+ """Convert a Polars DataFrame to a Phoenix Dataset.
77
+
78
+ This function transforms a Polars DataFrame into a Phoenix Dataset by:
79
+ 1. Extracting input columns (what agents receive)
80
+ 2. Extracting output columns (ground truth/expected output)
81
+ 3. Extracting metadata columns (optional labels, difficulty, etc.)
82
+
83
+ If column keys are not specified, uses smart defaults:
84
+ - input_keys: columns containing 'input', 'query', 'question', or 'prompt'
85
+ - output_keys: columns containing 'output', 'expected', 'answer', or 'response'
86
+ - metadata_keys: remaining columns
87
+
88
+ Args:
89
+ client: PhoenixClient instance
90
+ df: Polars DataFrame with experiment data
91
+ dataset_name: Name for the created Phoenix dataset
92
+ input_keys: Optional list of column names for inputs
93
+ output_keys: Optional list of column names for outputs (ground truth)
94
+ metadata_keys: Optional list of column names for metadata
95
+ description: Optional dataset description
96
+
97
+ Returns:
98
+ Phoenix Dataset instance
99
+
100
+ Example:
101
+ >>> df = pl.read_csv("golden_set.csv")
102
+ >>> dataset = dataframe_to_phoenix_dataset(
103
+ ... client=phoenix_client,
104
+ ... df=df,
105
+ ... dataset_name="my-golden-set",
106
+ ... input_keys=["query"],
107
+ ... output_keys=["expected_output"],
108
+ ... metadata_keys=["difficulty"]
109
+ ... )
110
+ """
111
+ columns = df.columns
112
+
113
+ # Smart defaults for column detection
114
+ if input_keys is None:
115
+ input_keys = [c for c in columns if any(
116
+ k in c.lower() for k in ["input", "query", "question", "prompt"]
117
+ )]
118
+ if not input_keys:
119
+ # Fallback: first column
120
+ input_keys = [columns[0]] if columns else []
121
+
122
+ if output_keys is None:
123
+ output_keys = [c for c in columns if any(
124
+ k in c.lower() for k in ["output", "expected", "answer", "response", "reference"]
125
+ )]
126
+ if not output_keys:
127
+ # Fallback: second column
128
+ output_keys = [columns[1]] if len(columns) > 1 else []
129
+
130
+ if metadata_keys is None:
131
+ used_keys = set(input_keys) | set(output_keys)
132
+ metadata_keys = [c for c in columns if c not in used_keys]
133
+
134
+ logger.debug(
135
+ f"DataFrame to Phoenix Dataset: inputs={input_keys}, "
136
+ f"outputs={output_keys}, metadata={metadata_keys}"
137
+ )
138
+
139
+ # Convert to list of dicts
140
+ records = df.to_dicts()
141
+
142
+ inputs = [{k: row.get(k) for k in input_keys} for row in records]
143
+ outputs = [{k: row.get(k) for k in output_keys} for row in records]
144
+ metadata = [{k: row.get(k) for k in metadata_keys} for row in records] if metadata_keys else None
145
+
146
+ # Create Phoenix dataset
147
+ return client.create_dataset_from_data(
148
+ name=dataset_name,
149
+ inputs=inputs,
150
+ outputs=outputs,
151
+ metadata=metadata,
152
+ description=description,
153
+ )
154
+
155
+
67
156
  class PhoenixClient:
68
157
  """High-level Phoenix client for REM evaluation workflows.
69
158
 
@@ -260,19 +349,22 @@ class PhoenixClient:
260
349
  "SEARCH semantic AI engineer",sarah-chen,person,medium,SEARCH
261
350
  """
262
351
  try:
263
- # Load CSV
264
- df = pd.read_csv(csv_file_path)
352
+ # Load CSV with Polars
353
+ df = pl.read_csv(csv_file_path)
354
+
355
+ # Convert to list of dicts
356
+ records = df.to_dicts()
265
357
 
266
358
  # Extract inputs
267
- inputs = cast(list[dict[str, Any]], df[input_keys].to_dict("records"))
359
+ inputs = [{k: row.get(k) for k in input_keys} for row in records]
268
360
 
269
361
  # Extract outputs
270
- outputs = cast(list[dict[str, Any]], df[output_keys].to_dict("records"))
362
+ outputs = [{k: row.get(k) for k in output_keys} for row in records]
271
363
 
272
364
  # Extract metadata if specified
273
365
  metadata = None
274
366
  if metadata_keys:
275
- metadata = cast(list[dict[str, Any]], df[metadata_keys].to_dict("records"))
367
+ metadata = [{k: row.get(k) for k in metadata_keys} for row in records]
276
368
 
277
369
  return self.create_dataset_from_data(
278
370
  name=name,
@@ -331,13 +423,16 @@ class PhoenixClient:
331
423
 
332
424
  def run_experiment(
333
425
  self,
334
- dataset: "Dataset" | str,
426
+ dataset: "Dataset" | str | pl.DataFrame,
335
427
  task: Callable[[Any], Any] | None = None,
336
428
  evaluators: list[Callable[[Any], Any]] | None = None,
337
429
  experiment_name: str | None = None,
338
430
  experiment_description: str | None = None,
339
431
  experiment_metadata: dict[str, Any] | None = None,
340
432
  experiment_config: Any | None = None,
433
+ input_keys: list[str] | None = None,
434
+ output_keys: list[str] | None = None,
435
+ metadata_keys: list[str] | None = None,
341
436
  ) -> "RanExperiment":
342
437
  """Run an evaluation experiment.
343
438
 
@@ -346,14 +441,22 @@ class PhoenixClient:
346
441
  2. Agent run: Provide task function to execute agents on dataset
347
442
  3. Evaluator run: Provide evaluators to score existing outputs
348
443
 
444
+ Dataset can be:
445
+ - Phoenix Dataset instance
446
+ - Dataset name (string) - will be loaded from Phoenix
447
+ - Polars DataFrame - will be converted to Phoenix Dataset
448
+
349
449
  Args:
350
- dataset: Dataset instance or name (required unless experiment_config provided)
450
+ dataset: Dataset instance, name, or Polars DataFrame
351
451
  task: Optional task function to run on each example (agent execution)
352
452
  evaluators: Optional list of evaluator functions
353
453
  experiment_name: Optional experiment name
354
454
  experiment_description: Optional description
355
455
  experiment_metadata: Optional metadata dict
356
456
  experiment_config: Optional ExperimentConfig instance (overrides other params)
457
+ input_keys: Column names for inputs (required if dataset is DataFrame)
458
+ output_keys: Column names for outputs (required if dataset is DataFrame)
459
+ metadata_keys: Optional column names for metadata
357
460
 
358
461
  Returns:
359
462
  RanExperiment with results
@@ -369,6 +472,16 @@ class PhoenixClient:
369
472
  ... experiment_name="rem-v1-baseline"
370
473
  ... )
371
474
 
475
+ Example - With Polars DataFrame:
476
+ >>> df = pl.read_csv("golden_set.csv")
477
+ >>> experiment = client.run_experiment(
478
+ ... dataset=df,
479
+ ... task=run_agent,
480
+ ... experiment_name="rem-v1-baseline",
481
+ ... input_keys=["query"],
482
+ ... output_keys=["expected_output"]
483
+ ... )
484
+
372
485
  Example - Evaluator Run (Phase 2b):
373
486
  >>> experiment = client.run_experiment(
374
487
  ... dataset=agent_results,
@@ -407,6 +520,21 @@ class PhoenixClient:
407
520
  else:
408
521
  dataset = dataset_ref.path
409
522
 
523
+ # Convert Polars DataFrame to Phoenix Dataset
524
+ if isinstance(dataset, pl.DataFrame):
525
+ dataset_name_for_phoenix = f"{experiment_name or 'experiment'}-dataset-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
526
+ logger.info(f"Converting Polars DataFrame to Phoenix Dataset: {dataset_name_for_phoenix}")
527
+ dataset = dataframe_to_phoenix_dataset(
528
+ client=self,
529
+ df=dataset,
530
+ dataset_name=dataset_name_for_phoenix,
531
+ input_keys=input_keys,
532
+ output_keys=output_keys,
533
+ metadata_keys=metadata_keys,
534
+ description=f"Auto-created from DataFrame for experiment: {experiment_name}",
535
+ )
536
+ logger.info(f"✓ Created Phoenix Dataset: {dataset_name_for_phoenix}")
537
+
410
538
  # Load dataset if name provided
411
539
  if isinstance(dataset, str):
412
540
  dataset = self.get_dataset(dataset)
@@ -454,7 +582,7 @@ class PhoenixClient:
454
582
  root_spans_only: bool = True,
455
583
  trace_id: str | None = None,
456
584
  span_id: str | None = None,
457
- ) -> pd.DataFrame:
585
+ ) -> pl.DataFrame:
458
586
  """Query traces from Phoenix.
459
587
 
460
588
  Args:
@@ -467,7 +595,7 @@ class PhoenixClient:
467
595
  span_id: Filter by specific span ID
468
596
 
469
597
  Returns:
470
- DataFrame with trace data
598
+ Polars DataFrame with trace data
471
599
 
472
600
  Example:
473
601
  >>> traces = client.get_traces(
@@ -492,8 +620,11 @@ class PhoenixClient:
492
620
  if span_id:
493
621
  query_params["span_id"] = span_id
494
622
 
495
- # Query traces
496
- traces_df = self._client.query_spans(limit=limit, **query_params) # type: ignore[attr-defined]
623
+ # Query traces (Phoenix returns pandas DataFrame)
624
+ pandas_df = self._client.query_spans(limit=limit, **query_params) # type: ignore[attr-defined]
625
+
626
+ # Convert pandas to Polars
627
+ traces_df = pl.from_pandas(pandas_df)
497
628
 
498
629
  logger.debug(f"Retrieved {len(traces_df)} traces")
499
630
  return traces_df
@@ -535,7 +666,7 @@ class PhoenixClient:
535
666
  ... )
536
667
  """
537
668
  try:
538
- # Query traces
669
+ # Query traces (returns Polars DataFrame)
539
670
  traces_df = self.get_traces(
540
671
  project_name=project_name,
541
672
  start_time=start_time,
@@ -547,12 +678,15 @@ class PhoenixClient:
547
678
  if len(traces_df) == 0:
548
679
  raise ValueError("No traces found matching criteria")
549
680
 
681
+ # Convert to list of dicts for iteration
682
+ records = traces_df.to_dicts()
683
+
550
684
  # Extract inputs and outputs from traces
551
685
  inputs = []
552
686
  outputs = []
553
687
  metadata = []
554
688
 
555
- for _, row in traces_df.iterrows():
689
+ for row in records:
556
690
  # Extract input
557
691
  span_input = row.get("attributes.input")
558
692
  if span_input:
@@ -1,7 +1,12 @@
1
1
  """
2
2
  Schema generation utility from Pydantic models.
3
3
 
4
- Scans a directory of Pydantic models and generates complete database schemas including:
4
+ Generates complete database schemas from:
5
+ 1. REM's core models (Resource, Moment, User, etc.)
6
+ 2. Models registered via rem.register_model() or rem.register_models()
7
+ 3. Models discovered from a directory scan
8
+
9
+ Output includes:
5
10
  - Primary tables
6
11
  - Embeddings tables
7
12
  - KV_STORE triggers
@@ -11,8 +16,12 @@ Scans a directory of Pydantic models and generates complete database schemas inc
11
16
  Usage:
12
17
  from rem.services.postgres.schema_generator import SchemaGenerator
13
18
 
19
+ # Generate from registry (includes core + registered models)
14
20
  generator = SchemaGenerator()
15
- schema = generator.generate_from_directory("src/rem/models/entities")
21
+ schema = await generator.generate_from_registry()
22
+
23
+ # Or generate from directory (legacy)
24
+ schema = await generator.generate_from_directory("src/rem/models/entities")
16
25
 
17
26
  # Write to file
18
27
  with open("src/rem/sql/schema.sql", "w") as f:
@@ -228,12 +237,65 @@ class SchemaGenerator:
228
237
  self.schemas[table_name] = schema
229
238
  return schema
230
239
 
240
+ async def generate_from_registry(
241
+ self, output_file: str | None = None, include_core: bool = True
242
+ ) -> str:
243
+ """
244
+ Generate complete schema from the model registry.
245
+
246
+ Includes:
247
+ 1. REM's core models (if include_core=True)
248
+ 2. Models registered via rem.register_model() or rem.register_models()
249
+
250
+ Args:
251
+ output_file: Optional output file path (relative to output_dir)
252
+ include_core: If True, include REM's core models (default: True)
253
+
254
+ Returns:
255
+ Complete SQL schema as string
256
+
257
+ Example:
258
+ import rem
259
+ from rem.models.core import CoreModel
260
+
261
+ # Register custom model
262
+ @rem.register_model
263
+ class CustomEntity(CoreModel):
264
+ name: str
265
+
266
+ # Generate schema (includes core + custom)
267
+ generator = SchemaGenerator()
268
+ schema = await generator.generate_from_registry()
269
+ """
270
+ from ...registry import get_model_registry
271
+
272
+ registry = get_model_registry()
273
+ models = registry.get_models(include_core=include_core)
274
+
275
+ logger.info(f"Generating schema from registry: {len(models)} models")
276
+
277
+ # Generate schemas for each model
278
+ for model_name, ext in models.items():
279
+ await self.generate_schema_for_model(
280
+ ext.model,
281
+ table_name=ext.table_name,
282
+ entity_key_field=ext.entity_key_field,
283
+ )
284
+
285
+ return self._generate_sql_output(
286
+ source="model registry",
287
+ output_file=output_file,
288
+ )
289
+
231
290
  async def generate_from_directory(
232
291
  self, directory: str | Path, output_file: str | None = None
233
292
  ) -> str:
234
293
  """
235
294
  Generate complete schema from all models in a directory.
236
295
 
296
+ Note: For most use cases, prefer generate_from_registry() which uses
297
+ the model registry pattern.
298
+
237
299
  Args:
238
300
  directory: Path to directory with Pydantic models
239
301
  output_file: Optional output file path (relative to output_dir)
@@ -248,12 +310,31 @@ class SchemaGenerator:
248
310
  for model_name, model in models.items():
249
311
  await self.generate_schema_for_model(model)
250
312
 
251
- # Combine into single SQL file
313
+ return self._generate_sql_output(
314
+ source=f"directory: {directory}",
315
+ output_file=output_file,
316
+ )
317
+
318
+ def _generate_sql_output(
319
+ self, source: str, output_file: str | None = None
320
+ ) -> str:
321
+ """
322
+ Generate SQL output from accumulated schemas.
323
+
324
+ Args:
325
+ source: Description of schema source (for header comment)
326
+ output_file: Optional output file path (relative to output_dir)
327
+
328
+ Returns:
329
+ Complete SQL schema as string
330
+ """
331
+ import datetime
332
+
252
333
  sql_parts = [
253
334
  "-- REM Model Schema (install_models.sql)",
254
335
  "-- Generated from Pydantic models",
255
- f"-- Source directory: {directory}",
256
- "-- Generated at: " + __import__("datetime").datetime.now().isoformat(),
336
+ f"-- Source: {source}",
337
+ f"-- Generated at: {datetime.datetime.now().isoformat()}",
257
338
  "--",
258
339
  "-- DO NOT EDIT MANUALLY - Regenerate with: rem db schema generate",
259
340
  "--",
@@ -0,0 +1,113 @@
1
+ """
2
+ Rate Limit Service - Postgres-backed rate limiting.
3
+
4
+ Implements tenant-aware, tiered rate limiting using PostgreSQL UNLOGGED tables
5
+ for high performance. Supports monthly quotas and short-term burst limits.
6
+ """
7
+
8
+ import random
9
+ from datetime import datetime, timezone
10
+ from enum import Enum
11
+ from typing import Optional
12
+
13
+ from loguru import logger
14
+
15
+ from ..models.entities.user import UserTier
16
+ from .postgres.service import PostgresService
17
+
18
+
19
+ class RateLimitService:
20
+ """
21
+ Service for tracking and enforcing API rate limits.
22
+
23
+ Uses an UNLOGGED table `rate_limits` for performance.
24
+ Note: Counts in UNLOGGED tables may be lost on database crash/restart.
25
+ """
26
+
27
+ def __init__(self, db: PostgresService):
28
+ self.db = db
29
+
30
+ # Rate limits configuration
31
+ # Format: (limit, period_seconds)
32
+ # This is a simple implementation. In production, move to settings.
33
+ self.TIER_CONFIG = {
34
+ UserTier.ANONYMOUS: {"limit": 1000, "period": 3600}, # 1000/hour (for testing)
35
+ UserTier.FREE: {"limit": 50, "period": 2592000}, # 50/month (~30 days)
36
+ UserTier.BASIC: {"limit": 10000, "period": 2592000}, # 10k/month
37
+ UserTier.PRO: {"limit": 100000, "period": 2592000}, # 100k/month
38
+ }
39
+
40
+ async def check_rate_limit(
41
+ self,
42
+ tenant_id: str,
43
+ identifier: str,
44
+ tier: UserTier
45
+ ) -> tuple[bool, int, int]:
46
+ """
47
+ Check if request is allowed under the rate limit.
48
+
49
+ Args:
50
+ tenant_id: Tenant identifier
51
+ identifier: User ID or Anonymous ID
52
+ tier: User subscription tier
53
+
54
+ Returns:
55
+ Tuple (is_allowed, current_count, limit)
56
+ """
57
+ config = self.TIER_CONFIG.get(tier, self.TIER_CONFIG[UserTier.FREE])
58
+ limit = config["limit"]
59
+ period = config["period"]
60
+
61
+ # Construct time-window key
62
+ now = datetime.now(timezone.utc)
63
+
64
+ if period >= 2592000: # Monthly
65
+ time_key = now.strftime("%Y-%m")
66
+ elif period >= 86400: # Daily
67
+ time_key = now.strftime("%Y-%m-%d")
68
+ elif period >= 3600: # Hourly
69
+ time_key = now.strftime("%Y-%m-%d-%H")
70
+ else: # Minute/Second (fallback)
71
+ time_key = int(now.timestamp() / period)
72
+
73
+ key = f"{tenant_id}:{identifier}:{tier.value}:{time_key}"
74
+
75
+ # Calculate expiry (for cleanup)
76
+ expires_at = now.timestamp() + period
77
+
78
+ # Atomic UPSERT to increment counter
79
+ # Returns the new count
80
+ query = """
81
+ INSERT INTO rate_limits (key, count, expires_at)
82
+ VALUES ($1, 1, to_timestamp($2))
83
+ ON CONFLICT (key) DO UPDATE
84
+ SET count = rate_limits.count + 1
85
+ RETURNING count;
86
+ """
87
+
88
+ try:
89
+ count = await self.db.fetchval(query, key, expires_at)
90
+ except Exception as e:
91
+ logger.error(f"Rate limit check failed: {e}")
92
+ # Fail open to avoid blocking users on DB error
93
+ return True, 0, limit
94
+
95
+ is_allowed = count <= limit
96
+
97
+ # Probabilistic cleanup (1% chance)
98
+ if random.random() < 0.01:
99
+ await self.cleanup_expired()
100
+
101
+ return is_allowed, count, limit
102
+
103
+ async def cleanup_expired(self):
104
+ """Remove expired rate limit keys."""
105
+ try:
106
+ # Use a small limit to avoid locking/long queries
107
+ query = """
108
+ DELETE FROM rate_limits
109
+ WHERE expires_at < NOW()
110
+ """
111
+ await self.db.execute(query)
112
+ except Exception as e:
113
+ logger.warning(f"Rate limit cleanup failed: {e}")
@@ -302,3 +302,17 @@ See `tests/integration/test_rem_query_evolution.py` for stage-based validation a
302
302
  * **Unified View**: The underlying SQL function `rem_traverse` uses a view `all_graph_edges` that unions `graph_edges` from all entity tables (`resources`, `moments`, `users`, etc.). This enables polymorphic traversal without complex joins in the application layer.
303
303
  * **KV Store**: Edge destinations (`dst`) are resolved to entity IDs using the `kv_store`. This requires that all traversable entities have an entry in the `kv_store` (handled automatically by database triggers).
304
304
  * **Iterated Retrieval**: REM is architected for multi-turn retrieval where LLMs conduct conversational database exploration. Each query informs the next, enabling emergent information discovery without requiring upfront schema knowledge.
305
+
306
+ ## Scaling & Architectural Decisions
307
+
308
+ ### 1. Hybrid Adjacency List
309
+ REM implements a **Hybrid Adjacency List** pattern to balance strict relational guarantees with graph flexibility:
310
+ * **Primary Storage (Source of Truth):** Standard PostgreSQL tables (`resources`, `moments`, etc.) enforce schema validation, constraints, and type safety.
311
+ * **Graph Overlay:** Relationships are stored as "inline edges" within a JSONB column (`graph_edges`) on each entity.
312
+ * **Performance Layer:** A denormalized `UNLOGGED` table (`kv_store`) acts as a high-speed cache, mapping human-readable keys to internal UUIDs and edges. This avoids the traditional "join bomb" of traversing normalized SQL tables while avoiding the operational complexity of a separate graph database (e.g., Neo4j).
313
+
314
+ ### 2. The Pareto Principle in Graph Algorithms
315
+ We explicitly choose **Simplicity over Full-Scale Graph Analytics**.
316
+ * **Hypothesis:** For LLM Agent workloads, 80% of the value is derived from **local context retrieval** (1-3 hops via `LOOKUP` and `TRAVERSE`).
317
+ * **Diminishing Returns:** Global graph algorithms (PageRank, Community Detection) offer diminishing returns for real-time agentic retrieval tasks. Agents typically need to answer specific questions ("Who worked on file X?"), which is a local neighborhood problem, not a global cluster analysis problem.
318
+ * **Future Scaling:** If deeper analysis is needed, we prefer **Graph + Vector (RAG)** approaches (using semantic similarity to find implicit links) over complex explicit graph algorithms.
@@ -0,0 +1,98 @@
1
+ """
2
+ User Service - User account management.
3
+
4
+ Handles user creation, profile updates, and session linking.
5
+ """
6
+
7
+ from datetime import datetime
8
+ from typing import Optional
9
+
10
+ from loguru import logger
11
+
12
+ from ..models.entities.user import User, UserTier
13
+ from .postgres.repository import Repository
14
+ from .postgres.service import PostgresService
15
+
16
+
17
+ class UserService:
18
+ """
19
+ Service for managing user accounts and sessions.
20
+ """
21
+
22
+ def __init__(self, db: PostgresService):
23
+ self.db = db
24
+ self.repo = Repository(User, "users", db=db)
25
+
26
+ async def get_or_create_user(
27
+ self,
28
+ email: str,
29
+ tenant_id: str = "default",
30
+ name: str = "New User",
31
+ avatar_url: Optional[str] = None,
32
+ ) -> User:
33
+ """
34
+ Get existing user by email or create a new one.
35
+ """
36
+ users = await self.repo.find(filters={"email": email}, limit=1)
37
+
38
+ if users:
39
+ user = users[0]
40
+ # Update profile if needed (e.g., name/avatar from OAuth)
41
+ updated = False
42
+ if name and user.name == "New User": # Only update if placeholder
43
+ user.name = name
44
+ updated = True
45
+
46
+ # Store avatar in metadata if provided
47
+ if avatar_url:
48
+ user.metadata = user.metadata or {}
49
+ if user.metadata.get("avatar_url") != avatar_url:
50
+ user.metadata["avatar_url"] = avatar_url
51
+ updated = True
52
+
53
+ if updated:
54
+ user.updated_at = datetime.utcnow()
55
+ await self.repo.upsert(user)
56
+
57
+ return user
58
+
59
+ # Create new user
60
+ user = User(
61
+ tenant_id=tenant_id,
62
+ user_id=email, # Use email as user_id for now? Or UUID?
63
+ # The User model has 'user_id' field but also 'id' UUID.
64
+ # Usually user_id is the external ID or email.
65
+ name=name,
66
+ email=email,
67
+ tier=UserTier.FREE,
68
+ created_at=datetime.utcnow(),
69
+ updated_at=datetime.utcnow(),
70
+ metadata={"avatar_url": avatar_url} if avatar_url else {},
71
+ )
72
+ await self.repo.upsert(user)
73
+ logger.info(f"Created new user: {email}")
74
+ return user
75
+
76
+ async def link_anonymous_session(self, user: User, anon_id: str) -> None:
77
+ """
78
+ Link an anonymous session ID to a user account.
79
+
80
+ This allows merging history from the anonymous session into the user's profile.
81
+ """
82
+ if not anon_id:
83
+ return
84
+
85
+ # Check if already linked
86
+ if anon_id in user.anonymous_ids:
87
+ return
88
+
89
+ # Add to list
90
+ user.anonymous_ids.append(anon_id)
91
+ user.updated_at = datetime.utcnow()
92
+
93
+ # Save
94
+ await self.repo.upsert(user)
95
+ logger.info(f"Linked anonymous session {anon_id} to user {user.email}")
96
+
97
+ # TODO: Migrate/Merge actual data (rate limit counts, history) if needed.
98
+ # For now, we just link the IDs so future queries can include data from this anon_id.