PyPI - remdb - Versions diffs - 0.3.7__py3-none-any.whl → 0.3.14__py3-none-any.whl - Mend

remdb 0.3.7py3-none-any.whl → 0.3.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

rem/__init__.py +129 -2
rem/agentic/context.py +7 -5
rem/agentic/providers/phoenix.py +32 -43
rem/api/README.md +23 -0
rem/api/main.py +27 -2
rem/api/middleware/tracking.py +172 -0
rem/api/routers/auth.py +54 -0
rem/api/routers/chat/completions.py +1 -1
rem/cli/commands/ask.py +13 -10
rem/cli/commands/configure.py +4 -3
rem/cli/commands/db.py +17 -3
rem/cli/commands/experiments.py +76 -72
rem/cli/commands/process.py +8 -7
rem/cli/commands/scaffold.py +47 -0
rem/cli/main.py +2 -0
rem/models/entities/user.py +10 -3
rem/registry.py +367 -0
rem/services/content/providers.py +92 -133
rem/services/dreaming/affinity_service.py +2 -16
rem/services/dreaming/moment_service.py +2 -15
rem/services/embeddings/api.py +20 -13
rem/services/phoenix/EXPERIMENT_DESIGN.md +3 -3
rem/services/phoenix/client.py +148 -14
rem/services/postgres/schema_generator.py +86 -5
rem/services/rate_limit.py +113 -0
rem/services/rem/README.md +14 -0
rem/services/user_service.py +98 -0
rem/settings.py +79 -10
rem/sql/install_models.sql +13 -0
rem/sql/migrations/003_seed_default_user.sql +48 -0
rem/utils/constants.py +97 -0
rem/utils/date_utils.py +228 -0
rem/utils/embeddings.py +17 -4
rem/utils/files.py +167 -0
rem/utils/mime_types.py +158 -0
rem/utils/schema_loader.py +63 -14
rem/utils/vision.py +9 -14
rem/workers/README.md +14 -14
rem/workers/db_maintainer.py +74 -0
{remdb-0.3.7.dist-info → remdb-0.3.14.dist-info}/METADATA +169 -121
{remdb-0.3.7.dist-info → remdb-0.3.14.dist-info}/RECORD +43 -32
{remdb-0.3.7.dist-info → remdb-0.3.14.dist-info}/WHEEL +0 -0
{remdb-0.3.7.dist-info → remdb-0.3.14.dist-info}/entry_points.txt +0 -0

rem/settings.py CHANGED Viewed

@@ -57,8 +57,10 @@ Example .env file:
 """
 import os
-from pydantic import Field, field_validator
+import hashlib
+from pydantic import Field, field_validator, FieldValidationInfo
 from pydantic_settings import BaseSettings, SettingsConfigDict
+from loguru import logger
 class LLMSettings(BaseSettings):
@@ -386,6 +388,22 @@ class AuthSettings(BaseSettings):
     google: GoogleOAuthSettings = Field(default_factory=GoogleOAuthSettings)
     microsoft: MicrosoftOAuthSettings = Field(default_factory=MicrosoftOAuthSettings)
+    @field_validator("session_secret", mode="before")
+    @classmethod
+    def generate_dev_secret(cls, v: str | None, info: FieldValidationInfo) -> str:
+        # Only generate if not already set and not in production
+        if not v and info.data.get("environment") != "production":
+            # Deterministic secret for development
+            seed_string = f"{info.data.get('team', 'rem')}-{info.data.get('environment', 'development')}-auth-secret-salt"
+            logger.warning(
+                "AUTH__SESSION_SECRET not set. Generating deterministic secret for non-production environment. "
+                "DO NOT use in production."
+            )
+            return hashlib.sha256(seed_string.encode()).hexdigest()
+        elif not v and info.data.get("environment") == "production":
+            raise ValueError("AUTH__SESSION_SECRET must be set in production environment.")
+        return v
 class PostgresSettings(BaseSettings):
     """
@@ -962,6 +980,54 @@ class APISettings(BaseSettings):
     )
+class SchemaSettings(BaseSettings):
+    """
+    Schema search path settings for agent and evaluator schemas.
+    Allows extending REM's schema search with custom directories.
+    Custom paths are searched BEFORE built-in package schemas.
+    Environment variables:
+        SCHEMA__PATHS - Semicolon-separated list of directories to search
+                        Example: "/app/schemas;/shared/agents;./local-schemas"
+    Search Order:
+    1. Exact path (if file exists)
+    2. Custom paths from SCHEMA__PATHS (in order)
+    3. Built-in package schemas (schemas/agents/, schemas/evaluators/, etc.)
+    4. Database LOOKUP (if enabled)
+    Example:
+        # In .env or environment
+        SCHEMA__PATHS=/app/custom-agents;/shared/evaluators
+        # Then in code
+        from rem.utils.schema_loader import load_agent_schema
+        schema = load_agent_schema("my-custom-agent")  # Found in /app/custom-agents/
+    """
+    model_config = SettingsConfigDict(
+        env_prefix="SCHEMA__",
+        extra="ignore",
+    )
+    paths: str = Field(
+        default="",
+        description=(
+            "Semicolon-separated list of directories to search for schemas. "
+            "These paths are searched BEFORE built-in package schemas. "
+            "Example: '/app/schemas;/shared/agents'"
+        ),
+    )
+    @property
+    def path_list(self) -> list[str]:
+        """Get paths as a list, filtering empty strings."""
+        if not self.paths:
+            return []
+        return [p.strip() for p in self.paths.split(";") if p.strip()]
 class GitSettings(BaseSettings):
     """
     Git repository provider settings for versioned schema/experiment syncing.
@@ -1207,20 +1273,23 @@ class Settings(BaseSettings):
     sqs: SQSSettings = Field(default_factory=SQSSettings)
     chunking: ChunkingSettings = Field(default_factory=ChunkingSettings)
     content: ContentSettings = Field(default_factory=ContentSettings)
+    schema_search: SchemaSettings = Field(default_factory=SchemaSettings)
     test: TestSettings = Field(default_factory=TestSettings)
 # Load configuration from ~/.rem/config.yaml before initializing settings
 # This allows user configuration to be merged with environment variables
-try:
-    from rem.config import load_config, merge_config_to_env
-    _config = load_config()
-    if _config:
-        merge_config_to_env(_config)
-except ImportError:
-    # config module not available (e.g., during initial setup)
-    pass
+# Set REM_SKIP_CONFIG_FILE=true to disable (useful for development with .env)
+if not os.getenv("REM_SKIP_CONFIG_FILE", "").lower() in ("true", "1", "yes"):
+    try:
+        from rem.config import load_config, merge_config_to_env
+        _config = load_config()
+        if _config:
+            merge_config_to_env(_config)
+    except ImportError:
+        # config module not available (e.g., during initial setup)
+        pass
 # Global settings singleton
 settings = Settings()

rem/sql/install_models.sql CHANGED Viewed

@@ -29,6 +29,18 @@ BEGIN
     RAISE NOTICE 'Prerequisites check passed';
 END $$;
+-- ======================================================================
+-- RATE LIMITS (Unlogged for performance)
+-- ======================================================================
+CREATE UNLOGGED TABLE IF NOT EXISTS rate_limits (
+    key VARCHAR(255) PRIMARY KEY, -- e.g., "tenant_1:anon_abc:per_minute:TIMESTAMP"
+    count INTEGER NOT NULL DEFAULT 1,
+    expires_at TIMESTAMP WITH TIME ZONE NOT NULL
+);
+CREATE INDEX IF NOT EXISTS idx_rate_limits_expires_at ON rate_limits(expires_at);
 -- ======================================================================
 -- USERS (Model: User)
 -- ======================================================================
@@ -41,6 +53,7 @@ CREATE TABLE IF NOT EXISTS users (
     email VARCHAR(256),
     role VARCHAR(256),
     tier TEXT,
+    anonymous_ids TEXT[] DEFAULT ARRAY[]::TEXT[],
     sec_policy JSONB DEFAULT '{}'::jsonb,
     summary TEXT,
     interests TEXT[] DEFAULT ARRAY[]::TEXT[],

rem/sql/migrations/003_seed_default_user.sql ADDED Viewed

@@ -0,0 +1,48 @@
+-- ============================================================================
+-- Migration: 003_seed_default_user.sql
+-- Description: Seed the default system user for CLI and API operations
+--
+-- The default user is derived from settings.test.user_email (test@rem.ai)
+-- using deterministic UUID v5 generation. This ensures consistent user ID
+-- across all environments and test runs.
+--
+-- Default user:
+--   email: test@rem.ai
+--   user_id: 9e7dc22b-13bb-5cea-8aee-f6b8e6dc962f (UUID v5 from DNS namespace)
+--
+-- This user is used when:
+--   - CLI commands run without --user-id flag
+--   - API requests come without X-User-Id header
+--   - Tests run without explicit user context
+-- ============================================================================
+-- Insert default user (idempotent - skip if exists)
+INSERT INTO users (
+    id,
+    user_id,
+    tenant_id,
+    name,
+    email,
+    role,
+    tags,
+    metadata,
+    created_at,
+    updated_at
+) VALUES (
+    '9e7dc22b-13bb-5cea-8aee-f6b8e6dc962f'::uuid,
+    '9e7dc22b-13bb-5cea-8aee-f6b8e6dc962f',
+    '9e7dc22b-13bb-5cea-8aee-f6b8e6dc962f',
+    'Default User',
+    'test@rem.ai',
+    'system',
+    ARRAY['system', 'default'],
+    '{"description": "Default system user for CLI and API operations without explicit user context"}'::jsonb,
+    NOW(),
+    NOW()
+) ON CONFLICT (id) DO NOTHING;
+-- Log migration
+DO $$
+BEGIN
+    RAISE NOTICE 'Seeded default user: test@rem.ai (id: 9e7dc22b-13bb-5cea-8aee-f6b8e6dc962f)';
+END $$;

rem/utils/constants.py ADDED Viewed

@@ -0,0 +1,97 @@
+"""
+Centralized constants for the REM system.
+All magic numbers and commonly-used values should be defined here
+to ensure consistency and make tuning easier.
+"""
+# =============================================================================
+# Embedding Model Constants
+# =============================================================================
+# OpenAI embedding dimensions by model
+OPENAI_EMBEDDING_DIMS_SMALL = 1536  # text-embedding-3-small
+OPENAI_EMBEDDING_DIMS_LARGE = 3072  # text-embedding-3-large
+OPENAI_EMBEDDING_DIMS_ADA = 1536  # text-embedding-ada-002
+# Default embedding dimension (text-embedding-3-small)
+DEFAULT_EMBEDDING_DIMS = 1536
+# Voyage AI embedding dimensions
+VOYAGE_EMBEDDING_DIMS = 1024  # voyage-2
+# =============================================================================
+# HTTP/API Timeouts (seconds)
+# =============================================================================
+HTTP_TIMEOUT_DEFAULT = 30.0  # Standard API calls
+HTTP_TIMEOUT_LONG = 60.0  # Vision/embedding APIs
+HTTP_TIMEOUT_VERY_LONG = 300.0  # Subprocess/batch operations
+# Request timeout for httpx AsyncClient
+ASYNC_CLIENT_TIMEOUT = 300.0
+# =============================================================================
+# Audio Processing Constants
+# =============================================================================
+# Minimum valid WAV file size (header only)
+WAV_HEADER_MIN_BYTES = 44
+# OpenAI Whisper API cost per minute (USD)
+WHISPER_COST_PER_MINUTE = 0.006
+# Audio chunking parameters
+AUDIO_CHUNK_TARGET_SECONDS = 60.0  # Target chunk duration
+AUDIO_CHUNK_WINDOW_SECONDS = 2.0  # Window for silence detection
+SILENCE_THRESHOLD_DB = -40.0  # Silence detection threshold
+MIN_SILENCE_MS = 500  # Minimum silence duration to split on
+# =============================================================================
+# File Processing Constants
+# =============================================================================
+# Subprocess timeout for document parsing
+SUBPROCESS_TIMEOUT_SECONDS = 300  # 5 minutes
+# Maximum file sizes
+MAX_AUDIO_FILE_SIZE_MB = 25  # Whisper API limit
+# =============================================================================
+# Database/Query Constants
+# =============================================================================
+# Default batch sizes
+DEFAULT_BATCH_SIZE = 100
+EMBEDDING_BATCH_SIZE = 50
+# Default pagination limits
+DEFAULT_PAGE_SIZE = 20
+MAX_PAGE_SIZE = 100
+# =============================================================================
+# Rate Limiting
+# =============================================================================
+# Default retry settings
+DEFAULT_MAX_RETRIES = 3
+RETRY_BACKOFF_MULTIPLIER = 1
+RETRY_BACKOFF_MIN = 1
+RETRY_BACKOFF_MAX = 60
+# =============================================================================
+# S3/Storage Constants
+# =============================================================================
+S3_URI_PREFIX = "s3://"
+FILE_URI_PREFIX = "file://"
+# =============================================================================
+# LLM Constants
+# =============================================================================
+# Default max tokens for vision analysis
+VISION_MAX_TOKENS = 2048
+# Default temperature
+DEFAULT_TEMPERATURE = 0.0

rem/utils/date_utils.py ADDED Viewed

@@ -0,0 +1,228 @@
+"""
+Centralized datetime utilities for consistent UTC-naive datetime handling.
+IMPORTANT: REM uses UTC-naive datetimes throughout the codebase.
+PostgreSQL stores TIMESTAMP WITHOUT TIME ZONE, so all Python datetime
+operations should use UTC-naive datetimes to avoid comparison errors.
+Convention:
+- All timestamps are implicitly UTC
+- Use utc_now() instead of datetime.utcnow() or datetime.now(timezone.utc)
+- Use parse_iso() to parse ISO format strings (handles "Z" suffix)
+- Use to_iso() to format datetimes as ISO strings
+See CLAUDE.md Section 1 (Datetime Convention) for details.
+"""
+from datetime import datetime, timedelta
+from typing import Optional
+def utc_now() -> datetime:
+    """
+    Get current UTC time as a naive datetime.
+    Returns:
+        UTC-naive datetime representing current time.
+    Example:
+        >>> now = utc_now()
+        >>> now.tzinfo is None
+        True
+    """
+    return datetime.utcnow()
+def to_iso(dt: datetime) -> str:
+    """
+    Convert datetime to ISO 8601 format string.
+    Args:
+        dt: Datetime to format (should be UTC-naive)
+    Returns:
+        ISO format string (e.g., "2024-01-15T10:30:00")
+    Example:
+        >>> dt = datetime(2024, 1, 15, 10, 30, 0)
+        >>> to_iso(dt)
+        '2024-01-15T10:30:00'
+    """
+    return dt.isoformat()
+def to_iso_with_z(dt: datetime) -> str:
+    """
+    Convert datetime to ISO 8601 format with Z suffix.
+    Use this when interfacing with external APIs that expect
+    the Z suffix to indicate UTC.
+    Args:
+        dt: Datetime to format (should be UTC-naive)
+    Returns:
+        ISO format string with Z suffix (e.g., "2024-01-15T10:30:00Z")
+    """
+    return dt.isoformat() + "Z"
+def parse_iso(iso_string: str) -> datetime:
+    """
+    Parse ISO 8601 format string to UTC-naive datetime.
+    Handles:
+    - Standard ISO format: "2024-01-15T10:30:00"
+    - Z suffix: "2024-01-15T10:30:00Z"
+    - Timezone offset: "2024-01-15T10:30:00+00:00" (converts to naive)
+    - Microseconds: "2024-01-15T10:30:00.123456"
+    Args:
+        iso_string: ISO format datetime string
+    Returns:
+        UTC-naive datetime
+    Raises:
+        ValueError: If string cannot be parsed
+    Example:
+        >>> parse_iso("2024-01-15T10:30:00Z")
+        datetime.datetime(2024, 1, 15, 10, 30)
+        >>> parse_iso("2024-01-15T10:30:00+00:00")
+        datetime.datetime(2024, 1, 15, 10, 30)
+    """
+    # Handle Z suffix (replace with +00:00 for fromisoformat)
+    if iso_string.endswith("Z"):
+        iso_string = iso_string[:-1] + "+00:00"
+    # Parse the ISO string
+    dt = datetime.fromisoformat(iso_string)
+    # Convert to naive UTC if timezone-aware
+    if dt.tzinfo is not None:
+        # Convert to UTC and strip timezone
+        from datetime import timezone
+        dt = dt.astimezone(timezone.utc).replace(tzinfo=None)
+    return dt
+def parse_iso_safe(iso_string: Optional[str], default: Optional[datetime] = None) -> Optional[datetime]:
+    """
+    Safely parse ISO string, returning default on failure.
+    Args:
+        iso_string: ISO format string or None
+        default: Default value if parsing fails
+    Returns:
+        Parsed datetime or default value
+    """
+    if not iso_string:
+        return default
+    try:
+        return parse_iso(iso_string)
+    except (ValueError, TypeError):
+        return default
+def format_timestamp(dt: Optional[datetime] = None) -> str:
+    """
+    Format datetime for display/logging.
+    Args:
+        dt: Datetime to format (defaults to current UTC time)
+    Returns:
+        Formatted string like "2024-01-15 10:30:00 UTC"
+    """
+    if dt is None:
+        dt = utc_now()
+    return dt.strftime("%Y-%m-%d %H:%M:%S") + " UTC"
+def format_timestamp_compact(dt: Optional[datetime] = None) -> str:
+    """
+    Format datetime as compact string for filenames/IDs.
+    Args:
+        dt: Datetime to format (defaults to current UTC time)
+    Returns:
+        Formatted string like "20240115_103000"
+    """
+    if dt is None:
+        dt = utc_now()
+    return dt.strftime("%Y%m%d_%H%M%S")
+def format_timestamp_for_experiment(dt: Optional[datetime] = None) -> str:
+    """
+    Format datetime for experiment names.
+    Args:
+        dt: Datetime to format (defaults to current UTC time)
+    Returns:
+        Formatted string like "20240115-103000"
+    """
+    if dt is None:
+        dt = utc_now()
+    return dt.strftime("%Y%m%d-%H%M%S")
+def days_ago(days: int) -> datetime:
+    """
+    Get datetime N days ago from now.
+    Args:
+        days: Number of days ago
+    Returns:
+        UTC-naive datetime
+    """
+    return utc_now() - timedelta(days=days)
+def hours_ago(hours: int) -> datetime:
+    """
+    Get datetime N hours ago from now.
+    Args:
+        hours: Number of hours ago
+    Returns:
+        UTC-naive datetime
+    """
+    return utc_now() - timedelta(hours=hours)
+def is_within_hours(dt: datetime, hours: int) -> bool:
+    """
+    Check if datetime is within N hours of now.
+    Args:
+        dt: Datetime to check (should be UTC-naive)
+        hours: Number of hours
+    Returns:
+        True if dt is within the time window
+    """
+    cutoff = hours_ago(hours)
+    return dt >= cutoff
+def is_within_days(dt: datetime, days: int) -> bool:
+    """
+    Check if datetime is within N days of now.
+    Args:
+        dt: Datetime to check (should be UTC-naive)
+        days: Number of days
+    Returns:
+        True if dt is within the time window
+    """
+    cutoff = days_ago(days)
+    return dt >= cutoff

rem/utils/embeddings.py CHANGED Viewed

@@ -20,7 +20,6 @@ Usage:
     embeddings = generate_embeddings("openai:text-embedding-3-small", texts)
 """
-import os
 from typing import Any, cast
 import requests
@@ -31,6 +30,16 @@ from tenacity import (
     wait_exponential,
 )
+from rem.utils.constants import (
+    HTTP_TIMEOUT_LONG,
+    OPENAI_EMBEDDING_DIMS_SMALL,
+    OPENAI_EMBEDDING_DIMS_LARGE,
+    VOYAGE_EMBEDDING_DIMS,
+    RETRY_BACKOFF_MULTIPLIER,
+    RETRY_BACKOFF_MIN,
+    RETRY_BACKOFF_MAX,
+)
 class EmbeddingError(Exception):
     """Base exception for embedding generation errors."""
@@ -166,7 +175,11 @@ def _create_retry_decorator(max_retries: int):
     return retry(
         retry=retry_if_exception_type(RateLimitError),
         stop=stop_after_attempt(max_retries),
-        wait=wait_exponential(multiplier=1, min=1, max=60),
+        wait=wait_exponential(
+            multiplier=RETRY_BACKOFF_MULTIPLIER,
+            min=RETRY_BACKOFF_MIN,
+            max=RETRY_BACKOFF_MAX,
+        ),
         reraise=True,
     )
@@ -234,7 +247,7 @@ def _generate_openai_embeddings(
     }
     try:
-        response = requests.post(url, json=payload, headers=headers, timeout=60)
+        response = requests.post(url, json=payload, headers=headers, timeout=HTTP_TIMEOUT_LONG)
         # Handle rate limits
         if response.status_code == 429:
@@ -334,7 +347,7 @@ def _generate_voyage_embeddings(
     }
     try:
-        response = requests.post(url, json=payload, headers=headers, timeout=60)
+        response = requests.post(url, json=payload, headers=headers, timeout=HTTP_TIMEOUT_LONG)
         # Handle rate limits
         if response.status_code == 429:

remdb 0.3.7__py3-none-any.whl → 0.3.14__py3-none-any.whl

remdb 0.3.7py3-none-any.whl → 0.3.14py3-none-any.whl