PyPI - remdb - Versions diffs - 0.3.242__py3-none-any.whl - Mend

remdb 0.3.242__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of remdb might be problematic. Click here for more details.

Files changed (235) hide show

rem/__init__.py +129 -0
rem/agentic/README.md +760 -0
rem/agentic/__init__.py +54 -0
rem/agentic/agents/README.md +155 -0
rem/agentic/agents/__init__.py +38 -0
rem/agentic/agents/agent_manager.py +311 -0
rem/agentic/agents/sse_simulator.py +502 -0
rem/agentic/context.py +425 -0
rem/agentic/context_builder.py +360 -0
rem/agentic/llm_provider_models.py +301 -0
rem/agentic/mcp/__init__.py +0 -0
rem/agentic/mcp/tool_wrapper.py +273 -0
rem/agentic/otel/__init__.py +5 -0
rem/agentic/otel/setup.py +240 -0
rem/agentic/providers/phoenix.py +926 -0
rem/agentic/providers/pydantic_ai.py +854 -0
rem/agentic/query.py +117 -0
rem/agentic/query_helper.py +89 -0
rem/agentic/schema.py +737 -0
rem/agentic/serialization.py +245 -0
rem/agentic/tools/__init__.py +5 -0
rem/agentic/tools/rem_tools.py +242 -0
rem/api/README.md +657 -0
rem/api/deps.py +253 -0
rem/api/main.py +460 -0
rem/api/mcp_router/prompts.py +182 -0
rem/api/mcp_router/resources.py +820 -0
rem/api/mcp_router/server.py +243 -0
rem/api/mcp_router/tools.py +1605 -0
rem/api/middleware/tracking.py +172 -0
rem/api/routers/admin.py +520 -0
rem/api/routers/auth.py +898 -0
rem/api/routers/chat/__init__.py +5 -0
rem/api/routers/chat/child_streaming.py +394 -0
rem/api/routers/chat/completions.py +702 -0
rem/api/routers/chat/json_utils.py +76 -0
rem/api/routers/chat/models.py +202 -0
rem/api/routers/chat/otel_utils.py +33 -0
rem/api/routers/chat/sse_events.py +546 -0
rem/api/routers/chat/streaming.py +950 -0
rem/api/routers/chat/streaming_utils.py +327 -0
rem/api/routers/common.py +18 -0
rem/api/routers/dev.py +87 -0
rem/api/routers/feedback.py +276 -0
rem/api/routers/messages.py +620 -0
rem/api/routers/models.py +86 -0
rem/api/routers/query.py +362 -0
rem/api/routers/shared_sessions.py +422 -0
rem/auth/README.md +258 -0
rem/auth/__init__.py +36 -0
rem/auth/jwt.py +367 -0
rem/auth/middleware.py +318 -0
rem/auth/providers/__init__.py +16 -0
rem/auth/providers/base.py +376 -0
rem/auth/providers/email.py +215 -0
rem/auth/providers/google.py +163 -0
rem/auth/providers/microsoft.py +237 -0
rem/cli/README.md +517 -0
rem/cli/__init__.py +8 -0
rem/cli/commands/README.md +299 -0
rem/cli/commands/__init__.py +3 -0
rem/cli/commands/ask.py +549 -0
rem/cli/commands/cluster.py +1808 -0
rem/cli/commands/configure.py +495 -0
rem/cli/commands/db.py +828 -0
rem/cli/commands/dreaming.py +324 -0
rem/cli/commands/experiments.py +1698 -0
rem/cli/commands/mcp.py +66 -0
rem/cli/commands/process.py +388 -0
rem/cli/commands/query.py +109 -0
rem/cli/commands/scaffold.py +47 -0
rem/cli/commands/schema.py +230 -0
rem/cli/commands/serve.py +106 -0
rem/cli/commands/session.py +453 -0
rem/cli/dreaming.py +363 -0
rem/cli/main.py +123 -0
rem/config.py +244 -0
rem/mcp_server.py +41 -0
rem/models/core/__init__.py +49 -0
rem/models/core/core_model.py +70 -0
rem/models/core/engram.py +333 -0
rem/models/core/experiment.py +672 -0
rem/models/core/inline_edge.py +132 -0
rem/models/core/rem_query.py +246 -0
rem/models/entities/__init__.py +68 -0
rem/models/entities/domain_resource.py +38 -0
rem/models/entities/feedback.py +123 -0
rem/models/entities/file.py +57 -0
rem/models/entities/image_resource.py +88 -0
rem/models/entities/message.py +64 -0
rem/models/entities/moment.py +123 -0
rem/models/entities/ontology.py +181 -0
rem/models/entities/ontology_config.py +131 -0
rem/models/entities/resource.py +95 -0
rem/models/entities/schema.py +87 -0
rem/models/entities/session.py +84 -0
rem/models/entities/shared_session.py +180 -0
rem/models/entities/subscriber.py +175 -0
rem/models/entities/user.py +93 -0
rem/py.typed +0 -0
rem/registry.py +373 -0
rem/schemas/README.md +507 -0
rem/schemas/__init__.py +6 -0
rem/schemas/agents/README.md +92 -0
rem/schemas/agents/core/agent-builder.yaml +235 -0
rem/schemas/agents/core/moment-builder.yaml +178 -0
rem/schemas/agents/core/rem-query-agent.yaml +226 -0
rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
rem/schemas/agents/core/simple-assistant.yaml +19 -0
rem/schemas/agents/core/user-profile-builder.yaml +163 -0
rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
rem/schemas/agents/examples/contract-extractor.yaml +134 -0
rem/schemas/agents/examples/cv-parser.yaml +263 -0
rem/schemas/agents/examples/hello-world.yaml +37 -0
rem/schemas/agents/examples/query.yaml +54 -0
rem/schemas/agents/examples/simple.yaml +21 -0
rem/schemas/agents/examples/test.yaml +29 -0
rem/schemas/agents/rem.yaml +132 -0
rem/schemas/evaluators/hello-world/default.yaml +77 -0
rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
rem/services/__init__.py +18 -0
rem/services/audio/INTEGRATION.md +308 -0
rem/services/audio/README.md +376 -0
rem/services/audio/__init__.py +15 -0
rem/services/audio/chunker.py +354 -0
rem/services/audio/transcriber.py +259 -0
rem/services/content/README.md +1269 -0
rem/services/content/__init__.py +5 -0
rem/services/content/providers.py +760 -0
rem/services/content/service.py +762 -0
rem/services/dreaming/README.md +230 -0
rem/services/dreaming/__init__.py +53 -0
rem/services/dreaming/affinity_service.py +322 -0
rem/services/dreaming/moment_service.py +251 -0
rem/services/dreaming/ontology_service.py +54 -0
rem/services/dreaming/user_model_service.py +297 -0
rem/services/dreaming/utils.py +39 -0
rem/services/email/__init__.py +10 -0
rem/services/email/service.py +522 -0
rem/services/email/templates.py +360 -0
rem/services/embeddings/__init__.py +11 -0
rem/services/embeddings/api.py +127 -0
rem/services/embeddings/worker.py +435 -0
rem/services/fs/README.md +662 -0
rem/services/fs/__init__.py +62 -0
rem/services/fs/examples.py +206 -0
rem/services/fs/examples_paths.py +204 -0
rem/services/fs/git_provider.py +935 -0
rem/services/fs/local_provider.py +760 -0
rem/services/fs/parsing-hooks-examples.md +172 -0
rem/services/fs/paths.py +276 -0
rem/services/fs/provider.py +460 -0
rem/services/fs/s3_provider.py +1042 -0
rem/services/fs/service.py +186 -0
rem/services/git/README.md +1075 -0
rem/services/git/__init__.py +17 -0
rem/services/git/service.py +469 -0
rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
rem/services/phoenix/README.md +453 -0
rem/services/phoenix/__init__.py +46 -0
rem/services/phoenix/client.py +960 -0
rem/services/phoenix/config.py +88 -0
rem/services/phoenix/prompt_labels.py +477 -0
rem/services/postgres/README.md +757 -0
rem/services/postgres/__init__.py +49 -0
rem/services/postgres/diff_service.py +599 -0
rem/services/postgres/migration_service.py +427 -0
rem/services/postgres/programmable_diff_service.py +635 -0
rem/services/postgres/pydantic_to_sqlalchemy.py +562 -0
rem/services/postgres/register_type.py +353 -0
rem/services/postgres/repository.py +481 -0
rem/services/postgres/schema_generator.py +661 -0
rem/services/postgres/service.py +802 -0
rem/services/postgres/sql_builder.py +355 -0
rem/services/rate_limit.py +113 -0
rem/services/rem/README.md +318 -0
rem/services/rem/__init__.py +23 -0
rem/services/rem/exceptions.py +71 -0
rem/services/rem/executor.py +293 -0
rem/services/rem/parser.py +180 -0
rem/services/rem/queries.py +196 -0
rem/services/rem/query.py +371 -0
rem/services/rem/service.py +608 -0
rem/services/session/README.md +374 -0
rem/services/session/__init__.py +13 -0
rem/services/session/compression.py +488 -0
rem/services/session/pydantic_messages.py +310 -0
rem/services/session/reload.py +85 -0
rem/services/user_service.py +130 -0
rem/settings.py +1877 -0
rem/sql/background_indexes.sql +52 -0
rem/sql/migrations/001_install.sql +983 -0
rem/sql/migrations/002_install_models.sql +3157 -0
rem/sql/migrations/003_optional_extensions.sql +326 -0
rem/sql/migrations/004_cache_system.sql +282 -0
rem/sql/migrations/005_schema_update.sql +145 -0
rem/sql/migrations/migrate_session_id_to_uuid.sql +45 -0
rem/utils/AGENTIC_CHUNKING.md +597 -0
rem/utils/README.md +628 -0
rem/utils/__init__.py +61 -0
rem/utils/agentic_chunking.py +622 -0
rem/utils/batch_ops.py +343 -0
rem/utils/chunking.py +108 -0
rem/utils/clip_embeddings.py +276 -0
rem/utils/constants.py +97 -0
rem/utils/date_utils.py +228 -0
rem/utils/dict_utils.py +98 -0
rem/utils/embeddings.py +436 -0
rem/utils/examples/embeddings_example.py +305 -0
rem/utils/examples/sql_types_example.py +202 -0
rem/utils/files.py +323 -0
rem/utils/markdown.py +16 -0
rem/utils/mime_types.py +158 -0
rem/utils/model_helpers.py +492 -0
rem/utils/schema_loader.py +649 -0
rem/utils/sql_paths.py +146 -0
rem/utils/sql_types.py +350 -0
rem/utils/user_id.py +81 -0
rem/utils/vision.py +325 -0
rem/workers/README.md +506 -0
rem/workers/__init__.py +7 -0
rem/workers/db_listener.py +579 -0
rem/workers/db_maintainer.py +74 -0
rem/workers/dreaming.py +502 -0
rem/workers/engram_processor.py +312 -0
rem/workers/sqs_file_processor.py +193 -0
rem/workers/unlogged_maintainer.py +463 -0
remdb-0.3.242.dist-info/METADATA +1632 -0
remdb-0.3.242.dist-info/RECORD +235 -0
remdb-0.3.242.dist-info/WHEEL +4 -0
remdb-0.3.242.dist-info/entry_points.txt +2 -0

rem/utils/clip_embeddings.py ADDED Viewed

@@ -0,0 +1,276 @@
+"""
+CLIP embeddings utility using Jina AI API.
+Provides image and text embeddings using Jina CLIP models via API.
+Falls back gracefully when API key is not available.
+Future: Can be extended to support self-hosted CLIP models or other providers.
+"""
+import base64
+import os
+from pathlib import Path
+from typing import Optional
+import requests
+from loguru import logger
+class CLIPEmbeddingResult:
+    """Result from CLIP embedding generation."""
+    def __init__(
+        self,
+        embedding: list[float],
+        model: str,
+        input_type: str,
+        tokens_used: int = 0,
+    ):
+        """
+        Initialize CLIP embedding result.
+        Args:
+            embedding: Vector embedding (512 or 768 dimensions)
+            model: Model name used
+            input_type: Type of input (image or text)
+            tokens_used: Number of tokens consumed (for cost tracking)
+        """
+        self.embedding = embedding
+        self.model = model
+        self.input_type = input_type
+        self.tokens_used = tokens_used
+    @property
+    def dimensions(self) -> int:
+        """Get embedding dimensionality."""
+        return len(self.embedding)
+    def __repr__(self) -> str:
+        return f"CLIPEmbeddingResult(model={self.model}, dims={self.dimensions}, tokens={self.tokens_used})"
+class JinaCLIPEmbedder:
+    """
+    CLIP embeddings using Jina AI API.
+    Supports:
+    - jina-clip-v1: 768-dimensional embeddings
+    - jina-clip-v2: 512-dimensional embeddings (default)
+    Pricing:
+    - ~$0.02 per million tokens
+    - Images: 4000 tokens per 512x512 tile (v2)
+    - Images: 1000 tokens per 224x224 tile (v1)
+    - Free tier: 10M tokens for new users
+    Future extensions:
+    - Self-hosted CLIP models
+    - OpenCLIP support
+    - Batch embedding support
+    """
+    def __init__(
+        self,
+        api_key: Optional[str] = None,
+        model: str = "jina-clip-v2",
+    ):
+        """
+        Initialize Jina CLIP embedder.
+        Args:
+            api_key: Jina AI API key (from env if None)
+            model: CLIP model name (jina-clip-v1 or jina-clip-v2)
+        """
+        # Get API key from environment if not provided
+        # Check both CONTENT__JINA_API_KEY (preferred) and legacy JINA_API_KEY
+        if api_key is None:
+            api_key = os.getenv("CONTENT__JINA_API_KEY") or os.getenv("JINA_API_KEY")
+        self.api_key = api_key
+        self.model = model
+        self.api_url = "https://api.jina.ai/v1/embeddings"
+        # Warn if no API key
+        if not self.api_key:
+            logger.warning(
+                "No Jina API key found - CLIP embeddings will be disabled. "
+                "Set CONTENT__JINA_API_KEY or get a free key at https://jina.ai/embeddings/"
+            )
+    def is_available(self) -> bool:
+        """Check if Jina CLIP embeddings are available."""
+        return self.api_key is not None
+    def embed_image(
+        self,
+        image_path: str | Path,
+    ) -> Optional[CLIPEmbeddingResult]:
+        """
+        Generate CLIP embedding for an image.
+        Args:
+            image_path: Path to image file
+        Returns:
+            CLIPEmbeddingResult with embedding vector, or None if unavailable
+        Raises:
+            RuntimeError: If API request fails (when API key is available)
+        """
+        if not self.is_available():
+            logger.debug("Jina API key not available - skipping CLIP embedding")
+            return None
+        image_path = Path(image_path)
+        if not image_path.exists():
+            raise FileNotFoundError(f"Image file not found: {image_path}")
+        # Read and encode image to base64
+        with open(image_path, "rb") as f:
+            image_bytes = f.read()
+        image_b64 = base64.b64encode(image_bytes).decode("utf-8")
+        # Detect media type
+        suffix = image_path.suffix.lower()
+        media_type_map = {
+            ".png": "image/png",
+            ".jpg": "image/jpeg",
+            ".jpeg": "image/jpeg",
+            ".gif": "image/gif",
+            ".webp": "image/webp",
+        }
+        media_type = media_type_map.get(suffix, "image/png")
+        logger.debug(f"Generating CLIP embedding for {image_path.name} with {self.model}")
+        try:
+            # Build request
+            headers = {
+                "Authorization": f"Bearer {self.api_key}",
+                "Content-Type": "application/json",
+            }
+            # Jina API expects data URL format
+            data_url = f"data:{media_type};base64,{image_b64}"
+            body = {
+                "model": self.model,
+                "input": [data_url],
+                "input_type": "image",
+            }
+            response = requests.post(
+                self.api_url,
+                headers=headers,
+                json=body,
+                timeout=30.0,
+            )
+            if response.status_code != 200:
+                error_detail = response.text
+                logger.error(f"Jina API error: {response.status_code} - {error_detail}")
+                raise RuntimeError(f"CLIP embedding failed: {response.status_code} - {error_detail}")
+            result = response.json()
+            # Extract embedding and usage
+            embedding = result["data"][0]["embedding"]
+            tokens_used = result.get("usage", {}).get("total_tokens", 0)
+            logger.info(
+                f"✓ CLIP embedding generated: {len(embedding)} dims, {tokens_used} tokens"
+            )
+            return CLIPEmbeddingResult(
+                embedding=embedding,
+                model=self.model,
+                input_type="image",
+                tokens_used=tokens_used,
+            )
+        except requests.exceptions.Timeout:
+            logger.error("Jina API request timed out")
+            raise RuntimeError("CLIP embedding timed out after 30 seconds")
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Request error: {e}")
+            raise RuntimeError(f"CLIP embedding request failed: {e}")
+        except Exception as e:
+            logger.error(f"Unexpected error during CLIP embedding: {e}")
+            raise
+    def embed_text(
+        self,
+        text: str,
+    ) -> Optional[CLIPEmbeddingResult]:
+        """
+        Generate CLIP embedding for text.
+        Useful for text-to-image search in shared embedding space.
+        Args:
+            text: Text to embed
+        Returns:
+            CLIPEmbeddingResult with embedding vector, or None if unavailable
+        Raises:
+            RuntimeError: If API request fails (when API key is available)
+        """
+        if not self.is_available():
+            logger.debug("Jina API key not available - skipping CLIP embedding")
+            return None
+        logger.debug(f"Generating CLIP text embedding with {self.model}")
+        try:
+            # Build request
+            headers = {
+                "Authorization": f"Bearer {self.api_key}",
+                "Content-Type": "application/json",
+            }
+            body = {
+                "model": self.model,
+                "input": [text],
+                "input_type": "text",
+            }
+            response = requests.post(
+                self.api_url,
+                headers=headers,
+                json=body,
+                timeout=30.0,
+            )
+            if response.status_code != 200:
+                error_detail = response.text
+                logger.error(f"Jina API error: {response.status_code} - {error_detail}")
+                raise RuntimeError(f"CLIP embedding failed: {response.status_code} - {error_detail}")
+            result = response.json()
+            # Extract embedding and usage
+            embedding = result["data"][0]["embedding"]
+            tokens_used = result.get("usage", {}).get("total_tokens", 0)
+            logger.info(
+                f"✓ CLIP text embedding generated: {len(embedding)} dims, {tokens_used} tokens"
+            )
+            return CLIPEmbeddingResult(
+                embedding=embedding,
+                model=self.model,
+                input_type="text",
+                tokens_used=tokens_used,
+            )
+        except requests.exceptions.Timeout:
+            logger.error("Jina API request timed out")
+            raise RuntimeError("CLIP embedding timed out after 30 seconds")
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Request error: {e}")
+            raise RuntimeError(f"CLIP embedding request failed: {e}")
+        except Exception as e:
+            logger.error(f"Unexpected error during CLIP embedding: {e}")
+            raise

rem/utils/constants.py ADDED Viewed

@@ -0,0 +1,97 @@
+"""
+Centralized constants for the REM system.
+All magic numbers and commonly-used values should be defined here
+to ensure consistency and make tuning easier.
+"""
+# =============================================================================
+# Embedding Model Constants
+# =============================================================================
+# OpenAI embedding dimensions by model
+OPENAI_EMBEDDING_DIMS_SMALL = 1536  # text-embedding-3-small
+OPENAI_EMBEDDING_DIMS_LARGE = 3072  # text-embedding-3-large
+OPENAI_EMBEDDING_DIMS_ADA = 1536  # text-embedding-ada-002
+# Default embedding dimension (text-embedding-3-small)
+DEFAULT_EMBEDDING_DIMS = 1536
+# Voyage AI embedding dimensions
+VOYAGE_EMBEDDING_DIMS = 1024  # voyage-2
+# =============================================================================
+# HTTP/API Timeouts (seconds)
+# =============================================================================
+HTTP_TIMEOUT_DEFAULT = 30.0  # Standard API calls
+HTTP_TIMEOUT_LONG = 60.0  # Vision/embedding APIs
+HTTP_TIMEOUT_VERY_LONG = 300.0  # Subprocess/batch operations
+# Request timeout for httpx AsyncClient
+ASYNC_CLIENT_TIMEOUT = 300.0
+# =============================================================================
+# Audio Processing Constants
+# =============================================================================
+# Minimum valid WAV file size (header only)
+WAV_HEADER_MIN_BYTES = 44
+# OpenAI Whisper API cost per minute (USD)
+WHISPER_COST_PER_MINUTE = 0.006
+# Audio chunking parameters
+AUDIO_CHUNK_TARGET_SECONDS = 60.0  # Target chunk duration
+AUDIO_CHUNK_WINDOW_SECONDS = 2.0  # Window for silence detection
+SILENCE_THRESHOLD_DB = -40.0  # Silence detection threshold
+MIN_SILENCE_MS = 500  # Minimum silence duration to split on
+# =============================================================================
+# File Processing Constants
+# =============================================================================
+# Subprocess timeout for document parsing
+SUBPROCESS_TIMEOUT_SECONDS = 300  # 5 minutes
+# Maximum file sizes
+MAX_AUDIO_FILE_SIZE_MB = 25  # Whisper API limit
+# =============================================================================
+# Database/Query Constants
+# =============================================================================
+# Default batch sizes
+DEFAULT_BATCH_SIZE = 100
+EMBEDDING_BATCH_SIZE = 50
+# Default pagination limits
+DEFAULT_PAGE_SIZE = 20
+MAX_PAGE_SIZE = 100
+# =============================================================================
+# Rate Limiting
+# =============================================================================
+# Default retry settings
+DEFAULT_MAX_RETRIES = 3
+RETRY_BACKOFF_MULTIPLIER = 1
+RETRY_BACKOFF_MIN = 1
+RETRY_BACKOFF_MAX = 60
+# =============================================================================
+# S3/Storage Constants
+# =============================================================================
+S3_URI_PREFIX = "s3://"
+FILE_URI_PREFIX = "file://"
+# =============================================================================
+# LLM Constants
+# =============================================================================
+# Default max tokens for vision analysis
+VISION_MAX_TOKENS = 2048
+# Default temperature
+DEFAULT_TEMPERATURE = 0.0

rem/utils/date_utils.py ADDED Viewed

@@ -0,0 +1,228 @@
+"""
+Centralized datetime utilities for consistent UTC-naive datetime handling.
+IMPORTANT: REM uses UTC-naive datetimes throughout the codebase.
+PostgreSQL stores TIMESTAMP WITHOUT TIME ZONE, so all Python datetime
+operations should use UTC-naive datetimes to avoid comparison errors.
+Convention:
+- All timestamps are implicitly UTC
+- Use utc_now() instead of datetime.utcnow() or datetime.now(timezone.utc)
+- Use parse_iso() to parse ISO format strings (handles "Z" suffix)
+- Use to_iso() to format datetimes as ISO strings
+See CLAUDE.md Section 1 (Datetime Convention) for details.
+"""
+from datetime import UTC, datetime, timedelta
+from typing import Optional
+def utc_now() -> datetime:
+    """
+    Get current UTC time as a naive datetime.
+    Returns:
+        UTC-naive datetime representing current time.
+    Example:
+        >>> now = utc_now()
+        >>> now.tzinfo is None
+        True
+    """
+    return datetime.now(UTC).replace(tzinfo=None)
+def to_iso(dt: datetime) -> str:
+    """
+    Convert datetime to ISO 8601 format string.
+    Args:
+        dt: Datetime to format (should be UTC-naive)
+    Returns:
+        ISO format string (e.g., "2024-01-15T10:30:00")
+    Example:
+        >>> dt = datetime(2024, 1, 15, 10, 30, 0)
+        >>> to_iso(dt)
+        '2024-01-15T10:30:00'
+    """
+    return dt.isoformat()
+def to_iso_with_z(dt: datetime) -> str:
+    """
+    Convert datetime to ISO 8601 format with Z suffix.
+    Use this when interfacing with external APIs that expect
+    the Z suffix to indicate UTC.
+    Args:
+        dt: Datetime to format (should be UTC-naive)
+    Returns:
+        ISO format string with Z suffix (e.g., "2024-01-15T10:30:00Z")
+    """
+    return dt.isoformat() + "Z"
+def parse_iso(iso_string: str) -> datetime:
+    """
+    Parse ISO 8601 format string to UTC-naive datetime.
+    Handles:
+    - Standard ISO format: "2024-01-15T10:30:00"
+    - Z suffix: "2024-01-15T10:30:00Z"
+    - Timezone offset: "2024-01-15T10:30:00+00:00" (converts to naive)
+    - Microseconds: "2024-01-15T10:30:00.123456"
+    Args:
+        iso_string: ISO format datetime string
+    Returns:
+        UTC-naive datetime
+    Raises:
+        ValueError: If string cannot be parsed
+    Example:
+        >>> parse_iso("2024-01-15T10:30:00Z")
+        datetime.datetime(2024, 1, 15, 10, 30)
+        >>> parse_iso("2024-01-15T10:30:00+00:00")
+        datetime.datetime(2024, 1, 15, 10, 30)
+    """
+    # Handle Z suffix (replace with +00:00 for fromisoformat)
+    if iso_string.endswith("Z"):
+        iso_string = iso_string[:-1] + "+00:00"
+    # Parse the ISO string
+    dt = datetime.fromisoformat(iso_string)
+    # Convert to naive UTC if timezone-aware
+    if dt.tzinfo is not None:
+        # Convert to UTC and strip timezone
+        from datetime import timezone
+        dt = dt.astimezone(timezone.utc).replace(tzinfo=None)
+    return dt
+def parse_iso_safe(iso_string: Optional[str], default: Optional[datetime] = None) -> Optional[datetime]:
+    """
+    Safely parse ISO string, returning default on failure.
+    Args:
+        iso_string: ISO format string or None
+        default: Default value if parsing fails
+    Returns:
+        Parsed datetime or default value
+    """
+    if not iso_string:
+        return default
+    try:
+        return parse_iso(iso_string)
+    except (ValueError, TypeError):
+        return default
+def format_timestamp(dt: Optional[datetime] = None) -> str:
+    """
+    Format datetime for display/logging.
+    Args:
+        dt: Datetime to format (defaults to current UTC time)
+    Returns:
+        Formatted string like "2024-01-15 10:30:00 UTC"
+    """
+    if dt is None:
+        dt = utc_now()
+    return dt.strftime("%Y-%m-%d %H:%M:%S") + " UTC"
+def format_timestamp_compact(dt: Optional[datetime] = None) -> str:
+    """
+    Format datetime as compact string for filenames/IDs.
+    Args:
+        dt: Datetime to format (defaults to current UTC time)
+    Returns:
+        Formatted string like "20240115_103000"
+    """
+    if dt is None:
+        dt = utc_now()
+    return dt.strftime("%Y%m%d_%H%M%S")
+def format_timestamp_for_experiment(dt: Optional[datetime] = None) -> str:
+    """
+    Format datetime for experiment names.
+    Args:
+        dt: Datetime to format (defaults to current UTC time)
+    Returns:
+        Formatted string like "20240115-103000"
+    """
+    if dt is None:
+        dt = utc_now()
+    return dt.strftime("%Y%m%d-%H%M%S")
+def days_ago(days: int) -> datetime:
+    """
+    Get datetime N days ago from now.
+    Args:
+        days: Number of days ago
+    Returns:
+        UTC-naive datetime
+    """
+    return utc_now() - timedelta(days=days)
+def hours_ago(hours: int) -> datetime:
+    """
+    Get datetime N hours ago from now.
+    Args:
+        hours: Number of hours ago
+    Returns:
+        UTC-naive datetime
+    """
+    return utc_now() - timedelta(hours=hours)
+def is_within_hours(dt: datetime, hours: int) -> bool:
+    """
+    Check if datetime is within N hours of now.
+    Args:
+        dt: Datetime to check (should be UTC-naive)
+        hours: Number of hours
+    Returns:
+        True if dt is within the time window
+    """
+    cutoff = hours_ago(hours)
+    return dt >= cutoff
+def is_within_days(dt: datetime, days: int) -> bool:
+    """
+    Check if datetime is within N days of now.
+    Args:
+        dt: Datetime to check (should be UTC-naive)
+        days: Number of days
+    Returns:
+        True if dt is within the time window
+    """
+    cutoff = days_ago(days)
+    return dt >= cutoff

rem/utils/dict_utils.py ADDED Viewed

@@ -0,0 +1,98 @@
+"""Dictionary utilities for nested access and field extraction.
+Utilities for working with nested dictionaries and extracting values
+for embeddings, serialization, etc.
+"""
+import json
+from typing import Any
+def get_nested_value(data: dict[str, Any], path: str) -> Any:
+    """Get value from nested dict using dot notation.
+    Args:
+        data: Dictionary to traverse
+        path: Dot-separated path (e.g., "candidate.name", "skills.0.proficiency")
+    Returns:
+        Value at the path, or None if not found
+    Examples:
+        >>> data = {"candidate": {"name": "John", "skills": [{"name": "Python"}]}}
+        >>> get_nested_value(data, "candidate.name")
+        'John'
+        >>> get_nested_value(data, "candidate.skills.0.name")
+        'Python'
+        >>> get_nested_value(data, "candidate.missing")
+        None
+    """
+    keys = path.split(".")
+    value: Any = data
+    for key in keys:
+        if isinstance(value, dict):
+            value = value.get(key)
+        elif isinstance(value, list):
+            # Handle array index (e.g., "skills.0.name")
+            try:
+                index = int(key)
+                value = value[index] if 0 <= index < len(value) else None
+            except (ValueError, TypeError):
+                return None
+        else:
+            return None
+        if value is None:
+            return None
+    return value
+def extract_fields_for_embedding(
+    data: dict[str, Any],
+    fields: list[str],
+) -> str:
+    """Extract and concatenate fields from dict for embedding generation.
+    Supports nested field access via dot notation.
+    Handles lists and dicts by JSON-serializing them.
+    Returns newline-separated concatenation of all field values.
+    Args:
+        data: Dictionary containing data to extract
+        fields: List of field paths (supports dot notation)
+    Returns:
+        Concatenated text suitable for embedding
+    Examples:
+        >>> data = {
+        ...     "name": "John Doe",
+        ...     "skills": ["Python", "PostgreSQL"],
+        ...     "experience": {"years": 5, "level": "senior"}
+        ... }
+        >>> extract_fields_for_embedding(data, ["name", "skills"])
+        'John Doe\\n["Python", "PostgreSQL"]'
+        >>> extract_fields_for_embedding(data, ["name", "experience.level"])
+        'John Doe\\nsenior'
+        >>> extract_fields_for_embedding(data, [])
+        '{"name": "John Doe", ...}'  # Full JSON if no fields specified
+    """
+    if not fields:
+        # If no fields specified, embed entire JSON
+        return json.dumps(data, indent=2)
+    parts = []
+    for field in fields:
+        value = get_nested_value(data, field)
+        if value is not None:
+            # Convert to string
+            if isinstance(value, (list, dict)):
+                parts.append(json.dumps(value))
+            else:
+                parts.append(str(value))
+    return "\n".join(parts)