remdb 0.3.7__py3-none-any.whl → 0.3.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rem/__init__.py +129 -2
- rem/agentic/context.py +7 -5
- rem/agentic/providers/phoenix.py +32 -43
- rem/api/README.md +23 -0
- rem/api/main.py +27 -2
- rem/api/middleware/tracking.py +172 -0
- rem/api/routers/auth.py +54 -0
- rem/api/routers/chat/completions.py +1 -1
- rem/cli/commands/ask.py +13 -10
- rem/cli/commands/configure.py +4 -3
- rem/cli/commands/db.py +17 -3
- rem/cli/commands/experiments.py +76 -72
- rem/cli/commands/process.py +8 -7
- rem/cli/commands/scaffold.py +47 -0
- rem/cli/main.py +2 -0
- rem/models/entities/user.py +10 -3
- rem/registry.py +367 -0
- rem/services/content/providers.py +92 -133
- rem/services/dreaming/affinity_service.py +2 -16
- rem/services/dreaming/moment_service.py +2 -15
- rem/services/embeddings/api.py +20 -13
- rem/services/phoenix/EXPERIMENT_DESIGN.md +3 -3
- rem/services/phoenix/client.py +148 -14
- rem/services/postgres/schema_generator.py +86 -5
- rem/services/rate_limit.py +113 -0
- rem/services/rem/README.md +14 -0
- rem/services/user_service.py +98 -0
- rem/settings.py +79 -10
- rem/sql/install_models.sql +13 -0
- rem/sql/migrations/003_seed_default_user.sql +48 -0
- rem/utils/constants.py +97 -0
- rem/utils/date_utils.py +228 -0
- rem/utils/embeddings.py +17 -4
- rem/utils/files.py +167 -0
- rem/utils/mime_types.py +158 -0
- rem/utils/schema_loader.py +63 -14
- rem/utils/vision.py +9 -14
- rem/workers/README.md +14 -14
- rem/workers/db_maintainer.py +74 -0
- {remdb-0.3.7.dist-info → remdb-0.3.14.dist-info}/METADATA +169 -121
- {remdb-0.3.7.dist-info → remdb-0.3.14.dist-info}/RECORD +43 -32
- {remdb-0.3.7.dist-info → remdb-0.3.14.dist-info}/WHEEL +0 -0
- {remdb-0.3.7.dist-info → remdb-0.3.14.dist-info}/entry_points.txt +0 -0
rem/settings.py
CHANGED
|
@@ -57,8 +57,10 @@ Example .env file:
|
|
|
57
57
|
"""
|
|
58
58
|
|
|
59
59
|
import os
|
|
60
|
-
|
|
60
|
+
import hashlib
|
|
61
|
+
from pydantic import Field, field_validator, FieldValidationInfo
|
|
61
62
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
63
|
+
from loguru import logger
|
|
62
64
|
|
|
63
65
|
|
|
64
66
|
class LLMSettings(BaseSettings):
|
|
@@ -386,6 +388,22 @@ class AuthSettings(BaseSettings):
|
|
|
386
388
|
google: GoogleOAuthSettings = Field(default_factory=GoogleOAuthSettings)
|
|
387
389
|
microsoft: MicrosoftOAuthSettings = Field(default_factory=MicrosoftOAuthSettings)
|
|
388
390
|
|
|
391
|
+
@field_validator("session_secret", mode="before")
|
|
392
|
+
@classmethod
|
|
393
|
+
def generate_dev_secret(cls, v: str | None, info: FieldValidationInfo) -> str:
|
|
394
|
+
# Only generate if not already set and not in production
|
|
395
|
+
if not v and info.data.get("environment") != "production":
|
|
396
|
+
# Deterministic secret for development
|
|
397
|
+
seed_string = f"{info.data.get('team', 'rem')}-{info.data.get('environment', 'development')}-auth-secret-salt"
|
|
398
|
+
logger.warning(
|
|
399
|
+
"AUTH__SESSION_SECRET not set. Generating deterministic secret for non-production environment. "
|
|
400
|
+
"DO NOT use in production."
|
|
401
|
+
)
|
|
402
|
+
return hashlib.sha256(seed_string.encode()).hexdigest()
|
|
403
|
+
elif not v and info.data.get("environment") == "production":
|
|
404
|
+
raise ValueError("AUTH__SESSION_SECRET must be set in production environment.")
|
|
405
|
+
return v
|
|
406
|
+
|
|
389
407
|
|
|
390
408
|
class PostgresSettings(BaseSettings):
|
|
391
409
|
"""
|
|
@@ -962,6 +980,54 @@ class APISettings(BaseSettings):
|
|
|
962
980
|
)
|
|
963
981
|
|
|
964
982
|
|
|
983
|
+
class SchemaSettings(BaseSettings):
|
|
984
|
+
"""
|
|
985
|
+
Schema search path settings for agent and evaluator schemas.
|
|
986
|
+
|
|
987
|
+
Allows extending REM's schema search with custom directories.
|
|
988
|
+
Custom paths are searched BEFORE built-in package schemas.
|
|
989
|
+
|
|
990
|
+
Environment variables:
|
|
991
|
+
SCHEMA__PATHS - Semicolon-separated list of directories to search
|
|
992
|
+
Example: "/app/schemas;/shared/agents;./local-schemas"
|
|
993
|
+
|
|
994
|
+
Search Order:
|
|
995
|
+
1. Exact path (if file exists)
|
|
996
|
+
2. Custom paths from SCHEMA__PATHS (in order)
|
|
997
|
+
3. Built-in package schemas (schemas/agents/, schemas/evaluators/, etc.)
|
|
998
|
+
4. Database LOOKUP (if enabled)
|
|
999
|
+
|
|
1000
|
+
Example:
|
|
1001
|
+
# In .env or environment
|
|
1002
|
+
SCHEMA__PATHS=/app/custom-agents;/shared/evaluators
|
|
1003
|
+
|
|
1004
|
+
# Then in code
|
|
1005
|
+
from rem.utils.schema_loader import load_agent_schema
|
|
1006
|
+
schema = load_agent_schema("my-custom-agent") # Found in /app/custom-agents/
|
|
1007
|
+
"""
|
|
1008
|
+
|
|
1009
|
+
model_config = SettingsConfigDict(
|
|
1010
|
+
env_prefix="SCHEMA__",
|
|
1011
|
+
extra="ignore",
|
|
1012
|
+
)
|
|
1013
|
+
|
|
1014
|
+
paths: str = Field(
|
|
1015
|
+
default="",
|
|
1016
|
+
description=(
|
|
1017
|
+
"Semicolon-separated list of directories to search for schemas. "
|
|
1018
|
+
"These paths are searched BEFORE built-in package schemas. "
|
|
1019
|
+
"Example: '/app/schemas;/shared/agents'"
|
|
1020
|
+
),
|
|
1021
|
+
)
|
|
1022
|
+
|
|
1023
|
+
@property
|
|
1024
|
+
def path_list(self) -> list[str]:
|
|
1025
|
+
"""Get paths as a list, filtering empty strings."""
|
|
1026
|
+
if not self.paths:
|
|
1027
|
+
return []
|
|
1028
|
+
return [p.strip() for p in self.paths.split(";") if p.strip()]
|
|
1029
|
+
|
|
1030
|
+
|
|
965
1031
|
class GitSettings(BaseSettings):
|
|
966
1032
|
"""
|
|
967
1033
|
Git repository provider settings for versioned schema/experiment syncing.
|
|
@@ -1207,20 +1273,23 @@ class Settings(BaseSettings):
|
|
|
1207
1273
|
sqs: SQSSettings = Field(default_factory=SQSSettings)
|
|
1208
1274
|
chunking: ChunkingSettings = Field(default_factory=ChunkingSettings)
|
|
1209
1275
|
content: ContentSettings = Field(default_factory=ContentSettings)
|
|
1276
|
+
schema_search: SchemaSettings = Field(default_factory=SchemaSettings)
|
|
1210
1277
|
test: TestSettings = Field(default_factory=TestSettings)
|
|
1211
1278
|
|
|
1212
1279
|
|
|
1213
1280
|
# Load configuration from ~/.rem/config.yaml before initializing settings
|
|
1214
1281
|
# This allows user configuration to be merged with environment variables
|
|
1215
|
-
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
|
|
1282
|
+
# Set REM_SKIP_CONFIG_FILE=true to disable (useful for development with .env)
|
|
1283
|
+
if not os.getenv("REM_SKIP_CONFIG_FILE", "").lower() in ("true", "1", "yes"):
|
|
1284
|
+
try:
|
|
1285
|
+
from rem.config import load_config, merge_config_to_env
|
|
1286
|
+
|
|
1287
|
+
_config = load_config()
|
|
1288
|
+
if _config:
|
|
1289
|
+
merge_config_to_env(_config)
|
|
1290
|
+
except ImportError:
|
|
1291
|
+
# config module not available (e.g., during initial setup)
|
|
1292
|
+
pass
|
|
1224
1293
|
|
|
1225
1294
|
# Global settings singleton
|
|
1226
1295
|
settings = Settings()
|
rem/sql/install_models.sql
CHANGED
|
@@ -29,6 +29,18 @@ BEGIN
|
|
|
29
29
|
RAISE NOTICE 'Prerequisites check passed';
|
|
30
30
|
END $$;
|
|
31
31
|
|
|
32
|
+
-- ======================================================================
|
|
33
|
+
-- RATE LIMITS (Unlogged for performance)
|
|
34
|
+
-- ======================================================================
|
|
35
|
+
|
|
36
|
+
CREATE UNLOGGED TABLE IF NOT EXISTS rate_limits (
|
|
37
|
+
key VARCHAR(255) PRIMARY KEY, -- e.g., "tenant_1:anon_abc:per_minute:TIMESTAMP"
|
|
38
|
+
count INTEGER NOT NULL DEFAULT 1,
|
|
39
|
+
expires_at TIMESTAMP WITH TIME ZONE NOT NULL
|
|
40
|
+
);
|
|
41
|
+
|
|
42
|
+
CREATE INDEX IF NOT EXISTS idx_rate_limits_expires_at ON rate_limits(expires_at);
|
|
43
|
+
|
|
32
44
|
-- ======================================================================
|
|
33
45
|
-- USERS (Model: User)
|
|
34
46
|
-- ======================================================================
|
|
@@ -41,6 +53,7 @@ CREATE TABLE IF NOT EXISTS users (
|
|
|
41
53
|
email VARCHAR(256),
|
|
42
54
|
role VARCHAR(256),
|
|
43
55
|
tier TEXT,
|
|
56
|
+
anonymous_ids TEXT[] DEFAULT ARRAY[]::TEXT[],
|
|
44
57
|
sec_policy JSONB DEFAULT '{}'::jsonb,
|
|
45
58
|
summary TEXT,
|
|
46
59
|
interests TEXT[] DEFAULT ARRAY[]::TEXT[],
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
-- ============================================================================
|
|
2
|
+
-- Migration: 003_seed_default_user.sql
|
|
3
|
+
-- Description: Seed the default system user for CLI and API operations
|
|
4
|
+
--
|
|
5
|
+
-- The default user is derived from settings.test.user_email (test@rem.ai)
|
|
6
|
+
-- using deterministic UUID v5 generation. This ensures consistent user ID
|
|
7
|
+
-- across all environments and test runs.
|
|
8
|
+
--
|
|
9
|
+
-- Default user:
|
|
10
|
+
-- email: test@rem.ai
|
|
11
|
+
-- user_id: 9e7dc22b-13bb-5cea-8aee-f6b8e6dc962f (UUID v5 from DNS namespace)
|
|
12
|
+
--
|
|
13
|
+
-- This user is used when:
|
|
14
|
+
-- - CLI commands run without --user-id flag
|
|
15
|
+
-- - API requests come without X-User-Id header
|
|
16
|
+
-- - Tests run without explicit user context
|
|
17
|
+
-- ============================================================================
|
|
18
|
+
|
|
19
|
+
-- Insert default user (idempotent - skip if exists)
|
|
20
|
+
INSERT INTO users (
|
|
21
|
+
id,
|
|
22
|
+
user_id,
|
|
23
|
+
tenant_id,
|
|
24
|
+
name,
|
|
25
|
+
email,
|
|
26
|
+
role,
|
|
27
|
+
tags,
|
|
28
|
+
metadata,
|
|
29
|
+
created_at,
|
|
30
|
+
updated_at
|
|
31
|
+
) VALUES (
|
|
32
|
+
'9e7dc22b-13bb-5cea-8aee-f6b8e6dc962f'::uuid,
|
|
33
|
+
'9e7dc22b-13bb-5cea-8aee-f6b8e6dc962f',
|
|
34
|
+
'9e7dc22b-13bb-5cea-8aee-f6b8e6dc962f',
|
|
35
|
+
'Default User',
|
|
36
|
+
'test@rem.ai',
|
|
37
|
+
'system',
|
|
38
|
+
ARRAY['system', 'default'],
|
|
39
|
+
'{"description": "Default system user for CLI and API operations without explicit user context"}'::jsonb,
|
|
40
|
+
NOW(),
|
|
41
|
+
NOW()
|
|
42
|
+
) ON CONFLICT (id) DO NOTHING;
|
|
43
|
+
|
|
44
|
+
-- Log migration
|
|
45
|
+
DO $$
|
|
46
|
+
BEGIN
|
|
47
|
+
RAISE NOTICE 'Seeded default user: test@rem.ai (id: 9e7dc22b-13bb-5cea-8aee-f6b8e6dc962f)';
|
|
48
|
+
END $$;
|
rem/utils/constants.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Centralized constants for the REM system.
|
|
3
|
+
|
|
4
|
+
All magic numbers and commonly-used values should be defined here
|
|
5
|
+
to ensure consistency and make tuning easier.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
# =============================================================================
|
|
9
|
+
# Embedding Model Constants
|
|
10
|
+
# =============================================================================
|
|
11
|
+
|
|
12
|
+
# OpenAI embedding dimensions by model
|
|
13
|
+
OPENAI_EMBEDDING_DIMS_SMALL = 1536 # text-embedding-3-small
|
|
14
|
+
OPENAI_EMBEDDING_DIMS_LARGE = 3072 # text-embedding-3-large
|
|
15
|
+
OPENAI_EMBEDDING_DIMS_ADA = 1536 # text-embedding-ada-002
|
|
16
|
+
|
|
17
|
+
# Default embedding dimension (text-embedding-3-small)
|
|
18
|
+
DEFAULT_EMBEDDING_DIMS = 1536
|
|
19
|
+
|
|
20
|
+
# Voyage AI embedding dimensions
|
|
21
|
+
VOYAGE_EMBEDDING_DIMS = 1024 # voyage-2
|
|
22
|
+
|
|
23
|
+
# =============================================================================
|
|
24
|
+
# HTTP/API Timeouts (seconds)
|
|
25
|
+
# =============================================================================
|
|
26
|
+
|
|
27
|
+
HTTP_TIMEOUT_DEFAULT = 30.0 # Standard API calls
|
|
28
|
+
HTTP_TIMEOUT_LONG = 60.0 # Vision/embedding APIs
|
|
29
|
+
HTTP_TIMEOUT_VERY_LONG = 300.0 # Subprocess/batch operations
|
|
30
|
+
|
|
31
|
+
# Request timeout for httpx AsyncClient
|
|
32
|
+
ASYNC_CLIENT_TIMEOUT = 300.0
|
|
33
|
+
|
|
34
|
+
# =============================================================================
|
|
35
|
+
# Audio Processing Constants
|
|
36
|
+
# =============================================================================
|
|
37
|
+
|
|
38
|
+
# Minimum valid WAV file size (header only)
|
|
39
|
+
WAV_HEADER_MIN_BYTES = 44
|
|
40
|
+
|
|
41
|
+
# OpenAI Whisper API cost per minute (USD)
|
|
42
|
+
WHISPER_COST_PER_MINUTE = 0.006
|
|
43
|
+
|
|
44
|
+
# Audio chunking parameters
|
|
45
|
+
AUDIO_CHUNK_TARGET_SECONDS = 60.0 # Target chunk duration
|
|
46
|
+
AUDIO_CHUNK_WINDOW_SECONDS = 2.0 # Window for silence detection
|
|
47
|
+
SILENCE_THRESHOLD_DB = -40.0 # Silence detection threshold
|
|
48
|
+
MIN_SILENCE_MS = 500 # Minimum silence duration to split on
|
|
49
|
+
|
|
50
|
+
# =============================================================================
|
|
51
|
+
# File Processing Constants
|
|
52
|
+
# =============================================================================
|
|
53
|
+
|
|
54
|
+
# Subprocess timeout for document parsing
|
|
55
|
+
SUBPROCESS_TIMEOUT_SECONDS = 300 # 5 minutes
|
|
56
|
+
|
|
57
|
+
# Maximum file sizes
|
|
58
|
+
MAX_AUDIO_FILE_SIZE_MB = 25 # Whisper API limit
|
|
59
|
+
|
|
60
|
+
# =============================================================================
|
|
61
|
+
# Database/Query Constants
|
|
62
|
+
# =============================================================================
|
|
63
|
+
|
|
64
|
+
# Default batch sizes
|
|
65
|
+
DEFAULT_BATCH_SIZE = 100
|
|
66
|
+
EMBEDDING_BATCH_SIZE = 50
|
|
67
|
+
|
|
68
|
+
# Default pagination limits
|
|
69
|
+
DEFAULT_PAGE_SIZE = 20
|
|
70
|
+
MAX_PAGE_SIZE = 100
|
|
71
|
+
|
|
72
|
+
# =============================================================================
|
|
73
|
+
# Rate Limiting
|
|
74
|
+
# =============================================================================
|
|
75
|
+
|
|
76
|
+
# Default retry settings
|
|
77
|
+
DEFAULT_MAX_RETRIES = 3
|
|
78
|
+
RETRY_BACKOFF_MULTIPLIER = 1
|
|
79
|
+
RETRY_BACKOFF_MIN = 1
|
|
80
|
+
RETRY_BACKOFF_MAX = 60
|
|
81
|
+
|
|
82
|
+
# =============================================================================
|
|
83
|
+
# S3/Storage Constants
|
|
84
|
+
# =============================================================================
|
|
85
|
+
|
|
86
|
+
S3_URI_PREFIX = "s3://"
|
|
87
|
+
FILE_URI_PREFIX = "file://"
|
|
88
|
+
|
|
89
|
+
# =============================================================================
|
|
90
|
+
# LLM Constants
|
|
91
|
+
# =============================================================================
|
|
92
|
+
|
|
93
|
+
# Default max tokens for vision analysis
|
|
94
|
+
VISION_MAX_TOKENS = 2048
|
|
95
|
+
|
|
96
|
+
# Default temperature
|
|
97
|
+
DEFAULT_TEMPERATURE = 0.0
|
rem/utils/date_utils.py
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Centralized datetime utilities for consistent UTC-naive datetime handling.
|
|
3
|
+
|
|
4
|
+
IMPORTANT: REM uses UTC-naive datetimes throughout the codebase.
|
|
5
|
+
PostgreSQL stores TIMESTAMP WITHOUT TIME ZONE, so all Python datetime
|
|
6
|
+
operations should use UTC-naive datetimes to avoid comparison errors.
|
|
7
|
+
|
|
8
|
+
Convention:
|
|
9
|
+
- All timestamps are implicitly UTC
|
|
10
|
+
- Use utc_now() instead of datetime.utcnow() or datetime.now(timezone.utc)
|
|
11
|
+
- Use parse_iso() to parse ISO format strings (handles "Z" suffix)
|
|
12
|
+
- Use to_iso() to format datetimes as ISO strings
|
|
13
|
+
|
|
14
|
+
See CLAUDE.md Section 1 (Datetime Convention) for details.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from datetime import datetime, timedelta
|
|
18
|
+
from typing import Optional
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def utc_now() -> datetime:
|
|
22
|
+
"""
|
|
23
|
+
Get current UTC time as a naive datetime.
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
UTC-naive datetime representing current time.
|
|
27
|
+
|
|
28
|
+
Example:
|
|
29
|
+
>>> now = utc_now()
|
|
30
|
+
>>> now.tzinfo is None
|
|
31
|
+
True
|
|
32
|
+
"""
|
|
33
|
+
return datetime.utcnow()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def to_iso(dt: datetime) -> str:
|
|
37
|
+
"""
|
|
38
|
+
Convert datetime to ISO 8601 format string.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
dt: Datetime to format (should be UTC-naive)
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
ISO format string (e.g., "2024-01-15T10:30:00")
|
|
45
|
+
|
|
46
|
+
Example:
|
|
47
|
+
>>> dt = datetime(2024, 1, 15, 10, 30, 0)
|
|
48
|
+
>>> to_iso(dt)
|
|
49
|
+
'2024-01-15T10:30:00'
|
|
50
|
+
"""
|
|
51
|
+
return dt.isoformat()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def to_iso_with_z(dt: datetime) -> str:
|
|
55
|
+
"""
|
|
56
|
+
Convert datetime to ISO 8601 format with Z suffix.
|
|
57
|
+
|
|
58
|
+
Use this when interfacing with external APIs that expect
|
|
59
|
+
the Z suffix to indicate UTC.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
dt: Datetime to format (should be UTC-naive)
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
ISO format string with Z suffix (e.g., "2024-01-15T10:30:00Z")
|
|
66
|
+
"""
|
|
67
|
+
return dt.isoformat() + "Z"
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def parse_iso(iso_string: str) -> datetime:
|
|
71
|
+
"""
|
|
72
|
+
Parse ISO 8601 format string to UTC-naive datetime.
|
|
73
|
+
|
|
74
|
+
Handles:
|
|
75
|
+
- Standard ISO format: "2024-01-15T10:30:00"
|
|
76
|
+
- Z suffix: "2024-01-15T10:30:00Z"
|
|
77
|
+
- Timezone offset: "2024-01-15T10:30:00+00:00" (converts to naive)
|
|
78
|
+
- Microseconds: "2024-01-15T10:30:00.123456"
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
iso_string: ISO format datetime string
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
UTC-naive datetime
|
|
85
|
+
|
|
86
|
+
Raises:
|
|
87
|
+
ValueError: If string cannot be parsed
|
|
88
|
+
|
|
89
|
+
Example:
|
|
90
|
+
>>> parse_iso("2024-01-15T10:30:00Z")
|
|
91
|
+
datetime.datetime(2024, 1, 15, 10, 30)
|
|
92
|
+
>>> parse_iso("2024-01-15T10:30:00+00:00")
|
|
93
|
+
datetime.datetime(2024, 1, 15, 10, 30)
|
|
94
|
+
"""
|
|
95
|
+
# Handle Z suffix (replace with +00:00 for fromisoformat)
|
|
96
|
+
if iso_string.endswith("Z"):
|
|
97
|
+
iso_string = iso_string[:-1] + "+00:00"
|
|
98
|
+
|
|
99
|
+
# Parse the ISO string
|
|
100
|
+
dt = datetime.fromisoformat(iso_string)
|
|
101
|
+
|
|
102
|
+
# Convert to naive UTC if timezone-aware
|
|
103
|
+
if dt.tzinfo is not None:
|
|
104
|
+
# Convert to UTC and strip timezone
|
|
105
|
+
from datetime import timezone
|
|
106
|
+
dt = dt.astimezone(timezone.utc).replace(tzinfo=None)
|
|
107
|
+
|
|
108
|
+
return dt
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def parse_iso_safe(iso_string: Optional[str], default: Optional[datetime] = None) -> Optional[datetime]:
|
|
112
|
+
"""
|
|
113
|
+
Safely parse ISO string, returning default on failure.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
iso_string: ISO format string or None
|
|
117
|
+
default: Default value if parsing fails
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
Parsed datetime or default value
|
|
121
|
+
"""
|
|
122
|
+
if not iso_string:
|
|
123
|
+
return default
|
|
124
|
+
try:
|
|
125
|
+
return parse_iso(iso_string)
|
|
126
|
+
except (ValueError, TypeError):
|
|
127
|
+
return default
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def format_timestamp(dt: Optional[datetime] = None) -> str:
|
|
131
|
+
"""
|
|
132
|
+
Format datetime for display/logging.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
dt: Datetime to format (defaults to current UTC time)
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
Formatted string like "2024-01-15 10:30:00 UTC"
|
|
139
|
+
"""
|
|
140
|
+
if dt is None:
|
|
141
|
+
dt = utc_now()
|
|
142
|
+
return dt.strftime("%Y-%m-%d %H:%M:%S") + " UTC"
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def format_timestamp_compact(dt: Optional[datetime] = None) -> str:
|
|
146
|
+
"""
|
|
147
|
+
Format datetime as compact string for filenames/IDs.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
dt: Datetime to format (defaults to current UTC time)
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
Formatted string like "20240115_103000"
|
|
154
|
+
"""
|
|
155
|
+
if dt is None:
|
|
156
|
+
dt = utc_now()
|
|
157
|
+
return dt.strftime("%Y%m%d_%H%M%S")
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def format_timestamp_for_experiment(dt: Optional[datetime] = None) -> str:
|
|
161
|
+
"""
|
|
162
|
+
Format datetime for experiment names.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
dt: Datetime to format (defaults to current UTC time)
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
Formatted string like "20240115-103000"
|
|
169
|
+
"""
|
|
170
|
+
if dt is None:
|
|
171
|
+
dt = utc_now()
|
|
172
|
+
return dt.strftime("%Y%m%d-%H%M%S")
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def days_ago(days: int) -> datetime:
|
|
176
|
+
"""
|
|
177
|
+
Get datetime N days ago from now.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
days: Number of days ago
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
UTC-naive datetime
|
|
184
|
+
"""
|
|
185
|
+
return utc_now() - timedelta(days=days)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def hours_ago(hours: int) -> datetime:
|
|
189
|
+
"""
|
|
190
|
+
Get datetime N hours ago from now.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
hours: Number of hours ago
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
UTC-naive datetime
|
|
197
|
+
"""
|
|
198
|
+
return utc_now() - timedelta(hours=hours)
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def is_within_hours(dt: datetime, hours: int) -> bool:
|
|
202
|
+
"""
|
|
203
|
+
Check if datetime is within N hours of now.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
dt: Datetime to check (should be UTC-naive)
|
|
207
|
+
hours: Number of hours
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
True if dt is within the time window
|
|
211
|
+
"""
|
|
212
|
+
cutoff = hours_ago(hours)
|
|
213
|
+
return dt >= cutoff
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def is_within_days(dt: datetime, days: int) -> bool:
|
|
217
|
+
"""
|
|
218
|
+
Check if datetime is within N days of now.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
dt: Datetime to check (should be UTC-naive)
|
|
222
|
+
days: Number of days
|
|
223
|
+
|
|
224
|
+
Returns:
|
|
225
|
+
True if dt is within the time window
|
|
226
|
+
"""
|
|
227
|
+
cutoff = days_ago(days)
|
|
228
|
+
return dt >= cutoff
|
rem/utils/embeddings.py
CHANGED
|
@@ -20,7 +20,6 @@ Usage:
|
|
|
20
20
|
embeddings = generate_embeddings("openai:text-embedding-3-small", texts)
|
|
21
21
|
"""
|
|
22
22
|
|
|
23
|
-
import os
|
|
24
23
|
from typing import Any, cast
|
|
25
24
|
|
|
26
25
|
import requests
|
|
@@ -31,6 +30,16 @@ from tenacity import (
|
|
|
31
30
|
wait_exponential,
|
|
32
31
|
)
|
|
33
32
|
|
|
33
|
+
from rem.utils.constants import (
|
|
34
|
+
HTTP_TIMEOUT_LONG,
|
|
35
|
+
OPENAI_EMBEDDING_DIMS_SMALL,
|
|
36
|
+
OPENAI_EMBEDDING_DIMS_LARGE,
|
|
37
|
+
VOYAGE_EMBEDDING_DIMS,
|
|
38
|
+
RETRY_BACKOFF_MULTIPLIER,
|
|
39
|
+
RETRY_BACKOFF_MIN,
|
|
40
|
+
RETRY_BACKOFF_MAX,
|
|
41
|
+
)
|
|
42
|
+
|
|
34
43
|
|
|
35
44
|
class EmbeddingError(Exception):
|
|
36
45
|
"""Base exception for embedding generation errors."""
|
|
@@ -166,7 +175,11 @@ def _create_retry_decorator(max_retries: int):
|
|
|
166
175
|
return retry(
|
|
167
176
|
retry=retry_if_exception_type(RateLimitError),
|
|
168
177
|
stop=stop_after_attempt(max_retries),
|
|
169
|
-
wait=wait_exponential(
|
|
178
|
+
wait=wait_exponential(
|
|
179
|
+
multiplier=RETRY_BACKOFF_MULTIPLIER,
|
|
180
|
+
min=RETRY_BACKOFF_MIN,
|
|
181
|
+
max=RETRY_BACKOFF_MAX,
|
|
182
|
+
),
|
|
170
183
|
reraise=True,
|
|
171
184
|
)
|
|
172
185
|
|
|
@@ -234,7 +247,7 @@ def _generate_openai_embeddings(
|
|
|
234
247
|
}
|
|
235
248
|
|
|
236
249
|
try:
|
|
237
|
-
response = requests.post(url, json=payload, headers=headers, timeout=
|
|
250
|
+
response = requests.post(url, json=payload, headers=headers, timeout=HTTP_TIMEOUT_LONG)
|
|
238
251
|
|
|
239
252
|
# Handle rate limits
|
|
240
253
|
if response.status_code == 429:
|
|
@@ -334,7 +347,7 @@ def _generate_voyage_embeddings(
|
|
|
334
347
|
}
|
|
335
348
|
|
|
336
349
|
try:
|
|
337
|
-
response = requests.post(url, json=payload, headers=headers, timeout=
|
|
350
|
+
response = requests.post(url, json=payload, headers=headers, timeout=HTTP_TIMEOUT_LONG)
|
|
338
351
|
|
|
339
352
|
# Handle rate limits
|
|
340
353
|
if response.status_code == 429:
|