remdb 0.3.242__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of remdb might be problematic. Click here for more details.

Files changed (235) hide show
  1. rem/__init__.py +129 -0
  2. rem/agentic/README.md +760 -0
  3. rem/agentic/__init__.py +54 -0
  4. rem/agentic/agents/README.md +155 -0
  5. rem/agentic/agents/__init__.py +38 -0
  6. rem/agentic/agents/agent_manager.py +311 -0
  7. rem/agentic/agents/sse_simulator.py +502 -0
  8. rem/agentic/context.py +425 -0
  9. rem/agentic/context_builder.py +360 -0
  10. rem/agentic/llm_provider_models.py +301 -0
  11. rem/agentic/mcp/__init__.py +0 -0
  12. rem/agentic/mcp/tool_wrapper.py +273 -0
  13. rem/agentic/otel/__init__.py +5 -0
  14. rem/agentic/otel/setup.py +240 -0
  15. rem/agentic/providers/phoenix.py +926 -0
  16. rem/agentic/providers/pydantic_ai.py +854 -0
  17. rem/agentic/query.py +117 -0
  18. rem/agentic/query_helper.py +89 -0
  19. rem/agentic/schema.py +737 -0
  20. rem/agentic/serialization.py +245 -0
  21. rem/agentic/tools/__init__.py +5 -0
  22. rem/agentic/tools/rem_tools.py +242 -0
  23. rem/api/README.md +657 -0
  24. rem/api/deps.py +253 -0
  25. rem/api/main.py +460 -0
  26. rem/api/mcp_router/prompts.py +182 -0
  27. rem/api/mcp_router/resources.py +820 -0
  28. rem/api/mcp_router/server.py +243 -0
  29. rem/api/mcp_router/tools.py +1605 -0
  30. rem/api/middleware/tracking.py +172 -0
  31. rem/api/routers/admin.py +520 -0
  32. rem/api/routers/auth.py +898 -0
  33. rem/api/routers/chat/__init__.py +5 -0
  34. rem/api/routers/chat/child_streaming.py +394 -0
  35. rem/api/routers/chat/completions.py +702 -0
  36. rem/api/routers/chat/json_utils.py +76 -0
  37. rem/api/routers/chat/models.py +202 -0
  38. rem/api/routers/chat/otel_utils.py +33 -0
  39. rem/api/routers/chat/sse_events.py +546 -0
  40. rem/api/routers/chat/streaming.py +950 -0
  41. rem/api/routers/chat/streaming_utils.py +327 -0
  42. rem/api/routers/common.py +18 -0
  43. rem/api/routers/dev.py +87 -0
  44. rem/api/routers/feedback.py +276 -0
  45. rem/api/routers/messages.py +620 -0
  46. rem/api/routers/models.py +86 -0
  47. rem/api/routers/query.py +362 -0
  48. rem/api/routers/shared_sessions.py +422 -0
  49. rem/auth/README.md +258 -0
  50. rem/auth/__init__.py +36 -0
  51. rem/auth/jwt.py +367 -0
  52. rem/auth/middleware.py +318 -0
  53. rem/auth/providers/__init__.py +16 -0
  54. rem/auth/providers/base.py +376 -0
  55. rem/auth/providers/email.py +215 -0
  56. rem/auth/providers/google.py +163 -0
  57. rem/auth/providers/microsoft.py +237 -0
  58. rem/cli/README.md +517 -0
  59. rem/cli/__init__.py +8 -0
  60. rem/cli/commands/README.md +299 -0
  61. rem/cli/commands/__init__.py +3 -0
  62. rem/cli/commands/ask.py +549 -0
  63. rem/cli/commands/cluster.py +1808 -0
  64. rem/cli/commands/configure.py +495 -0
  65. rem/cli/commands/db.py +828 -0
  66. rem/cli/commands/dreaming.py +324 -0
  67. rem/cli/commands/experiments.py +1698 -0
  68. rem/cli/commands/mcp.py +66 -0
  69. rem/cli/commands/process.py +388 -0
  70. rem/cli/commands/query.py +109 -0
  71. rem/cli/commands/scaffold.py +47 -0
  72. rem/cli/commands/schema.py +230 -0
  73. rem/cli/commands/serve.py +106 -0
  74. rem/cli/commands/session.py +453 -0
  75. rem/cli/dreaming.py +363 -0
  76. rem/cli/main.py +123 -0
  77. rem/config.py +244 -0
  78. rem/mcp_server.py +41 -0
  79. rem/models/core/__init__.py +49 -0
  80. rem/models/core/core_model.py +70 -0
  81. rem/models/core/engram.py +333 -0
  82. rem/models/core/experiment.py +672 -0
  83. rem/models/core/inline_edge.py +132 -0
  84. rem/models/core/rem_query.py +246 -0
  85. rem/models/entities/__init__.py +68 -0
  86. rem/models/entities/domain_resource.py +38 -0
  87. rem/models/entities/feedback.py +123 -0
  88. rem/models/entities/file.py +57 -0
  89. rem/models/entities/image_resource.py +88 -0
  90. rem/models/entities/message.py +64 -0
  91. rem/models/entities/moment.py +123 -0
  92. rem/models/entities/ontology.py +181 -0
  93. rem/models/entities/ontology_config.py +131 -0
  94. rem/models/entities/resource.py +95 -0
  95. rem/models/entities/schema.py +87 -0
  96. rem/models/entities/session.py +84 -0
  97. rem/models/entities/shared_session.py +180 -0
  98. rem/models/entities/subscriber.py +175 -0
  99. rem/models/entities/user.py +93 -0
  100. rem/py.typed +0 -0
  101. rem/registry.py +373 -0
  102. rem/schemas/README.md +507 -0
  103. rem/schemas/__init__.py +6 -0
  104. rem/schemas/agents/README.md +92 -0
  105. rem/schemas/agents/core/agent-builder.yaml +235 -0
  106. rem/schemas/agents/core/moment-builder.yaml +178 -0
  107. rem/schemas/agents/core/rem-query-agent.yaml +226 -0
  108. rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
  109. rem/schemas/agents/core/simple-assistant.yaml +19 -0
  110. rem/schemas/agents/core/user-profile-builder.yaml +163 -0
  111. rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
  112. rem/schemas/agents/examples/contract-extractor.yaml +134 -0
  113. rem/schemas/agents/examples/cv-parser.yaml +263 -0
  114. rem/schemas/agents/examples/hello-world.yaml +37 -0
  115. rem/schemas/agents/examples/query.yaml +54 -0
  116. rem/schemas/agents/examples/simple.yaml +21 -0
  117. rem/schemas/agents/examples/test.yaml +29 -0
  118. rem/schemas/agents/rem.yaml +132 -0
  119. rem/schemas/evaluators/hello-world/default.yaml +77 -0
  120. rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
  121. rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
  122. rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
  123. rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
  124. rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
  125. rem/services/__init__.py +18 -0
  126. rem/services/audio/INTEGRATION.md +308 -0
  127. rem/services/audio/README.md +376 -0
  128. rem/services/audio/__init__.py +15 -0
  129. rem/services/audio/chunker.py +354 -0
  130. rem/services/audio/transcriber.py +259 -0
  131. rem/services/content/README.md +1269 -0
  132. rem/services/content/__init__.py +5 -0
  133. rem/services/content/providers.py +760 -0
  134. rem/services/content/service.py +762 -0
  135. rem/services/dreaming/README.md +230 -0
  136. rem/services/dreaming/__init__.py +53 -0
  137. rem/services/dreaming/affinity_service.py +322 -0
  138. rem/services/dreaming/moment_service.py +251 -0
  139. rem/services/dreaming/ontology_service.py +54 -0
  140. rem/services/dreaming/user_model_service.py +297 -0
  141. rem/services/dreaming/utils.py +39 -0
  142. rem/services/email/__init__.py +10 -0
  143. rem/services/email/service.py +522 -0
  144. rem/services/email/templates.py +360 -0
  145. rem/services/embeddings/__init__.py +11 -0
  146. rem/services/embeddings/api.py +127 -0
  147. rem/services/embeddings/worker.py +435 -0
  148. rem/services/fs/README.md +662 -0
  149. rem/services/fs/__init__.py +62 -0
  150. rem/services/fs/examples.py +206 -0
  151. rem/services/fs/examples_paths.py +204 -0
  152. rem/services/fs/git_provider.py +935 -0
  153. rem/services/fs/local_provider.py +760 -0
  154. rem/services/fs/parsing-hooks-examples.md +172 -0
  155. rem/services/fs/paths.py +276 -0
  156. rem/services/fs/provider.py +460 -0
  157. rem/services/fs/s3_provider.py +1042 -0
  158. rem/services/fs/service.py +186 -0
  159. rem/services/git/README.md +1075 -0
  160. rem/services/git/__init__.py +17 -0
  161. rem/services/git/service.py +469 -0
  162. rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
  163. rem/services/phoenix/README.md +453 -0
  164. rem/services/phoenix/__init__.py +46 -0
  165. rem/services/phoenix/client.py +960 -0
  166. rem/services/phoenix/config.py +88 -0
  167. rem/services/phoenix/prompt_labels.py +477 -0
  168. rem/services/postgres/README.md +757 -0
  169. rem/services/postgres/__init__.py +49 -0
  170. rem/services/postgres/diff_service.py +599 -0
  171. rem/services/postgres/migration_service.py +427 -0
  172. rem/services/postgres/programmable_diff_service.py +635 -0
  173. rem/services/postgres/pydantic_to_sqlalchemy.py +562 -0
  174. rem/services/postgres/register_type.py +353 -0
  175. rem/services/postgres/repository.py +481 -0
  176. rem/services/postgres/schema_generator.py +661 -0
  177. rem/services/postgres/service.py +802 -0
  178. rem/services/postgres/sql_builder.py +355 -0
  179. rem/services/rate_limit.py +113 -0
  180. rem/services/rem/README.md +318 -0
  181. rem/services/rem/__init__.py +23 -0
  182. rem/services/rem/exceptions.py +71 -0
  183. rem/services/rem/executor.py +293 -0
  184. rem/services/rem/parser.py +180 -0
  185. rem/services/rem/queries.py +196 -0
  186. rem/services/rem/query.py +371 -0
  187. rem/services/rem/service.py +608 -0
  188. rem/services/session/README.md +374 -0
  189. rem/services/session/__init__.py +13 -0
  190. rem/services/session/compression.py +488 -0
  191. rem/services/session/pydantic_messages.py +310 -0
  192. rem/services/session/reload.py +85 -0
  193. rem/services/user_service.py +130 -0
  194. rem/settings.py +1877 -0
  195. rem/sql/background_indexes.sql +52 -0
  196. rem/sql/migrations/001_install.sql +983 -0
  197. rem/sql/migrations/002_install_models.sql +3157 -0
  198. rem/sql/migrations/003_optional_extensions.sql +326 -0
  199. rem/sql/migrations/004_cache_system.sql +282 -0
  200. rem/sql/migrations/005_schema_update.sql +145 -0
  201. rem/sql/migrations/migrate_session_id_to_uuid.sql +45 -0
  202. rem/utils/AGENTIC_CHUNKING.md +597 -0
  203. rem/utils/README.md +628 -0
  204. rem/utils/__init__.py +61 -0
  205. rem/utils/agentic_chunking.py +622 -0
  206. rem/utils/batch_ops.py +343 -0
  207. rem/utils/chunking.py +108 -0
  208. rem/utils/clip_embeddings.py +276 -0
  209. rem/utils/constants.py +97 -0
  210. rem/utils/date_utils.py +228 -0
  211. rem/utils/dict_utils.py +98 -0
  212. rem/utils/embeddings.py +436 -0
  213. rem/utils/examples/embeddings_example.py +305 -0
  214. rem/utils/examples/sql_types_example.py +202 -0
  215. rem/utils/files.py +323 -0
  216. rem/utils/markdown.py +16 -0
  217. rem/utils/mime_types.py +158 -0
  218. rem/utils/model_helpers.py +492 -0
  219. rem/utils/schema_loader.py +649 -0
  220. rem/utils/sql_paths.py +146 -0
  221. rem/utils/sql_types.py +350 -0
  222. rem/utils/user_id.py +81 -0
  223. rem/utils/vision.py +325 -0
  224. rem/workers/README.md +506 -0
  225. rem/workers/__init__.py +7 -0
  226. rem/workers/db_listener.py +579 -0
  227. rem/workers/db_maintainer.py +74 -0
  228. rem/workers/dreaming.py +502 -0
  229. rem/workers/engram_processor.py +312 -0
  230. rem/workers/sqs_file_processor.py +193 -0
  231. rem/workers/unlogged_maintainer.py +463 -0
  232. remdb-0.3.242.dist-info/METADATA +1632 -0
  233. remdb-0.3.242.dist-info/RECORD +235 -0
  234. remdb-0.3.242.dist-info/WHEEL +4 -0
  235. remdb-0.3.242.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,276 @@
1
+ """
2
+ CLIP embeddings utility using Jina AI API.
3
+
4
+ Provides image and text embeddings using Jina CLIP models via API.
5
+ Falls back gracefully when API key is not available.
6
+
7
+ Future: Can be extended to support self-hosted CLIP models or other providers.
8
+ """
9
+
10
+ import base64
11
+ import os
12
+ from pathlib import Path
13
+ from typing import Optional
14
+
15
+ import requests
16
+ from loguru import logger
17
+
18
+
19
+ class CLIPEmbeddingResult:
20
+ """Result from CLIP embedding generation."""
21
+
22
+ def __init__(
23
+ self,
24
+ embedding: list[float],
25
+ model: str,
26
+ input_type: str,
27
+ tokens_used: int = 0,
28
+ ):
29
+ """
30
+ Initialize CLIP embedding result.
31
+
32
+ Args:
33
+ embedding: Vector embedding (512 or 768 dimensions)
34
+ model: Model name used
35
+ input_type: Type of input (image or text)
36
+ tokens_used: Number of tokens consumed (for cost tracking)
37
+ """
38
+ self.embedding = embedding
39
+ self.model = model
40
+ self.input_type = input_type
41
+ self.tokens_used = tokens_used
42
+
43
+ @property
44
+ def dimensions(self) -> int:
45
+ """Get embedding dimensionality."""
46
+ return len(self.embedding)
47
+
48
+ def __repr__(self) -> str:
49
+ return f"CLIPEmbeddingResult(model={self.model}, dims={self.dimensions}, tokens={self.tokens_used})"
50
+
51
+
52
+ class JinaCLIPEmbedder:
53
+ """
54
+ CLIP embeddings using Jina AI API.
55
+
56
+ Supports:
57
+ - jina-clip-v1: 768-dimensional embeddings
58
+ - jina-clip-v2: 512-dimensional embeddings (default)
59
+
60
+ Pricing:
61
+ - ~$0.02 per million tokens
62
+ - Images: 4000 tokens per 512x512 tile (v2)
63
+ - Images: 1000 tokens per 224x224 tile (v1)
64
+ - Free tier: 10M tokens for new users
65
+
66
+ Future extensions:
67
+ - Self-hosted CLIP models
68
+ - OpenCLIP support
69
+ - Batch embedding support
70
+ """
71
+
72
+ def __init__(
73
+ self,
74
+ api_key: Optional[str] = None,
75
+ model: str = "jina-clip-v2",
76
+ ):
77
+ """
78
+ Initialize Jina CLIP embedder.
79
+
80
+ Args:
81
+ api_key: Jina AI API key (from env if None)
82
+ model: CLIP model name (jina-clip-v1 or jina-clip-v2)
83
+ """
84
+ # Get API key from environment if not provided
85
+ # Check both CONTENT__JINA_API_KEY (preferred) and legacy JINA_API_KEY
86
+ if api_key is None:
87
+ api_key = os.getenv("CONTENT__JINA_API_KEY") or os.getenv("JINA_API_KEY")
88
+
89
+ self.api_key = api_key
90
+ self.model = model
91
+ self.api_url = "https://api.jina.ai/v1/embeddings"
92
+
93
+ # Warn if no API key
94
+ if not self.api_key:
95
+ logger.warning(
96
+ "No Jina API key found - CLIP embeddings will be disabled. "
97
+ "Set CONTENT__JINA_API_KEY or get a free key at https://jina.ai/embeddings/"
98
+ )
99
+
100
+ def is_available(self) -> bool:
101
+ """Check if Jina CLIP embeddings are available."""
102
+ return self.api_key is not None
103
+
104
+ def embed_image(
105
+ self,
106
+ image_path: str | Path,
107
+ ) -> Optional[CLIPEmbeddingResult]:
108
+ """
109
+ Generate CLIP embedding for an image.
110
+
111
+ Args:
112
+ image_path: Path to image file
113
+
114
+ Returns:
115
+ CLIPEmbeddingResult with embedding vector, or None if unavailable
116
+
117
+ Raises:
118
+ RuntimeError: If API request fails (when API key is available)
119
+ """
120
+ if not self.is_available():
121
+ logger.debug("Jina API key not available - skipping CLIP embedding")
122
+ return None
123
+
124
+ image_path = Path(image_path)
125
+ if not image_path.exists():
126
+ raise FileNotFoundError(f"Image file not found: {image_path}")
127
+
128
+ # Read and encode image to base64
129
+ with open(image_path, "rb") as f:
130
+ image_bytes = f.read()
131
+
132
+ image_b64 = base64.b64encode(image_bytes).decode("utf-8")
133
+
134
+ # Detect media type
135
+ suffix = image_path.suffix.lower()
136
+ media_type_map = {
137
+ ".png": "image/png",
138
+ ".jpg": "image/jpeg",
139
+ ".jpeg": "image/jpeg",
140
+ ".gif": "image/gif",
141
+ ".webp": "image/webp",
142
+ }
143
+ media_type = media_type_map.get(suffix, "image/png")
144
+
145
+ logger.debug(f"Generating CLIP embedding for {image_path.name} with {self.model}")
146
+
147
+ try:
148
+ # Build request
149
+ headers = {
150
+ "Authorization": f"Bearer {self.api_key}",
151
+ "Content-Type": "application/json",
152
+ }
153
+
154
+ # Jina API expects data URL format
155
+ data_url = f"data:{media_type};base64,{image_b64}"
156
+
157
+ body = {
158
+ "model": self.model,
159
+ "input": [data_url],
160
+ "input_type": "image",
161
+ }
162
+
163
+ response = requests.post(
164
+ self.api_url,
165
+ headers=headers,
166
+ json=body,
167
+ timeout=30.0,
168
+ )
169
+
170
+ if response.status_code != 200:
171
+ error_detail = response.text
172
+ logger.error(f"Jina API error: {response.status_code} - {error_detail}")
173
+ raise RuntimeError(f"CLIP embedding failed: {response.status_code} - {error_detail}")
174
+
175
+ result = response.json()
176
+
177
+ # Extract embedding and usage
178
+ embedding = result["data"][0]["embedding"]
179
+ tokens_used = result.get("usage", {}).get("total_tokens", 0)
180
+
181
+ logger.info(
182
+ f"✓ CLIP embedding generated: {len(embedding)} dims, {tokens_used} tokens"
183
+ )
184
+
185
+ return CLIPEmbeddingResult(
186
+ embedding=embedding,
187
+ model=self.model,
188
+ input_type="image",
189
+ tokens_used=tokens_used,
190
+ )
191
+
192
+ except requests.exceptions.Timeout:
193
+ logger.error("Jina API request timed out")
194
+ raise RuntimeError("CLIP embedding timed out after 30 seconds")
195
+ except requests.exceptions.RequestException as e:
196
+ logger.error(f"Request error: {e}")
197
+ raise RuntimeError(f"CLIP embedding request failed: {e}")
198
+ except Exception as e:
199
+ logger.error(f"Unexpected error during CLIP embedding: {e}")
200
+ raise
201
+
202
+ def embed_text(
203
+ self,
204
+ text: str,
205
+ ) -> Optional[CLIPEmbeddingResult]:
206
+ """
207
+ Generate CLIP embedding for text.
208
+
209
+ Useful for text-to-image search in shared embedding space.
210
+
211
+ Args:
212
+ text: Text to embed
213
+
214
+ Returns:
215
+ CLIPEmbeddingResult with embedding vector, or None if unavailable
216
+
217
+ Raises:
218
+ RuntimeError: If API request fails (when API key is available)
219
+ """
220
+ if not self.is_available():
221
+ logger.debug("Jina API key not available - skipping CLIP embedding")
222
+ return None
223
+
224
+ logger.debug(f"Generating CLIP text embedding with {self.model}")
225
+
226
+ try:
227
+ # Build request
228
+ headers = {
229
+ "Authorization": f"Bearer {self.api_key}",
230
+ "Content-Type": "application/json",
231
+ }
232
+
233
+ body = {
234
+ "model": self.model,
235
+ "input": [text],
236
+ "input_type": "text",
237
+ }
238
+
239
+ response = requests.post(
240
+ self.api_url,
241
+ headers=headers,
242
+ json=body,
243
+ timeout=30.0,
244
+ )
245
+
246
+ if response.status_code != 200:
247
+ error_detail = response.text
248
+ logger.error(f"Jina API error: {response.status_code} - {error_detail}")
249
+ raise RuntimeError(f"CLIP embedding failed: {response.status_code} - {error_detail}")
250
+
251
+ result = response.json()
252
+
253
+ # Extract embedding and usage
254
+ embedding = result["data"][0]["embedding"]
255
+ tokens_used = result.get("usage", {}).get("total_tokens", 0)
256
+
257
+ logger.info(
258
+ f"✓ CLIP text embedding generated: {len(embedding)} dims, {tokens_used} tokens"
259
+ )
260
+
261
+ return CLIPEmbeddingResult(
262
+ embedding=embedding,
263
+ model=self.model,
264
+ input_type="text",
265
+ tokens_used=tokens_used,
266
+ )
267
+
268
+ except requests.exceptions.Timeout:
269
+ logger.error("Jina API request timed out")
270
+ raise RuntimeError("CLIP embedding timed out after 30 seconds")
271
+ except requests.exceptions.RequestException as e:
272
+ logger.error(f"Request error: {e}")
273
+ raise RuntimeError(f"CLIP embedding request failed: {e}")
274
+ except Exception as e:
275
+ logger.error(f"Unexpected error during CLIP embedding: {e}")
276
+ raise
rem/utils/constants.py ADDED
@@ -0,0 +1,97 @@
1
+ """
2
+ Centralized constants for the REM system.
3
+
4
+ All magic numbers and commonly-used values should be defined here
5
+ to ensure consistency and make tuning easier.
6
+ """
7
+
8
+ # =============================================================================
9
+ # Embedding Model Constants
10
+ # =============================================================================
11
+
12
+ # OpenAI embedding dimensions by model
13
+ OPENAI_EMBEDDING_DIMS_SMALL = 1536 # text-embedding-3-small
14
+ OPENAI_EMBEDDING_DIMS_LARGE = 3072 # text-embedding-3-large
15
+ OPENAI_EMBEDDING_DIMS_ADA = 1536 # text-embedding-ada-002
16
+
17
+ # Default embedding dimension (text-embedding-3-small)
18
+ DEFAULT_EMBEDDING_DIMS = 1536
19
+
20
+ # Voyage AI embedding dimensions
21
+ VOYAGE_EMBEDDING_DIMS = 1024 # voyage-2
22
+
23
+ # =============================================================================
24
+ # HTTP/API Timeouts (seconds)
25
+ # =============================================================================
26
+
27
+ HTTP_TIMEOUT_DEFAULT = 30.0 # Standard API calls
28
+ HTTP_TIMEOUT_LONG = 60.0 # Vision/embedding APIs
29
+ HTTP_TIMEOUT_VERY_LONG = 300.0 # Subprocess/batch operations
30
+
31
+ # Request timeout for httpx AsyncClient
32
+ ASYNC_CLIENT_TIMEOUT = 300.0
33
+
34
+ # =============================================================================
35
+ # Audio Processing Constants
36
+ # =============================================================================
37
+
38
+ # Minimum valid WAV file size (header only)
39
+ WAV_HEADER_MIN_BYTES = 44
40
+
41
+ # OpenAI Whisper API cost per minute (USD)
42
+ WHISPER_COST_PER_MINUTE = 0.006
43
+
44
+ # Audio chunking parameters
45
+ AUDIO_CHUNK_TARGET_SECONDS = 60.0 # Target chunk duration
46
+ AUDIO_CHUNK_WINDOW_SECONDS = 2.0 # Window for silence detection
47
+ SILENCE_THRESHOLD_DB = -40.0 # Silence detection threshold
48
+ MIN_SILENCE_MS = 500 # Minimum silence duration to split on
49
+
50
+ # =============================================================================
51
+ # File Processing Constants
52
+ # =============================================================================
53
+
54
+ # Subprocess timeout for document parsing
55
+ SUBPROCESS_TIMEOUT_SECONDS = 300 # 5 minutes
56
+
57
+ # Maximum file sizes
58
+ MAX_AUDIO_FILE_SIZE_MB = 25 # Whisper API limit
59
+
60
+ # =============================================================================
61
+ # Database/Query Constants
62
+ # =============================================================================
63
+
64
+ # Default batch sizes
65
+ DEFAULT_BATCH_SIZE = 100
66
+ EMBEDDING_BATCH_SIZE = 50
67
+
68
+ # Default pagination limits
69
+ DEFAULT_PAGE_SIZE = 20
70
+ MAX_PAGE_SIZE = 100
71
+
72
+ # =============================================================================
73
+ # Rate Limiting
74
+ # =============================================================================
75
+
76
+ # Default retry settings
77
+ DEFAULT_MAX_RETRIES = 3
78
+ RETRY_BACKOFF_MULTIPLIER = 1
79
+ RETRY_BACKOFF_MIN = 1
80
+ RETRY_BACKOFF_MAX = 60
81
+
82
+ # =============================================================================
83
+ # S3/Storage Constants
84
+ # =============================================================================
85
+
86
+ S3_URI_PREFIX = "s3://"
87
+ FILE_URI_PREFIX = "file://"
88
+
89
+ # =============================================================================
90
+ # LLM Constants
91
+ # =============================================================================
92
+
93
+ # Default max tokens for vision analysis
94
+ VISION_MAX_TOKENS = 2048
95
+
96
+ # Default temperature
97
+ DEFAULT_TEMPERATURE = 0.0
@@ -0,0 +1,228 @@
1
+ """
2
+ Centralized datetime utilities for consistent UTC-naive datetime handling.
3
+
4
+ IMPORTANT: REM uses UTC-naive datetimes throughout the codebase.
5
+ PostgreSQL stores TIMESTAMP WITHOUT TIME ZONE, so all Python datetime
6
+ operations should use UTC-naive datetimes to avoid comparison errors.
7
+
8
+ Convention:
9
+ - All timestamps are implicitly UTC
10
+ - Use utc_now() instead of datetime.utcnow() or datetime.now(timezone.utc)
11
+ - Use parse_iso() to parse ISO format strings (handles "Z" suffix)
12
+ - Use to_iso() to format datetimes as ISO strings
13
+
14
+ See CLAUDE.md Section 1 (Datetime Convention) for details.
15
+ """
16
+
17
+ from datetime import UTC, datetime, timedelta
18
+ from typing import Optional
19
+
20
+
21
+ def utc_now() -> datetime:
22
+ """
23
+ Get current UTC time as a naive datetime.
24
+
25
+ Returns:
26
+ UTC-naive datetime representing current time.
27
+
28
+ Example:
29
+ >>> now = utc_now()
30
+ >>> now.tzinfo is None
31
+ True
32
+ """
33
+ return datetime.now(UTC).replace(tzinfo=None)
34
+
35
+
36
+ def to_iso(dt: datetime) -> str:
37
+ """
38
+ Convert datetime to ISO 8601 format string.
39
+
40
+ Args:
41
+ dt: Datetime to format (should be UTC-naive)
42
+
43
+ Returns:
44
+ ISO format string (e.g., "2024-01-15T10:30:00")
45
+
46
+ Example:
47
+ >>> dt = datetime(2024, 1, 15, 10, 30, 0)
48
+ >>> to_iso(dt)
49
+ '2024-01-15T10:30:00'
50
+ """
51
+ return dt.isoformat()
52
+
53
+
54
+ def to_iso_with_z(dt: datetime) -> str:
55
+ """
56
+ Convert datetime to ISO 8601 format with Z suffix.
57
+
58
+ Use this when interfacing with external APIs that expect
59
+ the Z suffix to indicate UTC.
60
+
61
+ Args:
62
+ dt: Datetime to format (should be UTC-naive)
63
+
64
+ Returns:
65
+ ISO format string with Z suffix (e.g., "2024-01-15T10:30:00Z")
66
+ """
67
+ return dt.isoformat() + "Z"
68
+
69
+
70
+ def parse_iso(iso_string: str) -> datetime:
71
+ """
72
+ Parse ISO 8601 format string to UTC-naive datetime.
73
+
74
+ Handles:
75
+ - Standard ISO format: "2024-01-15T10:30:00"
76
+ - Z suffix: "2024-01-15T10:30:00Z"
77
+ - Timezone offset: "2024-01-15T10:30:00+00:00" (converts to naive)
78
+ - Microseconds: "2024-01-15T10:30:00.123456"
79
+
80
+ Args:
81
+ iso_string: ISO format datetime string
82
+
83
+ Returns:
84
+ UTC-naive datetime
85
+
86
+ Raises:
87
+ ValueError: If string cannot be parsed
88
+
89
+ Example:
90
+ >>> parse_iso("2024-01-15T10:30:00Z")
91
+ datetime.datetime(2024, 1, 15, 10, 30)
92
+ >>> parse_iso("2024-01-15T10:30:00+00:00")
93
+ datetime.datetime(2024, 1, 15, 10, 30)
94
+ """
95
+ # Handle Z suffix (replace with +00:00 for fromisoformat)
96
+ if iso_string.endswith("Z"):
97
+ iso_string = iso_string[:-1] + "+00:00"
98
+
99
+ # Parse the ISO string
100
+ dt = datetime.fromisoformat(iso_string)
101
+
102
+ # Convert to naive UTC if timezone-aware
103
+ if dt.tzinfo is not None:
104
+ # Convert to UTC and strip timezone
105
+ from datetime import timezone
106
+ dt = dt.astimezone(timezone.utc).replace(tzinfo=None)
107
+
108
+ return dt
109
+
110
+
111
+ def parse_iso_safe(iso_string: Optional[str], default: Optional[datetime] = None) -> Optional[datetime]:
112
+ """
113
+ Safely parse ISO string, returning default on failure.
114
+
115
+ Args:
116
+ iso_string: ISO format string or None
117
+ default: Default value if parsing fails
118
+
119
+ Returns:
120
+ Parsed datetime or default value
121
+ """
122
+ if not iso_string:
123
+ return default
124
+ try:
125
+ return parse_iso(iso_string)
126
+ except (ValueError, TypeError):
127
+ return default
128
+
129
+
130
+ def format_timestamp(dt: Optional[datetime] = None) -> str:
131
+ """
132
+ Format datetime for display/logging.
133
+
134
+ Args:
135
+ dt: Datetime to format (defaults to current UTC time)
136
+
137
+ Returns:
138
+ Formatted string like "2024-01-15 10:30:00 UTC"
139
+ """
140
+ if dt is None:
141
+ dt = utc_now()
142
+ return dt.strftime("%Y-%m-%d %H:%M:%S") + " UTC"
143
+
144
+
145
+ def format_timestamp_compact(dt: Optional[datetime] = None) -> str:
146
+ """
147
+ Format datetime as compact string for filenames/IDs.
148
+
149
+ Args:
150
+ dt: Datetime to format (defaults to current UTC time)
151
+
152
+ Returns:
153
+ Formatted string like "20240115_103000"
154
+ """
155
+ if dt is None:
156
+ dt = utc_now()
157
+ return dt.strftime("%Y%m%d_%H%M%S")
158
+
159
+
160
+ def format_timestamp_for_experiment(dt: Optional[datetime] = None) -> str:
161
+ """
162
+ Format datetime for experiment names.
163
+
164
+ Args:
165
+ dt: Datetime to format (defaults to current UTC time)
166
+
167
+ Returns:
168
+ Formatted string like "20240115-103000"
169
+ """
170
+ if dt is None:
171
+ dt = utc_now()
172
+ return dt.strftime("%Y%m%d-%H%M%S")
173
+
174
+
175
+ def days_ago(days: int) -> datetime:
176
+ """
177
+ Get datetime N days ago from now.
178
+
179
+ Args:
180
+ days: Number of days ago
181
+
182
+ Returns:
183
+ UTC-naive datetime
184
+ """
185
+ return utc_now() - timedelta(days=days)
186
+
187
+
188
+ def hours_ago(hours: int) -> datetime:
189
+ """
190
+ Get datetime N hours ago from now.
191
+
192
+ Args:
193
+ hours: Number of hours ago
194
+
195
+ Returns:
196
+ UTC-naive datetime
197
+ """
198
+ return utc_now() - timedelta(hours=hours)
199
+
200
+
201
+ def is_within_hours(dt: datetime, hours: int) -> bool:
202
+ """
203
+ Check if datetime is within N hours of now.
204
+
205
+ Args:
206
+ dt: Datetime to check (should be UTC-naive)
207
+ hours: Number of hours
208
+
209
+ Returns:
210
+ True if dt is within the time window
211
+ """
212
+ cutoff = hours_ago(hours)
213
+ return dt >= cutoff
214
+
215
+
216
+ def is_within_days(dt: datetime, days: int) -> bool:
217
+ """
218
+ Check if datetime is within N days of now.
219
+
220
+ Args:
221
+ dt: Datetime to check (should be UTC-naive)
222
+ days: Number of days
223
+
224
+ Returns:
225
+ True if dt is within the time window
226
+ """
227
+ cutoff = days_ago(days)
228
+ return dt >= cutoff
@@ -0,0 +1,98 @@
1
+ """Dictionary utilities for nested access and field extraction.
2
+
3
+ Utilities for working with nested dictionaries and extracting values
4
+ for embeddings, serialization, etc.
5
+ """
6
+
7
+ import json
8
+ from typing import Any
9
+
10
+
11
+ def get_nested_value(data: dict[str, Any], path: str) -> Any:
12
+ """Get value from nested dict using dot notation.
13
+
14
+ Args:
15
+ data: Dictionary to traverse
16
+ path: Dot-separated path (e.g., "candidate.name", "skills.0.proficiency")
17
+
18
+ Returns:
19
+ Value at the path, or None if not found
20
+
21
+ Examples:
22
+ >>> data = {"candidate": {"name": "John", "skills": [{"name": "Python"}]}}
23
+ >>> get_nested_value(data, "candidate.name")
24
+ 'John'
25
+ >>> get_nested_value(data, "candidate.skills.0.name")
26
+ 'Python'
27
+ >>> get_nested_value(data, "candidate.missing")
28
+ None
29
+ """
30
+ keys = path.split(".")
31
+ value: Any = data
32
+
33
+ for key in keys:
34
+ if isinstance(value, dict):
35
+ value = value.get(key)
36
+ elif isinstance(value, list):
37
+ # Handle array index (e.g., "skills.0.name")
38
+ try:
39
+ index = int(key)
40
+ value = value[index] if 0 <= index < len(value) else None
41
+ except (ValueError, TypeError):
42
+ return None
43
+ else:
44
+ return None
45
+
46
+ if value is None:
47
+ return None
48
+
49
+ return value
50
+
51
+
52
+ def extract_fields_for_embedding(
53
+ data: dict[str, Any],
54
+ fields: list[str],
55
+ ) -> str:
56
+ """Extract and concatenate fields from dict for embedding generation.
57
+
58
+ Supports nested field access via dot notation.
59
+ Handles lists and dicts by JSON-serializing them.
60
+ Returns newline-separated concatenation of all field values.
61
+
62
+ Args:
63
+ data: Dictionary containing data to extract
64
+ fields: List of field paths (supports dot notation)
65
+
66
+ Returns:
67
+ Concatenated text suitable for embedding
68
+
69
+ Examples:
70
+ >>> data = {
71
+ ... "name": "John Doe",
72
+ ... "skills": ["Python", "PostgreSQL"],
73
+ ... "experience": {"years": 5, "level": "senior"}
74
+ ... }
75
+ >>> extract_fields_for_embedding(data, ["name", "skills"])
76
+ 'John Doe\\n["Python", "PostgreSQL"]'
77
+
78
+ >>> extract_fields_for_embedding(data, ["name", "experience.level"])
79
+ 'John Doe\\nsenior'
80
+
81
+ >>> extract_fields_for_embedding(data, [])
82
+ '{"name": "John Doe", ...}' # Full JSON if no fields specified
83
+ """
84
+ if not fields:
85
+ # If no fields specified, embed entire JSON
86
+ return json.dumps(data, indent=2)
87
+
88
+ parts = []
89
+ for field in fields:
90
+ value = get_nested_value(data, field)
91
+ if value is not None:
92
+ # Convert to string
93
+ if isinstance(value, (list, dict)):
94
+ parts.append(json.dumps(value))
95
+ else:
96
+ parts.append(str(value))
97
+
98
+ return "\n".join(parts)