remdb 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of remdb might be problematic. Click here for more details.

Files changed (187) hide show
  1. rem/__init__.py +2 -0
  2. rem/agentic/README.md +650 -0
  3. rem/agentic/__init__.py +39 -0
  4. rem/agentic/agents/README.md +155 -0
  5. rem/agentic/agents/__init__.py +8 -0
  6. rem/agentic/context.py +148 -0
  7. rem/agentic/context_builder.py +329 -0
  8. rem/agentic/mcp/__init__.py +0 -0
  9. rem/agentic/mcp/tool_wrapper.py +107 -0
  10. rem/agentic/otel/__init__.py +5 -0
  11. rem/agentic/otel/setup.py +151 -0
  12. rem/agentic/providers/phoenix.py +674 -0
  13. rem/agentic/providers/pydantic_ai.py +572 -0
  14. rem/agentic/query.py +117 -0
  15. rem/agentic/query_helper.py +89 -0
  16. rem/agentic/schema.py +396 -0
  17. rem/agentic/serialization.py +245 -0
  18. rem/agentic/tools/__init__.py +5 -0
  19. rem/agentic/tools/rem_tools.py +231 -0
  20. rem/api/README.md +420 -0
  21. rem/api/main.py +324 -0
  22. rem/api/mcp_router/prompts.py +182 -0
  23. rem/api/mcp_router/resources.py +536 -0
  24. rem/api/mcp_router/server.py +213 -0
  25. rem/api/mcp_router/tools.py +584 -0
  26. rem/api/routers/auth.py +229 -0
  27. rem/api/routers/chat/__init__.py +5 -0
  28. rem/api/routers/chat/completions.py +281 -0
  29. rem/api/routers/chat/json_utils.py +76 -0
  30. rem/api/routers/chat/models.py +124 -0
  31. rem/api/routers/chat/streaming.py +185 -0
  32. rem/auth/README.md +258 -0
  33. rem/auth/__init__.py +26 -0
  34. rem/auth/middleware.py +100 -0
  35. rem/auth/providers/__init__.py +13 -0
  36. rem/auth/providers/base.py +376 -0
  37. rem/auth/providers/google.py +163 -0
  38. rem/auth/providers/microsoft.py +237 -0
  39. rem/cli/README.md +455 -0
  40. rem/cli/__init__.py +8 -0
  41. rem/cli/commands/README.md +126 -0
  42. rem/cli/commands/__init__.py +3 -0
  43. rem/cli/commands/ask.py +566 -0
  44. rem/cli/commands/configure.py +497 -0
  45. rem/cli/commands/db.py +493 -0
  46. rem/cli/commands/dreaming.py +324 -0
  47. rem/cli/commands/experiments.py +1302 -0
  48. rem/cli/commands/mcp.py +66 -0
  49. rem/cli/commands/process.py +245 -0
  50. rem/cli/commands/schema.py +183 -0
  51. rem/cli/commands/serve.py +106 -0
  52. rem/cli/dreaming.py +363 -0
  53. rem/cli/main.py +96 -0
  54. rem/config.py +237 -0
  55. rem/mcp_server.py +41 -0
  56. rem/models/core/__init__.py +49 -0
  57. rem/models/core/core_model.py +64 -0
  58. rem/models/core/engram.py +333 -0
  59. rem/models/core/experiment.py +628 -0
  60. rem/models/core/inline_edge.py +132 -0
  61. rem/models/core/rem_query.py +243 -0
  62. rem/models/entities/__init__.py +43 -0
  63. rem/models/entities/file.py +57 -0
  64. rem/models/entities/image_resource.py +88 -0
  65. rem/models/entities/message.py +35 -0
  66. rem/models/entities/moment.py +123 -0
  67. rem/models/entities/ontology.py +191 -0
  68. rem/models/entities/ontology_config.py +131 -0
  69. rem/models/entities/resource.py +95 -0
  70. rem/models/entities/schema.py +87 -0
  71. rem/models/entities/user.py +85 -0
  72. rem/py.typed +0 -0
  73. rem/schemas/README.md +507 -0
  74. rem/schemas/__init__.py +6 -0
  75. rem/schemas/agents/README.md +92 -0
  76. rem/schemas/agents/core/moment-builder.yaml +178 -0
  77. rem/schemas/agents/core/rem-query-agent.yaml +226 -0
  78. rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
  79. rem/schemas/agents/core/simple-assistant.yaml +19 -0
  80. rem/schemas/agents/core/user-profile-builder.yaml +163 -0
  81. rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
  82. rem/schemas/agents/examples/contract-extractor.yaml +134 -0
  83. rem/schemas/agents/examples/cv-parser.yaml +263 -0
  84. rem/schemas/agents/examples/hello-world.yaml +37 -0
  85. rem/schemas/agents/examples/query.yaml +54 -0
  86. rem/schemas/agents/examples/simple.yaml +21 -0
  87. rem/schemas/agents/examples/test.yaml +29 -0
  88. rem/schemas/agents/rem.yaml +128 -0
  89. rem/schemas/evaluators/hello-world/default.yaml +77 -0
  90. rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
  91. rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
  92. rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
  93. rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
  94. rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
  95. rem/services/__init__.py +16 -0
  96. rem/services/audio/INTEGRATION.md +308 -0
  97. rem/services/audio/README.md +376 -0
  98. rem/services/audio/__init__.py +15 -0
  99. rem/services/audio/chunker.py +354 -0
  100. rem/services/audio/transcriber.py +259 -0
  101. rem/services/content/README.md +1269 -0
  102. rem/services/content/__init__.py +5 -0
  103. rem/services/content/providers.py +806 -0
  104. rem/services/content/service.py +676 -0
  105. rem/services/dreaming/README.md +230 -0
  106. rem/services/dreaming/__init__.py +53 -0
  107. rem/services/dreaming/affinity_service.py +336 -0
  108. rem/services/dreaming/moment_service.py +264 -0
  109. rem/services/dreaming/ontology_service.py +54 -0
  110. rem/services/dreaming/user_model_service.py +297 -0
  111. rem/services/dreaming/utils.py +39 -0
  112. rem/services/embeddings/__init__.py +11 -0
  113. rem/services/embeddings/api.py +120 -0
  114. rem/services/embeddings/worker.py +421 -0
  115. rem/services/fs/README.md +662 -0
  116. rem/services/fs/__init__.py +62 -0
  117. rem/services/fs/examples.py +206 -0
  118. rem/services/fs/examples_paths.py +204 -0
  119. rem/services/fs/git_provider.py +935 -0
  120. rem/services/fs/local_provider.py +760 -0
  121. rem/services/fs/parsing-hooks-examples.md +172 -0
  122. rem/services/fs/paths.py +276 -0
  123. rem/services/fs/provider.py +460 -0
  124. rem/services/fs/s3_provider.py +1042 -0
  125. rem/services/fs/service.py +186 -0
  126. rem/services/git/README.md +1075 -0
  127. rem/services/git/__init__.py +17 -0
  128. rem/services/git/service.py +469 -0
  129. rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
  130. rem/services/phoenix/README.md +453 -0
  131. rem/services/phoenix/__init__.py +46 -0
  132. rem/services/phoenix/client.py +686 -0
  133. rem/services/phoenix/config.py +88 -0
  134. rem/services/phoenix/prompt_labels.py +477 -0
  135. rem/services/postgres/README.md +575 -0
  136. rem/services/postgres/__init__.py +23 -0
  137. rem/services/postgres/migration_service.py +427 -0
  138. rem/services/postgres/pydantic_to_sqlalchemy.py +232 -0
  139. rem/services/postgres/register_type.py +352 -0
  140. rem/services/postgres/repository.py +337 -0
  141. rem/services/postgres/schema_generator.py +379 -0
  142. rem/services/postgres/service.py +802 -0
  143. rem/services/postgres/sql_builder.py +354 -0
  144. rem/services/rem/README.md +304 -0
  145. rem/services/rem/__init__.py +23 -0
  146. rem/services/rem/exceptions.py +71 -0
  147. rem/services/rem/executor.py +293 -0
  148. rem/services/rem/parser.py +145 -0
  149. rem/services/rem/queries.py +196 -0
  150. rem/services/rem/query.py +371 -0
  151. rem/services/rem/service.py +527 -0
  152. rem/services/session/README.md +374 -0
  153. rem/services/session/__init__.py +6 -0
  154. rem/services/session/compression.py +360 -0
  155. rem/services/session/reload.py +77 -0
  156. rem/settings.py +1235 -0
  157. rem/sql/002_install_models.sql +1068 -0
  158. rem/sql/background_indexes.sql +42 -0
  159. rem/sql/install_models.sql +1038 -0
  160. rem/sql/migrations/001_install.sql +503 -0
  161. rem/sql/migrations/002_install_models.sql +1202 -0
  162. rem/utils/AGENTIC_CHUNKING.md +597 -0
  163. rem/utils/README.md +583 -0
  164. rem/utils/__init__.py +43 -0
  165. rem/utils/agentic_chunking.py +622 -0
  166. rem/utils/batch_ops.py +343 -0
  167. rem/utils/chunking.py +108 -0
  168. rem/utils/clip_embeddings.py +276 -0
  169. rem/utils/dict_utils.py +98 -0
  170. rem/utils/embeddings.py +423 -0
  171. rem/utils/examples/embeddings_example.py +305 -0
  172. rem/utils/examples/sql_types_example.py +202 -0
  173. rem/utils/markdown.py +16 -0
  174. rem/utils/model_helpers.py +236 -0
  175. rem/utils/schema_loader.py +336 -0
  176. rem/utils/sql_types.py +348 -0
  177. rem/utils/user_id.py +81 -0
  178. rem/utils/vision.py +330 -0
  179. rem/workers/README.md +506 -0
  180. rem/workers/__init__.py +5 -0
  181. rem/workers/dreaming.py +502 -0
  182. rem/workers/engram_processor.py +312 -0
  183. rem/workers/sqs_file_processor.py +193 -0
  184. remdb-0.3.0.dist-info/METADATA +1455 -0
  185. remdb-0.3.0.dist-info/RECORD +187 -0
  186. remdb-0.3.0.dist-info/WHEEL +4 -0
  187. remdb-0.3.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,276 @@
1
+ """
2
+ CLIP embeddings utility using Jina AI API.
3
+
4
+ Provides image and text embeddings using Jina CLIP models via API.
5
+ Falls back gracefully when API key is not available.
6
+
7
+ Future: Can be extended to support self-hosted CLIP models or other providers.
8
+ """
9
+
10
+ import base64
11
+ import os
12
+ from pathlib import Path
13
+ from typing import Optional
14
+
15
+ import requests
16
+ from loguru import logger
17
+
18
+
19
+ class CLIPEmbeddingResult:
20
+ """Result from CLIP embedding generation."""
21
+
22
+ def __init__(
23
+ self,
24
+ embedding: list[float],
25
+ model: str,
26
+ input_type: str,
27
+ tokens_used: int = 0,
28
+ ):
29
+ """
30
+ Initialize CLIP embedding result.
31
+
32
+ Args:
33
+ embedding: Vector embedding (512 or 768 dimensions)
34
+ model: Model name used
35
+ input_type: Type of input (image or text)
36
+ tokens_used: Number of tokens consumed (for cost tracking)
37
+ """
38
+ self.embedding = embedding
39
+ self.model = model
40
+ self.input_type = input_type
41
+ self.tokens_used = tokens_used
42
+
43
+ @property
44
+ def dimensions(self) -> int:
45
+ """Get embedding dimensionality."""
46
+ return len(self.embedding)
47
+
48
+ def __repr__(self) -> str:
49
+ return f"CLIPEmbeddingResult(model={self.model}, dims={self.dimensions}, tokens={self.tokens_used})"
50
+
51
+
52
+ class JinaCLIPEmbedder:
53
+ """
54
+ CLIP embeddings using Jina AI API.
55
+
56
+ Supports:
57
+ - jina-clip-v1: 768-dimensional embeddings
58
+ - jina-clip-v2: 512-dimensional embeddings (default)
59
+
60
+ Pricing:
61
+ - ~$0.02 per million tokens
62
+ - Images: 4000 tokens per 512x512 tile (v2)
63
+ - Images: 1000 tokens per 224x224 tile (v1)
64
+ - Free tier: 10M tokens for new users
65
+
66
+ Future extensions:
67
+ - Self-hosted CLIP models
68
+ - OpenCLIP support
69
+ - Batch embedding support
70
+ """
71
+
72
+ def __init__(
73
+ self,
74
+ api_key: Optional[str] = None,
75
+ model: str = "jina-clip-v2",
76
+ ):
77
+ """
78
+ Initialize Jina CLIP embedder.
79
+
80
+ Args:
81
+ api_key: Jina AI API key (from env if None)
82
+ model: CLIP model name (jina-clip-v1 or jina-clip-v2)
83
+ """
84
+ # Get API key from environment if not provided
85
+ # Check both CONTENT__JINA_API_KEY (preferred) and legacy JINA_API_KEY
86
+ if api_key is None:
87
+ api_key = os.getenv("CONTENT__JINA_API_KEY") or os.getenv("JINA_API_KEY")
88
+
89
+ self.api_key = api_key
90
+ self.model = model
91
+ self.api_url = "https://api.jina.ai/v1/embeddings"
92
+
93
+ # Warn if no API key
94
+ if not self.api_key:
95
+ logger.warning(
96
+ "No Jina API key found - CLIP embeddings will be disabled. "
97
+ "Set CONTENT__JINA_API_KEY or get a free key at https://jina.ai/embeddings/"
98
+ )
99
+
100
+ def is_available(self) -> bool:
101
+ """Check if Jina CLIP embeddings are available."""
102
+ return self.api_key is not None
103
+
104
+ def embed_image(
105
+ self,
106
+ image_path: str | Path,
107
+ ) -> Optional[CLIPEmbeddingResult]:
108
+ """
109
+ Generate CLIP embedding for an image.
110
+
111
+ Args:
112
+ image_path: Path to image file
113
+
114
+ Returns:
115
+ CLIPEmbeddingResult with embedding vector, or None if unavailable
116
+
117
+ Raises:
118
+ RuntimeError: If API request fails (when API key is available)
119
+ """
120
+ if not self.is_available():
121
+ logger.debug("Jina API key not available - skipping CLIP embedding")
122
+ return None
123
+
124
+ image_path = Path(image_path)
125
+ if not image_path.exists():
126
+ raise FileNotFoundError(f"Image file not found: {image_path}")
127
+
128
+ # Read and encode image to base64
129
+ with open(image_path, "rb") as f:
130
+ image_bytes = f.read()
131
+
132
+ image_b64 = base64.b64encode(image_bytes).decode("utf-8")
133
+
134
+ # Detect media type
135
+ suffix = image_path.suffix.lower()
136
+ media_type_map = {
137
+ ".png": "image/png",
138
+ ".jpg": "image/jpeg",
139
+ ".jpeg": "image/jpeg",
140
+ ".gif": "image/gif",
141
+ ".webp": "image/webp",
142
+ }
143
+ media_type = media_type_map.get(suffix, "image/png")
144
+
145
+ logger.debug(f"Generating CLIP embedding for {image_path.name} with {self.model}")
146
+
147
+ try:
148
+ # Build request
149
+ headers = {
150
+ "Authorization": f"Bearer {self.api_key}",
151
+ "Content-Type": "application/json",
152
+ }
153
+
154
+ # Jina API expects data URL format
155
+ data_url = f"data:{media_type};base64,{image_b64}"
156
+
157
+ body = {
158
+ "model": self.model,
159
+ "input": [data_url],
160
+ "input_type": "image",
161
+ }
162
+
163
+ response = requests.post(
164
+ self.api_url,
165
+ headers=headers,
166
+ json=body,
167
+ timeout=30.0,
168
+ )
169
+
170
+ if response.status_code != 200:
171
+ error_detail = response.text
172
+ logger.error(f"Jina API error: {response.status_code} - {error_detail}")
173
+ raise RuntimeError(f"CLIP embedding failed: {response.status_code} - {error_detail}")
174
+
175
+ result = response.json()
176
+
177
+ # Extract embedding and usage
178
+ embedding = result["data"][0]["embedding"]
179
+ tokens_used = result.get("usage", {}).get("total_tokens", 0)
180
+
181
+ logger.info(
182
+ f"✓ CLIP embedding generated: {len(embedding)} dims, {tokens_used} tokens"
183
+ )
184
+
185
+ return CLIPEmbeddingResult(
186
+ embedding=embedding,
187
+ model=self.model,
188
+ input_type="image",
189
+ tokens_used=tokens_used,
190
+ )
191
+
192
+ except requests.exceptions.Timeout:
193
+ logger.error("Jina API request timed out")
194
+ raise RuntimeError("CLIP embedding timed out after 30 seconds")
195
+ except requests.exceptions.RequestException as e:
196
+ logger.error(f"Request error: {e}")
197
+ raise RuntimeError(f"CLIP embedding request failed: {e}")
198
+ except Exception as e:
199
+ logger.error(f"Unexpected error during CLIP embedding: {e}")
200
+ raise
201
+
202
+ def embed_text(
203
+ self,
204
+ text: str,
205
+ ) -> Optional[CLIPEmbeddingResult]:
206
+ """
207
+ Generate CLIP embedding for text.
208
+
209
+ Useful for text-to-image search in shared embedding space.
210
+
211
+ Args:
212
+ text: Text to embed
213
+
214
+ Returns:
215
+ CLIPEmbeddingResult with embedding vector, or None if unavailable
216
+
217
+ Raises:
218
+ RuntimeError: If API request fails (when API key is available)
219
+ """
220
+ if not self.is_available():
221
+ logger.debug("Jina API key not available - skipping CLIP embedding")
222
+ return None
223
+
224
+ logger.debug(f"Generating CLIP text embedding with {self.model}")
225
+
226
+ try:
227
+ # Build request
228
+ headers = {
229
+ "Authorization": f"Bearer {self.api_key}",
230
+ "Content-Type": "application/json",
231
+ }
232
+
233
+ body = {
234
+ "model": self.model,
235
+ "input": [text],
236
+ "input_type": "text",
237
+ }
238
+
239
+ response = requests.post(
240
+ self.api_url,
241
+ headers=headers,
242
+ json=body,
243
+ timeout=30.0,
244
+ )
245
+
246
+ if response.status_code != 200:
247
+ error_detail = response.text
248
+ logger.error(f"Jina API error: {response.status_code} - {error_detail}")
249
+ raise RuntimeError(f"CLIP embedding failed: {response.status_code} - {error_detail}")
250
+
251
+ result = response.json()
252
+
253
+ # Extract embedding and usage
254
+ embedding = result["data"][0]["embedding"]
255
+ tokens_used = result.get("usage", {}).get("total_tokens", 0)
256
+
257
+ logger.info(
258
+ f"✓ CLIP text embedding generated: {len(embedding)} dims, {tokens_used} tokens"
259
+ )
260
+
261
+ return CLIPEmbeddingResult(
262
+ embedding=embedding,
263
+ model=self.model,
264
+ input_type="text",
265
+ tokens_used=tokens_used,
266
+ )
267
+
268
+ except requests.exceptions.Timeout:
269
+ logger.error("Jina API request timed out")
270
+ raise RuntimeError("CLIP embedding timed out after 30 seconds")
271
+ except requests.exceptions.RequestException as e:
272
+ logger.error(f"Request error: {e}")
273
+ raise RuntimeError(f"CLIP embedding request failed: {e}")
274
+ except Exception as e:
275
+ logger.error(f"Unexpected error during CLIP embedding: {e}")
276
+ raise
@@ -0,0 +1,98 @@
1
+ """Dictionary utilities for nested access and field extraction.
2
+
3
+ Utilities for working with nested dictionaries and extracting values
4
+ for embeddings, serialization, etc.
5
+ """
6
+
7
+ import json
8
+ from typing import Any
9
+
10
+
11
+ def get_nested_value(data: dict[str, Any], path: str) -> Any:
12
+ """Get value from nested dict using dot notation.
13
+
14
+ Args:
15
+ data: Dictionary to traverse
16
+ path: Dot-separated path (e.g., "candidate.name", "skills.0.proficiency")
17
+
18
+ Returns:
19
+ Value at the path, or None if not found
20
+
21
+ Examples:
22
+ >>> data = {"candidate": {"name": "John", "skills": [{"name": "Python"}]}}
23
+ >>> get_nested_value(data, "candidate.name")
24
+ 'John'
25
+ >>> get_nested_value(data, "candidate.skills.0.name")
26
+ 'Python'
27
+ >>> get_nested_value(data, "candidate.missing")
28
+ None
29
+ """
30
+ keys = path.split(".")
31
+ value: Any = data
32
+
33
+ for key in keys:
34
+ if isinstance(value, dict):
35
+ value = value.get(key)
36
+ elif isinstance(value, list):
37
+ # Handle array index (e.g., "skills.0.name")
38
+ try:
39
+ index = int(key)
40
+ value = value[index] if 0 <= index < len(value) else None
41
+ except (ValueError, TypeError):
42
+ return None
43
+ else:
44
+ return None
45
+
46
+ if value is None:
47
+ return None
48
+
49
+ return value
50
+
51
+
52
+ def extract_fields_for_embedding(
53
+ data: dict[str, Any],
54
+ fields: list[str],
55
+ ) -> str:
56
+ """Extract and concatenate fields from dict for embedding generation.
57
+
58
+ Supports nested field access via dot notation.
59
+ Handles lists and dicts by JSON-serializing them.
60
+ Returns newline-separated concatenation of all field values.
61
+
62
+ Args:
63
+ data: Dictionary containing data to extract
64
+ fields: List of field paths (supports dot notation)
65
+
66
+ Returns:
67
+ Concatenated text suitable for embedding
68
+
69
+ Examples:
70
+ >>> data = {
71
+ ... "name": "John Doe",
72
+ ... "skills": ["Python", "PostgreSQL"],
73
+ ... "experience": {"years": 5, "level": "senior"}
74
+ ... }
75
+ >>> extract_fields_for_embedding(data, ["name", "skills"])
76
+ 'John Doe\\n["Python", "PostgreSQL"]'
77
+
78
+ >>> extract_fields_for_embedding(data, ["name", "experience.level"])
79
+ 'John Doe\\nsenior'
80
+
81
+ >>> extract_fields_for_embedding(data, [])
82
+ '{"name": "John Doe", ...}' # Full JSON if no fields specified
83
+ """
84
+ if not fields:
85
+ # If no fields specified, embed entire JSON
86
+ return json.dumps(data, indent=2)
87
+
88
+ parts = []
89
+ for field in fields:
90
+ value = get_nested_value(data, field)
91
+ if value is not None:
92
+ # Convert to string
93
+ if isinstance(value, (list, dict)):
94
+ parts.append(json.dumps(value))
95
+ else:
96
+ parts.append(str(value))
97
+
98
+ return "\n".join(parts)