remdb 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of remdb might be problematic. Click here for more details.
- rem/__init__.py +2 -0
- rem/agentic/README.md +650 -0
- rem/agentic/__init__.py +39 -0
- rem/agentic/agents/README.md +155 -0
- rem/agentic/agents/__init__.py +8 -0
- rem/agentic/context.py +148 -0
- rem/agentic/context_builder.py +329 -0
- rem/agentic/mcp/__init__.py +0 -0
- rem/agentic/mcp/tool_wrapper.py +107 -0
- rem/agentic/otel/__init__.py +5 -0
- rem/agentic/otel/setup.py +151 -0
- rem/agentic/providers/phoenix.py +674 -0
- rem/agentic/providers/pydantic_ai.py +572 -0
- rem/agentic/query.py +117 -0
- rem/agentic/query_helper.py +89 -0
- rem/agentic/schema.py +396 -0
- rem/agentic/serialization.py +245 -0
- rem/agentic/tools/__init__.py +5 -0
- rem/agentic/tools/rem_tools.py +231 -0
- rem/api/README.md +420 -0
- rem/api/main.py +324 -0
- rem/api/mcp_router/prompts.py +182 -0
- rem/api/mcp_router/resources.py +536 -0
- rem/api/mcp_router/server.py +213 -0
- rem/api/mcp_router/tools.py +584 -0
- rem/api/routers/auth.py +229 -0
- rem/api/routers/chat/__init__.py +5 -0
- rem/api/routers/chat/completions.py +281 -0
- rem/api/routers/chat/json_utils.py +76 -0
- rem/api/routers/chat/models.py +124 -0
- rem/api/routers/chat/streaming.py +185 -0
- rem/auth/README.md +258 -0
- rem/auth/__init__.py +26 -0
- rem/auth/middleware.py +100 -0
- rem/auth/providers/__init__.py +13 -0
- rem/auth/providers/base.py +376 -0
- rem/auth/providers/google.py +163 -0
- rem/auth/providers/microsoft.py +237 -0
- rem/cli/README.md +455 -0
- rem/cli/__init__.py +8 -0
- rem/cli/commands/README.md +126 -0
- rem/cli/commands/__init__.py +3 -0
- rem/cli/commands/ask.py +566 -0
- rem/cli/commands/configure.py +497 -0
- rem/cli/commands/db.py +493 -0
- rem/cli/commands/dreaming.py +324 -0
- rem/cli/commands/experiments.py +1302 -0
- rem/cli/commands/mcp.py +66 -0
- rem/cli/commands/process.py +245 -0
- rem/cli/commands/schema.py +183 -0
- rem/cli/commands/serve.py +106 -0
- rem/cli/dreaming.py +363 -0
- rem/cli/main.py +96 -0
- rem/config.py +237 -0
- rem/mcp_server.py +41 -0
- rem/models/core/__init__.py +49 -0
- rem/models/core/core_model.py +64 -0
- rem/models/core/engram.py +333 -0
- rem/models/core/experiment.py +628 -0
- rem/models/core/inline_edge.py +132 -0
- rem/models/core/rem_query.py +243 -0
- rem/models/entities/__init__.py +43 -0
- rem/models/entities/file.py +57 -0
- rem/models/entities/image_resource.py +88 -0
- rem/models/entities/message.py +35 -0
- rem/models/entities/moment.py +123 -0
- rem/models/entities/ontology.py +191 -0
- rem/models/entities/ontology_config.py +131 -0
- rem/models/entities/resource.py +95 -0
- rem/models/entities/schema.py +87 -0
- rem/models/entities/user.py +85 -0
- rem/py.typed +0 -0
- rem/schemas/README.md +507 -0
- rem/schemas/__init__.py +6 -0
- rem/schemas/agents/README.md +92 -0
- rem/schemas/agents/core/moment-builder.yaml +178 -0
- rem/schemas/agents/core/rem-query-agent.yaml +226 -0
- rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
- rem/schemas/agents/core/simple-assistant.yaml +19 -0
- rem/schemas/agents/core/user-profile-builder.yaml +163 -0
- rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
- rem/schemas/agents/examples/contract-extractor.yaml +134 -0
- rem/schemas/agents/examples/cv-parser.yaml +263 -0
- rem/schemas/agents/examples/hello-world.yaml +37 -0
- rem/schemas/agents/examples/query.yaml +54 -0
- rem/schemas/agents/examples/simple.yaml +21 -0
- rem/schemas/agents/examples/test.yaml +29 -0
- rem/schemas/agents/rem.yaml +128 -0
- rem/schemas/evaluators/hello-world/default.yaml +77 -0
- rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
- rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
- rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
- rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
- rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
- rem/services/__init__.py +16 -0
- rem/services/audio/INTEGRATION.md +308 -0
- rem/services/audio/README.md +376 -0
- rem/services/audio/__init__.py +15 -0
- rem/services/audio/chunker.py +354 -0
- rem/services/audio/transcriber.py +259 -0
- rem/services/content/README.md +1269 -0
- rem/services/content/__init__.py +5 -0
- rem/services/content/providers.py +806 -0
- rem/services/content/service.py +676 -0
- rem/services/dreaming/README.md +230 -0
- rem/services/dreaming/__init__.py +53 -0
- rem/services/dreaming/affinity_service.py +336 -0
- rem/services/dreaming/moment_service.py +264 -0
- rem/services/dreaming/ontology_service.py +54 -0
- rem/services/dreaming/user_model_service.py +297 -0
- rem/services/dreaming/utils.py +39 -0
- rem/services/embeddings/__init__.py +11 -0
- rem/services/embeddings/api.py +120 -0
- rem/services/embeddings/worker.py +421 -0
- rem/services/fs/README.md +662 -0
- rem/services/fs/__init__.py +62 -0
- rem/services/fs/examples.py +206 -0
- rem/services/fs/examples_paths.py +204 -0
- rem/services/fs/git_provider.py +935 -0
- rem/services/fs/local_provider.py +760 -0
- rem/services/fs/parsing-hooks-examples.md +172 -0
- rem/services/fs/paths.py +276 -0
- rem/services/fs/provider.py +460 -0
- rem/services/fs/s3_provider.py +1042 -0
- rem/services/fs/service.py +186 -0
- rem/services/git/README.md +1075 -0
- rem/services/git/__init__.py +17 -0
- rem/services/git/service.py +469 -0
- rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
- rem/services/phoenix/README.md +453 -0
- rem/services/phoenix/__init__.py +46 -0
- rem/services/phoenix/client.py +686 -0
- rem/services/phoenix/config.py +88 -0
- rem/services/phoenix/prompt_labels.py +477 -0
- rem/services/postgres/README.md +575 -0
- rem/services/postgres/__init__.py +23 -0
- rem/services/postgres/migration_service.py +427 -0
- rem/services/postgres/pydantic_to_sqlalchemy.py +232 -0
- rem/services/postgres/register_type.py +352 -0
- rem/services/postgres/repository.py +337 -0
- rem/services/postgres/schema_generator.py +379 -0
- rem/services/postgres/service.py +802 -0
- rem/services/postgres/sql_builder.py +354 -0
- rem/services/rem/README.md +304 -0
- rem/services/rem/__init__.py +23 -0
- rem/services/rem/exceptions.py +71 -0
- rem/services/rem/executor.py +293 -0
- rem/services/rem/parser.py +145 -0
- rem/services/rem/queries.py +196 -0
- rem/services/rem/query.py +371 -0
- rem/services/rem/service.py +527 -0
- rem/services/session/README.md +374 -0
- rem/services/session/__init__.py +6 -0
- rem/services/session/compression.py +360 -0
- rem/services/session/reload.py +77 -0
- rem/settings.py +1235 -0
- rem/sql/002_install_models.sql +1068 -0
- rem/sql/background_indexes.sql +42 -0
- rem/sql/install_models.sql +1038 -0
- rem/sql/migrations/001_install.sql +503 -0
- rem/sql/migrations/002_install_models.sql +1202 -0
- rem/utils/AGENTIC_CHUNKING.md +597 -0
- rem/utils/README.md +583 -0
- rem/utils/__init__.py +43 -0
- rem/utils/agentic_chunking.py +622 -0
- rem/utils/batch_ops.py +343 -0
- rem/utils/chunking.py +108 -0
- rem/utils/clip_embeddings.py +276 -0
- rem/utils/dict_utils.py +98 -0
- rem/utils/embeddings.py +423 -0
- rem/utils/examples/embeddings_example.py +305 -0
- rem/utils/examples/sql_types_example.py +202 -0
- rem/utils/markdown.py +16 -0
- rem/utils/model_helpers.py +236 -0
- rem/utils/schema_loader.py +336 -0
- rem/utils/sql_types.py +348 -0
- rem/utils/user_id.py +81 -0
- rem/utils/vision.py +330 -0
- rem/workers/README.md +506 -0
- rem/workers/__init__.py +5 -0
- rem/workers/dreaming.py +502 -0
- rem/workers/engram_processor.py +312 -0
- rem/workers/sqs_file_processor.py +193 -0
- remdb-0.3.0.dist-info/METADATA +1455 -0
- remdb-0.3.0.dist-info/RECORD +187 -0
- remdb-0.3.0.dist-info/WHEEL +4 -0
- remdb-0.3.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CLIP embeddings utility using Jina AI API.
|
|
3
|
+
|
|
4
|
+
Provides image and text embeddings using Jina CLIP models via API.
|
|
5
|
+
Falls back gracefully when API key is not available.
|
|
6
|
+
|
|
7
|
+
Future: Can be extended to support self-hosted CLIP models or other providers.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import base64
|
|
11
|
+
import os
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Optional
|
|
14
|
+
|
|
15
|
+
import requests
|
|
16
|
+
from loguru import logger
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class CLIPEmbeddingResult:
|
|
20
|
+
"""Result from CLIP embedding generation."""
|
|
21
|
+
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
embedding: list[float],
|
|
25
|
+
model: str,
|
|
26
|
+
input_type: str,
|
|
27
|
+
tokens_used: int = 0,
|
|
28
|
+
):
|
|
29
|
+
"""
|
|
30
|
+
Initialize CLIP embedding result.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
embedding: Vector embedding (512 or 768 dimensions)
|
|
34
|
+
model: Model name used
|
|
35
|
+
input_type: Type of input (image or text)
|
|
36
|
+
tokens_used: Number of tokens consumed (for cost tracking)
|
|
37
|
+
"""
|
|
38
|
+
self.embedding = embedding
|
|
39
|
+
self.model = model
|
|
40
|
+
self.input_type = input_type
|
|
41
|
+
self.tokens_used = tokens_used
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def dimensions(self) -> int:
|
|
45
|
+
"""Get embedding dimensionality."""
|
|
46
|
+
return len(self.embedding)
|
|
47
|
+
|
|
48
|
+
def __repr__(self) -> str:
|
|
49
|
+
return f"CLIPEmbeddingResult(model={self.model}, dims={self.dimensions}, tokens={self.tokens_used})"
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class JinaCLIPEmbedder:
|
|
53
|
+
"""
|
|
54
|
+
CLIP embeddings using Jina AI API.
|
|
55
|
+
|
|
56
|
+
Supports:
|
|
57
|
+
- jina-clip-v1: 768-dimensional embeddings
|
|
58
|
+
- jina-clip-v2: 512-dimensional embeddings (default)
|
|
59
|
+
|
|
60
|
+
Pricing:
|
|
61
|
+
- ~$0.02 per million tokens
|
|
62
|
+
- Images: 4000 tokens per 512x512 tile (v2)
|
|
63
|
+
- Images: 1000 tokens per 224x224 tile (v1)
|
|
64
|
+
- Free tier: 10M tokens for new users
|
|
65
|
+
|
|
66
|
+
Future extensions:
|
|
67
|
+
- Self-hosted CLIP models
|
|
68
|
+
- OpenCLIP support
|
|
69
|
+
- Batch embedding support
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
def __init__(
|
|
73
|
+
self,
|
|
74
|
+
api_key: Optional[str] = None,
|
|
75
|
+
model: str = "jina-clip-v2",
|
|
76
|
+
):
|
|
77
|
+
"""
|
|
78
|
+
Initialize Jina CLIP embedder.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
api_key: Jina AI API key (from env if None)
|
|
82
|
+
model: CLIP model name (jina-clip-v1 or jina-clip-v2)
|
|
83
|
+
"""
|
|
84
|
+
# Get API key from environment if not provided
|
|
85
|
+
# Check both CONTENT__JINA_API_KEY (preferred) and legacy JINA_API_KEY
|
|
86
|
+
if api_key is None:
|
|
87
|
+
api_key = os.getenv("CONTENT__JINA_API_KEY") or os.getenv("JINA_API_KEY")
|
|
88
|
+
|
|
89
|
+
self.api_key = api_key
|
|
90
|
+
self.model = model
|
|
91
|
+
self.api_url = "https://api.jina.ai/v1/embeddings"
|
|
92
|
+
|
|
93
|
+
# Warn if no API key
|
|
94
|
+
if not self.api_key:
|
|
95
|
+
logger.warning(
|
|
96
|
+
"No Jina API key found - CLIP embeddings will be disabled. "
|
|
97
|
+
"Set CONTENT__JINA_API_KEY or get a free key at https://jina.ai/embeddings/"
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
def is_available(self) -> bool:
|
|
101
|
+
"""Check if Jina CLIP embeddings are available."""
|
|
102
|
+
return self.api_key is not None
|
|
103
|
+
|
|
104
|
+
def embed_image(
|
|
105
|
+
self,
|
|
106
|
+
image_path: str | Path,
|
|
107
|
+
) -> Optional[CLIPEmbeddingResult]:
|
|
108
|
+
"""
|
|
109
|
+
Generate CLIP embedding for an image.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
image_path: Path to image file
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
CLIPEmbeddingResult with embedding vector, or None if unavailable
|
|
116
|
+
|
|
117
|
+
Raises:
|
|
118
|
+
RuntimeError: If API request fails (when API key is available)
|
|
119
|
+
"""
|
|
120
|
+
if not self.is_available():
|
|
121
|
+
logger.debug("Jina API key not available - skipping CLIP embedding")
|
|
122
|
+
return None
|
|
123
|
+
|
|
124
|
+
image_path = Path(image_path)
|
|
125
|
+
if not image_path.exists():
|
|
126
|
+
raise FileNotFoundError(f"Image file not found: {image_path}")
|
|
127
|
+
|
|
128
|
+
# Read and encode image to base64
|
|
129
|
+
with open(image_path, "rb") as f:
|
|
130
|
+
image_bytes = f.read()
|
|
131
|
+
|
|
132
|
+
image_b64 = base64.b64encode(image_bytes).decode("utf-8")
|
|
133
|
+
|
|
134
|
+
# Detect media type
|
|
135
|
+
suffix = image_path.suffix.lower()
|
|
136
|
+
media_type_map = {
|
|
137
|
+
".png": "image/png",
|
|
138
|
+
".jpg": "image/jpeg",
|
|
139
|
+
".jpeg": "image/jpeg",
|
|
140
|
+
".gif": "image/gif",
|
|
141
|
+
".webp": "image/webp",
|
|
142
|
+
}
|
|
143
|
+
media_type = media_type_map.get(suffix, "image/png")
|
|
144
|
+
|
|
145
|
+
logger.debug(f"Generating CLIP embedding for {image_path.name} with {self.model}")
|
|
146
|
+
|
|
147
|
+
try:
|
|
148
|
+
# Build request
|
|
149
|
+
headers = {
|
|
150
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
151
|
+
"Content-Type": "application/json",
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
# Jina API expects data URL format
|
|
155
|
+
data_url = f"data:{media_type};base64,{image_b64}"
|
|
156
|
+
|
|
157
|
+
body = {
|
|
158
|
+
"model": self.model,
|
|
159
|
+
"input": [data_url],
|
|
160
|
+
"input_type": "image",
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
response = requests.post(
|
|
164
|
+
self.api_url,
|
|
165
|
+
headers=headers,
|
|
166
|
+
json=body,
|
|
167
|
+
timeout=30.0,
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
if response.status_code != 200:
|
|
171
|
+
error_detail = response.text
|
|
172
|
+
logger.error(f"Jina API error: {response.status_code} - {error_detail}")
|
|
173
|
+
raise RuntimeError(f"CLIP embedding failed: {response.status_code} - {error_detail}")
|
|
174
|
+
|
|
175
|
+
result = response.json()
|
|
176
|
+
|
|
177
|
+
# Extract embedding and usage
|
|
178
|
+
embedding = result["data"][0]["embedding"]
|
|
179
|
+
tokens_used = result.get("usage", {}).get("total_tokens", 0)
|
|
180
|
+
|
|
181
|
+
logger.info(
|
|
182
|
+
f"✓ CLIP embedding generated: {len(embedding)} dims, {tokens_used} tokens"
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
return CLIPEmbeddingResult(
|
|
186
|
+
embedding=embedding,
|
|
187
|
+
model=self.model,
|
|
188
|
+
input_type="image",
|
|
189
|
+
tokens_used=tokens_used,
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
except requests.exceptions.Timeout:
|
|
193
|
+
logger.error("Jina API request timed out")
|
|
194
|
+
raise RuntimeError("CLIP embedding timed out after 30 seconds")
|
|
195
|
+
except requests.exceptions.RequestException as e:
|
|
196
|
+
logger.error(f"Request error: {e}")
|
|
197
|
+
raise RuntimeError(f"CLIP embedding request failed: {e}")
|
|
198
|
+
except Exception as e:
|
|
199
|
+
logger.error(f"Unexpected error during CLIP embedding: {e}")
|
|
200
|
+
raise
|
|
201
|
+
|
|
202
|
+
def embed_text(
|
|
203
|
+
self,
|
|
204
|
+
text: str,
|
|
205
|
+
) -> Optional[CLIPEmbeddingResult]:
|
|
206
|
+
"""
|
|
207
|
+
Generate CLIP embedding for text.
|
|
208
|
+
|
|
209
|
+
Useful for text-to-image search in shared embedding space.
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
text: Text to embed
|
|
213
|
+
|
|
214
|
+
Returns:
|
|
215
|
+
CLIPEmbeddingResult with embedding vector, or None if unavailable
|
|
216
|
+
|
|
217
|
+
Raises:
|
|
218
|
+
RuntimeError: If API request fails (when API key is available)
|
|
219
|
+
"""
|
|
220
|
+
if not self.is_available():
|
|
221
|
+
logger.debug("Jina API key not available - skipping CLIP embedding")
|
|
222
|
+
return None
|
|
223
|
+
|
|
224
|
+
logger.debug(f"Generating CLIP text embedding with {self.model}")
|
|
225
|
+
|
|
226
|
+
try:
|
|
227
|
+
# Build request
|
|
228
|
+
headers = {
|
|
229
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
230
|
+
"Content-Type": "application/json",
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
body = {
|
|
234
|
+
"model": self.model,
|
|
235
|
+
"input": [text],
|
|
236
|
+
"input_type": "text",
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
response = requests.post(
|
|
240
|
+
self.api_url,
|
|
241
|
+
headers=headers,
|
|
242
|
+
json=body,
|
|
243
|
+
timeout=30.0,
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
if response.status_code != 200:
|
|
247
|
+
error_detail = response.text
|
|
248
|
+
logger.error(f"Jina API error: {response.status_code} - {error_detail}")
|
|
249
|
+
raise RuntimeError(f"CLIP embedding failed: {response.status_code} - {error_detail}")
|
|
250
|
+
|
|
251
|
+
result = response.json()
|
|
252
|
+
|
|
253
|
+
# Extract embedding and usage
|
|
254
|
+
embedding = result["data"][0]["embedding"]
|
|
255
|
+
tokens_used = result.get("usage", {}).get("total_tokens", 0)
|
|
256
|
+
|
|
257
|
+
logger.info(
|
|
258
|
+
f"✓ CLIP text embedding generated: {len(embedding)} dims, {tokens_used} tokens"
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
return CLIPEmbeddingResult(
|
|
262
|
+
embedding=embedding,
|
|
263
|
+
model=self.model,
|
|
264
|
+
input_type="text",
|
|
265
|
+
tokens_used=tokens_used,
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
except requests.exceptions.Timeout:
|
|
269
|
+
logger.error("Jina API request timed out")
|
|
270
|
+
raise RuntimeError("CLIP embedding timed out after 30 seconds")
|
|
271
|
+
except requests.exceptions.RequestException as e:
|
|
272
|
+
logger.error(f"Request error: {e}")
|
|
273
|
+
raise RuntimeError(f"CLIP embedding request failed: {e}")
|
|
274
|
+
except Exception as e:
|
|
275
|
+
logger.error(f"Unexpected error during CLIP embedding: {e}")
|
|
276
|
+
raise
|
rem/utils/dict_utils.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
"""Dictionary utilities for nested access and field extraction.
|
|
2
|
+
|
|
3
|
+
Utilities for working with nested dictionaries and extracting values
|
|
4
|
+
for embeddings, serialization, etc.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def get_nested_value(data: dict[str, Any], path: str) -> Any:
|
|
12
|
+
"""Get value from nested dict using dot notation.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
data: Dictionary to traverse
|
|
16
|
+
path: Dot-separated path (e.g., "candidate.name", "skills.0.proficiency")
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
Value at the path, or None if not found
|
|
20
|
+
|
|
21
|
+
Examples:
|
|
22
|
+
>>> data = {"candidate": {"name": "John", "skills": [{"name": "Python"}]}}
|
|
23
|
+
>>> get_nested_value(data, "candidate.name")
|
|
24
|
+
'John'
|
|
25
|
+
>>> get_nested_value(data, "candidate.skills.0.name")
|
|
26
|
+
'Python'
|
|
27
|
+
>>> get_nested_value(data, "candidate.missing")
|
|
28
|
+
None
|
|
29
|
+
"""
|
|
30
|
+
keys = path.split(".")
|
|
31
|
+
value: Any = data
|
|
32
|
+
|
|
33
|
+
for key in keys:
|
|
34
|
+
if isinstance(value, dict):
|
|
35
|
+
value = value.get(key)
|
|
36
|
+
elif isinstance(value, list):
|
|
37
|
+
# Handle array index (e.g., "skills.0.name")
|
|
38
|
+
try:
|
|
39
|
+
index = int(key)
|
|
40
|
+
value = value[index] if 0 <= index < len(value) else None
|
|
41
|
+
except (ValueError, TypeError):
|
|
42
|
+
return None
|
|
43
|
+
else:
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
if value is None:
|
|
47
|
+
return None
|
|
48
|
+
|
|
49
|
+
return value
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def extract_fields_for_embedding(
|
|
53
|
+
data: dict[str, Any],
|
|
54
|
+
fields: list[str],
|
|
55
|
+
) -> str:
|
|
56
|
+
"""Extract and concatenate fields from dict for embedding generation.
|
|
57
|
+
|
|
58
|
+
Supports nested field access via dot notation.
|
|
59
|
+
Handles lists and dicts by JSON-serializing them.
|
|
60
|
+
Returns newline-separated concatenation of all field values.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
data: Dictionary containing data to extract
|
|
64
|
+
fields: List of field paths (supports dot notation)
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
Concatenated text suitable for embedding
|
|
68
|
+
|
|
69
|
+
Examples:
|
|
70
|
+
>>> data = {
|
|
71
|
+
... "name": "John Doe",
|
|
72
|
+
... "skills": ["Python", "PostgreSQL"],
|
|
73
|
+
... "experience": {"years": 5, "level": "senior"}
|
|
74
|
+
... }
|
|
75
|
+
>>> extract_fields_for_embedding(data, ["name", "skills"])
|
|
76
|
+
'John Doe\\n["Python", "PostgreSQL"]'
|
|
77
|
+
|
|
78
|
+
>>> extract_fields_for_embedding(data, ["name", "experience.level"])
|
|
79
|
+
'John Doe\\nsenior'
|
|
80
|
+
|
|
81
|
+
>>> extract_fields_for_embedding(data, [])
|
|
82
|
+
'{"name": "John Doe", ...}' # Full JSON if no fields specified
|
|
83
|
+
"""
|
|
84
|
+
if not fields:
|
|
85
|
+
# If no fields specified, embed entire JSON
|
|
86
|
+
return json.dumps(data, indent=2)
|
|
87
|
+
|
|
88
|
+
parts = []
|
|
89
|
+
for field in fields:
|
|
90
|
+
value = get_nested_value(data, field)
|
|
91
|
+
if value is not None:
|
|
92
|
+
# Convert to string
|
|
93
|
+
if isinstance(value, (list, dict)):
|
|
94
|
+
parts.append(json.dumps(value))
|
|
95
|
+
else:
|
|
96
|
+
parts.append(str(value))
|
|
97
|
+
|
|
98
|
+
return "\n".join(parts)
|