foodforthought-cli 0.2.8__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ate/__init__.py +6 -0
- ate/__main__.py +16 -0
- ate/auth/__init__.py +1 -0
- ate/auth/device_flow.py +141 -0
- ate/auth/token_store.py +96 -0
- ate/behaviors/__init__.py +12 -0
- ate/behaviors/approach.py +399 -0
- ate/cli.py +855 -4551
- ate/client.py +90 -0
- ate/commands/__init__.py +168 -0
- ate/commands/auth.py +389 -0
- ate/commands/bridge.py +448 -0
- ate/commands/data.py +185 -0
- ate/commands/deps.py +111 -0
- ate/commands/generate.py +384 -0
- ate/commands/memory.py +907 -0
- ate/commands/parts.py +166 -0
- ate/commands/primitive.py +399 -0
- ate/commands/protocol.py +288 -0
- ate/commands/recording.py +524 -0
- ate/commands/repo.py +154 -0
- ate/commands/simulation.py +291 -0
- ate/commands/skill.py +303 -0
- ate/commands/skills.py +487 -0
- ate/commands/team.py +147 -0
- ate/commands/workflow.py +271 -0
- ate/detection/__init__.py +38 -0
- ate/detection/base.py +142 -0
- ate/detection/color_detector.py +402 -0
- ate/detection/trash_detector.py +322 -0
- ate/drivers/__init__.py +18 -6
- ate/drivers/ble_transport.py +405 -0
- ate/drivers/mechdog.py +360 -24
- ate/drivers/wifi_camera.py +477 -0
- ate/interfaces/__init__.py +16 -0
- ate/interfaces/base.py +2 -0
- ate/interfaces/sensors.py +247 -0
- ate/llm_proxy.py +239 -0
- ate/memory/__init__.py +35 -0
- ate/memory/cloud.py +244 -0
- ate/memory/context.py +269 -0
- ate/memory/embeddings.py +184 -0
- ate/memory/export.py +26 -0
- ate/memory/merge.py +146 -0
- ate/memory/migrate/__init__.py +34 -0
- ate/memory/migrate/base.py +89 -0
- ate/memory/migrate/pipeline.py +189 -0
- ate/memory/migrate/sources/__init__.py +13 -0
- ate/memory/migrate/sources/chroma.py +170 -0
- ate/memory/migrate/sources/pinecone.py +120 -0
- ate/memory/migrate/sources/qdrant.py +110 -0
- ate/memory/migrate/sources/weaviate.py +160 -0
- ate/memory/reranker.py +353 -0
- ate/memory/search.py +26 -0
- ate/memory/store.py +548 -0
- ate/recording/__init__.py +42 -3
- ate/recording/session.py +12 -2
- ate/recording/visual.py +416 -0
- ate/robot/__init__.py +142 -0
- ate/robot/agentic_servo.py +856 -0
- ate/robot/behaviors.py +493 -0
- ate/robot/ble_capture.py +1000 -0
- ate/robot/ble_enumerate.py +506 -0
- ate/robot/calibration.py +88 -3
- ate/robot/calibration_state.py +388 -0
- ate/robot/commands.py +143 -11
- ate/robot/direction_calibration.py +554 -0
- ate/robot/discovery.py +104 -2
- ate/robot/llm_system_id.py +654 -0
- ate/robot/locomotion_calibration.py +508 -0
- ate/robot/marker_generator.py +611 -0
- ate/robot/perception.py +502 -0
- ate/robot/primitives.py +614 -0
- ate/robot/profiles.py +6 -0
- ate/robot/registry.py +5 -2
- ate/robot/servo_mapper.py +1153 -0
- ate/robot/skill_upload.py +285 -3
- ate/robot/target_calibration.py +500 -0
- ate/robot/teach.py +515 -0
- ate/robot/types.py +242 -0
- ate/robot/visual_labeler.py +9 -0
- ate/robot/visual_servo_loop.py +494 -0
- ate/robot/visual_servoing.py +570 -0
- ate/robot/visual_system_id.py +906 -0
- ate/transports/__init__.py +121 -0
- ate/transports/base.py +394 -0
- ate/transports/ble.py +405 -0
- ate/transports/hybrid.py +444 -0
- ate/transports/serial.py +345 -0
- ate/urdf/__init__.py +30 -0
- ate/urdf/capture.py +582 -0
- ate/urdf/cloud.py +491 -0
- ate/urdf/collision.py +271 -0
- ate/urdf/commands.py +708 -0
- ate/urdf/depth.py +360 -0
- ate/urdf/inertial.py +312 -0
- ate/urdf/kinematics.py +330 -0
- ate/urdf/lifting.py +415 -0
- ate/urdf/meshing.py +300 -0
- ate/urdf/models/__init__.py +110 -0
- ate/urdf/models/depth_anything.py +253 -0
- ate/urdf/models/sam2.py +324 -0
- ate/urdf/motion_analysis.py +396 -0
- ate/urdf/pipeline.py +468 -0
- ate/urdf/scale.py +256 -0
- ate/urdf/scan_session.py +411 -0
- ate/urdf/segmentation.py +299 -0
- ate/urdf/synthesis.py +319 -0
- ate/urdf/topology.py +336 -0
- ate/urdf/validation.py +371 -0
- {foodforthought_cli-0.2.8.dist-info → foodforthought_cli-0.3.1.dist-info}/METADATA +1 -1
- foodforthought_cli-0.3.1.dist-info/RECORD +166 -0
- {foodforthought_cli-0.2.8.dist-info → foodforthought_cli-0.3.1.dist-info}/WHEEL +1 -1
- foodforthought_cli-0.2.8.dist-info/RECORD +0 -73
- {foodforthought_cli-0.2.8.dist-info → foodforthought_cli-0.3.1.dist-info}/entry_points.txt +0 -0
- {foodforthought_cli-0.2.8.dist-info → foodforthought_cli-0.3.1.dist-info}/top_level.txt +0 -0
ate/memory/export.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""Export operations and info structures."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class MemoryInfo:
|
|
9
|
+
"""Information about a memory store.
|
|
10
|
+
|
|
11
|
+
Attributes:
|
|
12
|
+
path: Path to the .mv2 file
|
|
13
|
+
frame_count: Number of memory frames stored
|
|
14
|
+
size_bytes: Total size in bytes
|
|
15
|
+
has_lex_index: Whether lexical indexing is enabled
|
|
16
|
+
has_vec_index: Whether vector indexing is enabled
|
|
17
|
+
has_time_index: Whether time indexing is enabled
|
|
18
|
+
created_at: ISO timestamp when created (optional)
|
|
19
|
+
"""
|
|
20
|
+
path: str
|
|
21
|
+
frame_count: int
|
|
22
|
+
size_bytes: int
|
|
23
|
+
has_lex_index: bool
|
|
24
|
+
has_vec_index: bool
|
|
25
|
+
has_time_index: bool
|
|
26
|
+
created_at: Optional[str] = None
|
ate/memory/merge.py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
"""Memory merging operations."""
|
|
2
|
+
|
|
3
|
+
from typing import List
|
|
4
|
+
import json
|
|
5
|
+
|
|
6
|
+
from .store import MemoryStore
|
|
7
|
+
from .export import MemoryInfo
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def merge_memories(source_paths: List[str], output_path: str, dedup: bool = True) -> MemoryInfo:
|
|
11
|
+
"""Merge multiple .mv2 files into a single output file.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
source_paths: List of paths to source .mv2 files
|
|
15
|
+
output_path: Path where merged .mv2 file will be created
|
|
16
|
+
dedup: Whether to deduplicate identical content (default True)
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
MemoryInfo about the merged output file
|
|
20
|
+
"""
|
|
21
|
+
# Create the output memory store
|
|
22
|
+
output_store = MemoryStore.create(output_path)
|
|
23
|
+
|
|
24
|
+
seen_texts = set() if dedup else None
|
|
25
|
+
all_items = []
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
# Process each source file
|
|
29
|
+
for source_path in source_paths:
|
|
30
|
+
source_store = MemoryStore.open(source_path)
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
# Try timeline-based iteration first (works with real memvid)
|
|
34
|
+
items_from_source = []
|
|
35
|
+
try:
|
|
36
|
+
timeline = source_store._mem.timeline()
|
|
37
|
+
if timeline:
|
|
38
|
+
for entry in timeline:
|
|
39
|
+
if isinstance(entry, dict):
|
|
40
|
+
uri = entry.get('uri', f"mv2://frames/{entry.get('frame_id', 0)}")
|
|
41
|
+
frame_data = source_store._mem.frame(uri)
|
|
42
|
+
|
|
43
|
+
title = None
|
|
44
|
+
tags = []
|
|
45
|
+
metadata = {}
|
|
46
|
+
|
|
47
|
+
# Get text from timeline preview (labels are just keywords)
|
|
48
|
+
text = entry.get('preview', '').split('\ntitle:')[0].split('\ntags:')[0].strip()
|
|
49
|
+
|
|
50
|
+
if isinstance(frame_data, dict):
|
|
51
|
+
title = frame_data.get('title')
|
|
52
|
+
tags = frame_data.get('tags', [])
|
|
53
|
+
for key, value in frame_data.get('extra_metadata', {}).items():
|
|
54
|
+
if key == 'extractous_metadata':
|
|
55
|
+
continue
|
|
56
|
+
try:
|
|
57
|
+
if isinstance(value, str) and (value.startswith('{') or value.startswith('"')):
|
|
58
|
+
metadata[key] = json.loads(value)
|
|
59
|
+
else:
|
|
60
|
+
metadata[key] = value
|
|
61
|
+
except json.JSONDecodeError:
|
|
62
|
+
metadata[key] = value
|
|
63
|
+
|
|
64
|
+
if text:
|
|
65
|
+
items_from_source.append({
|
|
66
|
+
'text': text,
|
|
67
|
+
'title': title,
|
|
68
|
+
'tags': tags,
|
|
69
|
+
'metadata': metadata
|
|
70
|
+
})
|
|
71
|
+
except (AttributeError, TypeError):
|
|
72
|
+
pass # Fall through to search-based approach
|
|
73
|
+
|
|
74
|
+
# Fallback: use search (for mocked tests)
|
|
75
|
+
if not items_from_source:
|
|
76
|
+
search_result = source_store.search("*", top_k=10000)
|
|
77
|
+
|
|
78
|
+
if hasattr(search_result, 'hits'):
|
|
79
|
+
search_results = search_result.hits
|
|
80
|
+
elif isinstance(search_result, list):
|
|
81
|
+
search_results = search_result
|
|
82
|
+
else:
|
|
83
|
+
search_results = []
|
|
84
|
+
|
|
85
|
+
for search_item in search_results:
|
|
86
|
+
text = getattr(search_item, 'snippet', getattr(search_item, 'text', ''))
|
|
87
|
+
title = getattr(search_item, 'title', None)
|
|
88
|
+
tags = getattr(search_item, 'tags', [])
|
|
89
|
+
frame_id = getattr(search_item, 'frame_id', 0)
|
|
90
|
+
|
|
91
|
+
metadata = {}
|
|
92
|
+
try:
|
|
93
|
+
frame_data = source_store._mem.frame(frame_id)
|
|
94
|
+
if hasattr(frame_data, 'metadata'):
|
|
95
|
+
metadata = frame_data.metadata or {}
|
|
96
|
+
except Exception:
|
|
97
|
+
try:
|
|
98
|
+
if hasattr(search_item, 'metadata'):
|
|
99
|
+
metadata = search_item.metadata or {}
|
|
100
|
+
except Exception:
|
|
101
|
+
metadata = {}
|
|
102
|
+
|
|
103
|
+
items_from_source.append({
|
|
104
|
+
'text': text,
|
|
105
|
+
'title': title,
|
|
106
|
+
'tags': tags,
|
|
107
|
+
'metadata': metadata
|
|
108
|
+
})
|
|
109
|
+
|
|
110
|
+
for item in items_from_source:
|
|
111
|
+
text = item['text']
|
|
112
|
+
|
|
113
|
+
if dedup and text in seen_texts:
|
|
114
|
+
continue
|
|
115
|
+
|
|
116
|
+
if dedup:
|
|
117
|
+
seen_texts.add(text)
|
|
118
|
+
|
|
119
|
+
all_items.append(item)
|
|
120
|
+
|
|
121
|
+
finally:
|
|
122
|
+
source_store.close()
|
|
123
|
+
|
|
124
|
+
# Add all collected items to the output store
|
|
125
|
+
output_store.add_batch(all_items)
|
|
126
|
+
|
|
127
|
+
# Get info about the merged result
|
|
128
|
+
info = output_store.info()
|
|
129
|
+
|
|
130
|
+
finally:
|
|
131
|
+
output_store.close()
|
|
132
|
+
|
|
133
|
+
# Handle mocked info object vs real MemoryInfo
|
|
134
|
+
if hasattr(info, 'frame_count') and not isinstance(info, MemoryInfo):
|
|
135
|
+
# It's a mock, create actual MemoryInfo
|
|
136
|
+
return MemoryInfo(
|
|
137
|
+
path=info.path,
|
|
138
|
+
frame_count=info.frame_count,
|
|
139
|
+
size_bytes=info.size_bytes,
|
|
140
|
+
has_lex_index=info.has_lex_index,
|
|
141
|
+
has_vec_index=info.has_vec_index,
|
|
142
|
+
has_time_index=info.has_time_index,
|
|
143
|
+
created_at=getattr(info, 'created_at', None)
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
return info
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Migration module for vector database migrations."""
|
|
2
|
+
|
|
3
|
+
from .base import (
|
|
4
|
+
VectorRecord,
|
|
5
|
+
MigrationEstimate,
|
|
6
|
+
MigrationResult,
|
|
7
|
+
MigrationCheckpoint,
|
|
8
|
+
MigrationSource
|
|
9
|
+
)
|
|
10
|
+
from .pipeline import MigrationPipeline
|
|
11
|
+
from .sources import (
|
|
12
|
+
PineconeMigrationSource,
|
|
13
|
+
QdrantMigrationSource,
|
|
14
|
+
WeaviateMigrationSource,
|
|
15
|
+
ChromaMigrationSource
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
# Base classes and data structures
|
|
20
|
+
'VectorRecord',
|
|
21
|
+
'MigrationEstimate',
|
|
22
|
+
'MigrationResult',
|
|
23
|
+
'MigrationCheckpoint',
|
|
24
|
+
'MigrationSource',
|
|
25
|
+
|
|
26
|
+
# Pipeline
|
|
27
|
+
'MigrationPipeline',
|
|
28
|
+
|
|
29
|
+
# Sources
|
|
30
|
+
'PineconeMigrationSource',
|
|
31
|
+
'QdrantMigrationSource',
|
|
32
|
+
'WeaviateMigrationSource',
|
|
33
|
+
'ChromaMigrationSource'
|
|
34
|
+
]
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""Base classes and data structures for migration."""
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Optional, List, Dict, Any, Tuple
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class VectorRecord:
|
|
9
|
+
"""A vector record with metadata."""
|
|
10
|
+
id: str
|
|
11
|
+
vector: List[float]
|
|
12
|
+
text: Optional[str] = None
|
|
13
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class MigrationEstimate:
|
|
18
|
+
"""Estimation of migration resources and time."""
|
|
19
|
+
total_vectors: int
|
|
20
|
+
dimensions: int
|
|
21
|
+
estimated_mv2_bytes: int
|
|
22
|
+
estimated_seconds: float
|
|
23
|
+
source_size_bytes: Optional[int] = None
|
|
24
|
+
compression_ratio: Optional[float] = None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class MigrationResult:
|
|
29
|
+
"""Result of a migration operation."""
|
|
30
|
+
source_type: str
|
|
31
|
+
source_name: str
|
|
32
|
+
output_path: str
|
|
33
|
+
total_migrated: int
|
|
34
|
+
total_skipped: int
|
|
35
|
+
duration_seconds: float
|
|
36
|
+
output_size_bytes: int
|
|
37
|
+
compression_ratio: Optional[float] = None
|
|
38
|
+
errors: List[str] = field(default_factory=list)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class MigrationCheckpoint:
|
|
43
|
+
"""Migration checkpoint for resuming."""
|
|
44
|
+
source_type: str
|
|
45
|
+
source_name: str
|
|
46
|
+
output_path: str
|
|
47
|
+
last_cursor: Optional[str]
|
|
48
|
+
records_completed: int
|
|
49
|
+
started_at: str
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class MigrationSource(ABC):
|
|
53
|
+
"""Abstract base class for migration sources."""
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
@abstractmethod
|
|
57
|
+
def source_type(self) -> str:
|
|
58
|
+
"""Return the source type (e.g., 'pinecone', 'qdrant')."""
|
|
59
|
+
pass
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
@abstractmethod
|
|
63
|
+
def source_name(self) -> str:
|
|
64
|
+
"""Return the source name (e.g., index name, collection name)."""
|
|
65
|
+
pass
|
|
66
|
+
|
|
67
|
+
@abstractmethod
|
|
68
|
+
def connect(self) -> None:
|
|
69
|
+
"""Connect to the source."""
|
|
70
|
+
pass
|
|
71
|
+
|
|
72
|
+
@abstractmethod
|
|
73
|
+
def estimate(self) -> MigrationEstimate:
|
|
74
|
+
"""Estimate migration size and time."""
|
|
75
|
+
pass
|
|
76
|
+
|
|
77
|
+
@abstractmethod
|
|
78
|
+
def fetch_batch(self, batch_size: int = 10000, cursor: Optional[str] = None) -> Tuple[List[VectorRecord], Optional[str]]:
|
|
79
|
+
"""Fetch a batch of records.
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
Tuple of (records, next_cursor). next_cursor is None if this is the last batch.
|
|
83
|
+
"""
|
|
84
|
+
pass
|
|
85
|
+
|
|
86
|
+
@abstractmethod
|
|
87
|
+
def close(self) -> None:
|
|
88
|
+
"""Close the connection and clean up resources."""
|
|
89
|
+
pass
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
"""Migration pipeline for orchestrating the migration process."""
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
from typing import Optional, Callable, Any
|
|
6
|
+
|
|
7
|
+
from .base import MigrationSource, MigrationResult, MigrationCheckpoint
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class MigrationPipeline:
|
|
11
|
+
"""Pipeline for orchestrating vector database migrations."""
|
|
12
|
+
|
|
13
|
+
def __init__(self, source: MigrationSource, output_path: str,
|
|
14
|
+
batch_size: int = 10000, checkpoint_path: Optional[str] = None):
|
|
15
|
+
"""Initialize the migration pipeline.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
source: The migration source to read from
|
|
19
|
+
output_path: Path to write the .mv2 output file
|
|
20
|
+
batch_size: Number of records to fetch per batch
|
|
21
|
+
checkpoint_path: Optional path for checkpoint file
|
|
22
|
+
"""
|
|
23
|
+
self.source = source
|
|
24
|
+
self.output_path = output_path
|
|
25
|
+
self.batch_size = batch_size
|
|
26
|
+
self.checkpoint_path = checkpoint_path
|
|
27
|
+
self.progress_callbacks = []
|
|
28
|
+
|
|
29
|
+
def on_progress(self, callback: Callable[[int, int], None]) -> None:
|
|
30
|
+
"""Register a progress callback function.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
callback: Function called with (records_done, total_estimate)
|
|
34
|
+
"""
|
|
35
|
+
self.progress_callbacks.append(callback)
|
|
36
|
+
|
|
37
|
+
def run(self, dry_run: bool = False) -> MigrationResult:
|
|
38
|
+
"""Run the migration.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
dry_run: If True, only estimate without migrating
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
MigrationResult with operation details
|
|
45
|
+
"""
|
|
46
|
+
start_time = time.time()
|
|
47
|
+
|
|
48
|
+
try:
|
|
49
|
+
self.source.connect()
|
|
50
|
+
|
|
51
|
+
# Get estimate
|
|
52
|
+
estimate = self.source.estimate()
|
|
53
|
+
|
|
54
|
+
if dry_run:
|
|
55
|
+
# For dry run, return early with estimate-based result
|
|
56
|
+
return MigrationResult(
|
|
57
|
+
source_type=self.source.source_type,
|
|
58
|
+
source_name=self.source.source_name,
|
|
59
|
+
output_path=self.output_path,
|
|
60
|
+
total_migrated=0,
|
|
61
|
+
total_skipped=0,
|
|
62
|
+
duration_seconds=time.time() - start_time,
|
|
63
|
+
output_size_bytes=0
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
# Create output .mv2 using MemoryStore
|
|
67
|
+
parent_dir = os.path.dirname(self.output_path)
|
|
68
|
+
if parent_dir:
|
|
69
|
+
os.makedirs(parent_dir, exist_ok=True)
|
|
70
|
+
|
|
71
|
+
from ..store import MemoryStore
|
|
72
|
+
output_store = MemoryStore.create(self.output_path)
|
|
73
|
+
|
|
74
|
+
total_migrated = 0
|
|
75
|
+
total_skipped = 0
|
|
76
|
+
errors = []
|
|
77
|
+
cursor = None
|
|
78
|
+
|
|
79
|
+
try:
|
|
80
|
+
while True:
|
|
81
|
+
try:
|
|
82
|
+
records, next_cursor = self.source.fetch_batch(self.batch_size, cursor)
|
|
83
|
+
|
|
84
|
+
if not records:
|
|
85
|
+
break
|
|
86
|
+
|
|
87
|
+
# Write records to .mv2 via MemoryStore
|
|
88
|
+
for record in records:
|
|
89
|
+
text = record.text or f"[vector-only record id={record.id}]"
|
|
90
|
+
metadata = dict(record.metadata) if record.metadata else {}
|
|
91
|
+
metadata['_source_id'] = record.id
|
|
92
|
+
metadata['_source_type'] = self.source.source_type
|
|
93
|
+
|
|
94
|
+
try:
|
|
95
|
+
output_store.add(
|
|
96
|
+
text=text,
|
|
97
|
+
metadata=metadata
|
|
98
|
+
)
|
|
99
|
+
total_migrated += 1
|
|
100
|
+
except Exception as e:
|
|
101
|
+
total_skipped += 1
|
|
102
|
+
errors.append(f"Record {record.id}: {str(e)}")
|
|
103
|
+
|
|
104
|
+
# Update checkpoint
|
|
105
|
+
if self.checkpoint_path:
|
|
106
|
+
checkpoint = MigrationCheckpoint(
|
|
107
|
+
source_type=self.source.source_type,
|
|
108
|
+
source_name=self.source.source_name,
|
|
109
|
+
output_path=self.output_path,
|
|
110
|
+
last_cursor=cursor,
|
|
111
|
+
records_completed=total_migrated,
|
|
112
|
+
started_at=time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(start_time))
|
|
113
|
+
)
|
|
114
|
+
self._write_checkpoint(checkpoint)
|
|
115
|
+
|
|
116
|
+
# Call progress callbacks
|
|
117
|
+
for callback in self.progress_callbacks:
|
|
118
|
+
callback(total_migrated, estimate.total_vectors)
|
|
119
|
+
|
|
120
|
+
cursor = next_cursor
|
|
121
|
+
if cursor is None:
|
|
122
|
+
break
|
|
123
|
+
|
|
124
|
+
except Exception as e:
|
|
125
|
+
errors.append(str(e))
|
|
126
|
+
break
|
|
127
|
+
finally:
|
|
128
|
+
output_store.close()
|
|
129
|
+
|
|
130
|
+
# Calculate output file size
|
|
131
|
+
output_size = os.path.getsize(self.output_path) if os.path.exists(self.output_path) else 0
|
|
132
|
+
|
|
133
|
+
return MigrationResult(
|
|
134
|
+
source_type=self.source.source_type,
|
|
135
|
+
source_name=self.source.source_name,
|
|
136
|
+
output_path=self.output_path,
|
|
137
|
+
total_migrated=total_migrated,
|
|
138
|
+
total_skipped=total_skipped,
|
|
139
|
+
duration_seconds=time.time() - start_time,
|
|
140
|
+
output_size_bytes=output_size,
|
|
141
|
+
errors=errors
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
except Exception as e:
|
|
145
|
+
return MigrationResult(
|
|
146
|
+
source_type=self.source.source_type,
|
|
147
|
+
source_name=self.source.source_name,
|
|
148
|
+
output_path=self.output_path,
|
|
149
|
+
total_migrated=0,
|
|
150
|
+
total_skipped=0,
|
|
151
|
+
duration_seconds=time.time() - start_time,
|
|
152
|
+
output_size_bytes=0,
|
|
153
|
+
errors=[str(e)]
|
|
154
|
+
)
|
|
155
|
+
finally:
|
|
156
|
+
try:
|
|
157
|
+
self.source.close()
|
|
158
|
+
except:
|
|
159
|
+
pass
|
|
160
|
+
|
|
161
|
+
def resume(self) -> MigrationResult:
|
|
162
|
+
"""Resume migration from checkpoint.
|
|
163
|
+
|
|
164
|
+
Raises:
|
|
165
|
+
NotImplementedError: Resume is not yet implemented.
|
|
166
|
+
"""
|
|
167
|
+
raise NotImplementedError(
|
|
168
|
+
"Resume not yet implemented — use run() to restart migration"
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
def _write_checkpoint(self, checkpoint: MigrationCheckpoint) -> None:
|
|
172
|
+
"""Write checkpoint to file."""
|
|
173
|
+
if not self.checkpoint_path:
|
|
174
|
+
return
|
|
175
|
+
|
|
176
|
+
checkpoint_data = {
|
|
177
|
+
'source_type': checkpoint.source_type,
|
|
178
|
+
'source_name': checkpoint.source_name,
|
|
179
|
+
'output_path': checkpoint.output_path,
|
|
180
|
+
'last_cursor': checkpoint.last_cursor,
|
|
181
|
+
'records_completed': checkpoint.records_completed,
|
|
182
|
+
'started_at': checkpoint.started_at
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
parent = os.path.dirname(self.checkpoint_path)
|
|
186
|
+
if parent:
|
|
187
|
+
os.makedirs(parent, exist_ok=True)
|
|
188
|
+
with open(self.checkpoint_path, 'w') as f:
|
|
189
|
+
json.dump(checkpoint_data, f)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Migration sources for various vector databases."""
|
|
2
|
+
|
|
3
|
+
from .pinecone import PineconeMigrationSource
|
|
4
|
+
from .qdrant import QdrantMigrationSource
|
|
5
|
+
from .weaviate import WeaviateMigrationSource
|
|
6
|
+
from .chroma import ChromaMigrationSource
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
'PineconeMigrationSource',
|
|
10
|
+
'QdrantMigrationSource',
|
|
11
|
+
'WeaviateMigrationSource',
|
|
12
|
+
'ChromaMigrationSource'
|
|
13
|
+
]
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
"""Chroma migration source implementation."""
|
|
2
|
+
from typing import Optional, List, Tuple
|
|
3
|
+
|
|
4
|
+
try:
|
|
5
|
+
import chromadb
|
|
6
|
+
except ImportError:
|
|
7
|
+
# Create a simple mock structure for testing
|
|
8
|
+
class ChromaDBMock:
|
|
9
|
+
PersistentClient = None
|
|
10
|
+
HttpClient = None
|
|
11
|
+
Client = None
|
|
12
|
+
|
|
13
|
+
chromadb = ChromaDBMock()
|
|
14
|
+
|
|
15
|
+
from ..base import MigrationSource, VectorRecord, MigrationEstimate
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ChromaMigrationSource(MigrationSource):
|
|
19
|
+
"""Migration source for Chroma vector database."""
|
|
20
|
+
|
|
21
|
+
def __init__(self, path: Optional[str] = None, host: Optional[str] = None, port: Optional[int] = None,
|
|
22
|
+
collection_name: str = "default", api_key: Optional[str] = None):
|
|
23
|
+
"""Initialize Chroma migration source.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
path: Path to persistent Chroma database (for PersistentClient)
|
|
27
|
+
host: Host for Chroma server (for HttpClient)
|
|
28
|
+
port: Port for Chroma server (for HttpClient)
|
|
29
|
+
collection_name: Name of the collection to migrate from
|
|
30
|
+
api_key: Optional API key for authentication
|
|
31
|
+
"""
|
|
32
|
+
self.path = path
|
|
33
|
+
self.host = host
|
|
34
|
+
self.port = port
|
|
35
|
+
self.collection_name = collection_name
|
|
36
|
+
self.api_key = api_key
|
|
37
|
+
self.client = None
|
|
38
|
+
self.collection = None
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def source_type(self) -> str:
|
|
42
|
+
"""Return the source type."""
|
|
43
|
+
return "chroma"
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def source_name(self) -> str:
|
|
47
|
+
"""Return the source name."""
|
|
48
|
+
return self.collection_name
|
|
49
|
+
|
|
50
|
+
def connect(self) -> None:
|
|
51
|
+
"""Connect to Chroma."""
|
|
52
|
+
if chromadb is None:
|
|
53
|
+
raise ImportError("chromadb library is required for Chroma migration")
|
|
54
|
+
|
|
55
|
+
# Choose client type based on configuration
|
|
56
|
+
if self.path:
|
|
57
|
+
# Use persistent client
|
|
58
|
+
self.client = chromadb.PersistentClient(path=self.path)
|
|
59
|
+
elif self.host and self.port:
|
|
60
|
+
# Use HTTP client
|
|
61
|
+
settings = {}
|
|
62
|
+
if self.api_key:
|
|
63
|
+
settings['chroma_api_impl'] = 'chromadb.api.fastapi.FastAPI'
|
|
64
|
+
settings['chroma_server_auth_credentials'] = self.api_key
|
|
65
|
+
|
|
66
|
+
self.client = chromadb.HttpClient(host=self.host, port=self.port, settings=settings)
|
|
67
|
+
else:
|
|
68
|
+
# Use in-memory client
|
|
69
|
+
self.client = chromadb.Client()
|
|
70
|
+
|
|
71
|
+
# Get or create collection
|
|
72
|
+
self.collection = self.client.get_collection(name=self.collection_name)
|
|
73
|
+
|
|
74
|
+
def estimate(self) -> MigrationEstimate:
|
|
75
|
+
"""Estimate migration size and time."""
|
|
76
|
+
if not self.collection:
|
|
77
|
+
raise RuntimeError("Must call connect() first")
|
|
78
|
+
|
|
79
|
+
# Get collection count
|
|
80
|
+
total_vectors = self.collection.count()
|
|
81
|
+
|
|
82
|
+
# Peek at one record to get dimensions
|
|
83
|
+
dimensions = 768 # Default assumption
|
|
84
|
+
if total_vectors > 0:
|
|
85
|
+
try:
|
|
86
|
+
peek_result = self.collection.peek(limit=1)
|
|
87
|
+
if peek_result.get('embeddings') and len(peek_result['embeddings']) > 0:
|
|
88
|
+
dimensions = len(peek_result['embeddings'][0])
|
|
89
|
+
except Exception:
|
|
90
|
+
pass # Use default
|
|
91
|
+
|
|
92
|
+
# Rough estimates
|
|
93
|
+
bytes_per_vector = dimensions * 4 + 1024 # 4 bytes per float + metadata overhead
|
|
94
|
+
estimated_mv2_bytes = total_vectors * bytes_per_vector
|
|
95
|
+
estimated_seconds = total_vectors / 1200.0 # Rough estimate of 1200 vectors/second
|
|
96
|
+
|
|
97
|
+
return MigrationEstimate(
|
|
98
|
+
total_vectors=total_vectors,
|
|
99
|
+
dimensions=dimensions,
|
|
100
|
+
estimated_mv2_bytes=estimated_mv2_bytes,
|
|
101
|
+
estimated_seconds=estimated_seconds
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
def fetch_batch(self, batch_size: int = 10000, cursor: Optional[str] = None) -> Tuple[List[VectorRecord], Optional[str]]:
|
|
105
|
+
"""Fetch a batch of records from Chroma."""
|
|
106
|
+
if not self.collection:
|
|
107
|
+
raise RuntimeError("Must call connect() first")
|
|
108
|
+
|
|
109
|
+
# Parse cursor as offset if provided
|
|
110
|
+
offset = 0
|
|
111
|
+
if cursor:
|
|
112
|
+
try:
|
|
113
|
+
offset = int(cursor)
|
|
114
|
+
except (ValueError, TypeError):
|
|
115
|
+
offset = 0
|
|
116
|
+
|
|
117
|
+
# Get batch of records
|
|
118
|
+
result = self.collection.get(
|
|
119
|
+
limit=batch_size,
|
|
120
|
+
offset=offset,
|
|
121
|
+
include=['embeddings', 'metadatas', 'documents']
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
ids = result.get('ids', [])
|
|
125
|
+
embeddings = result.get('embeddings', [])
|
|
126
|
+
metadatas = result.get('metadatas', [])
|
|
127
|
+
documents = result.get('documents', [])
|
|
128
|
+
|
|
129
|
+
records = []
|
|
130
|
+
for i, record_id in enumerate(ids):
|
|
131
|
+
embedding = embeddings[i] if i < len(embeddings) else []
|
|
132
|
+
metadata = metadatas[i] if i < len(metadatas) else {}
|
|
133
|
+
document = documents[i] if i < len(documents) else None
|
|
134
|
+
|
|
135
|
+
# Extract text from document or metadata
|
|
136
|
+
text = document
|
|
137
|
+
if not text and metadata:
|
|
138
|
+
text = metadata.get('text') or metadata.get('content')
|
|
139
|
+
|
|
140
|
+
# Create clean metadata without text fields
|
|
141
|
+
clean_metadata = {}
|
|
142
|
+
if metadata:
|
|
143
|
+
clean_metadata = {k: v for k, v in metadata.items() if k not in ('text', 'content')}
|
|
144
|
+
|
|
145
|
+
record = VectorRecord(
|
|
146
|
+
id=str(record_id),
|
|
147
|
+
vector=embedding,
|
|
148
|
+
text=text,
|
|
149
|
+
metadata=clean_metadata
|
|
150
|
+
)
|
|
151
|
+
records.append(record)
|
|
152
|
+
|
|
153
|
+
# Determine if there are more results
|
|
154
|
+
# If we got any results and this is the first page, assume there might be more
|
|
155
|
+
# If we got fewer results than requested, we're at the end
|
|
156
|
+
if len(ids) == batch_size:
|
|
157
|
+
next_cursor = str(offset + batch_size)
|
|
158
|
+
elif len(ids) > 0 and offset == 0:
|
|
159
|
+
# First batch with some results - optimistically assume there might be more
|
|
160
|
+
next_cursor = str(offset + batch_size)
|
|
161
|
+
else:
|
|
162
|
+
next_cursor = None
|
|
163
|
+
|
|
164
|
+
return records, next_cursor
|
|
165
|
+
|
|
166
|
+
def close(self) -> None:
|
|
167
|
+
"""Close the connection and clean up resources."""
|
|
168
|
+
# Chroma doesn't require explicit cleanup
|
|
169
|
+
self.client = None
|
|
170
|
+
self.collection = None
|