foodforthought-cli 0.2.7__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ate/__init__.py +6 -0
- ate/__main__.py +16 -0
- ate/auth/__init__.py +1 -0
- ate/auth/device_flow.py +141 -0
- ate/auth/token_store.py +96 -0
- ate/behaviors/__init__.py +100 -0
- ate/behaviors/approach.py +399 -0
- ate/behaviors/common.py +686 -0
- ate/behaviors/tree.py +454 -0
- ate/cli.py +855 -3995
- ate/client.py +90 -0
- ate/commands/__init__.py +168 -0
- ate/commands/auth.py +389 -0
- ate/commands/bridge.py +448 -0
- ate/commands/data.py +185 -0
- ate/commands/deps.py +111 -0
- ate/commands/generate.py +384 -0
- ate/commands/memory.py +907 -0
- ate/commands/parts.py +166 -0
- ate/commands/primitive.py +399 -0
- ate/commands/protocol.py +288 -0
- ate/commands/recording.py +524 -0
- ate/commands/repo.py +154 -0
- ate/commands/simulation.py +291 -0
- ate/commands/skill.py +303 -0
- ate/commands/skills.py +487 -0
- ate/commands/team.py +147 -0
- ate/commands/workflow.py +271 -0
- ate/detection/__init__.py +38 -0
- ate/detection/base.py +142 -0
- ate/detection/color_detector.py +399 -0
- ate/detection/trash_detector.py +322 -0
- ate/drivers/__init__.py +39 -0
- ate/drivers/ble_transport.py +405 -0
- ate/drivers/mechdog.py +942 -0
- ate/drivers/wifi_camera.py +477 -0
- ate/interfaces/__init__.py +187 -0
- ate/interfaces/base.py +273 -0
- ate/interfaces/body.py +267 -0
- ate/interfaces/detection.py +282 -0
- ate/interfaces/locomotion.py +422 -0
- ate/interfaces/manipulation.py +408 -0
- ate/interfaces/navigation.py +389 -0
- ate/interfaces/perception.py +362 -0
- ate/interfaces/sensors.py +247 -0
- ate/interfaces/types.py +371 -0
- ate/llm_proxy.py +239 -0
- ate/mcp_server.py +387 -0
- ate/memory/__init__.py +35 -0
- ate/memory/cloud.py +244 -0
- ate/memory/context.py +269 -0
- ate/memory/embeddings.py +184 -0
- ate/memory/export.py +26 -0
- ate/memory/merge.py +146 -0
- ate/memory/migrate/__init__.py +34 -0
- ate/memory/migrate/base.py +89 -0
- ate/memory/migrate/pipeline.py +189 -0
- ate/memory/migrate/sources/__init__.py +13 -0
- ate/memory/migrate/sources/chroma.py +170 -0
- ate/memory/migrate/sources/pinecone.py +120 -0
- ate/memory/migrate/sources/qdrant.py +110 -0
- ate/memory/migrate/sources/weaviate.py +160 -0
- ate/memory/reranker.py +353 -0
- ate/memory/search.py +26 -0
- ate/memory/store.py +548 -0
- ate/recording/__init__.py +83 -0
- ate/recording/demonstration.py +378 -0
- ate/recording/session.py +415 -0
- ate/recording/upload.py +304 -0
- ate/recording/visual.py +416 -0
- ate/recording/wrapper.py +95 -0
- ate/robot/__init__.py +221 -0
- ate/robot/agentic_servo.py +856 -0
- ate/robot/behaviors.py +493 -0
- ate/robot/ble_capture.py +1000 -0
- ate/robot/ble_enumerate.py +506 -0
- ate/robot/calibration.py +668 -0
- ate/robot/calibration_state.py +388 -0
- ate/robot/commands.py +3735 -0
- ate/robot/direction_calibration.py +554 -0
- ate/robot/discovery.py +441 -0
- ate/robot/introspection.py +330 -0
- ate/robot/llm_system_id.py +654 -0
- ate/robot/locomotion_calibration.py +508 -0
- ate/robot/manager.py +270 -0
- ate/robot/marker_generator.py +611 -0
- ate/robot/perception.py +502 -0
- ate/robot/primitives.py +614 -0
- ate/robot/profiles.py +281 -0
- ate/robot/registry.py +322 -0
- ate/robot/servo_mapper.py +1153 -0
- ate/robot/skill_upload.py +675 -0
- ate/robot/target_calibration.py +500 -0
- ate/robot/teach.py +515 -0
- ate/robot/types.py +242 -0
- ate/robot/visual_labeler.py +1048 -0
- ate/robot/visual_servo_loop.py +494 -0
- ate/robot/visual_servoing.py +570 -0
- ate/robot/visual_system_id.py +906 -0
- ate/transports/__init__.py +121 -0
- ate/transports/base.py +394 -0
- ate/transports/ble.py +405 -0
- ate/transports/hybrid.py +444 -0
- ate/transports/serial.py +345 -0
- ate/urdf/__init__.py +30 -0
- ate/urdf/capture.py +582 -0
- ate/urdf/cloud.py +491 -0
- ate/urdf/collision.py +271 -0
- ate/urdf/commands.py +708 -0
- ate/urdf/depth.py +360 -0
- ate/urdf/inertial.py +312 -0
- ate/urdf/kinematics.py +330 -0
- ate/urdf/lifting.py +415 -0
- ate/urdf/meshing.py +300 -0
- ate/urdf/models/__init__.py +110 -0
- ate/urdf/models/depth_anything.py +253 -0
- ate/urdf/models/sam2.py +324 -0
- ate/urdf/motion_analysis.py +396 -0
- ate/urdf/pipeline.py +468 -0
- ate/urdf/scale.py +256 -0
- ate/urdf/scan_session.py +411 -0
- ate/urdf/segmentation.py +299 -0
- ate/urdf/synthesis.py +319 -0
- ate/urdf/topology.py +336 -0
- ate/urdf/validation.py +371 -0
- {foodforthought_cli-0.2.7.dist-info → foodforthought_cli-0.3.0.dist-info}/METADATA +9 -1
- foodforthought_cli-0.3.0.dist-info/RECORD +166 -0
- {foodforthought_cli-0.2.7.dist-info → foodforthought_cli-0.3.0.dist-info}/WHEEL +1 -1
- foodforthought_cli-0.2.7.dist-info/RECORD +0 -44
- {foodforthought_cli-0.2.7.dist-info → foodforthought_cli-0.3.0.dist-info}/entry_points.txt +0 -0
- {foodforthought_cli-0.2.7.dist-info → foodforthought_cli-0.3.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""Base classes and data structures for migration."""
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Optional, List, Dict, Any, Tuple
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class VectorRecord:
|
|
9
|
+
"""A vector record with metadata."""
|
|
10
|
+
id: str
|
|
11
|
+
vector: List[float]
|
|
12
|
+
text: Optional[str] = None
|
|
13
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class MigrationEstimate:
|
|
18
|
+
"""Estimation of migration resources and time."""
|
|
19
|
+
total_vectors: int
|
|
20
|
+
dimensions: int
|
|
21
|
+
estimated_mv2_bytes: int
|
|
22
|
+
estimated_seconds: float
|
|
23
|
+
source_size_bytes: Optional[int] = None
|
|
24
|
+
compression_ratio: Optional[float] = None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class MigrationResult:
|
|
29
|
+
"""Result of a migration operation."""
|
|
30
|
+
source_type: str
|
|
31
|
+
source_name: str
|
|
32
|
+
output_path: str
|
|
33
|
+
total_migrated: int
|
|
34
|
+
total_skipped: int
|
|
35
|
+
duration_seconds: float
|
|
36
|
+
output_size_bytes: int
|
|
37
|
+
compression_ratio: Optional[float] = None
|
|
38
|
+
errors: List[str] = field(default_factory=list)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class MigrationCheckpoint:
|
|
43
|
+
"""Migration checkpoint for resuming."""
|
|
44
|
+
source_type: str
|
|
45
|
+
source_name: str
|
|
46
|
+
output_path: str
|
|
47
|
+
last_cursor: Optional[str]
|
|
48
|
+
records_completed: int
|
|
49
|
+
started_at: str
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class MigrationSource(ABC):
|
|
53
|
+
"""Abstract base class for migration sources."""
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
@abstractmethod
|
|
57
|
+
def source_type(self) -> str:
|
|
58
|
+
"""Return the source type (e.g., 'pinecone', 'qdrant')."""
|
|
59
|
+
pass
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
@abstractmethod
|
|
63
|
+
def source_name(self) -> str:
|
|
64
|
+
"""Return the source name (e.g., index name, collection name)."""
|
|
65
|
+
pass
|
|
66
|
+
|
|
67
|
+
@abstractmethod
|
|
68
|
+
def connect(self) -> None:
|
|
69
|
+
"""Connect to the source."""
|
|
70
|
+
pass
|
|
71
|
+
|
|
72
|
+
@abstractmethod
|
|
73
|
+
def estimate(self) -> MigrationEstimate:
|
|
74
|
+
"""Estimate migration size and time."""
|
|
75
|
+
pass
|
|
76
|
+
|
|
77
|
+
@abstractmethod
|
|
78
|
+
def fetch_batch(self, batch_size: int = 10000, cursor: Optional[str] = None) -> Tuple[List[VectorRecord], Optional[str]]:
|
|
79
|
+
"""Fetch a batch of records.
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
Tuple of (records, next_cursor). next_cursor is None if this is the last batch.
|
|
83
|
+
"""
|
|
84
|
+
pass
|
|
85
|
+
|
|
86
|
+
@abstractmethod
|
|
87
|
+
def close(self) -> None:
|
|
88
|
+
"""Close the connection and clean up resources."""
|
|
89
|
+
pass
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
"""Migration pipeline for orchestrating the migration process."""
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
from typing import Optional, Callable, Any
|
|
6
|
+
|
|
7
|
+
from .base import MigrationSource, MigrationResult, MigrationCheckpoint
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class MigrationPipeline:
|
|
11
|
+
"""Pipeline for orchestrating vector database migrations."""
|
|
12
|
+
|
|
13
|
+
def __init__(self, source: MigrationSource, output_path: str,
|
|
14
|
+
batch_size: int = 10000, checkpoint_path: Optional[str] = None):
|
|
15
|
+
"""Initialize the migration pipeline.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
source: The migration source to read from
|
|
19
|
+
output_path: Path to write the .mv2 output file
|
|
20
|
+
batch_size: Number of records to fetch per batch
|
|
21
|
+
checkpoint_path: Optional path for checkpoint file
|
|
22
|
+
"""
|
|
23
|
+
self.source = source
|
|
24
|
+
self.output_path = output_path
|
|
25
|
+
self.batch_size = batch_size
|
|
26
|
+
self.checkpoint_path = checkpoint_path
|
|
27
|
+
self.progress_callbacks = []
|
|
28
|
+
|
|
29
|
+
def on_progress(self, callback: Callable[[int, int], None]) -> None:
|
|
30
|
+
"""Register a progress callback function.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
callback: Function called with (records_done, total_estimate)
|
|
34
|
+
"""
|
|
35
|
+
self.progress_callbacks.append(callback)
|
|
36
|
+
|
|
37
|
+
def run(self, dry_run: bool = False) -> MigrationResult:
|
|
38
|
+
"""Run the migration.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
dry_run: If True, only estimate without migrating
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
MigrationResult with operation details
|
|
45
|
+
"""
|
|
46
|
+
start_time = time.time()
|
|
47
|
+
|
|
48
|
+
try:
|
|
49
|
+
self.source.connect()
|
|
50
|
+
|
|
51
|
+
# Get estimate
|
|
52
|
+
estimate = self.source.estimate()
|
|
53
|
+
|
|
54
|
+
if dry_run:
|
|
55
|
+
# For dry run, return early with estimate-based result
|
|
56
|
+
return MigrationResult(
|
|
57
|
+
source_type=self.source.source_type,
|
|
58
|
+
source_name=self.source.source_name,
|
|
59
|
+
output_path=self.output_path,
|
|
60
|
+
total_migrated=0,
|
|
61
|
+
total_skipped=0,
|
|
62
|
+
duration_seconds=time.time() - start_time,
|
|
63
|
+
output_size_bytes=0
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
# Create output .mv2 using MemoryStore
|
|
67
|
+
parent_dir = os.path.dirname(self.output_path)
|
|
68
|
+
if parent_dir:
|
|
69
|
+
os.makedirs(parent_dir, exist_ok=True)
|
|
70
|
+
|
|
71
|
+
from ..store import MemoryStore
|
|
72
|
+
output_store = MemoryStore.create(self.output_path)
|
|
73
|
+
|
|
74
|
+
total_migrated = 0
|
|
75
|
+
total_skipped = 0
|
|
76
|
+
errors = []
|
|
77
|
+
cursor = None
|
|
78
|
+
|
|
79
|
+
try:
|
|
80
|
+
while True:
|
|
81
|
+
try:
|
|
82
|
+
records, next_cursor = self.source.fetch_batch(self.batch_size, cursor)
|
|
83
|
+
|
|
84
|
+
if not records:
|
|
85
|
+
break
|
|
86
|
+
|
|
87
|
+
# Write records to .mv2 via MemoryStore
|
|
88
|
+
for record in records:
|
|
89
|
+
text = record.text or f"[vector-only record id={record.id}]"
|
|
90
|
+
metadata = dict(record.metadata) if record.metadata else {}
|
|
91
|
+
metadata['_source_id'] = record.id
|
|
92
|
+
metadata['_source_type'] = self.source.source_type
|
|
93
|
+
|
|
94
|
+
try:
|
|
95
|
+
output_store.add(
|
|
96
|
+
text=text,
|
|
97
|
+
metadata=metadata
|
|
98
|
+
)
|
|
99
|
+
total_migrated += 1
|
|
100
|
+
except Exception as e:
|
|
101
|
+
total_skipped += 1
|
|
102
|
+
errors.append(f"Record {record.id}: {str(e)}")
|
|
103
|
+
|
|
104
|
+
# Update checkpoint
|
|
105
|
+
if self.checkpoint_path:
|
|
106
|
+
checkpoint = MigrationCheckpoint(
|
|
107
|
+
source_type=self.source.source_type,
|
|
108
|
+
source_name=self.source.source_name,
|
|
109
|
+
output_path=self.output_path,
|
|
110
|
+
last_cursor=cursor,
|
|
111
|
+
records_completed=total_migrated,
|
|
112
|
+
started_at=time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(start_time))
|
|
113
|
+
)
|
|
114
|
+
self._write_checkpoint(checkpoint)
|
|
115
|
+
|
|
116
|
+
# Call progress callbacks
|
|
117
|
+
for callback in self.progress_callbacks:
|
|
118
|
+
callback(total_migrated, estimate.total_vectors)
|
|
119
|
+
|
|
120
|
+
cursor = next_cursor
|
|
121
|
+
if cursor is None:
|
|
122
|
+
break
|
|
123
|
+
|
|
124
|
+
except Exception as e:
|
|
125
|
+
errors.append(str(e))
|
|
126
|
+
break
|
|
127
|
+
finally:
|
|
128
|
+
output_store.close()
|
|
129
|
+
|
|
130
|
+
# Calculate output file size
|
|
131
|
+
output_size = os.path.getsize(self.output_path) if os.path.exists(self.output_path) else 0
|
|
132
|
+
|
|
133
|
+
return MigrationResult(
|
|
134
|
+
source_type=self.source.source_type,
|
|
135
|
+
source_name=self.source.source_name,
|
|
136
|
+
output_path=self.output_path,
|
|
137
|
+
total_migrated=total_migrated,
|
|
138
|
+
total_skipped=total_skipped,
|
|
139
|
+
duration_seconds=time.time() - start_time,
|
|
140
|
+
output_size_bytes=output_size,
|
|
141
|
+
errors=errors
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
except Exception as e:
|
|
145
|
+
return MigrationResult(
|
|
146
|
+
source_type=self.source.source_type,
|
|
147
|
+
source_name=self.source.source_name,
|
|
148
|
+
output_path=self.output_path,
|
|
149
|
+
total_migrated=0,
|
|
150
|
+
total_skipped=0,
|
|
151
|
+
duration_seconds=time.time() - start_time,
|
|
152
|
+
output_size_bytes=0,
|
|
153
|
+
errors=[str(e)]
|
|
154
|
+
)
|
|
155
|
+
finally:
|
|
156
|
+
try:
|
|
157
|
+
self.source.close()
|
|
158
|
+
except:
|
|
159
|
+
pass
|
|
160
|
+
|
|
161
|
+
def resume(self) -> MigrationResult:
|
|
162
|
+
"""Resume migration from checkpoint.
|
|
163
|
+
|
|
164
|
+
Raises:
|
|
165
|
+
NotImplementedError: Resume is not yet implemented.
|
|
166
|
+
"""
|
|
167
|
+
raise NotImplementedError(
|
|
168
|
+
"Resume not yet implemented — use run() to restart migration"
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
def _write_checkpoint(self, checkpoint: MigrationCheckpoint) -> None:
|
|
172
|
+
"""Write checkpoint to file."""
|
|
173
|
+
if not self.checkpoint_path:
|
|
174
|
+
return
|
|
175
|
+
|
|
176
|
+
checkpoint_data = {
|
|
177
|
+
'source_type': checkpoint.source_type,
|
|
178
|
+
'source_name': checkpoint.source_name,
|
|
179
|
+
'output_path': checkpoint.output_path,
|
|
180
|
+
'last_cursor': checkpoint.last_cursor,
|
|
181
|
+
'records_completed': checkpoint.records_completed,
|
|
182
|
+
'started_at': checkpoint.started_at
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
parent = os.path.dirname(self.checkpoint_path)
|
|
186
|
+
if parent:
|
|
187
|
+
os.makedirs(parent, exist_ok=True)
|
|
188
|
+
with open(self.checkpoint_path, 'w') as f:
|
|
189
|
+
json.dump(checkpoint_data, f)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Migration sources for various vector databases."""
|
|
2
|
+
|
|
3
|
+
from .pinecone import PineconeMigrationSource
|
|
4
|
+
from .qdrant import QdrantMigrationSource
|
|
5
|
+
from .weaviate import WeaviateMigrationSource
|
|
6
|
+
from .chroma import ChromaMigrationSource
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
'PineconeMigrationSource',
|
|
10
|
+
'QdrantMigrationSource',
|
|
11
|
+
'WeaviateMigrationSource',
|
|
12
|
+
'ChromaMigrationSource'
|
|
13
|
+
]
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
"""Chroma migration source implementation."""
|
|
2
|
+
from typing import Optional, List, Tuple
|
|
3
|
+
|
|
4
|
+
try:
|
|
5
|
+
import chromadb
|
|
6
|
+
except ImportError:
|
|
7
|
+
# Create a simple mock structure for testing
|
|
8
|
+
class ChromaDBMock:
|
|
9
|
+
PersistentClient = None
|
|
10
|
+
HttpClient = None
|
|
11
|
+
Client = None
|
|
12
|
+
|
|
13
|
+
chromadb = ChromaDBMock()
|
|
14
|
+
|
|
15
|
+
from ..base import MigrationSource, VectorRecord, MigrationEstimate
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ChromaMigrationSource(MigrationSource):
|
|
19
|
+
"""Migration source for Chroma vector database."""
|
|
20
|
+
|
|
21
|
+
def __init__(self, path: Optional[str] = None, host: Optional[str] = None, port: Optional[int] = None,
|
|
22
|
+
collection_name: str = "default", api_key: Optional[str] = None):
|
|
23
|
+
"""Initialize Chroma migration source.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
path: Path to persistent Chroma database (for PersistentClient)
|
|
27
|
+
host: Host for Chroma server (for HttpClient)
|
|
28
|
+
port: Port for Chroma server (for HttpClient)
|
|
29
|
+
collection_name: Name of the collection to migrate from
|
|
30
|
+
api_key: Optional API key for authentication
|
|
31
|
+
"""
|
|
32
|
+
self.path = path
|
|
33
|
+
self.host = host
|
|
34
|
+
self.port = port
|
|
35
|
+
self.collection_name = collection_name
|
|
36
|
+
self.api_key = api_key
|
|
37
|
+
self.client = None
|
|
38
|
+
self.collection = None
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def source_type(self) -> str:
|
|
42
|
+
"""Return the source type."""
|
|
43
|
+
return "chroma"
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def source_name(self) -> str:
|
|
47
|
+
"""Return the source name."""
|
|
48
|
+
return self.collection_name
|
|
49
|
+
|
|
50
|
+
def connect(self) -> None:
|
|
51
|
+
"""Connect to Chroma."""
|
|
52
|
+
if chromadb is None:
|
|
53
|
+
raise ImportError("chromadb library is required for Chroma migration")
|
|
54
|
+
|
|
55
|
+
# Choose client type based on configuration
|
|
56
|
+
if self.path:
|
|
57
|
+
# Use persistent client
|
|
58
|
+
self.client = chromadb.PersistentClient(path=self.path)
|
|
59
|
+
elif self.host and self.port:
|
|
60
|
+
# Use HTTP client
|
|
61
|
+
settings = {}
|
|
62
|
+
if self.api_key:
|
|
63
|
+
settings['chroma_api_impl'] = 'chromadb.api.fastapi.FastAPI'
|
|
64
|
+
settings['chroma_server_auth_credentials'] = self.api_key
|
|
65
|
+
|
|
66
|
+
self.client = chromadb.HttpClient(host=self.host, port=self.port, settings=settings)
|
|
67
|
+
else:
|
|
68
|
+
# Use in-memory client
|
|
69
|
+
self.client = chromadb.Client()
|
|
70
|
+
|
|
71
|
+
# Get or create collection
|
|
72
|
+
self.collection = self.client.get_collection(name=self.collection_name)
|
|
73
|
+
|
|
74
|
+
def estimate(self) -> MigrationEstimate:
|
|
75
|
+
"""Estimate migration size and time."""
|
|
76
|
+
if not self.collection:
|
|
77
|
+
raise RuntimeError("Must call connect() first")
|
|
78
|
+
|
|
79
|
+
# Get collection count
|
|
80
|
+
total_vectors = self.collection.count()
|
|
81
|
+
|
|
82
|
+
# Peek at one record to get dimensions
|
|
83
|
+
dimensions = 768 # Default assumption
|
|
84
|
+
if total_vectors > 0:
|
|
85
|
+
try:
|
|
86
|
+
peek_result = self.collection.peek(limit=1)
|
|
87
|
+
if peek_result.get('embeddings') and len(peek_result['embeddings']) > 0:
|
|
88
|
+
dimensions = len(peek_result['embeddings'][0])
|
|
89
|
+
except Exception:
|
|
90
|
+
pass # Use default
|
|
91
|
+
|
|
92
|
+
# Rough estimates
|
|
93
|
+
bytes_per_vector = dimensions * 4 + 1024 # 4 bytes per float + metadata overhead
|
|
94
|
+
estimated_mv2_bytes = total_vectors * bytes_per_vector
|
|
95
|
+
estimated_seconds = total_vectors / 1200.0 # Rough estimate of 1200 vectors/second
|
|
96
|
+
|
|
97
|
+
return MigrationEstimate(
|
|
98
|
+
total_vectors=total_vectors,
|
|
99
|
+
dimensions=dimensions,
|
|
100
|
+
estimated_mv2_bytes=estimated_mv2_bytes,
|
|
101
|
+
estimated_seconds=estimated_seconds
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
def fetch_batch(self, batch_size: int = 10000, cursor: Optional[str] = None) -> Tuple[List[VectorRecord], Optional[str]]:
|
|
105
|
+
"""Fetch a batch of records from Chroma."""
|
|
106
|
+
if not self.collection:
|
|
107
|
+
raise RuntimeError("Must call connect() first")
|
|
108
|
+
|
|
109
|
+
# Parse cursor as offset if provided
|
|
110
|
+
offset = 0
|
|
111
|
+
if cursor:
|
|
112
|
+
try:
|
|
113
|
+
offset = int(cursor)
|
|
114
|
+
except (ValueError, TypeError):
|
|
115
|
+
offset = 0
|
|
116
|
+
|
|
117
|
+
# Get batch of records
|
|
118
|
+
result = self.collection.get(
|
|
119
|
+
limit=batch_size,
|
|
120
|
+
offset=offset,
|
|
121
|
+
include=['embeddings', 'metadatas', 'documents']
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
ids = result.get('ids', [])
|
|
125
|
+
embeddings = result.get('embeddings', [])
|
|
126
|
+
metadatas = result.get('metadatas', [])
|
|
127
|
+
documents = result.get('documents', [])
|
|
128
|
+
|
|
129
|
+
records = []
|
|
130
|
+
for i, record_id in enumerate(ids):
|
|
131
|
+
embedding = embeddings[i] if i < len(embeddings) else []
|
|
132
|
+
metadata = metadatas[i] if i < len(metadatas) else {}
|
|
133
|
+
document = documents[i] if i < len(documents) else None
|
|
134
|
+
|
|
135
|
+
# Extract text from document or metadata
|
|
136
|
+
text = document
|
|
137
|
+
if not text and metadata:
|
|
138
|
+
text = metadata.get('text') or metadata.get('content')
|
|
139
|
+
|
|
140
|
+
# Create clean metadata without text fields
|
|
141
|
+
clean_metadata = {}
|
|
142
|
+
if metadata:
|
|
143
|
+
clean_metadata = {k: v for k, v in metadata.items() if k not in ('text', 'content')}
|
|
144
|
+
|
|
145
|
+
record = VectorRecord(
|
|
146
|
+
id=str(record_id),
|
|
147
|
+
vector=embedding,
|
|
148
|
+
text=text,
|
|
149
|
+
metadata=clean_metadata
|
|
150
|
+
)
|
|
151
|
+
records.append(record)
|
|
152
|
+
|
|
153
|
+
# Determine if there are more results
|
|
154
|
+
# If we got any results and this is the first page, assume there might be more
|
|
155
|
+
# If we got fewer results than requested, we're at the end
|
|
156
|
+
if len(ids) == batch_size:
|
|
157
|
+
next_cursor = str(offset + batch_size)
|
|
158
|
+
elif len(ids) > 0 and offset == 0:
|
|
159
|
+
# First batch with some results - optimistically assume there might be more
|
|
160
|
+
next_cursor = str(offset + batch_size)
|
|
161
|
+
else:
|
|
162
|
+
next_cursor = None
|
|
163
|
+
|
|
164
|
+
return records, next_cursor
|
|
165
|
+
|
|
166
|
+
def close(self) -> None:
|
|
167
|
+
"""Close the connection and clean up resources."""
|
|
168
|
+
# Chroma doesn't require explicit cleanup
|
|
169
|
+
self.client = None
|
|
170
|
+
self.collection = None
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
"""Pinecone migration source implementation."""
|
|
2
|
+
from typing import Optional, List, Tuple
|
|
3
|
+
|
|
4
|
+
try:
|
|
5
|
+
import pinecone
|
|
6
|
+
except ImportError:
|
|
7
|
+
pinecone = None
|
|
8
|
+
|
|
9
|
+
from ..base import MigrationSource, VectorRecord, MigrationEstimate
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class PineconeMigrationSource(MigrationSource):
|
|
13
|
+
"""Migration source for Pinecone vector database."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, api_key: str, index_name: str, environment: str, namespace: Optional[str] = None):
|
|
16
|
+
"""Initialize Pinecone migration source.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
api_key: Pinecone API key
|
|
20
|
+
index_name: Name of the Pinecone index
|
|
21
|
+
environment: Pinecone environment
|
|
22
|
+
namespace: Optional namespace to migrate from
|
|
23
|
+
"""
|
|
24
|
+
self.api_key = api_key
|
|
25
|
+
self.index_name = index_name
|
|
26
|
+
self.environment = environment
|
|
27
|
+
self.namespace = namespace
|
|
28
|
+
self.index = None
|
|
29
|
+
|
|
30
|
+
@property
|
|
31
|
+
def source_type(self) -> str:
|
|
32
|
+
"""Return the source type."""
|
|
33
|
+
return "pinecone"
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def source_name(self) -> str:
|
|
37
|
+
"""Return the source name."""
|
|
38
|
+
return self.index_name
|
|
39
|
+
|
|
40
|
+
def connect(self) -> None:
|
|
41
|
+
"""Connect to Pinecone."""
|
|
42
|
+
if pinecone is None:
|
|
43
|
+
raise ImportError("pinecone library is required for Pinecone migration")
|
|
44
|
+
|
|
45
|
+
pinecone.init(api_key=self.api_key, environment=self.environment)
|
|
46
|
+
self.index = pinecone.Index(self.index_name)
|
|
47
|
+
|
|
48
|
+
def estimate(self) -> MigrationEstimate:
|
|
49
|
+
"""Estimate migration size and time."""
|
|
50
|
+
if not self.index:
|
|
51
|
+
raise RuntimeError("Must call connect() first")
|
|
52
|
+
|
|
53
|
+
stats = self.index.describe_index_stats()
|
|
54
|
+
|
|
55
|
+
# If namespace is specified, use namespace count, otherwise use total
|
|
56
|
+
if self.namespace and 'namespaces' in stats and self.namespace in stats['namespaces']:
|
|
57
|
+
total_vectors = stats['namespaces'][self.namespace]['vector_count']
|
|
58
|
+
else:
|
|
59
|
+
total_vectors = stats.get('total_vector_count', 0)
|
|
60
|
+
|
|
61
|
+
dimensions = stats.get('dimension', 768)
|
|
62
|
+
|
|
63
|
+
# Rough estimates
|
|
64
|
+
bytes_per_vector = dimensions * 4 + 1024 # 4 bytes per float + metadata overhead
|
|
65
|
+
estimated_mv2_bytes = total_vectors * bytes_per_vector
|
|
66
|
+
estimated_seconds = total_vectors / 1000.0 # Rough estimate of 1000 vectors/second
|
|
67
|
+
|
|
68
|
+
return MigrationEstimate(
|
|
69
|
+
total_vectors=total_vectors,
|
|
70
|
+
dimensions=dimensions,
|
|
71
|
+
estimated_mv2_bytes=estimated_mv2_bytes,
|
|
72
|
+
estimated_seconds=estimated_seconds
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
def fetch_batch(self, batch_size: int = 10000, cursor: Optional[str] = None) -> Tuple[List[VectorRecord], Optional[str]]:
|
|
76
|
+
"""Fetch a batch of records from Pinecone."""
|
|
77
|
+
if not self.index:
|
|
78
|
+
raise RuntimeError("Must call connect() first")
|
|
79
|
+
|
|
80
|
+
# List vector IDs with pagination
|
|
81
|
+
list_response = self.index.list(
|
|
82
|
+
namespace=self.namespace,
|
|
83
|
+
limit=batch_size,
|
|
84
|
+
pagination_token=cursor
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
vector_ids = [vec['id'] for vec in list_response.get('vectors', [])]
|
|
88
|
+
|
|
89
|
+
if not vector_ids:
|
|
90
|
+
return [], None
|
|
91
|
+
|
|
92
|
+
# Fetch the actual vectors
|
|
93
|
+
fetch_response = self.index.fetch(vector_ids, namespace=self.namespace)
|
|
94
|
+
|
|
95
|
+
records = []
|
|
96
|
+
for vector_id, vector_data in fetch_response.get('vectors', {}).items():
|
|
97
|
+
# Extract text from metadata if present
|
|
98
|
+
metadata = vector_data.get('metadata', {})
|
|
99
|
+
text = metadata.get('text')
|
|
100
|
+
|
|
101
|
+
# Create clean metadata without text (since text has its own field)
|
|
102
|
+
clean_metadata = {k: v for k, v in metadata.items() if k != 'text'}
|
|
103
|
+
|
|
104
|
+
record = VectorRecord(
|
|
105
|
+
id=vector_data['id'],
|
|
106
|
+
vector=vector_data['values'],
|
|
107
|
+
text=text,
|
|
108
|
+
metadata=clean_metadata
|
|
109
|
+
)
|
|
110
|
+
records.append(record)
|
|
111
|
+
|
|
112
|
+
# Get next cursor from pagination
|
|
113
|
+
next_cursor = list_response.get('pagination', {}).get('next')
|
|
114
|
+
|
|
115
|
+
return records, next_cursor
|
|
116
|
+
|
|
117
|
+
def close(self) -> None:
|
|
118
|
+
"""Close the connection and clean up resources."""
|
|
119
|
+
# Pinecone doesn't require explicit cleanup
|
|
120
|
+
self.index = None
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""Qdrant migration source implementation."""
|
|
2
|
+
from typing import Optional, List, Tuple
|
|
3
|
+
|
|
4
|
+
try:
|
|
5
|
+
from qdrant_client import QdrantClient
|
|
6
|
+
except ImportError:
|
|
7
|
+
QdrantClient = None
|
|
8
|
+
|
|
9
|
+
from ..base import MigrationSource, VectorRecord, MigrationEstimate
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class QdrantMigrationSource(MigrationSource):
|
|
13
|
+
"""Migration source for Qdrant vector database."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, url: str, collection_name: str, api_key: Optional[str] = None):
|
|
16
|
+
"""Initialize Qdrant migration source.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
url: Qdrant server URL
|
|
20
|
+
collection_name: Name of the collection to migrate from
|
|
21
|
+
api_key: Optional API key for authentication
|
|
22
|
+
"""
|
|
23
|
+
self.url = url
|
|
24
|
+
self.collection_name = collection_name
|
|
25
|
+
self.api_key = api_key
|
|
26
|
+
self.client = None
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def source_type(self) -> str:
|
|
30
|
+
"""Return the source type."""
|
|
31
|
+
return "qdrant"
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def source_name(self) -> str:
|
|
35
|
+
"""Return the source name."""
|
|
36
|
+
return self.collection_name
|
|
37
|
+
|
|
38
|
+
def connect(self) -> None:
|
|
39
|
+
"""Connect to Qdrant."""
|
|
40
|
+
if QdrantClient is None:
|
|
41
|
+
raise ImportError("qdrant-client library is required for Qdrant migration")
|
|
42
|
+
|
|
43
|
+
self.client = QdrantClient(url=self.url, api_key=self.api_key)
|
|
44
|
+
|
|
45
|
+
def estimate(self) -> MigrationEstimate:
|
|
46
|
+
"""Estimate migration size and time."""
|
|
47
|
+
if not self.client:
|
|
48
|
+
raise RuntimeError("Must call connect() first")
|
|
49
|
+
|
|
50
|
+
collection_info = self.client.get_collection(self.collection_name)
|
|
51
|
+
|
|
52
|
+
total_vectors = collection_info.points_count
|
|
53
|
+
dimensions = collection_info.config.params.vectors.size
|
|
54
|
+
|
|
55
|
+
# Rough estimates
|
|
56
|
+
bytes_per_vector = dimensions * 4 + 1024 # 4 bytes per float + metadata overhead
|
|
57
|
+
estimated_mv2_bytes = total_vectors * bytes_per_vector
|
|
58
|
+
estimated_seconds = total_vectors / 1000.0 # Rough estimate of 1000 vectors/second
|
|
59
|
+
|
|
60
|
+
return MigrationEstimate(
|
|
61
|
+
total_vectors=total_vectors,
|
|
62
|
+
dimensions=dimensions,
|
|
63
|
+
estimated_mv2_bytes=estimated_mv2_bytes,
|
|
64
|
+
estimated_seconds=estimated_seconds
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
def fetch_batch(self, batch_size: int = 10000, cursor: Optional[str] = None) -> Tuple[List[VectorRecord], Optional[str]]:
|
|
68
|
+
"""Fetch a batch of records from Qdrant."""
|
|
69
|
+
if not self.client:
|
|
70
|
+
raise RuntimeError("Must call connect() first")
|
|
71
|
+
|
|
72
|
+
# Use cursor directly as offset (Qdrant handles different offset types)
|
|
73
|
+
offset = cursor
|
|
74
|
+
|
|
75
|
+
# Scroll through points
|
|
76
|
+
points, next_page_offset = self.client.scroll(
|
|
77
|
+
collection_name=self.collection_name,
|
|
78
|
+
limit=batch_size,
|
|
79
|
+
offset=offset,
|
|
80
|
+
with_payload=True,
|
|
81
|
+
with_vectors=True
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
records = []
|
|
85
|
+
for point in points:
|
|
86
|
+
# Extract text from payload if present
|
|
87
|
+
payload = point.payload or {}
|
|
88
|
+
text = payload.get('text')
|
|
89
|
+
|
|
90
|
+
# Create clean metadata without text
|
|
91
|
+
clean_metadata = {k: v for k, v in payload.items() if k != 'text'}
|
|
92
|
+
|
|
93
|
+
record = VectorRecord(
|
|
94
|
+
id=str(point.id),
|
|
95
|
+
vector=point.vector,
|
|
96
|
+
text=text,
|
|
97
|
+
metadata=clean_metadata
|
|
98
|
+
)
|
|
99
|
+
records.append(record)
|
|
100
|
+
|
|
101
|
+
# Convert next_page_offset to cursor string
|
|
102
|
+
next_cursor = str(next_page_offset) if next_page_offset is not None else None
|
|
103
|
+
|
|
104
|
+
return records, next_cursor
|
|
105
|
+
|
|
106
|
+
def close(self) -> None:
|
|
107
|
+
"""Close the connection and clean up resources."""
|
|
108
|
+
if self.client:
|
|
109
|
+
self.client.close()
|
|
110
|
+
self.client = None
|