foodforthought-cli 0.2.8__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. ate/__init__.py +6 -0
  2. ate/__main__.py +16 -0
  3. ate/auth/__init__.py +1 -0
  4. ate/auth/device_flow.py +141 -0
  5. ate/auth/token_store.py +96 -0
  6. ate/behaviors/__init__.py +12 -0
  7. ate/behaviors/approach.py +399 -0
  8. ate/cli.py +855 -4551
  9. ate/client.py +90 -0
  10. ate/commands/__init__.py +168 -0
  11. ate/commands/auth.py +389 -0
  12. ate/commands/bridge.py +448 -0
  13. ate/commands/data.py +185 -0
  14. ate/commands/deps.py +111 -0
  15. ate/commands/generate.py +384 -0
  16. ate/commands/memory.py +907 -0
  17. ate/commands/parts.py +166 -0
  18. ate/commands/primitive.py +399 -0
  19. ate/commands/protocol.py +288 -0
  20. ate/commands/recording.py +524 -0
  21. ate/commands/repo.py +154 -0
  22. ate/commands/simulation.py +291 -0
  23. ate/commands/skill.py +303 -0
  24. ate/commands/skills.py +487 -0
  25. ate/commands/team.py +147 -0
  26. ate/commands/workflow.py +271 -0
  27. ate/detection/__init__.py +38 -0
  28. ate/detection/base.py +142 -0
  29. ate/detection/color_detector.py +399 -0
  30. ate/detection/trash_detector.py +322 -0
  31. ate/drivers/__init__.py +18 -6
  32. ate/drivers/ble_transport.py +405 -0
  33. ate/drivers/mechdog.py +360 -24
  34. ate/drivers/wifi_camera.py +477 -0
  35. ate/interfaces/__init__.py +16 -0
  36. ate/interfaces/base.py +2 -0
  37. ate/interfaces/sensors.py +247 -0
  38. ate/llm_proxy.py +239 -0
  39. ate/memory/__init__.py +35 -0
  40. ate/memory/cloud.py +244 -0
  41. ate/memory/context.py +269 -0
  42. ate/memory/embeddings.py +184 -0
  43. ate/memory/export.py +26 -0
  44. ate/memory/merge.py +146 -0
  45. ate/memory/migrate/__init__.py +34 -0
  46. ate/memory/migrate/base.py +89 -0
  47. ate/memory/migrate/pipeline.py +189 -0
  48. ate/memory/migrate/sources/__init__.py +13 -0
  49. ate/memory/migrate/sources/chroma.py +170 -0
  50. ate/memory/migrate/sources/pinecone.py +120 -0
  51. ate/memory/migrate/sources/qdrant.py +110 -0
  52. ate/memory/migrate/sources/weaviate.py +160 -0
  53. ate/memory/reranker.py +353 -0
  54. ate/memory/search.py +26 -0
  55. ate/memory/store.py +548 -0
  56. ate/recording/__init__.py +42 -3
  57. ate/recording/session.py +12 -2
  58. ate/recording/visual.py +416 -0
  59. ate/robot/__init__.py +142 -0
  60. ate/robot/agentic_servo.py +856 -0
  61. ate/robot/behaviors.py +493 -0
  62. ate/robot/ble_capture.py +1000 -0
  63. ate/robot/ble_enumerate.py +506 -0
  64. ate/robot/calibration.py +88 -3
  65. ate/robot/calibration_state.py +388 -0
  66. ate/robot/commands.py +143 -11
  67. ate/robot/direction_calibration.py +554 -0
  68. ate/robot/discovery.py +104 -2
  69. ate/robot/llm_system_id.py +654 -0
  70. ate/robot/locomotion_calibration.py +508 -0
  71. ate/robot/marker_generator.py +611 -0
  72. ate/robot/perception.py +502 -0
  73. ate/robot/primitives.py +614 -0
  74. ate/robot/profiles.py +6 -0
  75. ate/robot/registry.py +5 -2
  76. ate/robot/servo_mapper.py +1153 -0
  77. ate/robot/skill_upload.py +285 -3
  78. ate/robot/target_calibration.py +500 -0
  79. ate/robot/teach.py +515 -0
  80. ate/robot/types.py +242 -0
  81. ate/robot/visual_labeler.py +9 -0
  82. ate/robot/visual_servo_loop.py +494 -0
  83. ate/robot/visual_servoing.py +570 -0
  84. ate/robot/visual_system_id.py +906 -0
  85. ate/transports/__init__.py +121 -0
  86. ate/transports/base.py +394 -0
  87. ate/transports/ble.py +405 -0
  88. ate/transports/hybrid.py +444 -0
  89. ate/transports/serial.py +345 -0
  90. ate/urdf/__init__.py +30 -0
  91. ate/urdf/capture.py +582 -0
  92. ate/urdf/cloud.py +491 -0
  93. ate/urdf/collision.py +271 -0
  94. ate/urdf/commands.py +708 -0
  95. ate/urdf/depth.py +360 -0
  96. ate/urdf/inertial.py +312 -0
  97. ate/urdf/kinematics.py +330 -0
  98. ate/urdf/lifting.py +415 -0
  99. ate/urdf/meshing.py +300 -0
  100. ate/urdf/models/__init__.py +110 -0
  101. ate/urdf/models/depth_anything.py +253 -0
  102. ate/urdf/models/sam2.py +324 -0
  103. ate/urdf/motion_analysis.py +396 -0
  104. ate/urdf/pipeline.py +468 -0
  105. ate/urdf/scale.py +256 -0
  106. ate/urdf/scan_session.py +411 -0
  107. ate/urdf/segmentation.py +299 -0
  108. ate/urdf/synthesis.py +319 -0
  109. ate/urdf/topology.py +336 -0
  110. ate/urdf/validation.py +371 -0
  111. {foodforthought_cli-0.2.8.dist-info → foodforthought_cli-0.3.0.dist-info}/METADATA +1 -1
  112. foodforthought_cli-0.3.0.dist-info/RECORD +166 -0
  113. {foodforthought_cli-0.2.8.dist-info → foodforthought_cli-0.3.0.dist-info}/WHEEL +1 -1
  114. foodforthought_cli-0.2.8.dist-info/RECORD +0 -73
  115. {foodforthought_cli-0.2.8.dist-info → foodforthought_cli-0.3.0.dist-info}/entry_points.txt +0 -0
  116. {foodforthought_cli-0.2.8.dist-info → foodforthought_cli-0.3.0.dist-info}/top_level.txt +0 -0
ate/memory/export.py ADDED
@@ -0,0 +1,26 @@
1
+ """Export operations and info structures."""
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Optional
5
+
6
+
7
+ @dataclass
8
+ class MemoryInfo:
9
+ """Information about a memory store.
10
+
11
+ Attributes:
12
+ path: Path to the .mv2 file
13
+ frame_count: Number of memory frames stored
14
+ size_bytes: Total size in bytes
15
+ has_lex_index: Whether lexical indexing is enabled
16
+ has_vec_index: Whether vector indexing is enabled
17
+ has_time_index: Whether time indexing is enabled
18
+ created_at: ISO timestamp when created (optional)
19
+ """
20
+ path: str
21
+ frame_count: int
22
+ size_bytes: int
23
+ has_lex_index: bool
24
+ has_vec_index: bool
25
+ has_time_index: bool
26
+ created_at: Optional[str] = None
ate/memory/merge.py ADDED
@@ -0,0 +1,146 @@
1
+ """Memory merging operations."""
2
+
3
+ from typing import List
4
+ import json
5
+
6
+ from .store import MemoryStore
7
+ from .export import MemoryInfo
8
+
9
+
10
+ def merge_memories(source_paths: List[str], output_path: str, dedup: bool = True) -> MemoryInfo:
11
+ """Merge multiple .mv2 files into a single output file.
12
+
13
+ Args:
14
+ source_paths: List of paths to source .mv2 files
15
+ output_path: Path where merged .mv2 file will be created
16
+ dedup: Whether to deduplicate identical content (default True)
17
+
18
+ Returns:
19
+ MemoryInfo about the merged output file
20
+ """
21
+ # Create the output memory store
22
+ output_store = MemoryStore.create(output_path)
23
+
24
+ seen_texts = set() if dedup else None
25
+ all_items = []
26
+
27
+ try:
28
+ # Process each source file
29
+ for source_path in source_paths:
30
+ source_store = MemoryStore.open(source_path)
31
+
32
+ try:
33
+ # Try timeline-based iteration first (works with real memvid)
34
+ items_from_source = []
35
+ try:
36
+ timeline = source_store._mem.timeline()
37
+ if timeline:
38
+ for entry in timeline:
39
+ if isinstance(entry, dict):
40
+ uri = entry.get('uri', f"mv2://frames/{entry.get('frame_id', 0)}")
41
+ frame_data = source_store._mem.frame(uri)
42
+
43
+ title = None
44
+ tags = []
45
+ metadata = {}
46
+
47
+ # Get text from timeline preview (labels are just keywords)
48
+ text = entry.get('preview', '').split('\ntitle:')[0].split('\ntags:')[0].strip()
49
+
50
+ if isinstance(frame_data, dict):
51
+ title = frame_data.get('title')
52
+ tags = frame_data.get('tags', [])
53
+ for key, value in frame_data.get('extra_metadata', {}).items():
54
+ if key == 'extractous_metadata':
55
+ continue
56
+ try:
57
+ if isinstance(value, str) and (value.startswith('{') or value.startswith('"')):
58
+ metadata[key] = json.loads(value)
59
+ else:
60
+ metadata[key] = value
61
+ except json.JSONDecodeError:
62
+ metadata[key] = value
63
+
64
+ if text:
65
+ items_from_source.append({
66
+ 'text': text,
67
+ 'title': title,
68
+ 'tags': tags,
69
+ 'metadata': metadata
70
+ })
71
+ except (AttributeError, TypeError):
72
+ pass # Fall through to search-based approach
73
+
74
+ # Fallback: use search (for mocked tests)
75
+ if not items_from_source:
76
+ search_result = source_store.search("*", top_k=10000)
77
+
78
+ if hasattr(search_result, 'hits'):
79
+ search_results = search_result.hits
80
+ elif isinstance(search_result, list):
81
+ search_results = search_result
82
+ else:
83
+ search_results = []
84
+
85
+ for search_item in search_results:
86
+ text = getattr(search_item, 'snippet', getattr(search_item, 'text', ''))
87
+ title = getattr(search_item, 'title', None)
88
+ tags = getattr(search_item, 'tags', [])
89
+ frame_id = getattr(search_item, 'frame_id', 0)
90
+
91
+ metadata = {}
92
+ try:
93
+ frame_data = source_store._mem.frame(frame_id)
94
+ if hasattr(frame_data, 'metadata'):
95
+ metadata = frame_data.metadata or {}
96
+ except Exception:
97
+ try:
98
+ if hasattr(search_item, 'metadata'):
99
+ metadata = search_item.metadata or {}
100
+ except Exception:
101
+ metadata = {}
102
+
103
+ items_from_source.append({
104
+ 'text': text,
105
+ 'title': title,
106
+ 'tags': tags,
107
+ 'metadata': metadata
108
+ })
109
+
110
+ for item in items_from_source:
111
+ text = item['text']
112
+
113
+ if dedup and text in seen_texts:
114
+ continue
115
+
116
+ if dedup:
117
+ seen_texts.add(text)
118
+
119
+ all_items.append(item)
120
+
121
+ finally:
122
+ source_store.close()
123
+
124
+ # Add all collected items to the output store
125
+ output_store.add_batch(all_items)
126
+
127
+ # Get info about the merged result
128
+ info = output_store.info()
129
+
130
+ finally:
131
+ output_store.close()
132
+
133
+ # Handle mocked info object vs real MemoryInfo
134
+ if hasattr(info, 'frame_count') and not isinstance(info, MemoryInfo):
135
+ # It's a mock, create actual MemoryInfo
136
+ return MemoryInfo(
137
+ path=info.path,
138
+ frame_count=info.frame_count,
139
+ size_bytes=info.size_bytes,
140
+ has_lex_index=info.has_lex_index,
141
+ has_vec_index=info.has_vec_index,
142
+ has_time_index=info.has_time_index,
143
+ created_at=getattr(info, 'created_at', None)
144
+ )
145
+
146
+ return info
@@ -0,0 +1,34 @@
1
+ """Migration module for vector database migrations."""
2
+
3
+ from .base import (
4
+ VectorRecord,
5
+ MigrationEstimate,
6
+ MigrationResult,
7
+ MigrationCheckpoint,
8
+ MigrationSource
9
+ )
10
+ from .pipeline import MigrationPipeline
11
+ from .sources import (
12
+ PineconeMigrationSource,
13
+ QdrantMigrationSource,
14
+ WeaviateMigrationSource,
15
+ ChromaMigrationSource
16
+ )
17
+
18
+ __all__ = [
19
+ # Base classes and data structures
20
+ 'VectorRecord',
21
+ 'MigrationEstimate',
22
+ 'MigrationResult',
23
+ 'MigrationCheckpoint',
24
+ 'MigrationSource',
25
+
26
+ # Pipeline
27
+ 'MigrationPipeline',
28
+
29
+ # Sources
30
+ 'PineconeMigrationSource',
31
+ 'QdrantMigrationSource',
32
+ 'WeaviateMigrationSource',
33
+ 'ChromaMigrationSource'
34
+ ]
@@ -0,0 +1,89 @@
1
+ """Base classes and data structures for migration."""
2
+ from abc import ABC, abstractmethod
3
+ from dataclasses import dataclass, field
4
+ from typing import Optional, List, Dict, Any, Tuple
5
+
6
+
7
+ @dataclass
8
+ class VectorRecord:
9
+ """A vector record with metadata."""
10
+ id: str
11
+ vector: List[float]
12
+ text: Optional[str] = None
13
+ metadata: Dict[str, Any] = field(default_factory=dict)
14
+
15
+
16
+ @dataclass
17
+ class MigrationEstimate:
18
+ """Estimation of migration resources and time."""
19
+ total_vectors: int
20
+ dimensions: int
21
+ estimated_mv2_bytes: int
22
+ estimated_seconds: float
23
+ source_size_bytes: Optional[int] = None
24
+ compression_ratio: Optional[float] = None
25
+
26
+
27
+ @dataclass
28
+ class MigrationResult:
29
+ """Result of a migration operation."""
30
+ source_type: str
31
+ source_name: str
32
+ output_path: str
33
+ total_migrated: int
34
+ total_skipped: int
35
+ duration_seconds: float
36
+ output_size_bytes: int
37
+ compression_ratio: Optional[float] = None
38
+ errors: List[str] = field(default_factory=list)
39
+
40
+
41
+ @dataclass
42
+ class MigrationCheckpoint:
43
+ """Migration checkpoint for resuming."""
44
+ source_type: str
45
+ source_name: str
46
+ output_path: str
47
+ last_cursor: Optional[str]
48
+ records_completed: int
49
+ started_at: str
50
+
51
+
52
+ class MigrationSource(ABC):
53
+ """Abstract base class for migration sources."""
54
+
55
+ @property
56
+ @abstractmethod
57
+ def source_type(self) -> str:
58
+ """Return the source type (e.g., 'pinecone', 'qdrant')."""
59
+ pass
60
+
61
+ @property
62
+ @abstractmethod
63
+ def source_name(self) -> str:
64
+ """Return the source name (e.g., index name, collection name)."""
65
+ pass
66
+
67
+ @abstractmethod
68
+ def connect(self) -> None:
69
+ """Connect to the source."""
70
+ pass
71
+
72
+ @abstractmethod
73
+ def estimate(self) -> MigrationEstimate:
74
+ """Estimate migration size and time."""
75
+ pass
76
+
77
+ @abstractmethod
78
+ def fetch_batch(self, batch_size: int = 10000, cursor: Optional[str] = None) -> Tuple[List[VectorRecord], Optional[str]]:
79
+ """Fetch a batch of records.
80
+
81
+ Returns:
82
+ Tuple of (records, next_cursor). next_cursor is None if this is the last batch.
83
+ """
84
+ pass
85
+
86
+ @abstractmethod
87
+ def close(self) -> None:
88
+ """Close the connection and clean up resources."""
89
+ pass
@@ -0,0 +1,189 @@
1
+ """Migration pipeline for orchestrating the migration process."""
2
+ import json
3
+ import os
4
+ import time
5
+ from typing import Optional, Callable, Any
6
+
7
+ from .base import MigrationSource, MigrationResult, MigrationCheckpoint
8
+
9
+
10
+ class MigrationPipeline:
11
+ """Pipeline for orchestrating vector database migrations."""
12
+
13
+ def __init__(self, source: MigrationSource, output_path: str,
14
+ batch_size: int = 10000, checkpoint_path: Optional[str] = None):
15
+ """Initialize the migration pipeline.
16
+
17
+ Args:
18
+ source: The migration source to read from
19
+ output_path: Path to write the .mv2 output file
20
+ batch_size: Number of records to fetch per batch
21
+ checkpoint_path: Optional path for checkpoint file
22
+ """
23
+ self.source = source
24
+ self.output_path = output_path
25
+ self.batch_size = batch_size
26
+ self.checkpoint_path = checkpoint_path
27
+ self.progress_callbacks = []
28
+
29
+ def on_progress(self, callback: Callable[[int, int], None]) -> None:
30
+ """Register a progress callback function.
31
+
32
+ Args:
33
+ callback: Function called with (records_done, total_estimate)
34
+ """
35
+ self.progress_callbacks.append(callback)
36
+
37
+ def run(self, dry_run: bool = False) -> MigrationResult:
38
+ """Run the migration.
39
+
40
+ Args:
41
+ dry_run: If True, only estimate without migrating
42
+
43
+ Returns:
44
+ MigrationResult with operation details
45
+ """
46
+ start_time = time.time()
47
+
48
+ try:
49
+ self.source.connect()
50
+
51
+ # Get estimate
52
+ estimate = self.source.estimate()
53
+
54
+ if dry_run:
55
+ # For dry run, return early with estimate-based result
56
+ return MigrationResult(
57
+ source_type=self.source.source_type,
58
+ source_name=self.source.source_name,
59
+ output_path=self.output_path,
60
+ total_migrated=0,
61
+ total_skipped=0,
62
+ duration_seconds=time.time() - start_time,
63
+ output_size_bytes=0
64
+ )
65
+
66
+ # Create output .mv2 using MemoryStore
67
+ parent_dir = os.path.dirname(self.output_path)
68
+ if parent_dir:
69
+ os.makedirs(parent_dir, exist_ok=True)
70
+
71
+ from ..store import MemoryStore
72
+ output_store = MemoryStore.create(self.output_path)
73
+
74
+ total_migrated = 0
75
+ total_skipped = 0
76
+ errors = []
77
+ cursor = None
78
+
79
+ try:
80
+ while True:
81
+ try:
82
+ records, next_cursor = self.source.fetch_batch(self.batch_size, cursor)
83
+
84
+ if not records:
85
+ break
86
+
87
+ # Write records to .mv2 via MemoryStore
88
+ for record in records:
89
+ text = record.text or f"[vector-only record id={record.id}]"
90
+ metadata = dict(record.metadata) if record.metadata else {}
91
+ metadata['_source_id'] = record.id
92
+ metadata['_source_type'] = self.source.source_type
93
+
94
+ try:
95
+ output_store.add(
96
+ text=text,
97
+ metadata=metadata
98
+ )
99
+ total_migrated += 1
100
+ except Exception as e:
101
+ total_skipped += 1
102
+ errors.append(f"Record {record.id}: {str(e)}")
103
+
104
+ # Update checkpoint
105
+ if self.checkpoint_path:
106
+ checkpoint = MigrationCheckpoint(
107
+ source_type=self.source.source_type,
108
+ source_name=self.source.source_name,
109
+ output_path=self.output_path,
110
+ last_cursor=cursor,
111
+ records_completed=total_migrated,
112
+ started_at=time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(start_time))
113
+ )
114
+ self._write_checkpoint(checkpoint)
115
+
116
+ # Call progress callbacks
117
+ for callback in self.progress_callbacks:
118
+ callback(total_migrated, estimate.total_vectors)
119
+
120
+ cursor = next_cursor
121
+ if cursor is None:
122
+ break
123
+
124
+ except Exception as e:
125
+ errors.append(str(e))
126
+ break
127
+ finally:
128
+ output_store.close()
129
+
130
+ # Calculate output file size
131
+ output_size = os.path.getsize(self.output_path) if os.path.exists(self.output_path) else 0
132
+
133
+ return MigrationResult(
134
+ source_type=self.source.source_type,
135
+ source_name=self.source.source_name,
136
+ output_path=self.output_path,
137
+ total_migrated=total_migrated,
138
+ total_skipped=total_skipped,
139
+ duration_seconds=time.time() - start_time,
140
+ output_size_bytes=output_size,
141
+ errors=errors
142
+ )
143
+
144
+ except Exception as e:
145
+ return MigrationResult(
146
+ source_type=self.source.source_type,
147
+ source_name=self.source.source_name,
148
+ output_path=self.output_path,
149
+ total_migrated=0,
150
+ total_skipped=0,
151
+ duration_seconds=time.time() - start_time,
152
+ output_size_bytes=0,
153
+ errors=[str(e)]
154
+ )
155
+ finally:
156
+ try:
157
+ self.source.close()
158
+ except:
159
+ pass
160
+
161
+ def resume(self) -> MigrationResult:
162
+ """Resume migration from checkpoint.
163
+
164
+ Raises:
165
+ NotImplementedError: Resume is not yet implemented.
166
+ """
167
+ raise NotImplementedError(
168
+ "Resume not yet implemented — use run() to restart migration"
169
+ )
170
+
171
+ def _write_checkpoint(self, checkpoint: MigrationCheckpoint) -> None:
172
+ """Write checkpoint to file."""
173
+ if not self.checkpoint_path:
174
+ return
175
+
176
+ checkpoint_data = {
177
+ 'source_type': checkpoint.source_type,
178
+ 'source_name': checkpoint.source_name,
179
+ 'output_path': checkpoint.output_path,
180
+ 'last_cursor': checkpoint.last_cursor,
181
+ 'records_completed': checkpoint.records_completed,
182
+ 'started_at': checkpoint.started_at
183
+ }
184
+
185
+ parent = os.path.dirname(self.checkpoint_path)
186
+ if parent:
187
+ os.makedirs(parent, exist_ok=True)
188
+ with open(self.checkpoint_path, 'w') as f:
189
+ json.dump(checkpoint_data, f)
@@ -0,0 +1,13 @@
1
+ """Migration sources for various vector databases."""
2
+
3
+ from .pinecone import PineconeMigrationSource
4
+ from .qdrant import QdrantMigrationSource
5
+ from .weaviate import WeaviateMigrationSource
6
+ from .chroma import ChromaMigrationSource
7
+
8
+ __all__ = [
9
+ 'PineconeMigrationSource',
10
+ 'QdrantMigrationSource',
11
+ 'WeaviateMigrationSource',
12
+ 'ChromaMigrationSource'
13
+ ]
@@ -0,0 +1,170 @@
1
+ """Chroma migration source implementation."""
2
+ from typing import Optional, List, Tuple
3
+
4
+ try:
5
+ import chromadb
6
+ except ImportError:
7
+ # Create a simple mock structure for testing
8
+ class ChromaDBMock:
9
+ PersistentClient = None
10
+ HttpClient = None
11
+ Client = None
12
+
13
+ chromadb = ChromaDBMock()
14
+
15
+ from ..base import MigrationSource, VectorRecord, MigrationEstimate
16
+
17
+
18
+ class ChromaMigrationSource(MigrationSource):
19
+ """Migration source for Chroma vector database."""
20
+
21
+ def __init__(self, path: Optional[str] = None, host: Optional[str] = None, port: Optional[int] = None,
22
+ collection_name: str = "default", api_key: Optional[str] = None):
23
+ """Initialize Chroma migration source.
24
+
25
+ Args:
26
+ path: Path to persistent Chroma database (for PersistentClient)
27
+ host: Host for Chroma server (for HttpClient)
28
+ port: Port for Chroma server (for HttpClient)
29
+ collection_name: Name of the collection to migrate from
30
+ api_key: Optional API key for authentication
31
+ """
32
+ self.path = path
33
+ self.host = host
34
+ self.port = port
35
+ self.collection_name = collection_name
36
+ self.api_key = api_key
37
+ self.client = None
38
+ self.collection = None
39
+
40
+ @property
41
+ def source_type(self) -> str:
42
+ """Return the source type."""
43
+ return "chroma"
44
+
45
+ @property
46
+ def source_name(self) -> str:
47
+ """Return the source name."""
48
+ return self.collection_name
49
+
50
+ def connect(self) -> None:
51
+ """Connect to Chroma."""
52
+ if chromadb is None:
53
+ raise ImportError("chromadb library is required for Chroma migration")
54
+
55
+ # Choose client type based on configuration
56
+ if self.path:
57
+ # Use persistent client
58
+ self.client = chromadb.PersistentClient(path=self.path)
59
+ elif self.host and self.port:
60
+ # Use HTTP client
61
+ settings = {}
62
+ if self.api_key:
63
+ settings['chroma_api_impl'] = 'chromadb.api.fastapi.FastAPI'
64
+ settings['chroma_server_auth_credentials'] = self.api_key
65
+
66
+ self.client = chromadb.HttpClient(host=self.host, port=self.port, settings=settings)
67
+ else:
68
+ # Use in-memory client
69
+ self.client = chromadb.Client()
70
+
71
+ # Get or create collection
72
+ self.collection = self.client.get_collection(name=self.collection_name)
73
+
74
+ def estimate(self) -> MigrationEstimate:
75
+ """Estimate migration size and time."""
76
+ if not self.collection:
77
+ raise RuntimeError("Must call connect() first")
78
+
79
+ # Get collection count
80
+ total_vectors = self.collection.count()
81
+
82
+ # Peek at one record to get dimensions
83
+ dimensions = 768 # Default assumption
84
+ if total_vectors > 0:
85
+ try:
86
+ peek_result = self.collection.peek(limit=1)
87
+ if peek_result.get('embeddings') and len(peek_result['embeddings']) > 0:
88
+ dimensions = len(peek_result['embeddings'][0])
89
+ except Exception:
90
+ pass # Use default
91
+
92
+ # Rough estimates
93
+ bytes_per_vector = dimensions * 4 + 1024 # 4 bytes per float + metadata overhead
94
+ estimated_mv2_bytes = total_vectors * bytes_per_vector
95
+ estimated_seconds = total_vectors / 1200.0 # Rough estimate of 1200 vectors/second
96
+
97
+ return MigrationEstimate(
98
+ total_vectors=total_vectors,
99
+ dimensions=dimensions,
100
+ estimated_mv2_bytes=estimated_mv2_bytes,
101
+ estimated_seconds=estimated_seconds
102
+ )
103
+
104
+ def fetch_batch(self, batch_size: int = 10000, cursor: Optional[str] = None) -> Tuple[List[VectorRecord], Optional[str]]:
105
+ """Fetch a batch of records from Chroma."""
106
+ if not self.collection:
107
+ raise RuntimeError("Must call connect() first")
108
+
109
+ # Parse cursor as offset if provided
110
+ offset = 0
111
+ if cursor:
112
+ try:
113
+ offset = int(cursor)
114
+ except (ValueError, TypeError):
115
+ offset = 0
116
+
117
+ # Get batch of records
118
+ result = self.collection.get(
119
+ limit=batch_size,
120
+ offset=offset,
121
+ include=['embeddings', 'metadatas', 'documents']
122
+ )
123
+
124
+ ids = result.get('ids', [])
125
+ embeddings = result.get('embeddings', [])
126
+ metadatas = result.get('metadatas', [])
127
+ documents = result.get('documents', [])
128
+
129
+ records = []
130
+ for i, record_id in enumerate(ids):
131
+ embedding = embeddings[i] if i < len(embeddings) else []
132
+ metadata = metadatas[i] if i < len(metadatas) else {}
133
+ document = documents[i] if i < len(documents) else None
134
+
135
+ # Extract text from document or metadata
136
+ text = document
137
+ if not text and metadata:
138
+ text = metadata.get('text') or metadata.get('content')
139
+
140
+ # Create clean metadata without text fields
141
+ clean_metadata = {}
142
+ if metadata:
143
+ clean_metadata = {k: v for k, v in metadata.items() if k not in ('text', 'content')}
144
+
145
+ record = VectorRecord(
146
+ id=str(record_id),
147
+ vector=embedding,
148
+ text=text,
149
+ metadata=clean_metadata
150
+ )
151
+ records.append(record)
152
+
153
+ # Determine if there are more results
154
+ # If we got any results and this is the first page, assume there might be more
155
+ # If we got fewer results than requested, we're at the end
156
+ if len(ids) == batch_size:
157
+ next_cursor = str(offset + batch_size)
158
+ elif len(ids) > 0 and offset == 0:
159
+ # First batch with some results - optimistically assume there might be more
160
+ next_cursor = str(offset + batch_size)
161
+ else:
162
+ next_cursor = None
163
+
164
+ return records, next_cursor
165
+
166
+ def close(self) -> None:
167
+ """Close the connection and clean up resources."""
168
+ # Chroma doesn't require explicit cleanup
169
+ self.client = None
170
+ self.collection = None