featrixsphere 0.2.5566__py3-none-any.whl → 0.2.5978__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,227 @@
1
+ """
2
+ ReferenceRecord class for FeatrixSphere API.
3
+
4
+ Represents a reference point in the embedding space for similarity search.
5
+ Useful when you only have positive examples and want to find similar records.
6
+ """
7
+
8
+ import logging
9
+ from dataclasses import dataclass, field
10
+ from datetime import datetime
11
+ from typing import Dict, Any, Optional, List, TYPE_CHECKING
12
+
13
+ if TYPE_CHECKING:
14
+ from .http_client import ClientContext
15
+ from .foundational_model import FoundationalModel
16
+ from .vector_database import VectorDatabase
17
+ import numpy as np
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ @dataclass
23
+ class ReferenceRecord:
24
+ """
25
+ Represents a reference record in the embedding space.
26
+
27
+ A ReferenceRecord is useful when you only have positive examples but no
28
+ negative examples. Instead of training a classifier, you can create a
29
+ reference from a positive example and find similar records.
30
+
31
+ Attributes:
32
+ id: ReferenceRecord ID
33
+ name: Optional name
34
+ session_id: Parent session ID
35
+ record: The record this reference represents
36
+ embedding: Cached embedding vector
37
+ created_at: Creation timestamp
38
+
39
+ Usage:
40
+ # Create from foundational model
41
+ ref = fm.create_reference_record(
42
+ record={"age": 35, "income": 50000},
43
+ name="target_profile"
44
+ )
45
+
46
+ # Find similar records
47
+ similar = ref.find_similar(k=10, vector_database=vdb)
48
+
49
+ # Get embedding
50
+ embedding = ref.get_embedding()
51
+ """
52
+
53
+ id: str
54
+ session_id: str
55
+ record: Dict[str, Any]
56
+ name: Optional[str] = None
57
+ embedding: Optional[List[float]] = None
58
+ created_at: Optional[datetime] = None
59
+
60
+ # Internal
61
+ _ctx: Optional['ClientContext'] = field(default=None, repr=False)
62
+ _foundational_model: Optional['FoundationalModel'] = field(default=None, repr=False)
63
+
64
+ @classmethod
65
+ def from_record(
66
+ cls,
67
+ record: Dict[str, Any],
68
+ session_id: str,
69
+ name: Optional[str] = None,
70
+ ctx: Optional['ClientContext'] = None,
71
+ foundational_model: Optional['FoundationalModel'] = None
72
+ ) -> 'ReferenceRecord':
73
+ """
74
+ Create a ReferenceRecord from a record.
75
+
76
+ Args:
77
+ record: The record to create reference from
78
+ session_id: Parent session ID
79
+ name: Optional name
80
+ ctx: Client context for API calls
81
+ foundational_model: Parent FM
82
+
83
+ Returns:
84
+ ReferenceRecord instance
85
+ """
86
+ import uuid
87
+
88
+ ref = cls(
89
+ id=str(uuid.uuid4()),
90
+ session_id=session_id,
91
+ record=record,
92
+ name=name,
93
+ created_at=datetime.now(),
94
+ _ctx=ctx,
95
+ _foundational_model=foundational_model,
96
+ )
97
+
98
+ # Pre-compute embedding if we have context
99
+ if ctx:
100
+ try:
101
+ ref._compute_embedding()
102
+ except Exception as e:
103
+ logger.warning(f"Failed to compute embedding for reference record: {e}")
104
+
105
+ return ref
106
+
107
+ @property
108
+ def foundational_model(self) -> Optional['FoundationalModel']:
109
+ """Get the parent foundational model."""
110
+ return self._foundational_model
111
+
112
+ def find_similar(
113
+ self,
114
+ k: int = 10,
115
+ vector_database: Optional['VectorDatabase'] = None
116
+ ) -> List[Dict[str, Any]]:
117
+ """
118
+ Find k records similar to this reference.
119
+
120
+ Args:
121
+ k: Number of similar records to return
122
+ vector_database: Optional VectorDatabase to search in.
123
+ If None, searches in the session's default records.
124
+
125
+ Returns:
126
+ List of similar records with similarity scores
127
+
128
+ Example:
129
+ similar = ref.find_similar(k=10)
130
+ for record in similar:
131
+ print(f"Score: {record['similarity']}")
132
+ print(f"Data: {record['record']}")
133
+ """
134
+ if not self._ctx:
135
+ raise ValueError("ReferenceRecord not connected to client")
136
+
137
+ # If vector database provided, use it
138
+ if vector_database:
139
+ return vector_database.similarity_search(self.record, k=k)
140
+
141
+ # Otherwise use session-level similarity search
142
+ cleaned_record = self._clean_record(self.record)
143
+
144
+ response = self._ctx.post_json(
145
+ f"/session/{self.session_id}/similarity_search",
146
+ data={
147
+ "query_record": cleaned_record,
148
+ "k": k
149
+ }
150
+ )
151
+
152
+ return response.get('similar_records', response.get('results', []))
153
+
154
+ def get_embedding(self) -> List[float]:
155
+ """
156
+ Get the embedding vector for this reference record.
157
+
158
+ Returns:
159
+ List of floats representing the embedding vector
160
+
161
+ Example:
162
+ embedding = ref.get_embedding()
163
+ print(f"Embedding dimension: {len(embedding)}")
164
+ """
165
+ if self.embedding is not None:
166
+ return self.embedding
167
+
168
+ self._compute_embedding()
169
+ return self.embedding or []
170
+
171
+ def _compute_embedding(self) -> None:
172
+ """Compute and cache the embedding for this reference's record."""
173
+ if not self._ctx:
174
+ raise ValueError("ReferenceRecord not connected to client")
175
+
176
+ cleaned_record = self._clean_record(self.record)
177
+
178
+ response = self._ctx.post_json(
179
+ f"/session/{self.session_id}/encode_records",
180
+ data={"records": [cleaned_record]}
181
+ )
182
+
183
+ embeddings = response.get('embeddings', [])
184
+ if embeddings:
185
+ self.embedding = embeddings[0]
186
+
187
+ def delete(self) -> None:
188
+ """
189
+ Delete this reference record.
190
+
191
+ Note: ReferenceRecords are lightweight and exist only in memory on the client.
192
+ This method clears the reference's data.
193
+ """
194
+ self.record = {}
195
+ self.embedding = None
196
+ self._ctx = None
197
+ self._foundational_model = None
198
+
199
+ def _clean_record(self, record: Dict[str, Any]) -> Dict[str, Any]:
200
+ """Clean a record for API submission."""
201
+ import math
202
+
203
+ cleaned = {}
204
+ for key, value in record.items():
205
+ if isinstance(value, float):
206
+ if math.isnan(value) or math.isinf(value):
207
+ value = None
208
+ if hasattr(value, 'item'):
209
+ value = value.item()
210
+ cleaned[key] = value
211
+ return cleaned
212
+
213
+ def to_dict(self) -> Dict[str, Any]:
214
+ """Convert to dictionary representation."""
215
+ return {
216
+ 'id': self.id,
217
+ 'session_id': self.session_id,
218
+ 'name': self.name,
219
+ 'record': self.record,
220
+ 'embedding_dimension': len(self.embedding) if self.embedding else None,
221
+ 'created_at': self.created_at.isoformat() if self.created_at else None,
222
+ }
223
+
224
+ def __repr__(self) -> str:
225
+ name_str = f", name='{self.name}'" if self.name else ""
226
+ emb_str = f", dim={len(self.embedding)}" if self.embedding else ""
227
+ return f"ReferenceRecord(id='{self.id}'{name_str}{emb_str})"
@@ -0,0 +1,269 @@
1
+ """
2
+ VectorDatabase class for FeatrixSphere API.
3
+
4
+ Represents a vector database for similarity search operations.
5
+ """
6
+
7
+ import logging
8
+ from dataclasses import dataclass, field
9
+ from datetime import datetime
10
+ from typing import Dict, Any, Optional, List, Union, TYPE_CHECKING
11
+
12
+ if TYPE_CHECKING:
13
+ from .http_client import ClientContext
14
+ from .foundational_model import FoundationalModel
15
+ import pandas as pd
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ @dataclass
21
+ class VectorDatabase:
22
+ """
23
+ Represents a vector database for similarity search.
24
+
25
+ Attributes:
26
+ id: Vector database ID (same as session_id)
27
+ name: Database name
28
+ session_id: Parent session ID
29
+ record_count: Number of records in database
30
+ created_at: Creation timestamp
31
+
32
+ Usage:
33
+ # Create from foundational model
34
+ vdb = fm.create_vector_database(
35
+ name="customer_search",
36
+ records=customer_records
37
+ )
38
+
39
+ # Similarity search
40
+ similar = vdb.similarity_search(
41
+ {"age": 35, "income": 50000},
42
+ k=5
43
+ )
44
+
45
+ # Add more records
46
+ vdb.add_records(new_customers)
47
+ """
48
+
49
+ id: str
50
+ session_id: str
51
+ name: Optional[str] = None
52
+ record_count: int = 0
53
+ created_at: Optional[datetime] = None
54
+
55
+ # Internal
56
+ _ctx: Optional['ClientContext'] = field(default=None, repr=False)
57
+ _foundational_model: Optional['FoundationalModel'] = field(default=None, repr=False)
58
+
59
+ @classmethod
60
+ def from_session(
61
+ cls,
62
+ session_id: str,
63
+ name: Optional[str] = None,
64
+ ctx: Optional['ClientContext'] = None,
65
+ foundational_model: Optional['FoundationalModel'] = None
66
+ ) -> 'VectorDatabase':
67
+ """Create VectorDatabase from session ID."""
68
+ vdb = cls(
69
+ id=session_id,
70
+ session_id=session_id,
71
+ name=name,
72
+ created_at=datetime.now(),
73
+ _ctx=ctx,
74
+ _foundational_model=foundational_model,
75
+ )
76
+
77
+ # Try to get record count
78
+ if ctx:
79
+ try:
80
+ size = ctx.get_json(f"/session/{session_id}/vectordb_size")
81
+ vdb.record_count = size.get('size', 0)
82
+ except Exception:
83
+ pass
84
+
85
+ return vdb
86
+
87
+ @property
88
+ def foundational_model(self) -> Optional['FoundationalModel']:
89
+ """Get the parent foundational model."""
90
+ return self._foundational_model
91
+
92
+ def similarity_search(
93
+ self,
94
+ query_record: Dict[str, Any],
95
+ k: int = 10
96
+ ) -> List[Dict[str, Any]]:
97
+ """
98
+ Find k most similar records to the query.
99
+
100
+ Args:
101
+ query_record: Query record dictionary
102
+ k: Number of similar records to return
103
+
104
+ Returns:
105
+ List of similar records with similarity scores
106
+
107
+ Example:
108
+ similar = vdb.similarity_search(
109
+ {"age": 35, "income": 50000},
110
+ k=5
111
+ )
112
+ for record in similar:
113
+ print(f"Score: {record['similarity']}")
114
+ print(f"Data: {record['record']}")
115
+ """
116
+ if not self._ctx:
117
+ raise ValueError("VectorDatabase not connected to client")
118
+
119
+ cleaned_query = self._clean_record(query_record)
120
+
121
+ response = self._ctx.post_json(
122
+ f"/session/{self.session_id}/similarity_search",
123
+ data={
124
+ "query_record": cleaned_query,
125
+ "k": k
126
+ }
127
+ )
128
+
129
+ return response.get('similar_records', response.get('results', []))
130
+
131
+ def add_records(
132
+ self,
133
+ records: Union[List[Dict[str, Any]], 'pd.DataFrame'],
134
+ batch_size: int = 500
135
+ ) -> 'VectorDatabase':
136
+ """
137
+ Add records to the vector database.
138
+
139
+ Args:
140
+ records: List of record dictionaries or DataFrame
141
+ batch_size: Batch size for adding records
142
+
143
+ Returns:
144
+ Self (updated record count)
145
+ """
146
+ if not self._ctx:
147
+ raise ValueError("VectorDatabase not connected to client")
148
+
149
+ # Convert DataFrame to list if needed
150
+ if hasattr(records, 'to_dict'):
151
+ records = records.to_dict('records')
152
+
153
+ # Clean records
154
+ cleaned_records = [self._clean_record(r) for r in records]
155
+
156
+ # Add in batches
157
+ total_added = 0
158
+ for i in range(0, len(cleaned_records), batch_size):
159
+ batch = cleaned_records[i:i + batch_size]
160
+
161
+ response = self._ctx.post_json(
162
+ f"/session/{self.session_id}/add_records",
163
+ data={"records": batch}
164
+ )
165
+
166
+ added = response.get('added', len(batch))
167
+ total_added += added
168
+
169
+ self.record_count += total_added
170
+ return self
171
+
172
+ def remove_records(
173
+ self,
174
+ record_ids: List[str]
175
+ ) -> 'VectorDatabase':
176
+ """
177
+ Remove records from the vector database by ID.
178
+
179
+ Args:
180
+ record_ids: List of record IDs to remove
181
+
182
+ Returns:
183
+ Self (updated record count)
184
+
185
+ Note:
186
+ This operation may not be supported by all backends.
187
+ """
188
+ if not self._ctx:
189
+ raise ValueError("VectorDatabase not connected to client")
190
+
191
+ # This endpoint may not exist yet - placeholder for future
192
+ try:
193
+ response = self._ctx.post_json(
194
+ f"/session/{self.session_id}/remove_records",
195
+ data={"record_ids": record_ids}
196
+ )
197
+ removed = response.get('removed', len(record_ids))
198
+ self.record_count = max(0, self.record_count - removed)
199
+ except Exception as e:
200
+ logger.warning(f"remove_records not supported: {e}")
201
+
202
+ return self
203
+
204
+ def size(self) -> int:
205
+ """Get the current number of records in the database."""
206
+ if not self._ctx:
207
+ raise ValueError("VectorDatabase not connected to client")
208
+
209
+ response = self._ctx.get_json(f"/session/{self.session_id}/vectordb_size")
210
+ self.record_count = response.get('size', 0)
211
+ return self.record_count
212
+
213
+ def encode(
214
+ self,
215
+ records: Union[Dict[str, Any], List[Dict[str, Any]], 'pd.DataFrame']
216
+ ) -> List[List[float]]:
217
+ """
218
+ Encode records to embedding vectors.
219
+
220
+ Args:
221
+ records: Single record, list of records, or DataFrame
222
+
223
+ Returns:
224
+ List of embedding vectors
225
+ """
226
+ if not self._ctx:
227
+ raise ValueError("VectorDatabase not connected to client")
228
+
229
+ # Normalize input to list
230
+ if isinstance(records, dict):
231
+ records = [records]
232
+ elif hasattr(records, 'to_dict'):
233
+ records = records.to_dict('records')
234
+
235
+ cleaned_records = [self._clean_record(r) for r in records]
236
+
237
+ response = self._ctx.post_json(
238
+ f"/session/{self.session_id}/encode_records",
239
+ data={"records": cleaned_records}
240
+ )
241
+
242
+ return response.get('embeddings', [])
243
+
244
+ def _clean_record(self, record: Dict[str, Any]) -> Dict[str, Any]:
245
+ """Clean a record for API submission."""
246
+ import math
247
+
248
+ cleaned = {}
249
+ for key, value in record.items():
250
+ if isinstance(value, float):
251
+ if math.isnan(value) or math.isinf(value):
252
+ value = None
253
+ if hasattr(value, 'item'):
254
+ value = value.item()
255
+ cleaned[key] = value
256
+ return cleaned
257
+
258
+ def to_dict(self) -> Dict[str, Any]:
259
+ """Convert to dictionary representation."""
260
+ return {
261
+ 'id': self.id,
262
+ 'session_id': self.session_id,
263
+ 'name': self.name,
264
+ 'record_count': self.record_count,
265
+ 'created_at': self.created_at.isoformat() if self.created_at else None,
266
+ }
267
+
268
+ def __repr__(self) -> str:
269
+ return f"VectorDatabase(id='{self.id}', name='{self.name}', records={self.record_count})"