featrixsphere 0.2.5566__py3-none-any.whl → 0.2.5978__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- featrixsphere/__init__.py +37 -18
- featrixsphere/api/__init__.py +50 -0
- featrixsphere/api/api_endpoint.py +280 -0
- featrixsphere/api/client.py +396 -0
- featrixsphere/api/foundational_model.py +658 -0
- featrixsphere/api/http_client.py +209 -0
- featrixsphere/api/notebook_helper.py +584 -0
- featrixsphere/api/prediction_result.py +231 -0
- featrixsphere/api/predictor.py +537 -0
- featrixsphere/api/reference_record.py +227 -0
- featrixsphere/api/vector_database.py +269 -0
- featrixsphere/client.py +211 -8
- {featrixsphere-0.2.5566.dist-info → featrixsphere-0.2.5978.dist-info}/METADATA +1 -1
- featrixsphere-0.2.5978.dist-info/RECORD +17 -0
- featrixsphere-0.2.5566.dist-info/RECORD +0 -7
- {featrixsphere-0.2.5566.dist-info → featrixsphere-0.2.5978.dist-info}/WHEEL +0 -0
- {featrixsphere-0.2.5566.dist-info → featrixsphere-0.2.5978.dist-info}/entry_points.txt +0 -0
- {featrixsphere-0.2.5566.dist-info → featrixsphere-0.2.5978.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ReferenceRecord class for FeatrixSphere API.
|
|
3
|
+
|
|
4
|
+
Represents a reference point in the embedding space for similarity search.
|
|
5
|
+
Useful when you only have positive examples and want to find similar records.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from datetime import datetime
|
|
11
|
+
from typing import Dict, Any, Optional, List, TYPE_CHECKING
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from .http_client import ClientContext
|
|
15
|
+
from .foundational_model import FoundationalModel
|
|
16
|
+
from .vector_database import VectorDatabase
|
|
17
|
+
import numpy as np
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class ReferenceRecord:
|
|
24
|
+
"""
|
|
25
|
+
Represents a reference record in the embedding space.
|
|
26
|
+
|
|
27
|
+
A ReferenceRecord is useful when you only have positive examples but no
|
|
28
|
+
negative examples. Instead of training a classifier, you can create a
|
|
29
|
+
reference from a positive example and find similar records.
|
|
30
|
+
|
|
31
|
+
Attributes:
|
|
32
|
+
id: ReferenceRecord ID
|
|
33
|
+
name: Optional name
|
|
34
|
+
session_id: Parent session ID
|
|
35
|
+
record: The record this reference represents
|
|
36
|
+
embedding: Cached embedding vector
|
|
37
|
+
created_at: Creation timestamp
|
|
38
|
+
|
|
39
|
+
Usage:
|
|
40
|
+
# Create from foundational model
|
|
41
|
+
ref = fm.create_reference_record(
|
|
42
|
+
record={"age": 35, "income": 50000},
|
|
43
|
+
name="target_profile"
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
# Find similar records
|
|
47
|
+
similar = ref.find_similar(k=10, vector_database=vdb)
|
|
48
|
+
|
|
49
|
+
# Get embedding
|
|
50
|
+
embedding = ref.get_embedding()
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
id: str
|
|
54
|
+
session_id: str
|
|
55
|
+
record: Dict[str, Any]
|
|
56
|
+
name: Optional[str] = None
|
|
57
|
+
embedding: Optional[List[float]] = None
|
|
58
|
+
created_at: Optional[datetime] = None
|
|
59
|
+
|
|
60
|
+
# Internal
|
|
61
|
+
_ctx: Optional['ClientContext'] = field(default=None, repr=False)
|
|
62
|
+
_foundational_model: Optional['FoundationalModel'] = field(default=None, repr=False)
|
|
63
|
+
|
|
64
|
+
@classmethod
|
|
65
|
+
def from_record(
|
|
66
|
+
cls,
|
|
67
|
+
record: Dict[str, Any],
|
|
68
|
+
session_id: str,
|
|
69
|
+
name: Optional[str] = None,
|
|
70
|
+
ctx: Optional['ClientContext'] = None,
|
|
71
|
+
foundational_model: Optional['FoundationalModel'] = None
|
|
72
|
+
) -> 'ReferenceRecord':
|
|
73
|
+
"""
|
|
74
|
+
Create a ReferenceRecord from a record.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
record: The record to create reference from
|
|
78
|
+
session_id: Parent session ID
|
|
79
|
+
name: Optional name
|
|
80
|
+
ctx: Client context for API calls
|
|
81
|
+
foundational_model: Parent FM
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
ReferenceRecord instance
|
|
85
|
+
"""
|
|
86
|
+
import uuid
|
|
87
|
+
|
|
88
|
+
ref = cls(
|
|
89
|
+
id=str(uuid.uuid4()),
|
|
90
|
+
session_id=session_id,
|
|
91
|
+
record=record,
|
|
92
|
+
name=name,
|
|
93
|
+
created_at=datetime.now(),
|
|
94
|
+
_ctx=ctx,
|
|
95
|
+
_foundational_model=foundational_model,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
# Pre-compute embedding if we have context
|
|
99
|
+
if ctx:
|
|
100
|
+
try:
|
|
101
|
+
ref._compute_embedding()
|
|
102
|
+
except Exception as e:
|
|
103
|
+
logger.warning(f"Failed to compute embedding for reference record: {e}")
|
|
104
|
+
|
|
105
|
+
return ref
|
|
106
|
+
|
|
107
|
+
@property
|
|
108
|
+
def foundational_model(self) -> Optional['FoundationalModel']:
|
|
109
|
+
"""Get the parent foundational model."""
|
|
110
|
+
return self._foundational_model
|
|
111
|
+
|
|
112
|
+
def find_similar(
|
|
113
|
+
self,
|
|
114
|
+
k: int = 10,
|
|
115
|
+
vector_database: Optional['VectorDatabase'] = None
|
|
116
|
+
) -> List[Dict[str, Any]]:
|
|
117
|
+
"""
|
|
118
|
+
Find k records similar to this reference.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
k: Number of similar records to return
|
|
122
|
+
vector_database: Optional VectorDatabase to search in.
|
|
123
|
+
If None, searches in the session's default records.
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
List of similar records with similarity scores
|
|
127
|
+
|
|
128
|
+
Example:
|
|
129
|
+
similar = ref.find_similar(k=10)
|
|
130
|
+
for record in similar:
|
|
131
|
+
print(f"Score: {record['similarity']}")
|
|
132
|
+
print(f"Data: {record['record']}")
|
|
133
|
+
"""
|
|
134
|
+
if not self._ctx:
|
|
135
|
+
raise ValueError("ReferenceRecord not connected to client")
|
|
136
|
+
|
|
137
|
+
# If vector database provided, use it
|
|
138
|
+
if vector_database:
|
|
139
|
+
return vector_database.similarity_search(self.record, k=k)
|
|
140
|
+
|
|
141
|
+
# Otherwise use session-level similarity search
|
|
142
|
+
cleaned_record = self._clean_record(self.record)
|
|
143
|
+
|
|
144
|
+
response = self._ctx.post_json(
|
|
145
|
+
f"/session/{self.session_id}/similarity_search",
|
|
146
|
+
data={
|
|
147
|
+
"query_record": cleaned_record,
|
|
148
|
+
"k": k
|
|
149
|
+
}
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
return response.get('similar_records', response.get('results', []))
|
|
153
|
+
|
|
154
|
+
def get_embedding(self) -> List[float]:
|
|
155
|
+
"""
|
|
156
|
+
Get the embedding vector for this reference record.
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
List of floats representing the embedding vector
|
|
160
|
+
|
|
161
|
+
Example:
|
|
162
|
+
embedding = ref.get_embedding()
|
|
163
|
+
print(f"Embedding dimension: {len(embedding)}")
|
|
164
|
+
"""
|
|
165
|
+
if self.embedding is not None:
|
|
166
|
+
return self.embedding
|
|
167
|
+
|
|
168
|
+
self._compute_embedding()
|
|
169
|
+
return self.embedding or []
|
|
170
|
+
|
|
171
|
+
def _compute_embedding(self) -> None:
|
|
172
|
+
"""Compute and cache the embedding for this reference's record."""
|
|
173
|
+
if not self._ctx:
|
|
174
|
+
raise ValueError("ReferenceRecord not connected to client")
|
|
175
|
+
|
|
176
|
+
cleaned_record = self._clean_record(self.record)
|
|
177
|
+
|
|
178
|
+
response = self._ctx.post_json(
|
|
179
|
+
f"/session/{self.session_id}/encode_records",
|
|
180
|
+
data={"records": [cleaned_record]}
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
embeddings = response.get('embeddings', [])
|
|
184
|
+
if embeddings:
|
|
185
|
+
self.embedding = embeddings[0]
|
|
186
|
+
|
|
187
|
+
def delete(self) -> None:
|
|
188
|
+
"""
|
|
189
|
+
Delete this reference record.
|
|
190
|
+
|
|
191
|
+
Note: ReferenceRecords are lightweight and exist only in memory on the client.
|
|
192
|
+
This method clears the reference's data.
|
|
193
|
+
"""
|
|
194
|
+
self.record = {}
|
|
195
|
+
self.embedding = None
|
|
196
|
+
self._ctx = None
|
|
197
|
+
self._foundational_model = None
|
|
198
|
+
|
|
199
|
+
def _clean_record(self, record: Dict[str, Any]) -> Dict[str, Any]:
|
|
200
|
+
"""Clean a record for API submission."""
|
|
201
|
+
import math
|
|
202
|
+
|
|
203
|
+
cleaned = {}
|
|
204
|
+
for key, value in record.items():
|
|
205
|
+
if isinstance(value, float):
|
|
206
|
+
if math.isnan(value) or math.isinf(value):
|
|
207
|
+
value = None
|
|
208
|
+
if hasattr(value, 'item'):
|
|
209
|
+
value = value.item()
|
|
210
|
+
cleaned[key] = value
|
|
211
|
+
return cleaned
|
|
212
|
+
|
|
213
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
214
|
+
"""Convert to dictionary representation."""
|
|
215
|
+
return {
|
|
216
|
+
'id': self.id,
|
|
217
|
+
'session_id': self.session_id,
|
|
218
|
+
'name': self.name,
|
|
219
|
+
'record': self.record,
|
|
220
|
+
'embedding_dimension': len(self.embedding) if self.embedding else None,
|
|
221
|
+
'created_at': self.created_at.isoformat() if self.created_at else None,
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
def __repr__(self) -> str:
|
|
225
|
+
name_str = f", name='{self.name}'" if self.name else ""
|
|
226
|
+
emb_str = f", dim={len(self.embedding)}" if self.embedding else ""
|
|
227
|
+
return f"ReferenceRecord(id='{self.id}'{name_str}{emb_str})"
|
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
"""
|
|
2
|
+
VectorDatabase class for FeatrixSphere API.
|
|
3
|
+
|
|
4
|
+
Represents a vector database for similarity search operations.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from typing import Dict, Any, Optional, List, Union, TYPE_CHECKING
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from .http_client import ClientContext
|
|
14
|
+
from .foundational_model import FoundationalModel
|
|
15
|
+
import pandas as pd
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class VectorDatabase:
|
|
22
|
+
"""
|
|
23
|
+
Represents a vector database for similarity search.
|
|
24
|
+
|
|
25
|
+
Attributes:
|
|
26
|
+
id: Vector database ID (same as session_id)
|
|
27
|
+
name: Database name
|
|
28
|
+
session_id: Parent session ID
|
|
29
|
+
record_count: Number of records in database
|
|
30
|
+
created_at: Creation timestamp
|
|
31
|
+
|
|
32
|
+
Usage:
|
|
33
|
+
# Create from foundational model
|
|
34
|
+
vdb = fm.create_vector_database(
|
|
35
|
+
name="customer_search",
|
|
36
|
+
records=customer_records
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
# Similarity search
|
|
40
|
+
similar = vdb.similarity_search(
|
|
41
|
+
{"age": 35, "income": 50000},
|
|
42
|
+
k=5
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
# Add more records
|
|
46
|
+
vdb.add_records(new_customers)
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
id: str
|
|
50
|
+
session_id: str
|
|
51
|
+
name: Optional[str] = None
|
|
52
|
+
record_count: int = 0
|
|
53
|
+
created_at: Optional[datetime] = None
|
|
54
|
+
|
|
55
|
+
# Internal
|
|
56
|
+
_ctx: Optional['ClientContext'] = field(default=None, repr=False)
|
|
57
|
+
_foundational_model: Optional['FoundationalModel'] = field(default=None, repr=False)
|
|
58
|
+
|
|
59
|
+
@classmethod
|
|
60
|
+
def from_session(
|
|
61
|
+
cls,
|
|
62
|
+
session_id: str,
|
|
63
|
+
name: Optional[str] = None,
|
|
64
|
+
ctx: Optional['ClientContext'] = None,
|
|
65
|
+
foundational_model: Optional['FoundationalModel'] = None
|
|
66
|
+
) -> 'VectorDatabase':
|
|
67
|
+
"""Create VectorDatabase from session ID."""
|
|
68
|
+
vdb = cls(
|
|
69
|
+
id=session_id,
|
|
70
|
+
session_id=session_id,
|
|
71
|
+
name=name,
|
|
72
|
+
created_at=datetime.now(),
|
|
73
|
+
_ctx=ctx,
|
|
74
|
+
_foundational_model=foundational_model,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# Try to get record count
|
|
78
|
+
if ctx:
|
|
79
|
+
try:
|
|
80
|
+
size = ctx.get_json(f"/session/{session_id}/vectordb_size")
|
|
81
|
+
vdb.record_count = size.get('size', 0)
|
|
82
|
+
except Exception:
|
|
83
|
+
pass
|
|
84
|
+
|
|
85
|
+
return vdb
|
|
86
|
+
|
|
87
|
+
@property
|
|
88
|
+
def foundational_model(self) -> Optional['FoundationalModel']:
|
|
89
|
+
"""Get the parent foundational model."""
|
|
90
|
+
return self._foundational_model
|
|
91
|
+
|
|
92
|
+
def similarity_search(
|
|
93
|
+
self,
|
|
94
|
+
query_record: Dict[str, Any],
|
|
95
|
+
k: int = 10
|
|
96
|
+
) -> List[Dict[str, Any]]:
|
|
97
|
+
"""
|
|
98
|
+
Find k most similar records to the query.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
query_record: Query record dictionary
|
|
102
|
+
k: Number of similar records to return
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
List of similar records with similarity scores
|
|
106
|
+
|
|
107
|
+
Example:
|
|
108
|
+
similar = vdb.similarity_search(
|
|
109
|
+
{"age": 35, "income": 50000},
|
|
110
|
+
k=5
|
|
111
|
+
)
|
|
112
|
+
for record in similar:
|
|
113
|
+
print(f"Score: {record['similarity']}")
|
|
114
|
+
print(f"Data: {record['record']}")
|
|
115
|
+
"""
|
|
116
|
+
if not self._ctx:
|
|
117
|
+
raise ValueError("VectorDatabase not connected to client")
|
|
118
|
+
|
|
119
|
+
cleaned_query = self._clean_record(query_record)
|
|
120
|
+
|
|
121
|
+
response = self._ctx.post_json(
|
|
122
|
+
f"/session/{self.session_id}/similarity_search",
|
|
123
|
+
data={
|
|
124
|
+
"query_record": cleaned_query,
|
|
125
|
+
"k": k
|
|
126
|
+
}
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
return response.get('similar_records', response.get('results', []))
|
|
130
|
+
|
|
131
|
+
def add_records(
|
|
132
|
+
self,
|
|
133
|
+
records: Union[List[Dict[str, Any]], 'pd.DataFrame'],
|
|
134
|
+
batch_size: int = 500
|
|
135
|
+
) -> 'VectorDatabase':
|
|
136
|
+
"""
|
|
137
|
+
Add records to the vector database.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
records: List of record dictionaries or DataFrame
|
|
141
|
+
batch_size: Batch size for adding records
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
Self (updated record count)
|
|
145
|
+
"""
|
|
146
|
+
if not self._ctx:
|
|
147
|
+
raise ValueError("VectorDatabase not connected to client")
|
|
148
|
+
|
|
149
|
+
# Convert DataFrame to list if needed
|
|
150
|
+
if hasattr(records, 'to_dict'):
|
|
151
|
+
records = records.to_dict('records')
|
|
152
|
+
|
|
153
|
+
# Clean records
|
|
154
|
+
cleaned_records = [self._clean_record(r) for r in records]
|
|
155
|
+
|
|
156
|
+
# Add in batches
|
|
157
|
+
total_added = 0
|
|
158
|
+
for i in range(0, len(cleaned_records), batch_size):
|
|
159
|
+
batch = cleaned_records[i:i + batch_size]
|
|
160
|
+
|
|
161
|
+
response = self._ctx.post_json(
|
|
162
|
+
f"/session/{self.session_id}/add_records",
|
|
163
|
+
data={"records": batch}
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
added = response.get('added', len(batch))
|
|
167
|
+
total_added += added
|
|
168
|
+
|
|
169
|
+
self.record_count += total_added
|
|
170
|
+
return self
|
|
171
|
+
|
|
172
|
+
def remove_records(
|
|
173
|
+
self,
|
|
174
|
+
record_ids: List[str]
|
|
175
|
+
) -> 'VectorDatabase':
|
|
176
|
+
"""
|
|
177
|
+
Remove records from the vector database by ID.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
record_ids: List of record IDs to remove
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
Self (updated record count)
|
|
184
|
+
|
|
185
|
+
Note:
|
|
186
|
+
This operation may not be supported by all backends.
|
|
187
|
+
"""
|
|
188
|
+
if not self._ctx:
|
|
189
|
+
raise ValueError("VectorDatabase not connected to client")
|
|
190
|
+
|
|
191
|
+
# This endpoint may not exist yet - placeholder for future
|
|
192
|
+
try:
|
|
193
|
+
response = self._ctx.post_json(
|
|
194
|
+
f"/session/{self.session_id}/remove_records",
|
|
195
|
+
data={"record_ids": record_ids}
|
|
196
|
+
)
|
|
197
|
+
removed = response.get('removed', len(record_ids))
|
|
198
|
+
self.record_count = max(0, self.record_count - removed)
|
|
199
|
+
except Exception as e:
|
|
200
|
+
logger.warning(f"remove_records not supported: {e}")
|
|
201
|
+
|
|
202
|
+
return self
|
|
203
|
+
|
|
204
|
+
def size(self) -> int:
|
|
205
|
+
"""Get the current number of records in the database."""
|
|
206
|
+
if not self._ctx:
|
|
207
|
+
raise ValueError("VectorDatabase not connected to client")
|
|
208
|
+
|
|
209
|
+
response = self._ctx.get_json(f"/session/{self.session_id}/vectordb_size")
|
|
210
|
+
self.record_count = response.get('size', 0)
|
|
211
|
+
return self.record_count
|
|
212
|
+
|
|
213
|
+
def encode(
|
|
214
|
+
self,
|
|
215
|
+
records: Union[Dict[str, Any], List[Dict[str, Any]], 'pd.DataFrame']
|
|
216
|
+
) -> List[List[float]]:
|
|
217
|
+
"""
|
|
218
|
+
Encode records to embedding vectors.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
records: Single record, list of records, or DataFrame
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
List of embedding vectors
|
|
225
|
+
"""
|
|
226
|
+
if not self._ctx:
|
|
227
|
+
raise ValueError("VectorDatabase not connected to client")
|
|
228
|
+
|
|
229
|
+
# Normalize input to list
|
|
230
|
+
if isinstance(records, dict):
|
|
231
|
+
records = [records]
|
|
232
|
+
elif hasattr(records, 'to_dict'):
|
|
233
|
+
records = records.to_dict('records')
|
|
234
|
+
|
|
235
|
+
cleaned_records = [self._clean_record(r) for r in records]
|
|
236
|
+
|
|
237
|
+
response = self._ctx.post_json(
|
|
238
|
+
f"/session/{self.session_id}/encode_records",
|
|
239
|
+
data={"records": cleaned_records}
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
return response.get('embeddings', [])
|
|
243
|
+
|
|
244
|
+
def _clean_record(self, record: Dict[str, Any]) -> Dict[str, Any]:
|
|
245
|
+
"""Clean a record for API submission."""
|
|
246
|
+
import math
|
|
247
|
+
|
|
248
|
+
cleaned = {}
|
|
249
|
+
for key, value in record.items():
|
|
250
|
+
if isinstance(value, float):
|
|
251
|
+
if math.isnan(value) or math.isinf(value):
|
|
252
|
+
value = None
|
|
253
|
+
if hasattr(value, 'item'):
|
|
254
|
+
value = value.item()
|
|
255
|
+
cleaned[key] = value
|
|
256
|
+
return cleaned
|
|
257
|
+
|
|
258
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
259
|
+
"""Convert to dictionary representation."""
|
|
260
|
+
return {
|
|
261
|
+
'id': self.id,
|
|
262
|
+
'session_id': self.session_id,
|
|
263
|
+
'name': self.name,
|
|
264
|
+
'record_count': self.record_count,
|
|
265
|
+
'created_at': self.created_at.isoformat() if self.created_at else None,
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
def __repr__(self) -> str:
|
|
269
|
+
return f"VectorDatabase(id='{self.id}', name='{self.name}', records={self.record_count})"
|