featrixsphere 0.2.1830__py3-none-any.whl → 0.2.2279__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- featrixsphere/__init__.py +1 -1
- featrixsphere/client.py +218 -65
- {featrixsphere-0.2.1830.dist-info → featrixsphere-0.2.2279.dist-info}/METADATA +1 -1
- featrixsphere-0.2.2279.dist-info/RECORD +8 -0
- featrixsphere-0.2.1830.dist-info/RECORD +0 -8
- {featrixsphere-0.2.1830.dist-info → featrixsphere-0.2.2279.dist-info}/WHEEL +0 -0
- {featrixsphere-0.2.1830.dist-info → featrixsphere-0.2.2279.dist-info}/entry_points.txt +0 -0
- {featrixsphere-0.2.1830.dist-info → featrixsphere-0.2.2279.dist-info}/top_level.txt +0 -0
featrixsphere/__init__.py
CHANGED
featrixsphere/client.py
CHANGED
|
@@ -565,7 +565,7 @@ class FeatrixSphereClient:
|
|
|
565
565
|
"""Make a DELETE request and return JSON response."""
|
|
566
566
|
response = self._make_request("DELETE", endpoint, max_retries=max_retries, **kwargs)
|
|
567
567
|
return self._unwrap_response(response.json())
|
|
568
|
-
|
|
568
|
+
|
|
569
569
|
# =========================================================================
|
|
570
570
|
# Session Management
|
|
571
571
|
# =========================================================================
|
|
@@ -893,6 +893,113 @@ class FeatrixSphereClient:
|
|
|
893
893
|
"""
|
|
894
894
|
response_data = self._post_json(f"/compute/session/{session_id}/unpublish", {})
|
|
895
895
|
return response_data
|
|
896
|
+
|
|
897
|
+
def publish_partial_foundation(
|
|
898
|
+
self,
|
|
899
|
+
source_session_id: str,
|
|
900
|
+
name: str,
|
|
901
|
+
checkpoint_epoch: int = None,
|
|
902
|
+
session_name_prefix: str = None,
|
|
903
|
+
publish: bool = True,
|
|
904
|
+
verbose: bool = True
|
|
905
|
+
) -> Dict[str, Any]:
|
|
906
|
+
"""
|
|
907
|
+
Publish a checkpoint from in-progress training as a standalone foundation model.
|
|
908
|
+
|
|
909
|
+
Takes a checkpoint from ongoing ES training and creates a NEW foundation model
|
|
910
|
+
session with full provenance tracking. Perfect for snapshotting good intermediate
|
|
911
|
+
models while training continues.
|
|
912
|
+
|
|
913
|
+
The new foundation model can be used with:
|
|
914
|
+
- train_on_foundational_model() - Train predictors on it
|
|
915
|
+
- Any standard foundation model operations
|
|
916
|
+
- Available across all compute nodes via backplane
|
|
917
|
+
|
|
918
|
+
Args:
|
|
919
|
+
source_session_id: Session with ES training (in-progress or completed)
|
|
920
|
+
name: Name for the new foundation model (REQUIRED)
|
|
921
|
+
checkpoint_epoch: Which epoch checkpoint to use (None = best/latest)
|
|
922
|
+
session_name_prefix: Optional prefix for new session ID
|
|
923
|
+
publish: Move to /sphere/published/ directory (default: True)
|
|
924
|
+
verbose: Print status updates
|
|
925
|
+
|
|
926
|
+
Returns:
|
|
927
|
+
dict with:
|
|
928
|
+
- foundation_session_id: New foundation session ID
|
|
929
|
+
- checkpoint_epoch: Epoch used
|
|
930
|
+
- provenance: Full metadata about source and training progress
|
|
931
|
+
- published_path: Path if published
|
|
932
|
+
|
|
933
|
+
Example:
|
|
934
|
+
```python
|
|
935
|
+
# Snapshot epoch 50 as foundation v0.5 while training continues
|
|
936
|
+
result = client.publish_partial_foundation(
|
|
937
|
+
source_session_id="abc-123",
|
|
938
|
+
name="My Foundation v0.5",
|
|
939
|
+
checkpoint_epoch=50,
|
|
940
|
+
session_name_prefix="foundation-v0.5",
|
|
941
|
+
publish=True
|
|
942
|
+
)
|
|
943
|
+
|
|
944
|
+
foundation_id = result['foundation_session_id']
|
|
945
|
+
print(f"Published foundation: {foundation_id}")
|
|
946
|
+
print(f"Source was {result['provenance']['training_progress_percent']}% trained")
|
|
947
|
+
|
|
948
|
+
# Use immediately like any foundation model
|
|
949
|
+
client.train_on_foundational_model(
|
|
950
|
+
foundation_model_id=foundation_id,
|
|
951
|
+
target_column="price",
|
|
952
|
+
target_column_type="scalar"
|
|
953
|
+
)
|
|
954
|
+
# Available on all compute nodes automatically via backplane
|
|
955
|
+
```
|
|
956
|
+
"""
|
|
957
|
+
if verbose:
|
|
958
|
+
print(f"📦 Publishing partial foundation from {source_session_id}")
|
|
959
|
+
print(f" Name: {name}")
|
|
960
|
+
if checkpoint_epoch is not None:
|
|
961
|
+
print(f" Checkpoint epoch: {checkpoint_epoch}")
|
|
962
|
+
else:
|
|
963
|
+
print(f" Checkpoint epoch: best/latest available")
|
|
964
|
+
print(f" Publish to /sphere/published/: {publish}")
|
|
965
|
+
|
|
966
|
+
data = {
|
|
967
|
+
'name': name,
|
|
968
|
+
'publish': publish
|
|
969
|
+
}
|
|
970
|
+
|
|
971
|
+
if checkpoint_epoch is not None:
|
|
972
|
+
data['checkpoint_epoch'] = checkpoint_epoch
|
|
973
|
+
if session_name_prefix:
|
|
974
|
+
data['session_name_prefix'] = session_name_prefix
|
|
975
|
+
|
|
976
|
+
try:
|
|
977
|
+
response_data = self._post_json(
|
|
978
|
+
f"/compute/session/{source_session_id}/publish_partial_foundation",
|
|
979
|
+
data
|
|
980
|
+
)
|
|
981
|
+
|
|
982
|
+
foundation_id = response_data.get('foundation_session_id')
|
|
983
|
+
checkpoint_used = response_data.get('checkpoint_epoch')
|
|
984
|
+
provenance = response_data.get('provenance', {})
|
|
985
|
+
|
|
986
|
+
if verbose:
|
|
987
|
+
print(f"✅ {response_data.get('message')}")
|
|
988
|
+
print(f" Foundation session ID: {foundation_id}")
|
|
989
|
+
print(f" Checkpoint epoch: {checkpoint_used}")
|
|
990
|
+
if provenance.get('training_progress_percent'):
|
|
991
|
+
print(f" Source training progress: {provenance['training_progress_percent']}%")
|
|
992
|
+
if provenance.get('validation_loss_at_checkpoint'):
|
|
993
|
+
print(f" Val loss at checkpoint: {provenance['validation_loss_at_checkpoint']:.4f}")
|
|
994
|
+
if response_data.get('published_path'):
|
|
995
|
+
print(f" Published to: {response_data['published_path']}")
|
|
996
|
+
|
|
997
|
+
return response_data
|
|
998
|
+
|
|
999
|
+
except Exception as e:
|
|
1000
|
+
if verbose:
|
|
1001
|
+
print(f"❌ Error publishing partial foundation: {e}")
|
|
1002
|
+
raise
|
|
896
1003
|
|
|
897
1004
|
def get_sessions_for_org(self, name_prefix: str, max_retries: int = None) -> Dict[str, Any]:
|
|
898
1005
|
"""
|
|
@@ -1974,9 +2081,30 @@ class FeatrixSphereClient:
|
|
|
1974
2081
|
- Category split: Use full data for ES, specific categories for predictor
|
|
1975
2082
|
- Label completeness: Include unlabeled rows in ES, exclude from predictor
|
|
1976
2083
|
- Test/holdout: Keep test data in ES context but exclude from predictor training
|
|
2084
|
+
|
|
2085
|
+
Special Input: Dictionary of Datasets
|
|
2086
|
+
--------------------------------------
|
|
2087
|
+
You can pass a dictionary of datasets instead of a single DataFrame. Each key is a dataset name,
|
|
2088
|
+
and each value is a list of DataFrames/tables to include in that dataset.
|
|
2089
|
+
|
|
2090
|
+
When using this format:
|
|
2091
|
+
- A __featrix_dataset_name column is automatically added to track which dataset each row came from
|
|
2092
|
+
- All tables from all datasets are concatenated into a single DataFrame before upload
|
|
2093
|
+
- The concatenated DataFrame is uploaded as normal
|
|
2094
|
+
|
|
2095
|
+
Example - Upload multiple datasets with labels:
|
|
2096
|
+
|
|
2097
|
+
datasets = {
|
|
2098
|
+
'training_data': [df1, df2, df3],
|
|
2099
|
+
'validation_data': [df4, df5],
|
|
2100
|
+
'test_data': [df6]
|
|
2101
|
+
}
|
|
2102
|
+
|
|
2103
|
+
session = client.upload_df_and_create_session(df=datasets)
|
|
2104
|
+
# Uploads a single DataFrame with __featrix_dataset_name column indicating source
|
|
1977
2105
|
|
|
1978
2106
|
Args:
|
|
1979
|
-
df: pandas DataFrame to upload (optional if file_path is provided)
|
|
2107
|
+
df: pandas DataFrame OR dict of {dataset_name: [DataFrames]} to upload (optional if file_path is provided)
|
|
1980
2108
|
filename: Name to give the uploaded file (default: "data.csv")
|
|
1981
2109
|
file_path: Path to CSV, Parquet, JSON, or JSONL file to upload (optional if df is provided)
|
|
1982
2110
|
column_overrides: Dict mapping column names to types ("scalar", "set", "free_string", "free_string_list")
|
|
@@ -2005,6 +2133,80 @@ class FeatrixSphereClient:
|
|
|
2005
2133
|
if column_types is not None:
|
|
2006
2134
|
column_overrides = column_types
|
|
2007
2135
|
|
|
2136
|
+
# Handle dictionary of datasets input
|
|
2137
|
+
if df is not None and isinstance(df, dict):
|
|
2138
|
+
print("Detected dictionary of datasets - concatenating with __featrix_dataset_name labels")
|
|
2139
|
+
all_dataframes = []
|
|
2140
|
+
total_rows = 0
|
|
2141
|
+
|
|
2142
|
+
for dataset_name, tables in df.items():
|
|
2143
|
+
if not isinstance(tables, list):
|
|
2144
|
+
raise ValueError(f"Value for dataset '{dataset_name}' must be a list of DataFrames/file paths, got {type(tables)}")
|
|
2145
|
+
|
|
2146
|
+
for i, table in enumerate(tables):
|
|
2147
|
+
# Handle file path (string)
|
|
2148
|
+
if isinstance(table, str):
|
|
2149
|
+
file_path_to_load = str(table)
|
|
2150
|
+
|
|
2151
|
+
if not os.path.exists(file_path_to_load):
|
|
2152
|
+
raise FileNotFoundError(f"File not found in dataset '{dataset_name}': {file_path_to_load}")
|
|
2153
|
+
|
|
2154
|
+
# Determine file type and load
|
|
2155
|
+
file_ext = file_path_to_load.lower()
|
|
2156
|
+
print(f" - {dataset_name} loading file: {os.path.basename(file_path_to_load)}")
|
|
2157
|
+
|
|
2158
|
+
if file_ext.endswith('.parquet'):
|
|
2159
|
+
loaded_df = pd.read_parquet(file_path_to_load)
|
|
2160
|
+
elif file_ext.endswith(('.json', '.jsonl')):
|
|
2161
|
+
try:
|
|
2162
|
+
from featrix.neural.input_data_file import featrix_wrap_read_json_file
|
|
2163
|
+
loaded_df = featrix_wrap_read_json_file(file_path_to_load)
|
|
2164
|
+
if loaded_df is None:
|
|
2165
|
+
raise ValueError(f"Failed to parse {'JSONL' if file_ext.endswith('.jsonl') else 'JSON'} file")
|
|
2166
|
+
except ImportError:
|
|
2167
|
+
# Fallback to pandas
|
|
2168
|
+
if file_ext.endswith('.jsonl'):
|
|
2169
|
+
import json
|
|
2170
|
+
records = []
|
|
2171
|
+
with open(file_path_to_load, 'r', encoding='utf-8') as f:
|
|
2172
|
+
for line in f:
|
|
2173
|
+
if line.strip():
|
|
2174
|
+
records.append(json.loads(line))
|
|
2175
|
+
loaded_df = pd.DataFrame(records)
|
|
2176
|
+
else:
|
|
2177
|
+
loaded_df = pd.read_json(file_path_to_load)
|
|
2178
|
+
elif file_ext.endswith(('.csv', '.csv.gz')):
|
|
2179
|
+
loaded_df = pd.read_csv(file_path_to_load)
|
|
2180
|
+
else:
|
|
2181
|
+
raise ValueError(f"Unsupported file type in dataset '{dataset_name}': {file_path_to_load}. "
|
|
2182
|
+
f"Supported: .csv, .csv.gz, .parquet, .json, .jsonl")
|
|
2183
|
+
|
|
2184
|
+
labeled_table = loaded_df
|
|
2185
|
+
print(f" Loaded {len(loaded_df)} rows, {len(loaded_df.columns)} columns")
|
|
2186
|
+
|
|
2187
|
+
# Handle DataFrame
|
|
2188
|
+
elif isinstance(table, pd.DataFrame):
|
|
2189
|
+
# Create a copy to avoid modifying the original
|
|
2190
|
+
labeled_table = table.copy()
|
|
2191
|
+
print(f" - {dataset_name} DataFrame {i+1}: {len(labeled_table)} rows, {len(labeled_table.columns)} columns")
|
|
2192
|
+
|
|
2193
|
+
else:
|
|
2194
|
+
raise ValueError(f"Table {i} in dataset '{dataset_name}' must be a pandas DataFrame or file path (str), got {type(table)}")
|
|
2195
|
+
|
|
2196
|
+
# Add the dataset name label column
|
|
2197
|
+
labeled_table['__featrix_dataset_name'] = dataset_name
|
|
2198
|
+
|
|
2199
|
+
all_dataframes.append(labeled_table)
|
|
2200
|
+
total_rows += len(labeled_table)
|
|
2201
|
+
|
|
2202
|
+
if not all_dataframes:
|
|
2203
|
+
raise ValueError("No DataFrames found in the provided dictionary")
|
|
2204
|
+
|
|
2205
|
+
# Concatenate all dataframes
|
|
2206
|
+
print(f"Concatenating {len(all_dataframes)} tables from {len(df)} datasets ({total_rows} total rows)")
|
|
2207
|
+
df = pd.concat(all_dataframes, ignore_index=True)
|
|
2208
|
+
print(f"Combined DataFrame: {len(df)} rows, {len(df.columns)} columns (includes __featrix_dataset_name)")
|
|
2209
|
+
|
|
2008
2210
|
# Validate inputs
|
|
2009
2211
|
if df is None and file_path is None:
|
|
2010
2212
|
raise ValueError("Either df or file_path must be provided")
|
|
@@ -4236,25 +4438,21 @@ class FeatrixSphereClient:
|
|
|
4236
4438
|
def clone_in_progress_embedding_space(self, session_id: str, from_compute: str, to_compute: str,
|
|
4237
4439
|
es_id: str = None, new_session_name: str = None) -> Dict[str, Any]:
|
|
4238
4440
|
"""
|
|
4239
|
-
Clone
|
|
4441
|
+
INTERNAL: Clone embedding space between compute nodes.
|
|
4240
4442
|
|
|
4241
|
-
|
|
4242
|
-
|
|
4243
|
-
|
|
4443
|
+
Note: With the backplane system, users generally don't need to manually clone.
|
|
4444
|
+
Sessions are automatically available across all compute nodes.
|
|
4445
|
+
This method is kept for backward compatibility and special cases.
|
|
4244
4446
|
|
|
4245
4447
|
Args:
|
|
4246
|
-
session_id: Source session ID
|
|
4247
|
-
from_compute: Source
|
|
4248
|
-
to_compute: Destination
|
|
4249
|
-
es_id: Optional ES ID
|
|
4250
|
-
new_session_name: Optional name for
|
|
4448
|
+
session_id: Source session ID
|
|
4449
|
+
from_compute: Source node name
|
|
4450
|
+
to_compute: Destination node name
|
|
4451
|
+
es_id: Optional ES ID (if session has multiple)
|
|
4452
|
+
new_session_name: Optional name for cloned session
|
|
4251
4453
|
|
|
4252
4454
|
Returns:
|
|
4253
|
-
Dict with new_session_id
|
|
4254
|
-
|
|
4255
|
-
Raises:
|
|
4256
|
-
ValueError: If multiple ES found in session and es_id not provided
|
|
4257
|
-
HTTPException: If cloning fails
|
|
4455
|
+
Dict with new_session_id
|
|
4258
4456
|
"""
|
|
4259
4457
|
# Prepare request data
|
|
4260
4458
|
request_data = {
|
|
@@ -4322,40 +4520,6 @@ class FeatrixSphereClient:
|
|
|
4322
4520
|
print(f"Training predictor on foundation model {foundation_model_id}...")
|
|
4323
4521
|
print(f" Target: {target_column} ({target_column_type})")
|
|
4324
4522
|
|
|
4325
|
-
# Get the compute cluster from the foundation model session
|
|
4326
|
-
# This ensures we upload files to the same node where the foundation model lives
|
|
4327
|
-
# If the foundation session doesn't exist (404), we'll proceed with current compute cluster
|
|
4328
|
-
foundation_compute_cluster = None
|
|
4329
|
-
try:
|
|
4330
|
-
foundation_session = self.get_session_status(foundation_model_id)
|
|
4331
|
-
foundation_compute_cluster = self.get_last_server_metadata()
|
|
4332
|
-
foundation_compute_cluster = foundation_compute_cluster.get('compute_cluster') if foundation_compute_cluster else None
|
|
4333
|
-
except Exception as e:
|
|
4334
|
-
# Foundation session might not exist or be accessible - that's okay
|
|
4335
|
-
# The server will validate it when we submit the training request
|
|
4336
|
-
if verbose:
|
|
4337
|
-
# Check if it's a 404 HTTP error
|
|
4338
|
-
is_404 = False
|
|
4339
|
-
if isinstance(e, requests.exceptions.HTTPError):
|
|
4340
|
-
if hasattr(e, 'response') and e.response.status_code == 404:
|
|
4341
|
-
is_404 = True
|
|
4342
|
-
|
|
4343
|
-
if is_404:
|
|
4344
|
-
print(f" ⚠️ Foundation session not found (404) - will use current compute cluster")
|
|
4345
|
-
print(f" Server will validate foundation model when training starts")
|
|
4346
|
-
else:
|
|
4347
|
-
print(f" ⚠️ Could not fetch foundation session: {e}")
|
|
4348
|
-
print(f" Will proceed with current compute cluster")
|
|
4349
|
-
|
|
4350
|
-
# Temporarily set compute cluster for file uploads if we found one
|
|
4351
|
-
original_compute_cluster = self.compute_cluster
|
|
4352
|
-
original_headers = self.session.headers.copy()
|
|
4353
|
-
if foundation_compute_cluster:
|
|
4354
|
-
self.set_compute_cluster(foundation_compute_cluster)
|
|
4355
|
-
if verbose:
|
|
4356
|
-
print(f" Using compute cluster: {foundation_compute_cluster}")
|
|
4357
|
-
elif verbose and self.compute_cluster:
|
|
4358
|
-
print(f" Using current compute cluster: {self.compute_cluster}")
|
|
4359
4523
|
|
|
4360
4524
|
try:
|
|
4361
4525
|
# Validate that only one data source is provided
|
|
@@ -4453,20 +4617,9 @@ class FeatrixSphereClient:
|
|
|
4453
4617
|
|
|
4454
4618
|
new_session_id = response_data.get('session_id')
|
|
4455
4619
|
print(f"✅ Predictor training session created: {new_session_id}")
|
|
4456
|
-
|
|
4457
|
-
|
|
4458
|
-
|
|
4459
|
-
if original_compute_cluster:
|
|
4460
|
-
self.set_compute_cluster(original_compute_cluster)
|
|
4461
|
-
else:
|
|
4462
|
-
self.session.headers = original_headers
|
|
4463
|
-
finally:
|
|
4464
|
-
# Ensure we restore headers even if there's an error
|
|
4465
|
-
if original_compute_cluster != self.compute_cluster:
|
|
4466
|
-
if original_compute_cluster:
|
|
4467
|
-
self.set_compute_cluster(original_compute_cluster)
|
|
4468
|
-
else:
|
|
4469
|
-
self.session.headers = original_headers
|
|
4620
|
+
|
|
4621
|
+
except Exception as e:
|
|
4622
|
+
raise
|
|
4470
4623
|
|
|
4471
4624
|
if verbose:
|
|
4472
4625
|
print(f"⏳ Waiting for training to complete...")
|
|
@@ -5079,7 +5232,7 @@ class FeatrixSphereClient:
|
|
|
5079
5232
|
Extend embedding space training with new data.
|
|
5080
5233
|
|
|
5081
5234
|
This function:
|
|
5082
|
-
1.
|
|
5235
|
+
1. Creates a new session with the existing embedding space
|
|
5083
5236
|
2. Uploads/processes the new data
|
|
5084
5237
|
3. Continues training from where the previous training left off
|
|
5085
5238
|
4. Trains for the specified number of additional epochs (data_passes)
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
featrixsphere/__init__.py,sha256=CdXrzec_MfZTUpRyajYawHBu5iEWAe-flVrlXjiLLN8,1888
|
|
2
|
+
featrixsphere/client.py,sha256=ez8ML3OMHMMO0ewzOGNgF1tg3KYygLG2NbEw33sf3AA,408591
|
|
3
|
+
featrixsphere/test_client.py,sha256=4SiRbib0ms3poK0UpnUv4G0HFQSzidF3Iswo_J2cjLk,11981
|
|
4
|
+
featrixsphere-0.2.2279.dist-info/METADATA,sha256=2hEITDtsSYmMLgl0RfqvcBzac4EBAH3DN4TSVf_1rPY,16232
|
|
5
|
+
featrixsphere-0.2.2279.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
6
|
+
featrixsphere-0.2.2279.dist-info/entry_points.txt,sha256=QreJeYfD_VWvbEqPmMXZ3pqqlFlJ1qZb-NtqnyhEldc,51
|
|
7
|
+
featrixsphere-0.2.2279.dist-info/top_level.txt,sha256=AyN4wjfzlD0hWnDieuEHX0KckphIk_aC73XCG4df5uU,14
|
|
8
|
+
featrixsphere-0.2.2279.dist-info/RECORD,,
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
featrixsphere/__init__.py,sha256=I27lMJL_tBPzKyo_79loiIS83AAC-vuoz1kA3ZY2fhc,1888
|
|
2
|
-
featrixsphere/client.py,sha256=L97tRb-6pCvP7lKYOsK4iYHfsFt3V0URu4RO1mQzFoQ,401468
|
|
3
|
-
featrixsphere/test_client.py,sha256=4SiRbib0ms3poK0UpnUv4G0HFQSzidF3Iswo_J2cjLk,11981
|
|
4
|
-
featrixsphere-0.2.1830.dist-info/METADATA,sha256=IFITUpYkfYT2s7WXDX0-5Xl-iiUBt6bb69-Mr9_w6O8,16232
|
|
5
|
-
featrixsphere-0.2.1830.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
6
|
-
featrixsphere-0.2.1830.dist-info/entry_points.txt,sha256=QreJeYfD_VWvbEqPmMXZ3pqqlFlJ1qZb-NtqnyhEldc,51
|
|
7
|
-
featrixsphere-0.2.1830.dist-info/top_level.txt,sha256=AyN4wjfzlD0hWnDieuEHX0KckphIk_aC73XCG4df5uU,14
|
|
8
|
-
featrixsphere-0.2.1830.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|