featrixsphere 0.2.3737__py3-none-any.whl → 0.2.4983__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- featrixsphere/__init__.py +1 -1
- featrixsphere/client.py +426 -123
- {featrixsphere-0.2.3737.dist-info → featrixsphere-0.2.4983.dist-info}/METADATA +1 -1
- featrixsphere-0.2.4983.dist-info/RECORD +8 -0
- featrixsphere-0.2.3737.dist-info/RECORD +0 -8
- {featrixsphere-0.2.3737.dist-info → featrixsphere-0.2.4983.dist-info}/WHEEL +0 -0
- {featrixsphere-0.2.3737.dist-info → featrixsphere-0.2.4983.dist-info}/entry_points.txt +0 -0
- {featrixsphere-0.2.3737.dist-info → featrixsphere-0.2.4983.dist-info}/top_level.txt +0 -0
featrixsphere/__init__.py
CHANGED
featrixsphere/client.py
CHANGED
|
@@ -11,7 +11,7 @@ import time
|
|
|
11
11
|
import requests
|
|
12
12
|
from pathlib import Path
|
|
13
13
|
from typing import Dict, Any, Optional, List, Tuple, Union
|
|
14
|
-
from dataclasses import dataclass
|
|
14
|
+
from dataclasses import dataclass, field
|
|
15
15
|
import gzip
|
|
16
16
|
import os
|
|
17
17
|
import random
|
|
@@ -65,6 +65,7 @@ class SessionInfo:
|
|
|
65
65
|
status: str
|
|
66
66
|
jobs: Dict[str, Any]
|
|
67
67
|
job_queue_positions: Dict[str, Any]
|
|
68
|
+
job_plan: List[Dict[str, Any]] = field(default_factory=list)
|
|
68
69
|
_client: Optional['FeatrixSphereClient'] = None
|
|
69
70
|
|
|
70
71
|
def predictors(self) -> List[Dict[str, Any]]:
|
|
@@ -82,6 +83,51 @@ class SessionInfo:
|
|
|
82
83
|
return list(predictors_dict.values())
|
|
83
84
|
except Exception:
|
|
84
85
|
return []
|
|
86
|
+
|
|
87
|
+
def embedding_space_info(self) -> Optional[Dict[str, Any]]:
|
|
88
|
+
"""
|
|
89
|
+
Get embedding space information for this session.
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
Dictionary with ES info (dimensions, epochs, etc.) or None if not available
|
|
93
|
+
"""
|
|
94
|
+
if not self._client:
|
|
95
|
+
return None
|
|
96
|
+
|
|
97
|
+
try:
|
|
98
|
+
# Get session details from the client
|
|
99
|
+
session_data = self._client._get_json(f"/compute/session/{self.session_id}")
|
|
100
|
+
|
|
101
|
+
es_info = {}
|
|
102
|
+
|
|
103
|
+
# Extract embedding space path
|
|
104
|
+
embedding_space_path = session_data.get('embedding_space')
|
|
105
|
+
if embedding_space_path:
|
|
106
|
+
es_info['embedding_space_path'] = embedding_space_path
|
|
107
|
+
|
|
108
|
+
# Extract model architecture info
|
|
109
|
+
model_info = session_data.get('model_info', {}) or session_data.get('embedding_space', {})
|
|
110
|
+
if isinstance(model_info, dict):
|
|
111
|
+
es_info['d_model'] = model_info.get('d_model') or model_info.get('embedding_dim')
|
|
112
|
+
es_info['dimensions'] = es_info.get('d_model') # Alias for compatibility
|
|
113
|
+
es_info['parameter_count'] = model_info.get('parameter_count') or model_info.get('num_parameters')
|
|
114
|
+
es_info['layer_count'] = model_info.get('layer_count') or model_info.get('num_layers')
|
|
115
|
+
|
|
116
|
+
# Extract training statistics
|
|
117
|
+
training_stats = session_data.get('training_stats', {}) or session_data.get('stats', {})
|
|
118
|
+
if isinstance(training_stats, dict):
|
|
119
|
+
es_info['epochs'] = training_stats.get('final_epoch') or training_stats.get('epochs_trained') or training_stats.get('epochs')
|
|
120
|
+
es_info['final_loss'] = training_stats.get('final_loss') or training_stats.get('loss')
|
|
121
|
+
es_info['final_val_loss'] = training_stats.get('final_val_loss') or training_stats.get('validation_loss')
|
|
122
|
+
es_info['training_time_seconds'] = training_stats.get('training_time') or training_stats.get('elapsed_seconds')
|
|
123
|
+
|
|
124
|
+
# If we have any info, return it
|
|
125
|
+
if es_info:
|
|
126
|
+
return es_info
|
|
127
|
+
|
|
128
|
+
return None
|
|
129
|
+
except Exception:
|
|
130
|
+
return None
|
|
85
131
|
|
|
86
132
|
|
|
87
133
|
class PredictionBatch:
|
|
@@ -617,6 +663,7 @@ class FeatrixSphereClient:
|
|
|
617
663
|
status=response_data.get('status', 'unknown'),
|
|
618
664
|
jobs={},
|
|
619
665
|
job_queue_positions={},
|
|
666
|
+
job_plan=[],
|
|
620
667
|
_client=self
|
|
621
668
|
)
|
|
622
669
|
|
|
@@ -640,6 +687,7 @@ class FeatrixSphereClient:
|
|
|
640
687
|
session = response_data.get('session', {})
|
|
641
688
|
jobs = response_data.get('jobs', {})
|
|
642
689
|
positions = response_data.get('job_queue_positions', {})
|
|
690
|
+
job_plan = session.get('job_plan', [])
|
|
643
691
|
|
|
644
692
|
return SessionInfo(
|
|
645
693
|
session_id=session.get('session_id', session_id),
|
|
@@ -647,6 +695,7 @@ class FeatrixSphereClient:
|
|
|
647
695
|
status=session.get('status', 'unknown'),
|
|
648
696
|
jobs=jobs,
|
|
649
697
|
job_queue_positions=positions,
|
|
698
|
+
job_plan=job_plan,
|
|
650
699
|
_client=self
|
|
651
700
|
)
|
|
652
701
|
|
|
@@ -1370,6 +1419,20 @@ class FeatrixSphereClient:
|
|
|
1370
1419
|
|
|
1371
1420
|
start_time = time.time()
|
|
1372
1421
|
|
|
1422
|
+
# Initial wait for job dispatch (jobs are dispatched asynchronously after session creation)
|
|
1423
|
+
# Wait up to 10 seconds for jobs to appear before starting main monitoring loop
|
|
1424
|
+
initial_wait_timeout = 10
|
|
1425
|
+
initial_wait_start = time.time()
|
|
1426
|
+
jobs_appeared = False
|
|
1427
|
+
|
|
1428
|
+
while time.time() - initial_wait_start < initial_wait_timeout:
|
|
1429
|
+
session_info = self.get_session_status(session_id)
|
|
1430
|
+
if session_info.jobs:
|
|
1431
|
+
jobs_appeared = True
|
|
1432
|
+
break
|
|
1433
|
+
time.sleep(0.5) # Check every 500ms during initial wait
|
|
1434
|
+
|
|
1435
|
+
# Main monitoring loop
|
|
1373
1436
|
while time.time() - start_time < max_wait_time:
|
|
1374
1437
|
session_info = self.get_session_status(session_id)
|
|
1375
1438
|
elapsed = time.time() - start_time
|
|
@@ -1842,7 +1905,8 @@ class FeatrixSphereClient:
|
|
|
1842
1905
|
session_type=response_data.get('session_type', 'embedding_space'),
|
|
1843
1906
|
status=response_data.get('status', 'ready'),
|
|
1844
1907
|
jobs={},
|
|
1845
|
-
job_queue_positions={}
|
|
1908
|
+
job_queue_positions={},
|
|
1909
|
+
job_plan=[]
|
|
1846
1910
|
)
|
|
1847
1911
|
|
|
1848
1912
|
def fine_tune_embedding_space(
|
|
@@ -1972,7 +2036,144 @@ class FeatrixSphereClient:
|
|
|
1972
2036
|
session_type=response_data.get('session_type', 'embedding_space_finetune'),
|
|
1973
2037
|
status=response_data.get('status', 'ready'),
|
|
1974
2038
|
jobs={},
|
|
1975
|
-
job_queue_positions={}
|
|
2039
|
+
job_queue_positions={},
|
|
2040
|
+
job_plan=[]
|
|
2041
|
+
)
|
|
2042
|
+
|
|
2043
|
+
def extend_embedding_space(
|
|
2044
|
+
self,
|
|
2045
|
+
name: str,
|
|
2046
|
+
parent_session_id: str = None,
|
|
2047
|
+
parent_embedding_space_path: str = None,
|
|
2048
|
+
s3_training_dataset: str = None,
|
|
2049
|
+
s3_validation_dataset: str = None,
|
|
2050
|
+
n_epochs: int = None,
|
|
2051
|
+
webhooks: Dict[str, str] = None,
|
|
2052
|
+
user_metadata: Dict[str, Any] = None
|
|
2053
|
+
) -> SessionInfo:
|
|
2054
|
+
"""
|
|
2055
|
+
Extend an existing embedding space with new feature columns.
|
|
2056
|
+
|
|
2057
|
+
This method takes a pre-trained embedding space and extends it with new feature columns
|
|
2058
|
+
from enriched training/validation data. The extended ES preserves existing encoder weights
|
|
2059
|
+
and creates new codecs for the new columns.
|
|
2060
|
+
|
|
2061
|
+
**When to Use Extend vs Fine-Tune:**
|
|
2062
|
+
- **Extend**: When you've added NEW COLUMNS (features) to your dataset
|
|
2063
|
+
- **Fine-Tune**: When you have new rows with the SAME COLUMNS
|
|
2064
|
+
|
|
2065
|
+
**How It Works:**
|
|
2066
|
+
1. Loads the parent embedding space
|
|
2067
|
+
2. Identifies new columns in the enriched dataset
|
|
2068
|
+
3. Creates codecs for the new columns
|
|
2069
|
+
4. Copies existing encoder weights (preserves learned representations)
|
|
2070
|
+
5. Fine-tunes for shorter duration (default: original_epochs / 4)
|
|
2071
|
+
6. Returns extended embedding space with all columns
|
|
2072
|
+
|
|
2073
|
+
Args:
|
|
2074
|
+
name: Name for the extended embedding space
|
|
2075
|
+
parent_session_id: Session ID of the parent embedding space (optional)
|
|
2076
|
+
parent_embedding_space_path: Direct path to parent embedding space pickle file (optional)
|
|
2077
|
+
s3_training_dataset: S3 URL for enriched training dataset with new columns (must start with 's3://')
|
|
2078
|
+
s3_validation_dataset: S3 URL for enriched validation dataset with new columns (must start with 's3://')
|
|
2079
|
+
n_epochs: Number of epochs for extension training (default: original_epochs / 4)
|
|
2080
|
+
webhooks: Optional dict with webhook configuration keys
|
|
2081
|
+
user_metadata: Optional metadata dict to attach to the session
|
|
2082
|
+
|
|
2083
|
+
Returns:
|
|
2084
|
+
SessionInfo for the newly created extension session
|
|
2085
|
+
|
|
2086
|
+
Raises:
|
|
2087
|
+
ValueError: If S3 URLs are invalid or neither parent identifier is provided
|
|
2088
|
+
|
|
2089
|
+
Example:
|
|
2090
|
+
```python
|
|
2091
|
+
# Extend an existing embedding space with new feature columns
|
|
2092
|
+
client = FeatrixSphereClient("https://sphere-api.featrix.com")
|
|
2093
|
+
|
|
2094
|
+
# Original ES was trained on: age, income, credit_score
|
|
2095
|
+
# New data includes engineered features: debt_to_income_ratio, age_bin
|
|
2096
|
+
extended = client.extend_embedding_space(
|
|
2097
|
+
name="customer_model_with_features",
|
|
2098
|
+
parent_session_id="abc123-20240101-120000",
|
|
2099
|
+
s3_training_dataset="s3://my-bucket/enriched_training.csv",
|
|
2100
|
+
s3_validation_dataset="s3://my-bucket/enriched_validation.csv",
|
|
2101
|
+
n_epochs=25 # Optional: specify epochs (defaults to original/4)
|
|
2102
|
+
)
|
|
2103
|
+
|
|
2104
|
+
# Wait for extension to complete
|
|
2105
|
+
client.wait_for_session_completion(extended.session_id)
|
|
2106
|
+
|
|
2107
|
+
# The extended ES now includes the new feature columns
|
|
2108
|
+
# Use it for predictions with enriched data
|
|
2109
|
+
result = client.predict(extended.session_id, {
|
|
2110
|
+
"age": 35,
|
|
2111
|
+
"income": 75000,
|
|
2112
|
+
"credit_score": 720,
|
|
2113
|
+
"debt_to_income_ratio": 0.25, # New feature!
|
|
2114
|
+
"age_bin": "30-40" # New feature!
|
|
2115
|
+
})
|
|
2116
|
+
```
|
|
2117
|
+
"""
|
|
2118
|
+
# Validate S3 URLs
|
|
2119
|
+
if s3_training_dataset and not s3_training_dataset.startswith('s3://'):
|
|
2120
|
+
raise ValueError("s3_training_dataset must be a valid S3 URL (s3://...)")
|
|
2121
|
+
if s3_validation_dataset and not s3_validation_dataset.startswith('s3://'):
|
|
2122
|
+
raise ValueError("s3_validation_dataset must be a valid S3 URL (s3://...)")
|
|
2123
|
+
|
|
2124
|
+
# Validate that we have either parent_session_id or parent_embedding_space_path
|
|
2125
|
+
if not parent_session_id and not parent_embedding_space_path:
|
|
2126
|
+
raise ValueError("Either parent_session_id or parent_embedding_space_path must be provided")
|
|
2127
|
+
|
|
2128
|
+
print(f"Extending embedding space '{name}' with new features...")
|
|
2129
|
+
if parent_session_id:
|
|
2130
|
+
print(f" Parent session: {parent_session_id}")
|
|
2131
|
+
if parent_embedding_space_path:
|
|
2132
|
+
print(f" Parent embedding space: {parent_embedding_space_path}")
|
|
2133
|
+
print(f" Enriched training data: {s3_training_dataset}")
|
|
2134
|
+
print(f" Enriched validation data: {s3_validation_dataset}")
|
|
2135
|
+
if n_epochs:
|
|
2136
|
+
print(f" Extension epochs: {n_epochs}")
|
|
2137
|
+
else:
|
|
2138
|
+
print(f" Extension epochs: auto (original/4)")
|
|
2139
|
+
|
|
2140
|
+
data = {
|
|
2141
|
+
"name": name,
|
|
2142
|
+
"s3_file_data_set_training": s3_training_dataset,
|
|
2143
|
+
"s3_file_data_set_validation": s3_validation_dataset
|
|
2144
|
+
}
|
|
2145
|
+
|
|
2146
|
+
if parent_session_id:
|
|
2147
|
+
data["parent_session_id"] = parent_session_id
|
|
2148
|
+
if parent_embedding_space_path:
|
|
2149
|
+
data["parent_embedding_space_path"] = parent_embedding_space_path
|
|
2150
|
+
if n_epochs is not None:
|
|
2151
|
+
data["n_epochs"] = n_epochs
|
|
2152
|
+
|
|
2153
|
+
if webhooks:
|
|
2154
|
+
data['webhooks'] = webhooks
|
|
2155
|
+
if user_metadata:
|
|
2156
|
+
import json
|
|
2157
|
+
data['user_metadata'] = json.dumps(user_metadata)
|
|
2158
|
+
print(f"User metadata: {user_metadata}")
|
|
2159
|
+
|
|
2160
|
+
response_data = self._post_json("/compute/extend-embedding-space", data)
|
|
2161
|
+
|
|
2162
|
+
session_id = response_data.get('session_id')
|
|
2163
|
+
extend_info = response_data.get('extend_es_info', {})
|
|
2164
|
+
|
|
2165
|
+
print(f"Extension session created: {session_id}")
|
|
2166
|
+
if extend_info:
|
|
2167
|
+
print(f" Original epochs: {extend_info.get('original_epochs', 'N/A')}")
|
|
2168
|
+
print(f" Extension epochs: {extend_info.get('extension_epochs', 'N/A')}")
|
|
2169
|
+
|
|
2170
|
+
return SessionInfo(
|
|
2171
|
+
session_id=session_id,
|
|
2172
|
+
session_type=response_data.get('session_type', 'embedding_space_extend'),
|
|
2173
|
+
status=response_data.get('status', 'ready'),
|
|
2174
|
+
jobs={},
|
|
2175
|
+
job_queue_positions={},
|
|
2176
|
+
job_plan=[]
|
|
1976
2177
|
)
|
|
1977
2178
|
|
|
1978
2179
|
# =========================================================================
|
|
@@ -2033,7 +2234,8 @@ class FeatrixSphereClient:
|
|
|
2033
2234
|
session_type=response_data.get('session_type', 'sphere'),
|
|
2034
2235
|
status=response_data.get('status', 'ready'),
|
|
2035
2236
|
jobs={},
|
|
2036
|
-
job_queue_positions={}
|
|
2237
|
+
job_queue_positions={},
|
|
2238
|
+
job_plan=[]
|
|
2037
2239
|
)
|
|
2038
2240
|
|
|
2039
2241
|
def upload_df_and_create_session(self, df=None, filename: str = "data.csv", file_path: str = None,
|
|
@@ -2309,23 +2511,57 @@ class FeatrixSphereClient:
|
|
|
2309
2511
|
compression_ratio = (1 - compressed_size / original_size) * 100
|
|
2310
2512
|
print(f"Converted Parquet to CSV and compressed from {original_size:,} to {compressed_size:,} bytes ({compression_ratio:.1f}% reduction)")
|
|
2311
2513
|
else:
|
|
2312
|
-
# Regular CSV file -
|
|
2514
|
+
# Regular CSV file - check size and suggest Parquet for large files
|
|
2313
2515
|
with open(file_path, 'rb') as f:
|
|
2314
2516
|
csv_content = f.read()
|
|
2315
2517
|
|
|
2316
|
-
|
|
2317
|
-
|
|
2318
|
-
compressed_buffer = io.BytesIO()
|
|
2319
|
-
with gzip.GzipFile(fileobj=compressed_buffer, mode='wb') as gz:
|
|
2320
|
-
gz.write(csv_content)
|
|
2321
|
-
file_content = compressed_buffer.getvalue()
|
|
2322
|
-
upload_filename = os.path.basename(file_path) + '.gz'
|
|
2323
|
-
content_type = 'application/gzip'
|
|
2518
|
+
csv_size_mb = len(csv_content) / (1024 * 1024)
|
|
2519
|
+
CSV_WARNING_THRESHOLD_MB = 1.0 # Warn if CSV > 1MB
|
|
2324
2520
|
|
|
2325
|
-
|
|
2326
|
-
|
|
2327
|
-
|
|
2328
|
-
|
|
2521
|
+
if csv_size_mb > CSV_WARNING_THRESHOLD_MB:
|
|
2522
|
+
print(f"\n⚠️ Warning: CSV file is {csv_size_mb:.1f} MB")
|
|
2523
|
+
print(f" Parquet format is more efficient for large files (smaller size, faster upload).")
|
|
2524
|
+
print(f" Converting to Parquet format for better performance...")
|
|
2525
|
+
|
|
2526
|
+
# Read CSV as DataFrame
|
|
2527
|
+
csv_df = pd.read_csv(file_path)
|
|
2528
|
+
|
|
2529
|
+
# Convert to Parquet in memory
|
|
2530
|
+
parquet_buffer = io.BytesIO()
|
|
2531
|
+
try:
|
|
2532
|
+
# Try pyarrow first (faster), fallback to fastparquet
|
|
2533
|
+
csv_df.to_parquet(parquet_buffer, index=False, engine='pyarrow')
|
|
2534
|
+
except (ImportError, ValueError):
|
|
2535
|
+
# Fallback to fastparquet or default engine
|
|
2536
|
+
try:
|
|
2537
|
+
csv_df.to_parquet(parquet_buffer, index=False, engine='fastparquet')
|
|
2538
|
+
except (ImportError, ValueError):
|
|
2539
|
+
# Last resort: use default engine
|
|
2540
|
+
csv_df.to_parquet(parquet_buffer, index=False)
|
|
2541
|
+
parquet_content = parquet_buffer.getvalue()
|
|
2542
|
+
parquet_size_mb = len(parquet_content) / (1024 * 1024)
|
|
2543
|
+
|
|
2544
|
+
# Use Parquet instead of compressed CSV
|
|
2545
|
+
file_content = parquet_content
|
|
2546
|
+
upload_filename = os.path.basename(file_path).replace('.csv', '.parquet')
|
|
2547
|
+
content_type = 'application/octet-stream'
|
|
2548
|
+
|
|
2549
|
+
size_reduction = (1 - len(parquet_content) / len(csv_content)) * 100
|
|
2550
|
+
print(f" ✅ Converted to Parquet: {csv_size_mb:.1f} MB → {parquet_size_mb:.1f} MB ({size_reduction:.1f}% reduction)")
|
|
2551
|
+
else:
|
|
2552
|
+
# Small CSV - compress as before
|
|
2553
|
+
print("Compressing CSV file...")
|
|
2554
|
+
compressed_buffer = io.BytesIO()
|
|
2555
|
+
with gzip.GzipFile(fileobj=compressed_buffer, mode='wb') as gz:
|
|
2556
|
+
gz.write(csv_content)
|
|
2557
|
+
file_content = compressed_buffer.getvalue()
|
|
2558
|
+
upload_filename = os.path.basename(file_path) + '.gz'
|
|
2559
|
+
content_type = 'application/gzip'
|
|
2560
|
+
|
|
2561
|
+
original_size = len(csv_content)
|
|
2562
|
+
compressed_size = len(file_content)
|
|
2563
|
+
compression_ratio = (1 - compressed_size / original_size) * 100
|
|
2564
|
+
print(f"Compressed from {original_size:,} to {compressed_size:,} bytes ({compression_ratio:.1f}% reduction)")
|
|
2329
2565
|
|
|
2330
2566
|
# Handle DataFrame input
|
|
2331
2567
|
else:
|
|
@@ -2334,29 +2570,31 @@ class FeatrixSphereClient:
|
|
|
2334
2570
|
|
|
2335
2571
|
print(f"Uploading DataFrame ({len(df)} rows, {len(df.columns)} columns)")
|
|
2336
2572
|
|
|
2337
|
-
# Clean NaN values in DataFrame before
|
|
2573
|
+
# Clean NaN values in DataFrame before conversion
|
|
2338
2574
|
# This prevents JSON encoding issues when the server processes the data
|
|
2339
2575
|
# Use pandas.notna() with where() for compatibility with all pandas versions
|
|
2340
2576
|
cleaned_df = df.where(pd.notna(df), None) # Replace NaN with None for JSON compatibility
|
|
2341
2577
|
|
|
2342
|
-
#
|
|
2343
|
-
|
|
2344
|
-
|
|
2345
|
-
|
|
2346
|
-
|
|
2347
|
-
|
|
2348
|
-
|
|
2349
|
-
|
|
2350
|
-
|
|
2351
|
-
|
|
2352
|
-
|
|
2353
|
-
|
|
2354
|
-
|
|
2355
|
-
|
|
2356
|
-
|
|
2357
|
-
|
|
2358
|
-
|
|
2359
|
-
|
|
2578
|
+
# Always use Parquet format for DataFrames (smaller, faster than CSV.gz)
|
|
2579
|
+
print("Converting DataFrame to Parquet format...")
|
|
2580
|
+
parquet_buffer = io.BytesIO()
|
|
2581
|
+
try:
|
|
2582
|
+
# Try pyarrow first (faster), fallback to fastparquet
|
|
2583
|
+
cleaned_df.to_parquet(parquet_buffer, index=False, engine='pyarrow')
|
|
2584
|
+
except (ImportError, ValueError):
|
|
2585
|
+
# Fallback to fastparquet or default engine
|
|
2586
|
+
try:
|
|
2587
|
+
cleaned_df.to_parquet(parquet_buffer, index=False, engine='fastparquet')
|
|
2588
|
+
except (ImportError, ValueError):
|
|
2589
|
+
# Last resort: use default engine
|
|
2590
|
+
cleaned_df.to_parquet(parquet_buffer, index=False)
|
|
2591
|
+
|
|
2592
|
+
file_content = parquet_buffer.getvalue()
|
|
2593
|
+
parquet_size_mb = len(file_content) / (1024 * 1024)
|
|
2594
|
+
upload_filename = filename.replace('.csv', '.parquet') if filename.endswith('.csv') else filename + '.parquet'
|
|
2595
|
+
content_type = 'application/octet-stream'
|
|
2596
|
+
|
|
2597
|
+
print(f"✅ Saved as Parquet: {parquet_size_mb:.2f} MB")
|
|
2360
2598
|
|
|
2361
2599
|
# Upload the compressed file with optional column overrides
|
|
2362
2600
|
files = {'file': (upload_filename, file_content, content_type)}
|
|
@@ -2403,9 +2641,18 @@ class FeatrixSphereClient:
|
|
|
2403
2641
|
file_size_mb = len(file_content) / (1024 * 1024)
|
|
2404
2642
|
CHUNK_SIZE_MB = 512 # 512 MB chunk size
|
|
2405
2643
|
CHUNK_SIZE_BYTES = CHUNK_SIZE_MB * 1024 * 1024
|
|
2644
|
+
LARGE_FILE_WARNING_MB = 10 # Warn if file > 10 MB
|
|
2645
|
+
|
|
2646
|
+
if file_size_mb > LARGE_FILE_WARNING_MB:
|
|
2647
|
+
print(f"\n⚠️ Warning: File size ({file_size_mb:.1f} MB) is quite large")
|
|
2648
|
+
print(f" For very large files (>10 MB), consider using S3 uploads:")
|
|
2649
|
+
print(f" 1. Upload your file to S3 (or your cloud storage)")
|
|
2650
|
+
print(f" 2. Generate a signed/private URL with read access")
|
|
2651
|
+
print(f" 3. Contact Featrix support to configure S3-based uploads")
|
|
2652
|
+
print(f" This can be more reliable than direct uploads for large datasets.")
|
|
2406
2653
|
|
|
2407
2654
|
if file_size_mb > CHUNK_SIZE_MB:
|
|
2408
|
-
print(f"⚠️ Warning: File size ({file_size_mb:.1f} MB) exceeds {CHUNK_SIZE_MB} MB threshold")
|
|
2655
|
+
print(f"\n⚠️ Warning: File size ({file_size_mb:.1f} MB) exceeds {CHUNK_SIZE_MB} MB threshold")
|
|
2409
2656
|
print(f" Large uploads may timeout. Consider splitting the data or using smaller batches.")
|
|
2410
2657
|
|
|
2411
2658
|
# Try upload with retry on 504
|
|
@@ -2451,7 +2698,8 @@ class FeatrixSphereClient:
|
|
|
2451
2698
|
session_type=response_data.get('session_type', 'sphere'),
|
|
2452
2699
|
status=response_data.get('status', 'ready'),
|
|
2453
2700
|
jobs={},
|
|
2454
|
-
job_queue_positions={}
|
|
2701
|
+
job_queue_positions={},
|
|
2702
|
+
job_plan=[]
|
|
2455
2703
|
)
|
|
2456
2704
|
|
|
2457
2705
|
|
|
@@ -2564,7 +2812,8 @@ class FeatrixSphereClient:
|
|
|
2564
2812
|
# =========================================================================
|
|
2565
2813
|
|
|
2566
2814
|
def predict(self, session_id: str, record: Dict[str, Any], target_column: str = None,
|
|
2567
|
-
predictor_id: str = None,
|
|
2815
|
+
predictor_id: str = None, best_metric_preference: str = None,
|
|
2816
|
+
max_retries: int = None, queue_batches: bool = False) -> Dict[str, Any]:
|
|
2568
2817
|
"""
|
|
2569
2818
|
Make a single prediction for a record.
|
|
2570
2819
|
|
|
@@ -2573,6 +2822,7 @@ class FeatrixSphereClient:
|
|
|
2573
2822
|
record: Record dictionary (without target column)
|
|
2574
2823
|
target_column: Specific target column predictor to use (required if multiple predictors exist and predictor_id not specified)
|
|
2575
2824
|
predictor_id: Specific predictor ID to use (recommended - more precise than target_column)
|
|
2825
|
+
best_metric_preference: Which metric checkpoint to use: "roc_auc", "pr_auc", or None (use default checkpoint) (default: None)
|
|
2576
2826
|
max_retries: Number of retries for errors (default: uses client default)
|
|
2577
2827
|
queue_batches: If True, queue this prediction for batch processing instead of immediate API call
|
|
2578
2828
|
|
|
@@ -2595,41 +2845,24 @@ class FeatrixSphereClient:
|
|
|
2595
2845
|
queue_id = self._add_to_prediction_queue(session_id, record, target_column, predictor_id)
|
|
2596
2846
|
return {"queued": True, "queue_id": queue_id}
|
|
2597
2847
|
|
|
2598
|
-
#
|
|
2599
|
-
predictors = self._get_available_predictors(session_id, debug=False)
|
|
2600
|
-
if len(predictors) > 1 and not target_column and not predictor_id:
|
|
2601
|
-
available_targets = [p.get('target_column') for p in predictors.values() if p.get('target_column')]
|
|
2602
|
-
available_ids = list(predictors.keys())
|
|
2603
|
-
raise ValueError(
|
|
2604
|
-
f"Session {session_id} has {len(predictors)} predictors. "
|
|
2605
|
-
f"You must specify either 'target_column' or 'predictor_id'.\n"
|
|
2606
|
-
f"Available target columns: {available_targets}\n"
|
|
2607
|
-
f"Available predictor IDs: {available_ids}\n"
|
|
2608
|
-
f"Use client.list_predictors('{session_id}') to see details."
|
|
2609
|
-
)
|
|
2610
|
-
|
|
2611
|
-
# Resolve predictor information (handles both predictor_id and target_column)
|
|
2612
|
-
predictor_info = self._resolve_predictor_id(session_id, predictor_id, target_column)
|
|
2613
|
-
validated_target_column = predictor_info['target_column']
|
|
2614
|
-
resolved_predictor_id = predictor_info['predictor_id']
|
|
2615
|
-
|
|
2616
|
-
# Clean NaN/Inf values and remove target column
|
|
2848
|
+
# Clean NaN/Inf values
|
|
2617
2849
|
cleaned_record = self._clean_numpy_values(record)
|
|
2618
|
-
# Additional NaN cleaning for JSON encoding
|
|
2619
2850
|
cleaned_record = self.replace_nans_with_nulls(cleaned_record)
|
|
2620
|
-
cleaned_records = self._remove_target_columns(session_id, [cleaned_record], validated_target_column)
|
|
2621
|
-
final_record = cleaned_records[0] if cleaned_records else cleaned_record
|
|
2622
2851
|
|
|
2623
|
-
#
|
|
2852
|
+
# Build request payload - let the server handle predictor resolution
|
|
2624
2853
|
request_payload = {
|
|
2625
|
-
"query_record":
|
|
2626
|
-
"target_column": validated_target_column
|
|
2854
|
+
"query_record": cleaned_record,
|
|
2627
2855
|
}
|
|
2628
2856
|
|
|
2629
|
-
# Include
|
|
2630
|
-
if
|
|
2631
|
-
request_payload["
|
|
2857
|
+
# Include whatever the caller provided - server will figure it out
|
|
2858
|
+
if target_column:
|
|
2859
|
+
request_payload["target_column"] = target_column
|
|
2860
|
+
if predictor_id:
|
|
2861
|
+
request_payload["predictor_id"] = predictor_id
|
|
2862
|
+
if best_metric_preference:
|
|
2863
|
+
request_payload["best_metric_preference"] = best_metric_preference
|
|
2632
2864
|
|
|
2865
|
+
# Just send it to the server - it has all the smart fallback logic
|
|
2633
2866
|
response_data = self._post_json(f"/session/{session_id}/predict", request_payload, max_retries=max_retries)
|
|
2634
2867
|
return response_data
|
|
2635
2868
|
|
|
@@ -2712,7 +2945,7 @@ class FeatrixSphereClient:
|
|
|
2712
2945
|
def plot_training_loss(self, session_id: str, figsize: Tuple[int, int] = (12, 8),
|
|
2713
2946
|
style: str = 'notebook', save_path: Optional[str] = None,
|
|
2714
2947
|
show_learning_rate: bool = True, smooth: bool = True,
|
|
2715
|
-
title: Optional[str] = None)
|
|
2948
|
+
title: Optional[str] = None):
|
|
2716
2949
|
"""
|
|
2717
2950
|
Plot comprehensive training loss curves for a session (both embedding space and single predictor).
|
|
2718
2951
|
|
|
@@ -2800,7 +3033,7 @@ class FeatrixSphereClient:
|
|
|
2800
3033
|
|
|
2801
3034
|
def plot_embedding_space_training(self, session_id: str, figsize: Tuple[int, int] = (10, 6),
|
|
2802
3035
|
style: str = 'notebook', save_path: Optional[str] = None,
|
|
2803
|
-
show_mutual_info: bool = False)
|
|
3036
|
+
show_mutual_info: bool = False):
|
|
2804
3037
|
"""
|
|
2805
3038
|
Plot detailed embedding space training metrics.
|
|
2806
3039
|
|
|
@@ -2876,7 +3109,7 @@ class FeatrixSphereClient:
|
|
|
2876
3109
|
|
|
2877
3110
|
def plot_single_predictor_training(self, session_id: str, figsize: Tuple[int, int] = (10, 6),
|
|
2878
3111
|
style: str = 'notebook', save_path: Optional[str] = None,
|
|
2879
|
-
show_metrics: bool = True)
|
|
3112
|
+
show_metrics: bool = True):
|
|
2880
3113
|
"""
|
|
2881
3114
|
Plot detailed single predictor training metrics.
|
|
2882
3115
|
|
|
@@ -2952,7 +3185,7 @@ class FeatrixSphereClient:
|
|
|
2952
3185
|
|
|
2953
3186
|
def plot_training_comparison(self, session_ids: List[str], labels: Optional[List[str]] = None,
|
|
2954
3187
|
figsize: Tuple[int, int] = (12, 8), style: str = 'notebook',
|
|
2955
|
-
save_path: Optional[str] = None)
|
|
3188
|
+
save_path: Optional[str] = None):
|
|
2956
3189
|
"""
|
|
2957
3190
|
Compare training curves across multiple sessions.
|
|
2958
3191
|
|
|
@@ -3205,7 +3438,7 @@ class FeatrixSphereClient:
|
|
|
3205
3438
|
def plot_embedding_space_3d(self, session_id: str, sample_size: int = 2000,
|
|
3206
3439
|
color_by: Optional[str] = None, size_by: Optional[str] = None,
|
|
3207
3440
|
interactive: bool = True, style: str = 'notebook',
|
|
3208
|
-
title: Optional[str] = None, save_path: Optional[str] = None)
|
|
3441
|
+
title: Optional[str] = None, save_path: Optional[str] = None):
|
|
3209
3442
|
"""
|
|
3210
3443
|
Create interactive 3D visualization of the embedding space.
|
|
3211
3444
|
|
|
@@ -3281,7 +3514,7 @@ class FeatrixSphereClient:
|
|
|
3281
3514
|
style: str = 'notebook', save_path: Optional[str] = None,
|
|
3282
3515
|
show_embedding_evolution: bool = True,
|
|
3283
3516
|
show_loss_evolution: bool = True,
|
|
3284
|
-
fps: int = 2, notebook_mode: bool = True)
|
|
3517
|
+
fps: int = 2, notebook_mode: bool = True):
|
|
3285
3518
|
"""
|
|
3286
3519
|
Create an animated training movie showing loss curves and embedding evolution.
|
|
3287
3520
|
|
|
@@ -3335,7 +3568,7 @@ class FeatrixSphereClient:
|
|
|
3335
3568
|
|
|
3336
3569
|
def plot_embedding_evolution(self, session_id: str, epoch_range: Optional[Tuple[int, int]] = None,
|
|
3337
3570
|
interactive: bool = True, sample_size: int = 1000,
|
|
3338
|
-
color_by: Optional[str] = None)
|
|
3571
|
+
color_by: Optional[str] = None):
|
|
3339
3572
|
"""
|
|
3340
3573
|
Show how embedding space evolves during training across epochs.
|
|
3341
3574
|
|
|
@@ -3771,7 +4004,18 @@ class FeatrixSphereClient:
|
|
|
3771
4004
|
available_predictors = self._get_available_predictors(session_id, debug=debug)
|
|
3772
4005
|
|
|
3773
4006
|
if not available_predictors:
|
|
3774
|
-
|
|
4007
|
+
# Don't fail here - let the server try to find/auto-discover the predictor
|
|
4008
|
+
# The server's /predict endpoint has smart fallback logic to find checkpoint files
|
|
4009
|
+
# even if the session file wasn't properly updated (e.g., training crashed)
|
|
4010
|
+
if debug:
|
|
4011
|
+
print(f"⚠️ No predictors found via models endpoint, letting server handle discovery")
|
|
4012
|
+
return {
|
|
4013
|
+
'target_column': target_column,
|
|
4014
|
+
'predictor_id': predictor_id,
|
|
4015
|
+
'path': None,
|
|
4016
|
+
'type': None,
|
|
4017
|
+
'server_discovery': True # Flag that server should auto-discover
|
|
4018
|
+
}
|
|
3775
4019
|
|
|
3776
4020
|
# If predictor_id is provided, find it directly (since it's now the key)
|
|
3777
4021
|
if predictor_id:
|
|
@@ -4649,6 +4893,7 @@ class FeatrixSphereClient:
|
|
|
4649
4893
|
status="running",
|
|
4650
4894
|
jobs={},
|
|
4651
4895
|
job_queue_positions={},
|
|
4896
|
+
job_plan=[],
|
|
4652
4897
|
_client=self
|
|
4653
4898
|
)
|
|
4654
4899
|
|
|
@@ -5500,6 +5745,7 @@ class FeatrixSphereClient:
|
|
|
5500
5745
|
status="running",
|
|
5501
5746
|
jobs={},
|
|
5502
5747
|
job_queue_positions={},
|
|
5748
|
+
job_plan=[],
|
|
5503
5749
|
_client=self
|
|
5504
5750
|
)
|
|
5505
5751
|
|
|
@@ -6083,7 +6329,8 @@ class FeatrixSphereClient:
|
|
|
6083
6329
|
# =========================================================================
|
|
6084
6330
|
|
|
6085
6331
|
def predict_table(self, session_id: str, table_data: Dict[str, Any],
|
|
6086
|
-
target_column: str = None, predictor_id: str = None,
|
|
6332
|
+
target_column: str = None, predictor_id: str = None,
|
|
6333
|
+
best_metric_preference: str = None, max_retries: int = None) -> Dict[str, Any]:
|
|
6087
6334
|
"""
|
|
6088
6335
|
Make batch predictions using JSON Tables format.
|
|
6089
6336
|
|
|
@@ -6124,6 +6371,8 @@ class FeatrixSphereClient:
|
|
|
6124
6371
|
table_data['target_column'] = target_column
|
|
6125
6372
|
if predictor_id:
|
|
6126
6373
|
table_data['predictor_id'] = predictor_id
|
|
6374
|
+
if best_metric_preference:
|
|
6375
|
+
table_data['best_metric_preference'] = best_metric_preference
|
|
6127
6376
|
|
|
6128
6377
|
try:
|
|
6129
6378
|
response_data = self._post_json(f"/session/{session_id}/predict_table", table_data, max_retries=max_retries)
|
|
@@ -6136,7 +6385,8 @@ class FeatrixSphereClient:
|
|
|
6136
6385
|
raise
|
|
6137
6386
|
|
|
6138
6387
|
def predict_records(self, session_id: str, records: List[Dict[str, Any]],
|
|
6139
|
-
target_column: str = None, predictor_id: str = None,
|
|
6388
|
+
target_column: str = None, predictor_id: str = None, best_metric_preference: str = None,
|
|
6389
|
+
batch_size: int = 2500, use_async: bool = False,
|
|
6140
6390
|
show_progress_bar: bool = True, print_target_column_warning: bool = True) -> Dict[str, Any]:
|
|
6141
6391
|
"""
|
|
6142
6392
|
Make batch predictions on a list of records with automatic client-side batching.
|
|
@@ -6183,7 +6433,8 @@ class FeatrixSphereClient:
|
|
|
6183
6433
|
table_data = JSONTablesEncoder.from_records(cleaned_records)
|
|
6184
6434
|
|
|
6185
6435
|
try:
|
|
6186
|
-
result = self.predict_table(session_id, table_data
|
|
6436
|
+
result = self.predict_table(session_id, table_data, target_column=target_column,
|
|
6437
|
+
predictor_id=predictor_id, best_metric_preference=best_metric_preference)
|
|
6187
6438
|
|
|
6188
6439
|
# Check if server returned an async job
|
|
6189
6440
|
if result.get('async') and result.get('job_id'):
|
|
@@ -6220,7 +6471,8 @@ class FeatrixSphereClient:
|
|
|
6220
6471
|
table_data = JSONTablesEncoder.from_records(cleaned_records)
|
|
6221
6472
|
|
|
6222
6473
|
try:
|
|
6223
|
-
return self.predict_table(session_id, table_data
|
|
6474
|
+
return self.predict_table(session_id, table_data, target_column=target_column,
|
|
6475
|
+
predictor_id=predictor_id, best_metric_preference=best_metric_preference)
|
|
6224
6476
|
except Exception as e:
|
|
6225
6477
|
if "404" in str(e) and "Single predictor not found" in str(e):
|
|
6226
6478
|
self._raise_predictor_not_found_error(session_id, "predict_records")
|
|
@@ -6250,7 +6502,8 @@ class FeatrixSphereClient:
|
|
|
6250
6502
|
table_data = JSONTablesEncoder.from_records(chunk_records)
|
|
6251
6503
|
|
|
6252
6504
|
# Make prediction
|
|
6253
|
-
chunk_result = self.predict_table(session_id, table_data
|
|
6505
|
+
chunk_result = self.predict_table(session_id, table_data, target_column=target_column,
|
|
6506
|
+
predictor_id=predictor_id, best_metric_preference=best_metric_preference)
|
|
6254
6507
|
chunk_predictions = chunk_result.get('predictions', [])
|
|
6255
6508
|
|
|
6256
6509
|
# Adjust row indices to match original dataset
|
|
@@ -6611,7 +6864,8 @@ class FeatrixSphereClient:
|
|
|
6611
6864
|
print(f"\n⏰ Timeout after {max_wait_time} seconds")
|
|
6612
6865
|
return {'status': 'timeout', 'message': f'Job did not complete within {max_wait_time} seconds'}
|
|
6613
6866
|
|
|
6614
|
-
def predict_df(self, session_id: str, df, target_column: str = None, predictor_id: str = None,
|
|
6867
|
+
def predict_df(self, session_id: str, df, target_column: str = None, predictor_id: str = None,
|
|
6868
|
+
best_metric_preference: str = None, show_progress_bar: bool = True, print_target_column_warning: bool = True) -> Dict[str, Any]:
|
|
6615
6869
|
"""
|
|
6616
6870
|
Make batch predictions on a pandas DataFrame.
|
|
6617
6871
|
|
|
@@ -6636,7 +6890,8 @@ class FeatrixSphereClient:
|
|
|
6636
6890
|
records = df.to_dict(orient='records')
|
|
6637
6891
|
# Clean NaNs for JSON encoding
|
|
6638
6892
|
cleaned_records = self.replace_nans_with_nulls(records)
|
|
6639
|
-
return self.predict_records(session_id, cleaned_records, target_column=target_column, predictor_id=predictor_id,
|
|
6893
|
+
return self.predict_records(session_id, cleaned_records, target_column=target_column, predictor_id=predictor_id,
|
|
6894
|
+
best_metric_preference=best_metric_preference, show_progress_bar=show_progress_bar, print_target_column_warning=print_target_column_warning)
|
|
6640
6895
|
|
|
6641
6896
|
def _raise_predictor_not_found_error(self, session_id: str, method_name: str):
|
|
6642
6897
|
"""
|
|
@@ -6767,28 +7022,54 @@ class FeatrixSphereClient:
|
|
|
6767
7022
|
training_metrics = models.get('training_metrics', {})
|
|
6768
7023
|
if debug:
|
|
6769
7024
|
print(f"🔍 Debug: training_metrics available = {training_metrics.get('available')}")
|
|
7025
|
+
target_column = None
|
|
7026
|
+
metadata = {}
|
|
7027
|
+
|
|
6770
7028
|
if training_metrics.get('available'):
|
|
6771
|
-
|
|
6772
|
-
|
|
6773
|
-
|
|
6774
|
-
|
|
6775
|
-
|
|
6776
|
-
|
|
6777
|
-
|
|
6778
|
-
|
|
6779
|
-
|
|
6780
|
-
|
|
6781
|
-
|
|
6782
|
-
|
|
6783
|
-
|
|
6784
|
-
|
|
6785
|
-
|
|
7029
|
+
try:
|
|
7030
|
+
metrics_data = self.get_training_metrics(session_id)
|
|
7031
|
+
if debug:
|
|
7032
|
+
print(f"🔍 Debug: metrics_data keys = {list(metrics_data.keys())}")
|
|
7033
|
+
training_metrics_inner = metrics_data.get('training_metrics', {})
|
|
7034
|
+
if debug:
|
|
7035
|
+
print(f"🔍 Debug: training_metrics_inner keys = {list(training_metrics_inner.keys()) if training_metrics_inner else 'None'}")
|
|
7036
|
+
target_column = training_metrics_inner.get('target_column')
|
|
7037
|
+
if debug:
|
|
7038
|
+
print(f"🔍 Debug: extracted target_column = {target_column}")
|
|
7039
|
+
if target_column:
|
|
7040
|
+
# Extract metadata from training metrics
|
|
7041
|
+
metadata = self._extract_predictor_metadata(metrics_data, debug)
|
|
7042
|
+
except Exception as e:
|
|
7043
|
+
if debug:
|
|
7044
|
+
print(f"⚠️ Could not get training metrics: {e}")
|
|
7045
|
+
|
|
7046
|
+
# Fallback: try to get target column from job_plan
|
|
7047
|
+
if not target_column:
|
|
7048
|
+
job_plan = session.get('job_plan', [])
|
|
7049
|
+
for job in job_plan:
|
|
7050
|
+
if job.get('job_type') == 'train_single_predictor':
|
|
7051
|
+
spec = job.get('spec', {})
|
|
7052
|
+
target_column = spec.get('target_column')
|
|
7053
|
+
if target_column:
|
|
7054
|
+
if debug:
|
|
7055
|
+
print(f"🔍 Debug: extracted target_column from job_plan: {target_column}")
|
|
7056
|
+
break
|
|
7057
|
+
|
|
7058
|
+
# If predictor is available, add it even without target_column (can be None)
|
|
7059
|
+
if single_predictor.get('available') or single_predictor.get('predictors'):
|
|
7060
|
+
# Generate unique predictor ID
|
|
7061
|
+
predictor_path = single_predictor.get('path', '')
|
|
7062
|
+
if not predictor_path and single_predictor.get('predictors'):
|
|
7063
|
+
# Use first predictor from new format
|
|
7064
|
+
predictor_path = single_predictor.get('predictors', [{}])[0].get('path', '')
|
|
7065
|
+
|
|
7066
|
+
if predictor_path:
|
|
6786
7067
|
predictor_id = self._generate_predictor_id(predictor_path, 'single_predictor')
|
|
6787
7068
|
|
|
6788
7069
|
predictors[predictor_id] = {
|
|
6789
7070
|
'predictor_id': predictor_id,
|
|
6790
7071
|
'path': predictor_path,
|
|
6791
|
-
'target_column': target_column,
|
|
7072
|
+
'target_column': target_column, # Can be None
|
|
6792
7073
|
'available': True,
|
|
6793
7074
|
'type': 'single_predictor',
|
|
6794
7075
|
**metadata # Include epochs, validation_loss, job_status, etc.
|
|
@@ -6811,6 +7092,9 @@ class FeatrixSphereClient:
|
|
|
6811
7092
|
if debug:
|
|
6812
7093
|
print(f"🔍 Debug: single_predictors array = {single_predictors_paths}")
|
|
6813
7094
|
if single_predictors_paths:
|
|
7095
|
+
target_column = None
|
|
7096
|
+
metadata = {}
|
|
7097
|
+
|
|
6814
7098
|
# Try to get target column info from training metrics
|
|
6815
7099
|
training_metrics = models.get('training_metrics', {})
|
|
6816
7100
|
if training_metrics.get('available'):
|
|
@@ -6820,30 +7104,44 @@ class FeatrixSphereClient:
|
|
|
6820
7104
|
if target_column:
|
|
6821
7105
|
# Extract metadata from training metrics
|
|
6822
7106
|
metadata = self._extract_predictor_metadata(metrics_data, debug)
|
|
6823
|
-
|
|
6824
|
-
# Add each predictor individually with its own predictor_id key
|
|
6825
|
-
for i, path in enumerate(single_predictors_paths):
|
|
6826
|
-
predictor_id = self._generate_predictor_id(path, f'multiple_predictor_{i}')
|
|
6827
|
-
|
|
6828
|
-
predictors[predictor_id] = {
|
|
6829
|
-
'predictor_id': predictor_id,
|
|
6830
|
-
'path': path,
|
|
6831
|
-
'target_column': target_column,
|
|
6832
|
-
'available': True,
|
|
6833
|
-
'type': 'single_predictor', # Each is treated as individual predictor
|
|
6834
|
-
'predictor_index': i, # Track original index for compatibility
|
|
6835
|
-
**metadata # Include epochs, validation_loss, job_status, etc.
|
|
6836
|
-
}
|
|
6837
|
-
if debug:
|
|
6838
|
-
print(f"✅ Added predictor {i} for target_column: {target_column}")
|
|
6839
|
-
print(f" Predictor ID: {predictor_id}")
|
|
6840
|
-
print(f" Path: {path}")
|
|
6841
|
-
|
|
6842
|
-
if debug:
|
|
6843
|
-
print(f" Total predictors added: {len(single_predictors_paths)}")
|
|
6844
|
-
print(f" Shared metadata: {metadata}")
|
|
6845
7107
|
except Exception as e:
|
|
6846
|
-
|
|
7108
|
+
if debug:
|
|
7109
|
+
print(f"⚠️ Could not get training metrics: {e}")
|
|
7110
|
+
|
|
7111
|
+
# Fallback: try to get target column from job_plan
|
|
7112
|
+
if not target_column:
|
|
7113
|
+
job_plan = session.get('job_plan', [])
|
|
7114
|
+
for job in job_plan:
|
|
7115
|
+
if job.get('job_type') == 'train_single_predictor':
|
|
7116
|
+
spec = job.get('spec', {})
|
|
7117
|
+
target_column = spec.get('target_column')
|
|
7118
|
+
if target_column:
|
|
7119
|
+
if debug:
|
|
7120
|
+
print(f"🔍 Debug: extracted target_column from job_plan: {target_column}")
|
|
7121
|
+
break
|
|
7122
|
+
|
|
7123
|
+
# Add each predictor even if target_column is None
|
|
7124
|
+
for i, path in enumerate(single_predictors_paths):
|
|
7125
|
+
predictor_id = self._generate_predictor_id(path, f'multiple_predictor_{i}')
|
|
7126
|
+
|
|
7127
|
+
predictors[predictor_id] = {
|
|
7128
|
+
'predictor_id': predictor_id,
|
|
7129
|
+
'path': path,
|
|
7130
|
+
'target_column': target_column, # Can be None
|
|
7131
|
+
'available': True,
|
|
7132
|
+
'type': 'single_predictor', # Each is treated as individual predictor
|
|
7133
|
+
'predictor_index': i, # Track original index for compatibility
|
|
7134
|
+
**metadata # Include epochs, validation_loss, job_status, etc.
|
|
7135
|
+
}
|
|
7136
|
+
if debug:
|
|
7137
|
+
print(f"✅ Added predictor {i} for target_column: {target_column}")
|
|
7138
|
+
print(f" Predictor ID: {predictor_id}")
|
|
7139
|
+
print(f" Path: {path}")
|
|
7140
|
+
|
|
7141
|
+
if debug:
|
|
7142
|
+
print(f" Total predictors added: {len(single_predictors_paths)}")
|
|
7143
|
+
if metadata:
|
|
7144
|
+
print(f" Shared metadata: {metadata}")
|
|
6847
7145
|
|
|
6848
7146
|
# Fallback: check old format single_predictor field
|
|
6849
7147
|
single_predictor_path = session.get('single_predictor')
|
|
@@ -6920,7 +7218,7 @@ class FeatrixSphereClient:
|
|
|
6920
7218
|
target_column: Specific target column to validate, or None for auto-detect
|
|
6921
7219
|
|
|
6922
7220
|
Returns:
|
|
6923
|
-
Validated target column name
|
|
7221
|
+
Validated target column name (or None if can't determine, server will handle)
|
|
6924
7222
|
|
|
6925
7223
|
Raises:
|
|
6926
7224
|
ValueError: If target_column is invalid or multiple predictors exist without specification
|
|
@@ -6928,7 +7226,8 @@ class FeatrixSphereClient:
|
|
|
6928
7226
|
available_predictors = self._get_available_predictors(session_id)
|
|
6929
7227
|
|
|
6930
7228
|
if not available_predictors:
|
|
6931
|
-
|
|
7229
|
+
# Don't fail - let server handle discovery. Return provided target_column or None.
|
|
7230
|
+
return target_column
|
|
6932
7231
|
|
|
6933
7232
|
if target_column is None:
|
|
6934
7233
|
# Auto-detect: only valid if there's exactly one predictor
|
|
@@ -6982,6 +7281,10 @@ class FeatrixSphereClient:
|
|
|
6982
7281
|
# Re-raise validation errors
|
|
6983
7282
|
raise e
|
|
6984
7283
|
|
|
7284
|
+
# If we couldn't determine target column (server will handle), just return records as-is
|
|
7285
|
+
if validated_target_column is None:
|
|
7286
|
+
return records
|
|
7287
|
+
|
|
6985
7288
|
if validated_target_column in records[0]:
|
|
6986
7289
|
if print_warning:
|
|
6987
7290
|
print(f"⚠️ Warning: Removing target column '{validated_target_column}' from prediction data")
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
featrixsphere/__init__.py,sha256=IRi4Di6zgujpSsWTJi0VL5b8yhjUGd9z_XFXJT2uWi8,1888
|
|
2
|
+
featrixsphere/client.py,sha256=kWGR7cYH0IDWNZDX5w8yPP9-2wA-3KfXxFlYl6w01wE,431295
|
|
3
|
+
featrixsphere/test_client.py,sha256=4SiRbib0ms3poK0UpnUv4G0HFQSzidF3Iswo_J2cjLk,11981
|
|
4
|
+
featrixsphere-0.2.4983.dist-info/METADATA,sha256=UxwPQet3orfLfAsNhHh1Bvn_QfV2ZXpuBL9AWYyjvxg,16232
|
|
5
|
+
featrixsphere-0.2.4983.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
6
|
+
featrixsphere-0.2.4983.dist-info/entry_points.txt,sha256=QreJeYfD_VWvbEqPmMXZ3pqqlFlJ1qZb-NtqnyhEldc,51
|
|
7
|
+
featrixsphere-0.2.4983.dist-info/top_level.txt,sha256=AyN4wjfzlD0hWnDieuEHX0KckphIk_aC73XCG4df5uU,14
|
|
8
|
+
featrixsphere-0.2.4983.dist-info/RECORD,,
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
featrixsphere/__init__.py,sha256=rPPiD3URmePTVO31B5XHkaGVl6zd-r6OIbpBxjqi9Yg,1888
|
|
2
|
-
featrixsphere/client.py,sha256=XaNFHfjogpj3exISZG1Q2SIMn-NewVsUELpzN7-5I-A,416085
|
|
3
|
-
featrixsphere/test_client.py,sha256=4SiRbib0ms3poK0UpnUv4G0HFQSzidF3Iswo_J2cjLk,11981
|
|
4
|
-
featrixsphere-0.2.3737.dist-info/METADATA,sha256=3gUOu7cpZPdzBkw5Z_8H6StVxotvIBCkOXmOWfW951c,16232
|
|
5
|
-
featrixsphere-0.2.3737.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
6
|
-
featrixsphere-0.2.3737.dist-info/entry_points.txt,sha256=QreJeYfD_VWvbEqPmMXZ3pqqlFlJ1qZb-NtqnyhEldc,51
|
|
7
|
-
featrixsphere-0.2.3737.dist-info/top_level.txt,sha256=AyN4wjfzlD0hWnDieuEHX0KckphIk_aC73XCG4df5uU,14
|
|
8
|
-
featrixsphere-0.2.3737.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|