featrixsphere 0.2.1141__py3-none-any.whl → 0.2.1235__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
featrixsphere/__init__.py CHANGED
@@ -38,7 +38,7 @@ Example:
38
38
  ... labels=['Experiment A', 'Experiment B'])
39
39
  """
40
40
 
41
- __version__ = "0.2.1141"
41
+ __version__ = "0.2.1235"
42
42
  __author__ = "Featrix"
43
43
  __email__ = "support@featrix.com"
44
44
  __license__ = "MIT"
featrixsphere/client.py CHANGED
@@ -633,13 +633,97 @@ class FeatrixSphereClient:
633
633
  _client=self
634
634
  )
635
635
 
636
- def get_model_card(self, session_id: str, max_retries: int = None) -> Dict[str, Any]:
636
+ def update_user_metadata(self, session_id: str, metadata: Dict[str, Any], write_mode: str = "merge") -> Dict[str, Any]:
637
+ """
638
+ Update user metadata for a session.
639
+
640
+ Args:
641
+ session_id: The session ID to update metadata for
642
+ metadata: Dictionary of metadata to update (max 32KB total)
643
+ write_mode: How to update metadata:
644
+ - "merge" (default): Merge new metadata with existing (existing keys are updated, new keys are added)
645
+ - "overwrite": Replace all user_metadata with the new dictionary
646
+
647
+ Returns:
648
+ Dictionary containing the updated session information
649
+
650
+ Raises:
651
+ requests.exceptions.HTTPError: If the request fails
652
+ ValueError: If write_mode is not "merge" or "overwrite"
653
+
654
+ Example:
655
+ >>> # Merge new metadata with existing
656
+ >>> client.update_user_metadata(
657
+ ... session_id="abc123",
658
+ ... metadata={"new_key": "value", "existing_key": "updated_value"},
659
+ ... write_mode="merge"
660
+ ... )
661
+
662
+ >>> # Replace all metadata
663
+ >>> client.update_user_metadata(
664
+ ... session_id="abc123",
665
+ ... metadata={"only_key": "only_value"},
666
+ ... write_mode="overwrite"
667
+ ... )
668
+ """
669
+ if write_mode not in ["merge", "overwrite"]:
670
+ raise ValueError(f"write_mode must be 'merge' or 'overwrite', got '{write_mode}'")
671
+
672
+ request_data = {
673
+ "user_metadata": metadata,
674
+ "write_mode": write_mode
675
+ }
676
+
677
+ response_data = self._post_json(f"/session/{session_id}/update_user_metadata", request_data)
678
+ return response_data
679
+
680
+ def is_foundation_model_ready(self, session_id: str, max_retries: int = None) -> Tuple[bool, str]:
681
+ """
682
+ Check if a foundation model session is ready to use (training completed).
683
+
684
+ Args:
685
+ session_id: The session ID to check
686
+ max_retries: Maximum number of retries (defaults to client default)
687
+
688
+ Returns:
689
+ Tuple of (is_ready: bool, status_message: str)
690
+ - is_ready: True if session is done and model card is available
691
+ - status_message: Human-readable status message
692
+
693
+ Example:
694
+ >>> is_ready, message = client.is_foundation_model_ready("session_123")
695
+ >>> if not is_ready:
696
+ ... print(f"Foundation model not ready: {message}")
697
+ """
698
+ try:
699
+ session_status = self.get_session_status(session_id, max_retries=max_retries)
700
+
701
+ if session_status.status in ["done", "DONE"]:
702
+ # Check if model card exists
703
+ try:
704
+ self.get_model_card(session_id, max_retries=max_retries, check_status_first=False)
705
+ return True, "Foundation model is ready"
706
+ except (requests.exceptions.HTTPError, FileNotFoundError):
707
+ return False, "Session is done but model card is not available yet"
708
+ else:
709
+ return False, f"Session is still {session_status.status}. Training may still be in progress."
710
+
711
+ except requests.exceptions.HTTPError as e:
712
+ if e.response.status_code == 404:
713
+ return False, f"Session {session_id} not found"
714
+ return False, f"Error checking session status: {e}"
715
+ except Exception as e:
716
+ return False, f"Error checking foundation model: {e}"
717
+
718
+ def get_model_card(self, session_id: str, max_retries: int = None, check_status_first: bool = True) -> Dict[str, Any]:
637
719
  """
638
720
  Get the model card JSON for a given session.
639
721
 
640
722
  Args:
641
723
  session_id: The session ID to get the model card for
642
724
  max_retries: Maximum number of retries (defaults to client default)
725
+ check_status_first: If True, check session status before fetching model card.
726
+ Provides better error messages if session is still training.
643
727
 
644
728
  Returns:
645
729
  Dictionary containing the model card JSON data
@@ -647,12 +731,31 @@ class FeatrixSphereClient:
647
731
  Raises:
648
732
  requests.exceptions.HTTPError: If the request fails
649
733
  FileNotFoundError: If the model card doesn't exist (404)
734
+ ValueError: If session is not ready and check_status_first is True
650
735
 
651
736
  Example:
652
737
  >>> client = FeatrixSphereClient()
653
738
  >>> model_card = client.get_model_card("session_123")
654
739
  >>> print(model_card["model_details"]["name"])
655
740
  """
741
+ # Check session status first to provide better error messages
742
+ if check_status_first:
743
+ try:
744
+ session_status = self.get_session_status(session_id, max_retries=max_retries)
745
+ if session_status.status not in ["done", "DONE"]:
746
+ raise ValueError(
747
+ f"Session {session_id} is not ready (status: {session_status.status}). "
748
+ f"Model card is only available after training completes. "
749
+ f"Use wait_for_session_completion() to wait for training to finish."
750
+ )
751
+ except requests.exceptions.HTTPError as e:
752
+ # If we can't get status, continue and let the model_card request fail
753
+ # This handles cases where the session doesn't exist
754
+ if e.response.status_code == 404:
755
+ raise FileNotFoundError(f"Session {session_id} not found") from e
756
+ # For other HTTP errors, continue to try model_card request
757
+ pass
758
+
656
759
  response = self._make_request(
657
760
  "GET",
658
761
  f"/session/{session_id}/model_card",
@@ -660,6 +763,77 @@ class FeatrixSphereClient:
660
763
  )
661
764
  return response.json()
662
765
 
766
+ def publish_session(self, session_id: str) -> Dict[str, Any]:
767
+ """
768
+ Publish a session by moving it to /sphere/published/<sessionId>.
769
+ Moves both the session file and output directory.
770
+
771
+ Args:
772
+ session_id: Session ID to publish
773
+
774
+ Returns:
775
+ Response with published_path, output_path, and status
776
+
777
+ Example:
778
+ ```python
779
+ result = client.publish_session("abc123")
780
+ print(f"Published to: {result['published_path']}")
781
+ ```
782
+ """
783
+ response_data = self._post_json(f"/compute/session/{session_id}/publish", {})
784
+ return response_data
785
+
786
+ def deprecate_session(self, session_id: str, warning_message: str, expiration_date: str) -> Dict[str, Any]:
787
+ """
788
+ Deprecate a published session with a warning message and expiration date.
789
+ The session remains available until the expiration date.
790
+
791
+ Args:
792
+ session_id: Session ID to deprecate
793
+ warning_message: Warning message to display about deprecation
794
+ expiration_date: ISO format date string when session will be removed (e.g., "2025-12-31T23:59:59Z")
795
+
796
+ Returns:
797
+ Response with deprecation status
798
+
799
+ Example:
800
+ ```python
801
+ from datetime import datetime, timedelta
802
+
803
+ expiration = (datetime.now() + timedelta(days=90)).isoformat() + "Z"
804
+ result = client.deprecate_session(
805
+ session_id="abc123",
806
+ warning_message="This session will be removed on 2025-12-31",
807
+ expiration_date=expiration
808
+ )
809
+ ```
810
+ """
811
+ data = {
812
+ "warning_message": warning_message,
813
+ "expiration_date": expiration_date
814
+ }
815
+ response_data = self._post_json(f"/compute/session/{session_id}/deprecate", data)
816
+ return response_data
817
+
818
+ def unpublish_session(self, session_id: str) -> Dict[str, Any]:
819
+ """
820
+ Unpublish a session by moving it back from /sphere/published/<sessionId>.
821
+
822
+ Args:
823
+ session_id: Session ID to unpublish
824
+
825
+ Returns:
826
+ Response with unpublish status
827
+
828
+ Example:
829
+ ```python
830
+ result = client.unpublish_session("abc123")
831
+ print(f"Status: {result['status']}")
832
+ ```
833
+ """
834
+ response_data = self._post_json(f"/compute/session/{session_id}/unpublish", {})
835
+ return response_data
836
+
663
837
  def get_sessions_for_org(self, name_prefix: str, max_retries: int = None) -> Dict[str, Any]:
664
838
  """
665
839
  Get all sessions matching a name prefix across all compute nodes.
@@ -703,8 +877,8 @@ class FeatrixSphereClient:
703
877
  >>> print(f"Model card recreated: {model_card['model_info']['name']}")
704
878
  """
705
879
  response = self._make_request(
706
- "POST",
707
- f"/session/{session_id}/model_card",
880
+ "GET",
881
+ f"/compute/session/{session_id}/model_card",
708
882
  max_retries=max_retries
709
883
  )
710
884
  return response.json()
@@ -1553,12 +1727,54 @@ class FeatrixSphereClient:
1553
1727
  # File Upload
1554
1728
  # =========================================================================
1555
1729
 
1730
+ def upload_file(self, file_path: str) -> str:
1731
+ """
1732
+ Upload a file to the server without creating a session.
1733
+ Returns the filename that can be used in training requests.
1734
+
1735
+ Args:
1736
+ file_path: Path to the file to upload
1737
+
1738
+ Returns:
1739
+ Filename (relative path) that can be used in training requests
1740
+ """
1741
+ from pathlib import Path as PathLib
1742
+ file_path_obj = PathLib(file_path)
1743
+ if not file_path_obj.exists():
1744
+ raise FileNotFoundError(f"File not found: {file_path}")
1745
+
1746
+ with open(file_path_obj, 'rb') as f:
1747
+ files = {'file': (file_path_obj.name, f, 'text/csv' if file_path_obj.suffix == '.csv' else 'application/gzip')}
1748
+ response = self._make_request("POST", "/compute/upload_file", files=files)
1749
+
1750
+ response_data = response.json()
1751
+
1752
+ # Handle S3 upload response (returns s3_url and filename)
1753
+ if 's3_url' in response_data:
1754
+ # S3 upload - extract filename from key or use returned filename
1755
+ filename = response_data.get('filename')
1756
+ if not filename:
1757
+ # Extract from S3 key if filename not provided
1758
+ s3_key = response_data.get('key', '')
1759
+ if s3_key:
1760
+ filename = PathLib(s3_key).name
1761
+ if not filename:
1762
+ raise ValueError("Server did not return filename in S3 upload response")
1763
+ return filename
1764
+
1765
+ # Handle local file upload response (returns filename)
1766
+ filename = response_data.get('filename')
1767
+ if not filename:
1768
+ raise ValueError("Server did not return filename in upload response")
1769
+
1770
+ return filename
1771
+
1556
1772
  def upload_file_and_create_session(self, file_path: Path, session_name_prefix: str = None, name: str = None, webhooks: Dict[str, str] = None) -> SessionInfo:
1557
1773
  """
1558
- Upload a CSV file and create a new session.
1774
+ Upload a CSV, Parquet, JSON, or JSONL file and create a new session.
1559
1775
 
1560
1776
  Args:
1561
- file_path: Path to the CSV file to upload
1777
+ file_path: Path to the CSV, Parquet, JSON, or JSONL file to upload
1562
1778
  session_name_prefix: Optional prefix for the session ID. Session will be named <prefix>-<full-uuid>
1563
1779
  name: Optional name for the embedding space/model (for identification and metadata)
1564
1780
  webhooks: Optional dict with webhook configuration keys (webhook_callback_secret, s3_backup_url, model_id_update_url)
@@ -1622,7 +1838,7 @@ class FeatrixSphereClient:
1622
1838
  webhooks: Dict[str, str] = None,
1623
1839
  epochs: int = None) -> SessionInfo:
1624
1840
  """
1625
- Upload a pandas DataFrame or CSV file and create a new session.
1841
+ Upload a pandas DataFrame, CSV file, Parquet file, JSON file, or JSONL file and create a new session.
1626
1842
 
1627
1843
  Special Column: __featrix_train_predictor
1628
1844
  ------------------------------------------
@@ -1630,7 +1846,7 @@ class FeatrixSphereClient:
1630
1846
  which rows are used for single predictor training.
1631
1847
 
1632
1848
  How it works:
1633
- - Add a boolean column "__featrix_train_predictor" to your DataFrame/CSV before upload
1849
+ - Add a boolean column "__featrix_train_predictor" to your DataFrame/CSV/Parquet/JSON/JSONL before upload
1634
1850
  - Set it to True for rows you want to use for predictor training
1635
1851
  - Set it to False (or any other value) for rows to exclude from predictor training
1636
1852
  - Embedding space training uses ALL rows (ignores this column)
@@ -1664,7 +1880,7 @@ class FeatrixSphereClient:
1664
1880
  Args:
1665
1881
  df: pandas DataFrame to upload (optional if file_path is provided)
1666
1882
  filename: Name to give the uploaded file (default: "data.csv")
1667
- file_path: Path to CSV file to upload (optional if df is provided)
1883
+ file_path: Path to CSV, Parquet, JSON, or JSONL file to upload (optional if df is provided)
1668
1884
  column_overrides: Dict mapping column names to types ("scalar", "set", "free_string", "free_string_list")
1669
1885
  column_types: Alias for column_overrides (for backward compatibility)
1670
1886
  string_list_delimiter: Delimiter for free_string_list columns (default: "|")
@@ -1705,21 +1921,90 @@ class FeatrixSphereClient:
1705
1921
  if not os.path.exists(file_path):
1706
1922
  raise FileNotFoundError(f"File not found: {file_path}")
1707
1923
 
1708
- # Check if it's a CSV file
1709
- if not file_path.lower().endswith(('.csv', '.csv.gz')):
1710
- raise ValueError("File must be a CSV file (with .csv or .csv.gz extension)")
1924
+ # Check if it's a supported file type
1925
+ file_ext = file_path.lower()
1926
+ if not file_ext.endswith(('.csv', '.csv.gz', '.parquet', '.json', '.jsonl')):
1927
+ raise ValueError("File must be a CSV, Parquet, JSON, or JSONL file (with .csv, .csv.gz, .parquet, .json, or .jsonl extension)")
1711
1928
 
1712
1929
  print(f"Uploading file: {file_path}")
1713
1930
 
1714
1931
  # Read the file content
1715
1932
  if file_path.endswith('.gz'):
1716
- # Already gzipped
1933
+ # Already gzipped CSV
1717
1934
  with gzip.open(file_path, 'rb') as f:
1718
1935
  file_content = f.read()
1719
1936
  upload_filename = os.path.basename(file_path)
1720
1937
  content_type = 'application/gzip'
1938
+ elif file_path.lower().endswith(('.json', '.jsonl')):
1939
+ # JSON/JSONL file - read as DataFrame, convert to CSV, then compress
1940
+ print(f"Reading {'JSONL' if file_path.lower().endswith('.jsonl') else 'JSON'} file...")
1941
+ try:
1942
+ from featrix.neural.input_data_file import featrix_wrap_read_json_file
1943
+ json_df = featrix_wrap_read_json_file(file_path)
1944
+ if json_df is None:
1945
+ raise ValueError(f"Failed to parse {'JSONL' if file_path.lower().endswith('.jsonl') else 'JSON'} file")
1946
+ except ImportError:
1947
+ # Fallback to pandas if featrix wrapper not available
1948
+ if file_path.lower().endswith('.jsonl'):
1949
+ # JSONL: one JSON object per line
1950
+ import json
1951
+ records = []
1952
+ with open(file_path, 'r', encoding='utf-8') as f:
1953
+ for line in f:
1954
+ if line.strip():
1955
+ records.append(json.loads(line))
1956
+ json_df = pd.DataFrame(records)
1957
+ else:
1958
+ # Regular JSON
1959
+ json_df = pd.read_json(file_path)
1960
+
1961
+ # Clean NaN values before CSV conversion
1962
+ cleaned_df = json_df.where(pd.notna(json_df), None)
1963
+
1964
+ # Convert to CSV and compress
1965
+ csv_buffer = io.StringIO()
1966
+ cleaned_df.to_csv(csv_buffer, index=False)
1967
+ csv_data = csv_buffer.getvalue().encode('utf-8')
1968
+
1969
+ print(f"Compressing {'JSONL' if file_path.lower().endswith('.jsonl') else 'JSON'} (converted to CSV)...")
1970
+ compressed_buffer = io.BytesIO()
1971
+ with gzip.GzipFile(fileobj=compressed_buffer, mode='wb') as gz:
1972
+ gz.write(csv_data)
1973
+ file_content = compressed_buffer.getvalue()
1974
+ upload_filename = os.path.basename(file_path).replace('.jsonl', '.csv.gz').replace('.json', '.csv.gz')
1975
+ content_type = 'application/gzip'
1976
+
1977
+ original_size = len(csv_data)
1978
+ compressed_size = len(file_content)
1979
+ compression_ratio = (1 - compressed_size / original_size) * 100
1980
+ print(f"Converted {'JSONL' if file_path.lower().endswith('.jsonl') else 'JSON'} to CSV and compressed from {original_size:,} to {compressed_size:,} bytes ({compression_ratio:.1f}% reduction)")
1981
+ elif file_path.lower().endswith('.parquet'):
1982
+ # Parquet file - read as DataFrame, convert to CSV, then compress
1983
+ print("Reading Parquet file...")
1984
+ parquet_df = pd.read_parquet(file_path)
1985
+
1986
+ # Clean NaN values before CSV conversion
1987
+ cleaned_df = parquet_df.where(pd.notna(parquet_df), None)
1988
+
1989
+ # Convert to CSV and compress
1990
+ csv_buffer = io.StringIO()
1991
+ cleaned_df.to_csv(csv_buffer, index=False)
1992
+ csv_data = csv_buffer.getvalue().encode('utf-8')
1993
+
1994
+ print("Compressing Parquet (converted to CSV)...")
1995
+ compressed_buffer = io.BytesIO()
1996
+ with gzip.GzipFile(fileobj=compressed_buffer, mode='wb') as gz:
1997
+ gz.write(csv_data)
1998
+ file_content = compressed_buffer.getvalue()
1999
+ upload_filename = os.path.basename(file_path).replace('.parquet', '.csv.gz')
2000
+ content_type = 'application/gzip'
2001
+
2002
+ original_size = len(csv_data)
2003
+ compressed_size = len(file_content)
2004
+ compression_ratio = (1 - compressed_size / original_size) * 100
2005
+ print(f"Converted Parquet to CSV and compressed from {original_size:,} to {compressed_size:,} bytes ({compression_ratio:.1f}% reduction)")
1721
2006
  else:
1722
- # Read CSV and compress it
2007
+ # Regular CSV file - read and compress it
1723
2008
  with open(file_path, 'rb') as f:
1724
2009
  csv_content = f.read()
1725
2010
 
@@ -3868,9 +4153,10 @@ class FeatrixSphereClient:
3868
4153
 
3869
4154
  def train_on_foundational_model(self, foundation_model_id: str, target_column: str, target_column_type: str,
3870
4155
  input_filename: str = None,
4156
+ df = None,
3871
4157
  name: str = None,
3872
4158
  session_name_prefix: str = None,
3873
- epochs: int = 0, batch_size: int = 0, learning_rate: float = 0.001,
4159
+ epochs: int = 0,
3874
4160
  rare_label_value: str = None,
3875
4161
  class_imbalance: dict = None,
3876
4162
  optimize_for: str = "balanced",
@@ -3889,11 +4175,11 @@ class FeatrixSphereClient:
3889
4175
  target_column: Name of the target column to predict
3890
4176
  target_column_type: Type of target column ("set" or "scalar")
3891
4177
  input_filename: Optional input data file (uses foundation model's data if not provided)
4178
+ df: Optional pandas DataFrame with training data (uses foundation model's data if not provided).
4179
+ Use input_filename OR df (not both) to train predictor on different data than the foundation model.
3892
4180
  name: Optional name for the new session
3893
4181
  session_name_prefix: Optional prefix for session ID. Session will be named <prefix>-<uuid>
3894
4182
  epochs: Number of training epochs (default: 0; automatic)
3895
- batch_size: Training batch size (default: 0; automatic)
3896
- learning_rate: Learning rate for training (default: 0.001)
3897
4183
  rare_label_value: For binary classification, which class is the rare/minority class for metrics (default: None)
3898
4184
  class_imbalance: Expected class ratios/counts from real world for sampled data (default: None)
3899
4185
  optimize_for: Optimization target - "balanced" (F1 score), "precision", or "recall" (default: "balanced")
@@ -3908,44 +4194,122 @@ class FeatrixSphereClient:
3908
4194
  print(f"Training predictor on foundation model {foundation_model_id}...")
3909
4195
  print(f" Target: {target_column} ({target_column_type})")
3910
4196
 
3911
- data = {
3912
- "foundation_model_id": foundation_model_id,
3913
- "target_column": target_column,
3914
- "target_column_type": target_column_type,
3915
- "epochs": epochs,
3916
- "batch_size": batch_size,
3917
- "learning_rate": learning_rate,
3918
- "optimize_for": optimize_for,
3919
- }
3920
-
3921
- if input_filename:
3922
- # Clean up input_filename: extract just the filename if it's an absolute path
3923
- # The file should be uploaded first or already exist on the server
3924
- from pathlib import Path
3925
- input_path = Path(input_filename)
3926
- if input_path.is_absolute():
3927
- # Extract just the filename - client should upload file first
3928
- cleaned_filename = input_path.name
3929
- print(f"⚠️ Note: Extracted filename '{cleaned_filename}' from absolute path '{input_filename}'")
3930
- print(f" Make sure the file has been uploaded to the server first")
3931
- data["input_filename"] = cleaned_filename
3932
- else:
3933
- data["input_filename"] = input_filename
3934
- if name:
3935
- data["name"] = name
3936
- if session_name_prefix:
3937
- data["session_name_prefix"] = session_name_prefix
3938
- if rare_label_value:
3939
- data["rare_label_value"] = rare_label_value
3940
- if class_imbalance:
3941
- data["class_imbalance"] = class_imbalance
3942
- if webhooks:
3943
- data["webhooks"] = webhooks
4197
+ # Get the compute cluster from the foundation model session
4198
+ # This ensures we upload files to the same node where the foundation model lives
4199
+ foundation_session = self.get_session_status(foundation_model_id)
4200
+ foundation_compute_cluster = self.get_last_server_metadata()
4201
+ foundation_compute_cluster = foundation_compute_cluster.get('compute_cluster') if foundation_compute_cluster else None
4202
+
4203
+ # Temporarily set compute cluster for file uploads if we found one
4204
+ original_compute_cluster = self.compute_cluster
4205
+ original_headers = self.session.headers.copy()
4206
+ if foundation_compute_cluster:
4207
+ self.set_compute_cluster(foundation_compute_cluster)
4208
+ if verbose:
4209
+ print(f" Using compute cluster: {foundation_compute_cluster}")
3944
4210
 
3945
- response_data = self._post_json("/compute/train_on_foundational_model", data)
4211
+ try:
4212
+ # Validate that only one data source is provided
4213
+ if input_filename and df is not None:
4214
+ raise ValueError("Provide either input_filename or df, not both")
4215
+
4216
+ # If DataFrame provided, save to temp file and upload it
4217
+ temp_file = None
4218
+ if df is not None:
4219
+ import pandas as pd
4220
+ import tempfile
4221
+ import os
4222
+
4223
+ if not isinstance(df, pd.DataFrame):
4224
+ raise ValueError("df must be a pandas DataFrame")
4225
+
4226
+ if verbose:
4227
+ print(f"📊 Using provided DataFrame ({len(df)} rows, {len(df.columns)} columns)")
4228
+
4229
+ # Create temporary CSV file
4230
+ temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False)
4231
+ temp_file_path = temp_file.name
4232
+ temp_file.close()
4233
+
4234
+ # Save DataFrame to temp file
4235
+ df.to_csv(temp_file_path, index=False)
4236
+
4237
+ if verbose:
4238
+ print(f"📁 Saved to temporary file: {os.path.basename(temp_file_path)}")
4239
+ print(f"📤 Uploading file to server...")
4240
+
4241
+ # Upload the file
4242
+ uploaded_filename = self.upload_file(temp_file_path)
4243
+ input_filename = uploaded_filename
4244
+
4245
+ if verbose:
4246
+ print(f"✅ File uploaded: {input_filename}")
4247
+
4248
+ # Clean up temp file
4249
+ try:
4250
+ os.unlink(temp_file_path)
4251
+ except Exception:
4252
+ pass # Ignore cleanup errors
3946
4253
 
3947
- new_session_id = response_data.get('session_id')
3948
- print(f"✅ Predictor training session created: {new_session_id}")
4254
+ data = {
4255
+ "foundation_model_id": foundation_model_id,
4256
+ "target_column": target_column,
4257
+ "target_column_type": target_column_type,
4258
+ "epochs": epochs,
4259
+ "optimize_for": optimize_for,
4260
+ }
4261
+
4262
+ if input_filename:
4263
+ # If absolute path provided, upload the file first
4264
+ from pathlib import Path
4265
+ input_path = Path(input_filename)
4266
+ if input_path.is_absolute():
4267
+ # Upload the file first, then use the uploaded filename
4268
+ if not input_path.exists():
4269
+ raise FileNotFoundError(f"Input file not found: {input_filename}")
4270
+
4271
+ if verbose:
4272
+ print(f"📤 Uploading file from absolute path: {input_filename}")
4273
+
4274
+ # Upload the file
4275
+ uploaded_filename = self.upload_file(str(input_path))
4276
+
4277
+ if verbose:
4278
+ print(f"✅ File uploaded as: {uploaded_filename}")
4279
+
4280
+ data["input_filename"] = uploaded_filename
4281
+ else:
4282
+ # Relative filename - assume it's already on the server
4283
+ data["input_filename"] = input_filename
4284
+ if name:
4285
+ data["name"] = name
4286
+ if session_name_prefix:
4287
+ data["session_name_prefix"] = session_name_prefix
4288
+ if rare_label_value:
4289
+ data["rare_label_value"] = rare_label_value
4290
+ if class_imbalance:
4291
+ data["class_imbalance"] = class_imbalance
4292
+ if webhooks:
4293
+ data["webhooks"] = webhooks
4294
+
4295
+ response_data = self._post_json("/compute/train_on_foundational_model", data)
4296
+
4297
+ new_session_id = response_data.get('session_id')
4298
+ print(f"✅ Predictor training session created: {new_session_id}")
4299
+
4300
+ # Restore original compute cluster setting
4301
+ if original_compute_cluster != self.compute_cluster:
4302
+ if original_compute_cluster:
4303
+ self.set_compute_cluster(original_compute_cluster)
4304
+ else:
4305
+ self.session.headers = original_headers
4306
+ finally:
4307
+ # Ensure we restore headers even if there's an error
4308
+ if original_compute_cluster != self.compute_cluster:
4309
+ if original_compute_cluster:
4310
+ self.set_compute_cluster(original_compute_cluster)
4311
+ else:
4312
+ self.session.headers = original_headers
3949
4313
 
3950
4314
  if verbose:
3951
4315
  print(f"⏳ Waiting for training to complete...")
@@ -4751,9 +5115,8 @@ class FeatrixSphereClient:
4751
5115
 
4752
5116
  def train_predictor_more(self, session_id: str, epochs: int = 50,
4753
5117
  predictor_id: str = None, target_column: str = None,
4754
- batch_size: int = 0, learning_rate: float = None,
4755
5118
  poll_interval: int = 30, max_poll_time: int = 3600,
4756
- verbose: bool = True) -> Dict[str, Any]:
5119
+ verbose: bool = True, webhooks: Dict[str, str] = None) -> Dict[str, Any]:
4757
5120
  """
4758
5121
  Continue training an existing single predictor for more epochs.
4759
5122
  Loads the existing predictor and resumes training from where it left off.
@@ -4763,11 +5126,10 @@ class FeatrixSphereClient:
4763
5126
  epochs: Additional epochs to train (required)
4764
5127
  predictor_id: Predictor ID to continue training (optional, highest priority)
4765
5128
  target_column: Target column name to find predictor (optional, alternative to predictor_id)
4766
- batch_size: Batch size for continuation (0 = use existing from predictor)
4767
- learning_rate: Learning rate for continuation (None = use existing from predictor)
4768
5129
  poll_interval: Seconds between status checks (default: 30)
4769
5130
  max_poll_time: Maximum time to poll in seconds (default: 3600 = 1 hour)
4770
5131
  verbose: Whether to print status updates (default: True)
5132
+ webhooks: Optional dict with webhook configuration keys (webhook_callback_secret, s3_backup_url, model_id_update_url)
4771
5133
 
4772
5134
  Returns:
4773
5135
  Response with continuation start confirmation or completion status
@@ -4790,15 +5152,14 @@ class FeatrixSphereClient:
4790
5152
 
4791
5153
  data = {
4792
5154
  "epochs": epochs,
4793
- "batch_size": batch_size,
4794
5155
  }
4795
5156
 
4796
5157
  if predictor_id:
4797
5158
  data["predictor_id"] = predictor_id
4798
5159
  if target_column:
4799
5160
  data["target_column"] = target_column
4800
- if learning_rate is not None:
4801
- data["learning_rate"] = learning_rate
5161
+ if webhooks:
5162
+ data["webhooks"] = webhooks
4802
5163
 
4803
5164
  if verbose:
4804
5165
  print(f"🔄 Continuing training for predictor on session {session_id}")
@@ -4888,6 +5249,139 @@ class FeatrixSphereClient:
4888
5249
  print(f"❌ Error starting predictor continuation: {e}")
4889
5250
  raise
4890
5251
 
5252
+ def foundation_model_train_more(self, session_id: str, es_id: str = None, data_passes: int = None,
5253
+ epochs: int = None, poll_interval: int = 30, max_poll_time: int = 3600,
5254
+ verbose: bool = True, webhooks: Dict[str, str] = None) -> Dict[str, Any]:
5255
+ """
5256
+ Continue training an existing foundation model (embedding space) for more epochs.
5257
+ Loads the existing embedding space and resumes training from where it left off.
5258
+
5259
+ Args:
5260
+ session_id: Session ID containing the trained foundation model
5261
+ es_id: Embedding space ID (optional, uses session's ES if not provided)
5262
+ data_passes: Additional epochs to train (preferred, default: 50)
5263
+ epochs: Additional epochs to train (deprecated, use data_passes instead, for compatibility)
5264
+ poll_interval: Seconds between status checks (default: 30)
5265
+ max_poll_time: Maximum time to poll in seconds (default: 3600 = 1 hour)
5266
+ verbose: Whether to print status updates (default: True)
5267
+ webhooks: Optional dict with webhook configuration keys (webhook_callback_secret, s3_backup_url, model_id_update_url)
5268
+
5269
+ Returns:
5270
+ Response with continuation start confirmation or completion status
5271
+
5272
+ Example:
5273
+ ```python
5274
+ # Continue training for 50 more epochs
5275
+ result = client.foundation_model_train_more(
5276
+ session_id="abc123",
5277
+ data_passes=50
5278
+ )
5279
+ ```
5280
+ """
5281
+ # Support both data_passes and epochs for compatibility
5282
+ if data_passes is None and epochs is None:
5283
+ data_passes = 50 # Default
5284
+ elif data_passes is None:
5285
+ data_passes = epochs # Use epochs if data_passes not provided
5286
+ # If both provided, data_passes takes precedence
5287
+
5288
+ if data_passes <= 0:
5289
+ raise ValueError("data_passes (or epochs) must be > 0 (specify additional epochs to train)")
5290
+
5291
+ data = {
5292
+ "data_passes": data_passes,
5293
+ }
5294
+
5295
+ if es_id:
5296
+ data["es_id"] = es_id
5297
+ if webhooks:
5298
+ data["webhooks"] = webhooks
5299
+
5300
+ if verbose:
5301
+ print(f"🔄 Continuing training for foundation model on session {session_id}")
5302
+ print(f" Additional epochs: {data_passes}")
5303
+ if es_id:
5304
+ print(f" ES ID: {es_id}")
5305
+
5306
+ try:
5307
+ response_data = self._post_json(f"/compute/session/{session_id}/train_foundation_model_more", data)
5308
+
5309
+ if verbose:
5310
+ print(f"✅ Foundation model continuation started: {response_data.get('message')}")
5311
+
5312
+ # Poll for completion if requested
5313
+ if poll_interval > 0 and max_poll_time > 0:
5314
+ import time
5315
+ start_time = time.time()
5316
+ last_status = ""
5317
+
5318
+ while time.time() - start_time < max_poll_time:
5319
+ try:
5320
+ session_info = self.get_session_status(session_id)
5321
+ jobs = session_info.jobs if hasattr(session_info, 'jobs') else {}
5322
+
5323
+ # Find continuation jobs
5324
+ es_jobs = {j_id: j for j_id, j in jobs.items()
5325
+ if j.get('type') == 'train_es'}
5326
+
5327
+ if not es_jobs:
5328
+ if verbose:
5329
+ print("✅ No continuation jobs found - training may have completed")
5330
+ break
5331
+
5332
+ # Check job statuses
5333
+ running_jobs = [j_id for j_id, j in es_jobs.items() if j.get('status') == 'running']
5334
+ completed_jobs = [j_id for j_id, j in es_jobs.items() if j.get('status') == 'done']
5335
+ failed_jobs = [j_id for j_id, j in es_jobs.items() if j.get('status') == 'failed']
5336
+
5337
+ current_status = f"Running: {len(running_jobs)}, Done: {len(completed_jobs)}, Failed: {len(failed_jobs)}"
5338
+ if current_status != last_status and verbose:
5339
+ print(f"📊 Status: {current_status}")
5340
+ last_status = current_status
5341
+
5342
+ if not running_jobs and (completed_jobs or failed_jobs):
5343
+ if completed_jobs:
5344
+ if verbose:
5345
+ print(f"✅ Foundation model continuation completed successfully!")
5346
+ return {
5347
+ "message": "Foundation model continuation completed successfully",
5348
+ "session_id": session_id,
5349
+ "status": "completed",
5350
+ "additional_epochs": data_passes
5351
+ }
5352
+ else:
5353
+ if verbose:
5354
+ print(f"❌ Foundation model continuation failed")
5355
+ return {
5356
+ "message": "Foundation model continuation failed",
5357
+ "session_id": session_id,
5358
+ "status": "failed",
5359
+ "failed_jobs": failed_jobs
5360
+ }
5361
+
5362
+ time.sleep(poll_interval)
5363
+ except Exception as poll_error:
5364
+ if verbose:
5365
+ print(f"⚠️ Error during polling: {poll_error}")
5366
+ time.sleep(poll_interval)
5367
+
5368
+ # Timeout
5369
+ if verbose:
5370
+ print(f"⏱️ Polling timeout reached ({max_poll_time}s)")
5371
+ return {
5372
+ "message": "Polling timeout",
5373
+ "session_id": session_id,
5374
+ "status": "timeout",
5375
+ "additional_epochs": data_passes
5376
+ }
5377
+
5378
+ return response_data
5379
+
5380
+ except Exception as e:
5381
+ if verbose:
5382
+ print(f"❌ Error starting foundation model continuation: {e}")
5383
+ raise
5384
+
4891
5385
  def _train_single_predictor_with_file(
4892
5386
  self,
4893
5387
  session_id: str,
@@ -5965,7 +6459,24 @@ class FeatrixSphereClient:
5965
6459
  if not file_path.exists():
5966
6460
  raise FileNotFoundError(f"File not found: {file_path}")
5967
6461
 
5968
- df = pd.read_csv(file_path)
6462
+ # Support CSV, Parquet, JSON, and JSONL files
6463
+ file_path_str = str(file_path).lower()
6464
+ if file_path_str.endswith('.parquet'):
6465
+ df = pd.read_parquet(file_path)
6466
+ elif file_path_str.endswith('.jsonl'):
6467
+ # JSONL: one JSON object per line
6468
+ import json
6469
+ records = []
6470
+ with open(file_path, 'r', encoding='utf-8') as f:
6471
+ for line in f:
6472
+ if line.strip():
6473
+ records.append(json.loads(line))
6474
+ df = pd.DataFrame(records)
6475
+ elif file_path_str.endswith('.json'):
6476
+ # Regular JSON
6477
+ df = pd.read_json(file_path)
6478
+ else:
6479
+ df = pd.read_csv(file_path)
5969
6480
 
5970
6481
  # Convert to JSON Tables format and clean NaNs
5971
6482
  table_data = JSONTablesEncoder.from_dataframe(df)
@@ -6119,11 +6630,11 @@ class FeatrixSphereClient:
6119
6630
  def run_csv_predictions(self, session_id: str, csv_file: str, target_column: str = None,
6120
6631
  sample_size: int = None, remove_target: bool = True) -> Dict[str, Any]:
6121
6632
  """
6122
- Run predictions on a CSV file with automatic accuracy calculation.
6633
+ Run predictions on a CSV, Parquet, JSON, or JSONL file with automatic accuracy calculation.
6123
6634
 
6124
6635
  Args:
6125
6636
  session_id: ID of session with trained predictor
6126
- csv_file: Path to CSV file
6637
+ csv_file: Path to CSV, Parquet, JSON, or JSONL file
6127
6638
  target_column: Name of target column (for accuracy calculation)
6128
6639
  sample_size: Number of records to test (None = all records)
6129
6640
  remove_target: Whether to remove target column from prediction input
@@ -6133,8 +6644,24 @@ class FeatrixSphereClient:
6133
6644
  """
6134
6645
  import pandas as pd
6135
6646
 
6136
- # Load CSV
6137
- df = pd.read_csv(csv_file)
6647
+ # Load CSV, Parquet, JSON, or JSONL
6648
+ csv_file_lower = csv_file.lower()
6649
+ if csv_file_lower.endswith('.parquet'):
6650
+ df = pd.read_parquet(csv_file)
6651
+ elif csv_file_lower.endswith('.jsonl'):
6652
+ # JSONL: one JSON object per line
6653
+ import json
6654
+ records = []
6655
+ with open(csv_file, 'r', encoding='utf-8') as f:
6656
+ for line in f:
6657
+ if line.strip():
6658
+ records.append(json.loads(line))
6659
+ df = pd.DataFrame(records)
6660
+ elif csv_file_lower.endswith('.json'):
6661
+ # Regular JSON
6662
+ df = pd.read_json(csv_file)
6663
+ else:
6664
+ df = pd.read_csv(csv_file)
6138
6665
 
6139
6666
  # Handle target column
6140
6667
  actual_values = None
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: featrixsphere
3
- Version: 0.2.1141
3
+ Version: 0.2.1235
4
4
  Summary: Transform any CSV into a production-ready ML model in minutes, not months.
5
5
  Home-page: https://github.com/Featrix/sphere
6
6
  Author: Featrix
@@ -0,0 +1,9 @@
1
+ featrixsphere/__init__.py,sha256=Md3n-bIkA4f-Jdk1zoINUXqmnhGnM0osazYM1lk_1hY,1888
2
+ featrixsphere/cli.py,sha256=AW9O3vCvCNJ2UxVGN66eRmeN7XLSiHJlvK6JLZ9UJXc,13358
3
+ featrixsphere/client.py,sha256=8OKx-pUcZjStdR7Fy4yGMMcWTuqaF1DeSPnxXPcoKtQ,384045
4
+ featrixsphere/test_client.py,sha256=4SiRbib0ms3poK0UpnUv4G0HFQSzidF3Iswo_J2cjLk,11981
5
+ featrixsphere-0.2.1235.dist-info/METADATA,sha256=xKOgY6aH86jvurdKvPbV-_n1YnZi5ZxGtmxH9aQuGTs,16232
6
+ featrixsphere-0.2.1235.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
7
+ featrixsphere-0.2.1235.dist-info/entry_points.txt,sha256=QreJeYfD_VWvbEqPmMXZ3pqqlFlJ1qZb-NtqnyhEldc,51
8
+ featrixsphere-0.2.1235.dist-info/top_level.txt,sha256=AyN4wjfzlD0hWnDieuEHX0KckphIk_aC73XCG4df5uU,14
9
+ featrixsphere-0.2.1235.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- featrixsphere/__init__.py,sha256=FMxe64cn4iu9Ce5UDkOAtWZQMeWSijwX-tsiTDvblkM,1888
2
- featrixsphere/cli.py,sha256=AW9O3vCvCNJ2UxVGN66eRmeN7XLSiHJlvK6JLZ9UJXc,13358
3
- featrixsphere/client.py,sha256=TsiV-nr0VbBS1jJfidk5zrhOx6StolKsSn_txH0wmmg,358958
4
- featrixsphere/test_client.py,sha256=4SiRbib0ms3poK0UpnUv4G0HFQSzidF3Iswo_J2cjLk,11981
5
- featrixsphere-0.2.1141.dist-info/METADATA,sha256=27KEfgeXQqUNAlO3HIFhYkJU43YN3RdCjTJ_-viNJow,16232
6
- featrixsphere-0.2.1141.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
7
- featrixsphere-0.2.1141.dist-info/entry_points.txt,sha256=QreJeYfD_VWvbEqPmMXZ3pqqlFlJ1qZb-NtqnyhEldc,51
8
- featrixsphere-0.2.1141.dist-info/top_level.txt,sha256=AyN4wjfzlD0hWnDieuEHX0KckphIk_aC73XCG4df5uU,14
9
- featrixsphere-0.2.1141.dist-info/RECORD,,