featrixsphere 0.2.1002__py3-none-any.whl → 0.2.1314__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
featrixsphere/__init__.py CHANGED
@@ -38,7 +38,7 @@ Example:
38
38
  ... labels=['Experiment A', 'Experiment B'])
39
39
  """
40
40
 
41
- __version__ = "0.2.1002"
41
+ __version__ = "0.2.1314"
42
42
  __author__ = "Featrix"
43
43
  __email__ = "support@featrix.com"
44
44
  __license__ = "MIT"
featrixsphere/client.py CHANGED
@@ -633,13 +633,97 @@ class FeatrixSphereClient:
633
633
  _client=self
634
634
  )
635
635
 
636
- def get_model_card(self, session_id: str, max_retries: int = None) -> Dict[str, Any]:
636
+ def update_user_metadata(self, session_id: str, metadata: Dict[str, Any], write_mode: str = "merge") -> Dict[str, Any]:
637
+ """
638
+ Update user metadata for a session.
639
+
640
+ Args:
641
+ session_id: The session ID to update metadata for
642
+ metadata: Dictionary of metadata to update (max 32KB total)
643
+ write_mode: How to update metadata:
644
+ - "merge" (default): Merge new metadata with existing (existing keys are updated, new keys are added)
645
+ - "overwrite": Replace all user_metadata with the new dictionary
646
+
647
+ Returns:
648
+ Dictionary containing the updated session information
649
+
650
+ Raises:
651
+ requests.exceptions.HTTPError: If the request fails
652
+ ValueError: If write_mode is not "merge" or "overwrite"
653
+
654
+ Example:
655
+ >>> # Merge new metadata with existing
656
+ >>> client.update_user_metadata(
657
+ ... session_id="abc123",
658
+ ... metadata={"new_key": "value", "existing_key": "updated_value"},
659
+ ... write_mode="merge"
660
+ ... )
661
+
662
+ >>> # Replace all metadata
663
+ >>> client.update_user_metadata(
664
+ ... session_id="abc123",
665
+ ... metadata={"only_key": "only_value"},
666
+ ... write_mode="overwrite"
667
+ ... )
668
+ """
669
+ if write_mode not in ["merge", "overwrite"]:
670
+ raise ValueError(f"write_mode must be 'merge' or 'overwrite', got '{write_mode}'")
671
+
672
+ request_data = {
673
+ "user_metadata": metadata,
674
+ "write_mode": write_mode
675
+ }
676
+
677
+ response_data = self._post_json(f"/session/{session_id}/update_user_metadata", request_data)
678
+ return response_data
679
+
680
+ def is_foundation_model_ready(self, session_id: str, max_retries: int = None) -> Tuple[bool, str]:
681
+ """
682
+ Check if a foundation model session is ready to use (training completed).
683
+
684
+ Args:
685
+ session_id: The session ID to check
686
+ max_retries: Maximum number of retries (defaults to client default)
687
+
688
+ Returns:
689
+ Tuple of (is_ready: bool, status_message: str)
690
+ - is_ready: True if session is done and model card is available
691
+ - status_message: Human-readable status message
692
+
693
+ Example:
694
+ >>> is_ready, message = client.is_foundation_model_ready("session_123")
695
+ >>> if not is_ready:
696
+ ... print(f"Foundation model not ready: {message}")
697
+ """
698
+ try:
699
+ session_status = self.get_session_status(session_id, max_retries=max_retries)
700
+
701
+ if session_status.status in ["done", "DONE"]:
702
+ # Check if model card exists
703
+ try:
704
+ self.get_model_card(session_id, max_retries=max_retries, check_status_first=False)
705
+ return True, "Foundation model is ready"
706
+ except (requests.exceptions.HTTPError, FileNotFoundError):
707
+ return False, "Session is done but model card is not available yet"
708
+ else:
709
+ return False, f"Session is still {session_status.status}. Training may still be in progress."
710
+
711
+ except requests.exceptions.HTTPError as e:
712
+ if e.response.status_code == 404:
713
+ return False, f"Session {session_id} not found"
714
+ return False, f"Error checking session status: {e}"
715
+ except Exception as e:
716
+ return False, f"Error checking foundation model: {e}"
717
+
718
+ def get_model_card(self, session_id: str, max_retries: int = None, check_status_first: bool = True) -> Dict[str, Any]:
637
719
  """
638
720
  Get the model card JSON for a given session.
639
721
 
640
722
  Args:
641
723
  session_id: The session ID to get the model card for
642
724
  max_retries: Maximum number of retries (defaults to client default)
725
+ check_status_first: If True, check session status before fetching model card.
726
+ Provides better error messages if session is still training.
643
727
 
644
728
  Returns:
645
729
  Dictionary containing the model card JSON data
@@ -647,12 +731,31 @@ class FeatrixSphereClient:
647
731
  Raises:
648
732
  requests.exceptions.HTTPError: If the request fails
649
733
  FileNotFoundError: If the model card doesn't exist (404)
734
+ ValueError: If session is not ready and check_status_first is True
650
735
 
651
736
  Example:
652
737
  >>> client = FeatrixSphereClient()
653
738
  >>> model_card = client.get_model_card("session_123")
654
739
  >>> print(model_card["model_details"]["name"])
655
740
  """
741
+ # Check session status first to provide better error messages
742
+ if check_status_first:
743
+ try:
744
+ session_status = self.get_session_status(session_id, max_retries=max_retries)
745
+ if session_status.status not in ["done", "DONE"]:
746
+ raise ValueError(
747
+ f"Session {session_id} is not ready (status: {session_status.status}). "
748
+ f"Model card is only available after training completes. "
749
+ f"Use wait_for_session_completion() to wait for training to finish."
750
+ )
751
+ except requests.exceptions.HTTPError as e:
752
+ # If we can't get status, continue and let the model_card request fail
753
+ # This handles cases where the session doesn't exist
754
+ if e.response.status_code == 404:
755
+ raise FileNotFoundError(f"Session {session_id} not found") from e
756
+ # For other HTTP errors, continue to try model_card request
757
+ pass
758
+
656
759
  response = self._make_request(
657
760
  "GET",
658
761
  f"/session/{session_id}/model_card",
@@ -660,6 +763,77 @@ class FeatrixSphereClient:
660
763
  )
661
764
  return response.json()
662
765
 
766
+ def publish_session(self, session_id: str) -> Dict[str, Any]:
767
+ """
768
+ Publish a session by moving it to /sphere/published/<sessionId>.
769
+ Moves both the session file and output directory.
770
+
771
+ Args:
772
+ session_id: Session ID to publish
773
+
774
+ Returns:
775
+ Response with published_path, output_path, and status
776
+
777
+ Example:
778
+ ```python
779
+ result = client.publish_session("abc123")
780
+ print(f"Published to: {result['published_path']}")
781
+ ```
782
+ """
783
+ response_data = self._post_json(f"/compute/session/{session_id}/publish", {})
784
+ return response_data
785
+
786
+ def deprecate_session(self, session_id: str, warning_message: str, expiration_date: str) -> Dict[str, Any]:
787
+ """
788
+ Deprecate a published session with a warning message and expiration date.
789
+ The session remains available until the expiration date.
790
+
791
+ Args:
792
+ session_id: Session ID to deprecate
793
+ warning_message: Warning message to display about deprecation
794
+ expiration_date: ISO format date string when session will be removed (e.g., "2025-12-31T23:59:59Z")
795
+
796
+ Returns:
797
+ Response with deprecation status
798
+
799
+ Example:
800
+ ```python
801
+ from datetime import datetime, timedelta
802
+
803
+ expiration = (datetime.now() + timedelta(days=90)).isoformat() + "Z"
804
+ result = client.deprecate_session(
805
+ session_id="abc123",
806
+ warning_message="This session will be removed on 2025-12-31",
807
+ expiration_date=expiration
808
+ )
809
+ ```
810
+ """
811
+ data = {
812
+ "warning_message": warning_message,
813
+ "expiration_date": expiration_date
814
+ }
815
+ response_data = self._post_json(f"/compute/session/{session_id}/deprecate", data)
816
+ return response_data
817
+
818
+ def unpublish_session(self, session_id: str) -> Dict[str, Any]:
819
+ """
820
+ Unpublish a session by moving it back from /sphere/published/<sessionId>.
821
+
822
+ Args:
823
+ session_id: Session ID to unpublish
824
+
825
+ Returns:
826
+ Response with unpublish status
827
+
828
+ Example:
829
+ ```python
830
+ result = client.unpublish_session("abc123")
831
+ print(f"Status: {result['status']}")
832
+ ```
833
+ """
834
+ response_data = self._post_json(f"/compute/session/{session_id}/unpublish", {})
835
+ return response_data
836
+
663
837
  def get_sessions_for_org(self, name_prefix: str, max_retries: int = None) -> Dict[str, Any]:
664
838
  """
665
839
  Get all sessions matching a name prefix across all compute nodes.
@@ -703,8 +877,8 @@ class FeatrixSphereClient:
703
877
  >>> print(f"Model card recreated: {model_card['model_info']['name']}")
704
878
  """
705
879
  response = self._make_request(
706
- "POST",
707
- f"/session/{session_id}/model_card",
880
+ "GET",
881
+ f"/compute/session/{session_id}/model_card",
708
882
  max_retries=max_retries
709
883
  )
710
884
  return response.json()
@@ -1424,16 +1598,141 @@ class FeatrixSphereClient:
1424
1598
  job_queue_positions={}
1425
1599
  )
1426
1600
 
1601
+ def fine_tune_embedding_space(
1602
+ self,
1603
+ name: str,
1604
+ parent_session_id: str = None,
1605
+ parent_embedding_space_path: str = None,
1606
+ s3_training_dataset: str = None,
1607
+ s3_validation_dataset: str = None,
1608
+ webhooks: Dict[str, str] = None
1609
+ ) -> SessionInfo:
1610
+ """
1611
+ Fine-tune an existing embedding space on new data.
1612
+
1613
+ This method takes a pre-trained embedding space (the "parent") and fine-tunes it
1614
+ on a new dataset with the same columns. The number of training epochs is automatically
1615
+ calculated based on the dataset size ratio to ensure optimal training.
1616
+
1617
+ **How Epoch Calculation Works:**
1618
+ - The system calculates F = len(new_dataset) / len(old_dataset)
1619
+ - New epochs = original_epochs / F
1620
+ - If new dataset is smaller (F < 1), more epochs are used (to see data enough times)
1621
+ - If new dataset is larger (F > 1), fewer epochs are used (less repetition needed)
1622
+
1623
+ **Example:**
1624
+ - Original: 1000 rows, trained for 100 epochs
1625
+ - New: 500 rows → F = 0.5 → 100/0.5 = 200 epochs
1626
+ - New: 2000 rows → F = 2.0 → 100/2.0 = 50 epochs
1627
+
1628
+ This ensures the model sees the new data an appropriate number of times relative
1629
+ to how much it saw the original data.
1630
+
1631
+ Args:
1632
+ name: Name for the fine-tuned embedding space
1633
+ parent_session_id: Session ID of the parent embedding space (optional)
1634
+ parent_embedding_space_path: Direct path to parent embedding space pickle file (optional)
1635
+ s3_training_dataset: S3 URL for new training dataset (must start with 's3://')
1636
+ s3_validation_dataset: S3 URL for new validation dataset (must start with 's3://')
1637
+ webhooks: Optional dict with webhook configuration keys (webhook_callback_secret, s3_backup_url, model_id_update_url)
1638
+
1639
+ Returns:
1640
+ SessionInfo for the newly created fine-tuning session
1641
+
1642
+ Raises:
1643
+ ValueError: If S3 URLs are invalid or neither parent identifier is provided
1644
+
1645
+ Example:
1646
+ ```python
1647
+ # Fine-tune an existing embedding space on new data
1648
+ client = FeatrixSphereClient("https://sphere-api.featrix.com")
1649
+
1650
+ # Option 1: Use parent session ID
1651
+ fine_tuned = client.fine_tune_embedding_space(
1652
+ name="customer_behavior_v2",
1653
+ parent_session_id="abc123-20240101-120000",
1654
+ s3_training_dataset="s3://my-bucket/new_training_data.csv",
1655
+ s3_validation_dataset="s3://my-bucket/new_validation_data.csv"
1656
+ )
1657
+
1658
+ # Option 2: Use direct path to parent embedding space
1659
+ fine_tuned = client.fine_tune_embedding_space(
1660
+ name="customer_behavior_v2",
1661
+ parent_embedding_space_path="/path/to/parent/embedded_space.pickle",
1662
+ s3_training_dataset="s3://my-bucket/new_training_data.csv",
1663
+ s3_validation_dataset="s3://my-bucket/new_validation_data.csv"
1664
+ )
1665
+
1666
+ # Wait for fine-tuning to complete
1667
+ client.wait_for_session_completion(fine_tuned.session_id)
1668
+
1669
+ # Use the fine-tuned model for predictions
1670
+ result = client.predict(fine_tuned.session_id, {"feature1": "value1"})
1671
+ ```
1672
+ """
1673
+ # Validate S3 URLs
1674
+ if s3_training_dataset and not s3_training_dataset.startswith('s3://'):
1675
+ raise ValueError("s3_training_dataset must be a valid S3 URL (s3://...)")
1676
+ if s3_validation_dataset and not s3_validation_dataset.startswith('s3://'):
1677
+ raise ValueError("s3_validation_dataset must be a valid S3 URL (s3://...)")
1678
+
1679
+ # Validate that we have either parent_session_id or parent_embedding_space_path
1680
+ if not parent_session_id and not parent_embedding_space_path:
1681
+ raise ValueError("Either parent_session_id or parent_embedding_space_path must be provided")
1682
+
1683
+ print(f"Fine-tuning embedding space '{name}'...")
1684
+ if parent_session_id:
1685
+ print(f" Parent session: {parent_session_id}")
1686
+ if parent_embedding_space_path:
1687
+ print(f" Parent embedding space: {parent_embedding_space_path}")
1688
+ print(f" New training data: {s3_training_dataset}")
1689
+ print(f" New validation data: {s3_validation_dataset}")
1690
+
1691
+ data = {
1692
+ "name": name,
1693
+ "s3_file_data_set_training": s3_training_dataset,
1694
+ "s3_file_data_set_validation": s3_validation_dataset
1695
+ }
1696
+
1697
+ if parent_session_id:
1698
+ data["parent_session_id"] = parent_session_id
1699
+ if parent_embedding_space_path:
1700
+ data["parent_embedding_space_path"] = parent_embedding_space_path
1701
+
1702
+ if webhooks:
1703
+ data['webhooks'] = webhooks
1704
+
1705
+ response_data = self._post_json("/compute/fine-tune-embedding-space", data)
1706
+
1707
+ session_id = response_data.get('session_id')
1708
+ fine_tune_info = response_data.get('fine_tune_info', {})
1709
+
1710
+ print(f"Fine-tuning session created: {session_id}")
1711
+ if fine_tune_info:
1712
+ print(f" Original dataset: {fine_tune_info.get('original_train_size', 'N/A')} rows")
1713
+ print(f" New dataset: {fine_tune_info.get('new_total_size', 'N/A')} rows")
1714
+ print(f" Dataset ratio (F): {fine_tune_info.get('F', 'N/A'):.4f}")
1715
+ print(f" Original epochs: {fine_tune_info.get('original_epochs', 'N/A')}")
1716
+ print(f" Calculated epochs: {fine_tune_info.get('calculated_epochs', 'N/A')}")
1717
+
1718
+ return SessionInfo(
1719
+ session_id=session_id,
1720
+ session_type=response_data.get('session_type', 'embedding_space_finetune'),
1721
+ status=response_data.get('status', 'ready'),
1722
+ jobs={},
1723
+ job_queue_positions={}
1724
+ )
1725
+
1427
1726
  # =========================================================================
1428
1727
  # File Upload
1429
1728
  # =========================================================================
1430
1729
 
1431
1730
  def upload_file_and_create_session(self, file_path: Path, session_name_prefix: str = None, name: str = None, webhooks: Dict[str, str] = None) -> SessionInfo:
1432
1731
  """
1433
- Upload a CSV file and create a new session.
1732
+ Upload a CSV, Parquet, JSON, or JSONL file and create a new session.
1434
1733
 
1435
1734
  Args:
1436
- file_path: Path to the CSV file to upload
1735
+ file_path: Path to the CSV, Parquet, JSON, or JSONL file to upload
1437
1736
  session_name_prefix: Optional prefix for the session ID. Session will be named <prefix>-<full-uuid>
1438
1737
  name: Optional name for the embedding space/model (for identification and metadata)
1439
1738
  webhooks: Optional dict with webhook configuration keys (webhook_callback_secret, s3_backup_url, model_id_update_url)
@@ -1491,12 +1790,13 @@ class FeatrixSphereClient:
1491
1790
  string_list_delimiter: str = "|",
1492
1791
  important_columns_for_visualization: List[str] = None,
1493
1792
  metadata: Dict[str, Any] = None,
1793
+ user_metadata: Dict[str, Any] = None, # User metadata for ES/SP identification (max 32KB)
1494
1794
  session_name_prefix: str = None,
1495
1795
  name: str = None,
1496
1796
  webhooks: Dict[str, str] = None,
1497
1797
  epochs: int = None) -> SessionInfo:
1498
1798
  """
1499
- Upload a pandas DataFrame or CSV file and create a new session.
1799
+ Upload a pandas DataFrame, CSV file, Parquet file, JSON file, or JSONL file and create a new session.
1500
1800
 
1501
1801
  Special Column: __featrix_train_predictor
1502
1802
  ------------------------------------------
@@ -1504,7 +1804,7 @@ class FeatrixSphereClient:
1504
1804
  which rows are used for single predictor training.
1505
1805
 
1506
1806
  How it works:
1507
- - Add a boolean column "__featrix_train_predictor" to your DataFrame/CSV before upload
1807
+ - Add a boolean column "__featrix_train_predictor" to your DataFrame/CSV/Parquet/JSON/JSONL before upload
1508
1808
  - Set it to True for rows you want to use for predictor training
1509
1809
  - Set it to False (or any other value) for rows to exclude from predictor training
1510
1810
  - Embedding space training uses ALL rows (ignores this column)
@@ -1538,7 +1838,7 @@ class FeatrixSphereClient:
1538
1838
  Args:
1539
1839
  df: pandas DataFrame to upload (optional if file_path is provided)
1540
1840
  filename: Name to give the uploaded file (default: "data.csv")
1541
- file_path: Path to CSV file to upload (optional if df is provided)
1841
+ file_path: Path to CSV, Parquet, JSON, or JSONL file to upload (optional if df is provided)
1542
1842
  column_overrides: Dict mapping column names to types ("scalar", "set", "free_string", "free_string_list")
1543
1843
  column_types: Alias for column_overrides (for backward compatibility)
1544
1844
  string_list_delimiter: Delimiter for free_string_list columns (default: "|")
@@ -1579,21 +1879,90 @@ class FeatrixSphereClient:
1579
1879
  if not os.path.exists(file_path):
1580
1880
  raise FileNotFoundError(f"File not found: {file_path}")
1581
1881
 
1582
- # Check if it's a CSV file
1583
- if not file_path.lower().endswith(('.csv', '.csv.gz')):
1584
- raise ValueError("File must be a CSV file (with .csv or .csv.gz extension)")
1882
+ # Check if it's a supported file type
1883
+ file_ext = file_path.lower()
1884
+ if not file_ext.endswith(('.csv', '.csv.gz', '.parquet', '.json', '.jsonl')):
1885
+ raise ValueError("File must be a CSV, Parquet, JSON, or JSONL file (with .csv, .csv.gz, .parquet, .json, or .jsonl extension)")
1585
1886
 
1586
1887
  print(f"Uploading file: {file_path}")
1587
1888
 
1588
1889
  # Read the file content
1589
1890
  if file_path.endswith('.gz'):
1590
- # Already gzipped
1891
+ # Already gzipped CSV
1591
1892
  with gzip.open(file_path, 'rb') as f:
1592
1893
  file_content = f.read()
1593
1894
  upload_filename = os.path.basename(file_path)
1594
1895
  content_type = 'application/gzip'
1896
+ elif file_path.lower().endswith(('.json', '.jsonl')):
1897
+ # JSON/JSONL file - read as DataFrame, convert to CSV, then compress
1898
+ print(f"Reading {'JSONL' if file_path.lower().endswith('.jsonl') else 'JSON'} file...")
1899
+ try:
1900
+ from featrix.neural.input_data_file import featrix_wrap_read_json_file
1901
+ json_df = featrix_wrap_read_json_file(file_path)
1902
+ if json_df is None:
1903
+ raise ValueError(f"Failed to parse {'JSONL' if file_path.lower().endswith('.jsonl') else 'JSON'} file")
1904
+ except ImportError:
1905
+ # Fallback to pandas if featrix wrapper not available
1906
+ if file_path.lower().endswith('.jsonl'):
1907
+ # JSONL: one JSON object per line
1908
+ import json
1909
+ records = []
1910
+ with open(file_path, 'r', encoding='utf-8') as f:
1911
+ for line in f:
1912
+ if line.strip():
1913
+ records.append(json.loads(line))
1914
+ json_df = pd.DataFrame(records)
1915
+ else:
1916
+ # Regular JSON
1917
+ json_df = pd.read_json(file_path)
1918
+
1919
+ # Clean NaN values before CSV conversion
1920
+ cleaned_df = json_df.where(pd.notna(json_df), None)
1921
+
1922
+ # Convert to CSV and compress
1923
+ csv_buffer = io.StringIO()
1924
+ cleaned_df.to_csv(csv_buffer, index=False)
1925
+ csv_data = csv_buffer.getvalue().encode('utf-8')
1926
+
1927
+ print(f"Compressing {'JSONL' if file_path.lower().endswith('.jsonl') else 'JSON'} (converted to CSV)...")
1928
+ compressed_buffer = io.BytesIO()
1929
+ with gzip.GzipFile(fileobj=compressed_buffer, mode='wb') as gz:
1930
+ gz.write(csv_data)
1931
+ file_content = compressed_buffer.getvalue()
1932
+ upload_filename = os.path.basename(file_path).replace('.jsonl', '.csv.gz').replace('.json', '.csv.gz')
1933
+ content_type = 'application/gzip'
1934
+
1935
+ original_size = len(csv_data)
1936
+ compressed_size = len(file_content)
1937
+ compression_ratio = (1 - compressed_size / original_size) * 100
1938
+ print(f"Converted {'JSONL' if file_path.lower().endswith('.jsonl') else 'JSON'} to CSV and compressed from {original_size:,} to {compressed_size:,} bytes ({compression_ratio:.1f}% reduction)")
1939
+ elif file_path.lower().endswith('.parquet'):
1940
+ # Parquet file - read as DataFrame, convert to CSV, then compress
1941
+ print("Reading Parquet file...")
1942
+ parquet_df = pd.read_parquet(file_path)
1943
+
1944
+ # Clean NaN values before CSV conversion
1945
+ cleaned_df = parquet_df.where(pd.notna(parquet_df), None)
1946
+
1947
+ # Convert to CSV and compress
1948
+ csv_buffer = io.StringIO()
1949
+ cleaned_df.to_csv(csv_buffer, index=False)
1950
+ csv_data = csv_buffer.getvalue().encode('utf-8')
1951
+
1952
+ print("Compressing Parquet (converted to CSV)...")
1953
+ compressed_buffer = io.BytesIO()
1954
+ with gzip.GzipFile(fileobj=compressed_buffer, mode='wb') as gz:
1955
+ gz.write(csv_data)
1956
+ file_content = compressed_buffer.getvalue()
1957
+ upload_filename = os.path.basename(file_path).replace('.parquet', '.csv.gz')
1958
+ content_type = 'application/gzip'
1959
+
1960
+ original_size = len(csv_data)
1961
+ compressed_size = len(file_content)
1962
+ compression_ratio = (1 - compressed_size / original_size) * 100
1963
+ print(f"Converted Parquet to CSV and compressed from {original_size:,} to {compressed_size:,} bytes ({compression_ratio:.1f}% reduction)")
1595
1964
  else:
1596
- # Read CSV and compress it
1965
+ # Regular CSV file - read and compress it
1597
1966
  with open(file_path, 'rb') as f:
1598
1967
  csv_content = f.read()
1599
1968
 
@@ -1663,6 +2032,10 @@ class FeatrixSphereClient:
1663
2032
  import json
1664
2033
  data['metadata'] = json.dumps(metadata)
1665
2034
  print(f"Session metadata: {metadata}")
2035
+ if user_metadata:
2036
+ import json
2037
+ data['user_metadata'] = json.dumps(user_metadata)
2038
+ print(f"User metadata: {user_metadata}")
1666
2039
  if session_name_prefix:
1667
2040
  data['session_name_prefix'] = session_name_prefix
1668
2041
  print(f"Session name prefix: {session_name_prefix}")
@@ -3239,6 +3612,24 @@ class FeatrixSphereClient:
3239
3612
  response_data = self._delete_json(f"/session/{session_id}/predictor", params=params, max_retries=max_retries)
3240
3613
  return response_data
3241
3614
 
3615
+ def mark_for_deletion(self, session_id: str, max_retries: int = None) -> Dict[str, Any]:
3616
+ """
3617
+ Mark a session for deletion. The session will be deleted by the garbage collection process.
3618
+
3619
+ Args:
3620
+ session_id: Session ID to mark for deletion
3621
+ max_retries: Number of retries for errors (default: uses client default)
3622
+
3623
+ Returns:
3624
+ Dictionary with confirmation that the session was marked for deletion
3625
+
3626
+ Example:
3627
+ result = client.mark_for_deletion("session_123")
3628
+ print(result) # {"status": "marked", "session_id": "session_123"}
3629
+ """
3630
+ response_data = self._post_json(f"/compute/session/{session_id}/mark_for_deletion", max_retries=max_retries)
3631
+ return response_data
3632
+
3242
3633
 
3243
3634
  def _create_interactive_training_movie(self, training_metrics, epoch_projections, session_id,
3244
3635
  show_embedding_evolution, show_loss_evolution):
@@ -3720,12 +4111,12 @@ class FeatrixSphereClient:
3720
4111
 
3721
4112
  def train_on_foundational_model(self, foundation_model_id: str, target_column: str, target_column_type: str,
3722
4113
  input_filename: str = None,
4114
+ df = None,
3723
4115
  name: str = None,
3724
4116
  session_name_prefix: str = None,
3725
- epochs: int = 0, batch_size: int = 0, learning_rate: float = 0.001,
3726
- positive_label: str = None,
4117
+ epochs: int = 0,
4118
+ rare_label_value: str = None,
3727
4119
  class_imbalance: dict = None,
3728
- optimize_for: str = "balanced",
3729
4120
  poll_interval: int = 30, max_poll_time: int = 3600,
3730
4121
  verbose: bool = True,
3731
4122
  webhooks: Dict[str, str] = None) -> SessionInfo:
@@ -3741,14 +4132,13 @@ class FeatrixSphereClient:
3741
4132
  target_column: Name of the target column to predict
3742
4133
  target_column_type: Type of target column ("set" or "scalar")
3743
4134
  input_filename: Optional input data file (uses foundation model's data if not provided)
4135
+ df: Optional pandas DataFrame with training data (uses foundation model's data if not provided).
4136
+ Use input_filename OR df (not both) to train predictor on different data than the foundation model.
3744
4137
  name: Optional name for the new session
3745
4138
  session_name_prefix: Optional prefix for session ID. Session will be named <prefix>-<uuid>
3746
4139
  epochs: Number of training epochs (default: 0; automatic)
3747
- batch_size: Training batch size (default: 0; automatic)
3748
- learning_rate: Learning rate for training (default: 0.001)
3749
- positive_label: For binary classification, which class is "positive" for metrics (default: None)
4140
+ rare_label_value: For binary classification, which class is the rare/minority class for metrics (default: None)
3750
4141
  class_imbalance: Expected class ratios/counts from real world for sampled data (default: None)
3751
- optimize_for: Optimization target - "balanced" (F1 score), "precision", or "recall" (default: "balanced")
3752
4142
  poll_interval: Seconds between status checks when job is already running (default: 30)
3753
4143
  max_poll_time: Maximum time to poll in seconds (default: 3600 = 1 hour)
3754
4144
  verbose: Whether to print status updates during polling (default: True)
@@ -3760,44 +4150,125 @@ class FeatrixSphereClient:
3760
4150
  print(f"Training predictor on foundation model {foundation_model_id}...")
3761
4151
  print(f" Target: {target_column} ({target_column_type})")
3762
4152
 
3763
- data = {
3764
- "foundation_model_id": foundation_model_id,
3765
- "target_column": target_column,
3766
- "target_column_type": target_column_type,
3767
- "epochs": epochs,
3768
- "batch_size": batch_size,
3769
- "learning_rate": learning_rate,
3770
- "optimize_for": optimize_for,
3771
- }
3772
-
3773
- if input_filename:
3774
- # Clean up input_filename: extract just the filename if it's an absolute path
3775
- # The file should be uploaded first or already exist on the server
3776
- from pathlib import Path
3777
- input_path = Path(input_filename)
3778
- if input_path.is_absolute():
3779
- # Extract just the filename - client should upload file first
3780
- cleaned_filename = input_path.name
3781
- print(f"⚠️ Note: Extracted filename '{cleaned_filename}' from absolute path '{input_filename}'")
3782
- print(f" Make sure the file has been uploaded to the server first")
3783
- data["input_filename"] = cleaned_filename
3784
- else:
3785
- data["input_filename"] = input_filename
3786
- if name:
3787
- data["name"] = name
3788
- if session_name_prefix:
3789
- data["session_name_prefix"] = session_name_prefix
3790
- if positive_label:
3791
- data["positive_label"] = positive_label
3792
- if class_imbalance:
3793
- data["class_imbalance"] = class_imbalance
3794
- if webhooks:
3795
- data["webhooks"] = webhooks
3796
-
3797
- response_data = self._post_json("/compute/train_on_foundational_model", data)
4153
+ # Get the compute cluster from the foundation model session
4154
+ # This ensures we upload files to the same node where the foundation model lives
4155
+ foundation_session = self.get_session_status(foundation_model_id)
4156
+ foundation_compute_cluster = self.get_last_server_metadata()
4157
+ foundation_compute_cluster = foundation_compute_cluster.get('compute_cluster') if foundation_compute_cluster else None
4158
+
4159
+ # Temporarily set compute cluster for file uploads if we found one
4160
+ original_compute_cluster = self.compute_cluster
4161
+ original_headers = self.session.headers.copy()
4162
+ if foundation_compute_cluster:
4163
+ self.set_compute_cluster(foundation_compute_cluster)
4164
+ if verbose:
4165
+ print(f" Using compute cluster: {foundation_compute_cluster}")
3798
4166
 
3799
- new_session_id = response_data.get('session_id')
3800
- print(f"✅ Predictor training session created: {new_session_id}")
4167
+ try:
4168
+ # Validate that only one data source is provided
4169
+ if input_filename and df is not None:
4170
+ raise ValueError("Provide either input_filename or df, not both")
4171
+
4172
+ # Prepare multipart form data (like train_single_predictor_with_file does)
4173
+ files = None
4174
+ data = {
4175
+ "foundation_model_id": foundation_model_id,
4176
+ "target_column": target_column,
4177
+ "target_column_type": target_column_type,
4178
+ "epochs": str(epochs),
4179
+ }
4180
+
4181
+ # Handle file upload - send file directly in multipart form
4182
+ if df is not None:
4183
+ import pandas as pd
4184
+ import tempfile
4185
+ import os
4186
+
4187
+ if not isinstance(df, pd.DataFrame):
4188
+ raise ValueError("df must be a pandas DataFrame")
4189
+
4190
+ if verbose:
4191
+ print(f"📊 Using provided DataFrame ({len(df)} rows, {len(df.columns)} columns)")
4192
+
4193
+ # Create temporary CSV file
4194
+ temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False)
4195
+ temp_file_path = temp_file.name
4196
+ temp_file.close()
4197
+
4198
+ # Save DataFrame to temp file
4199
+ df.to_csv(temp_file_path, index=False)
4200
+
4201
+ if verbose:
4202
+ print(f"📁 Saved to temporary file: {os.path.basename(temp_file_path)}")
4203
+ print(f"📤 Uploading file directly with training request...")
4204
+
4205
+ # Send file in multipart form
4206
+ files = {'file': (os.path.basename(temp_file_path), open(temp_file_path, 'rb'), 'text/csv')}
4207
+
4208
+ elif input_filename:
4209
+ # If absolute path provided, send file directly
4210
+ from pathlib import Path
4211
+ input_path = Path(input_filename)
4212
+ if input_path.is_absolute():
4213
+ if not input_path.exists():
4214
+ raise FileNotFoundError(f"Input file not found: {input_filename}")
4215
+
4216
+ if verbose:
4217
+ print(f"📤 Sending file directly from absolute path: {input_filename}")
4218
+
4219
+ # Send file in multipart form
4220
+ files = {'file': (input_path.name, open(input_path, 'rb'), 'text/csv' if input_path.suffix == '.csv' else 'application/gzip')}
4221
+ else:
4222
+ # Relative filename - assume it's already on the server
4223
+ data["input_filename"] = input_filename
4224
+
4225
+ if name:
4226
+ data["name"] = name
4227
+ if session_name_prefix:
4228
+ data["session_name_prefix"] = session_name_prefix
4229
+ if rare_label_value:
4230
+ data["rare_label_value"] = rare_label_value
4231
+ if class_imbalance:
4232
+ import json
4233
+ data["class_imbalance"] = json.dumps(class_imbalance)
4234
+ if webhooks:
4235
+ import json
4236
+ data["webhooks"] = json.dumps(webhooks)
4237
+
4238
+ # Send request with file if provided
4239
+ try:
4240
+ if files:
4241
+ response = self._make_request("POST", "/compute/train_on_foundational_model", files=files, data=data)
4242
+ else:
4243
+ response = self._make_request("POST", "/compute/train_on_foundational_model", json=data)
4244
+ response_data = response.json()
4245
+ finally:
4246
+ # Close file handles
4247
+ if files and 'file' in files:
4248
+ files['file'][1].close()
4249
+ # Clean up temp file if we created one
4250
+ if df is not None and temp_file_path:
4251
+ try:
4252
+ os.unlink(temp_file_path)
4253
+ except Exception:
4254
+ pass
4255
+
4256
+ new_session_id = response_data.get('session_id')
4257
+ print(f"✅ Predictor training session created: {new_session_id}")
4258
+
4259
+ # Restore original compute cluster setting
4260
+ if original_compute_cluster != self.compute_cluster:
4261
+ if original_compute_cluster:
4262
+ self.set_compute_cluster(original_compute_cluster)
4263
+ else:
4264
+ self.session.headers = original_headers
4265
+ finally:
4266
+ # Ensure we restore headers even if there's an error
4267
+ if original_compute_cluster != self.compute_cluster:
4268
+ if original_compute_cluster:
4269
+ self.set_compute_cluster(original_compute_cluster)
4270
+ else:
4271
+ self.session.headers = original_headers
3801
4272
 
3802
4273
  if verbose:
3803
4274
  print(f"⏳ Waiting for training to complete...")
@@ -3821,9 +4292,10 @@ class FeatrixSphereClient:
3821
4292
  df = None,
3822
4293
  epochs: int = 0,
3823
4294
  validation_ignore_columns: List[str] = None,
3824
- positive_label: str = None,
4295
+ rare_label_value: str = None,
3825
4296
  class_imbalance: dict = None,
3826
- optimize_for: str = "balanced",
4297
+ cost_false_positive: float = None,
4298
+ cost_false_negative: float = None,
3827
4299
  poll_interval: int = 30, max_poll_time: int = 3600,
3828
4300
  verbose: bool = True,
3829
4301
  webhooks: Dict[str, str] = None) -> Dict[str, Any]:
@@ -4001,24 +4473,6 @@ class FeatrixSphereClient:
4001
4473
  This happens automatically - no configuration needed. The system invests a few seconds
4002
4474
  in analysis to deliver significantly better models.
4003
4475
 
4004
- Understanding optimize_for:
4005
- ---------------------------
4006
- The optimize_for parameter controls which loss function and training strategy is used,
4007
- optimizing for different aspects of model performance:
4008
-
4009
- - "balanced" (default): Optimizes for F1 score (harmonic mean of precision and recall).
4010
- Uses FocalLoss with class weights. Best for general-purpose classification where you
4011
- want balanced performance across all classes.
4012
-
4013
- - "precision": Optimizes for precision (minimizing false positives). Uses FocalLoss with
4014
- class weights, which focuses training on hard-to-classify examples. Best when false
4015
- positives are costly (e.g., fraud detection where flagging legitimate transactions
4016
- as fraud is expensive).
4017
-
4018
- - "recall": Optimizes for recall (minimizing false negatives). Uses CrossEntropyLoss
4019
- with class weights that strongly boost the minority class. Best when false negatives
4020
- are costly (e.g., medical diagnosis where missing a disease is dangerous).
4021
-
4022
4476
  Understanding class_imbalance:
4023
4477
  ------------------------------
4024
4478
  For imbalanced datasets where your training data doesn't reflect real-world class
@@ -4036,14 +4490,14 @@ class FeatrixSphereClient:
4036
4490
 
4037
4491
  If not provided, class weights are computed from your training data distribution.
4038
4492
 
4039
- Understanding positive_label:
4493
+ Understanding rare_label_value:
4040
4494
  -----------------------------
4041
- For binary classification, positive_label specifies which class is considered the
4042
- "positive" class for computing metrics like precision, recall, and ROC-AUC.
4495
+ For binary classification, rare_label_value specifies which class is the rare/minority
4496
+ class for computing metrics like precision, recall, and ROC-AUC.
4043
4497
 
4044
4498
  Example: For a credit risk model predicting "good" vs "bad" loans:
4045
4499
 
4046
- positive_label="bad" # We want to detect bad loans
4500
+ rare_label_value="bad" # "bad" is the rare class we want to detect
4047
4501
 
4048
4502
  This affects how metrics are reported:
4049
4503
  - Precision = True Positives / (True Positives + False Positives)
@@ -4124,7 +4578,7 @@ class FeatrixSphereClient:
4124
4578
  session_id=session.session_id,
4125
4579
  target_column='approved',
4126
4580
  target_column_type='set',
4127
- positive_label='yes'
4581
+ rare_label_value='yes'
4128
4582
  )
4129
4583
  ```
4130
4584
 
@@ -4139,8 +4593,7 @@ class FeatrixSphereClient:
4139
4593
  target_column='approved',
4140
4594
  target_column_type='set',
4141
4595
  class_imbalance={'approved': 0.97, 'rejected': 0.03},
4142
- optimize_for='recall', # Don't miss rejections
4143
- positive_label='rejected'
4596
+ rare_label_value='rejected'
4144
4597
  )
4145
4598
 
4146
4599
  # System will:
@@ -4159,8 +4612,7 @@ class FeatrixSphereClient:
4159
4612
  session_id=session.session_id,
4160
4613
  target_column='is_fraud',
4161
4614
  target_column_type='set',
4162
- positive_label='fraud',
4163
- optimize_for='precision', # Minimize false alarms
4615
+ rare_label_value='fraud',
4164
4616
  class_imbalance={'legitimate': 0.999, 'fraud': 0.001}
4165
4617
  )
4166
4618
  ```
@@ -4174,8 +4626,7 @@ class FeatrixSphereClient:
4174
4626
  session_id=session.session_id,
4175
4627
  target_column='has_disease',
4176
4628
  target_column_type='set',
4177
- positive_label='positive',
4178
- optimize_for='recall' # Don't miss any cases
4629
+ rare_label_value='positive'
4179
4630
  )
4180
4631
  ```
4181
4632
 
@@ -4189,7 +4640,7 @@ class FeatrixSphereClient:
4189
4640
  target_column='churn',
4190
4641
  target_column_type='set',
4191
4642
  validation_ignore_columns=['customer_id', 'signup_date'],
4192
- positive_label='churned'
4643
+ rare_label_value='churned'
4193
4644
  )
4194
4645
  ```
4195
4646
 
@@ -4215,9 +4666,8 @@ class FeatrixSphereClient:
4215
4666
  embedding space! If neither provided, uses session's original data file.
4216
4667
  epochs: Number of training epochs (default: 0; automatic)
4217
4668
  validation_ignore_columns: List of column names to exclude from validation queries (default: None)
4218
- positive_label: For binary classification, which class is "positive" for metrics (default: None)
4669
+ rare_label_value: For binary classification, which class is the rare/minority class for metrics (default: None)
4219
4670
  class_imbalance: Expected class ratios/counts from real world for sampled data (default: None)
4220
- optimize_for: Optimization target - "balanced" (F1 score), "precision", or "recall" (default: "balanced")
4221
4671
  poll_interval: Seconds between status checks when job is already running (default: 30)
4222
4672
  max_poll_time: Maximum time to poll in seconds (default: 3600 = 1 hour)
4223
4673
  verbose: Whether to print status updates during polling (default: True)
@@ -4234,6 +4684,17 @@ class FeatrixSphereClient:
4234
4684
  if file_path and df is not None:
4235
4685
  raise ValueError("Provide either file_path or df, not both")
4236
4686
 
4687
+ # Validate cost parameters
4688
+ if cost_false_positive is not None or cost_false_negative is not None:
4689
+ if cost_false_positive is None or cost_false_negative is None:
4690
+ raise ValueError("Both cost_false_positive and cost_false_negative must be specified together")
4691
+ if target_column_type != "set":
4692
+ raise ValueError("cost_false_positive and cost_false_negative are only valid for target_column_type='set' (classification), not 'scalar' (regression)")
4693
+ if cost_false_positive <= 0 or cost_false_negative <= 0:
4694
+ raise ValueError("cost_false_positive and cost_false_negative must be positive numbers")
4695
+ if verbose:
4696
+ print(f"💰 Cost-based optimization enabled: FP cost={cost_false_positive}, FN cost={cost_false_negative}")
4697
+
4237
4698
  # If DataFrame provided, save to temp file and use file_path logic
4238
4699
  temp_file = None
4239
4700
  if df is not None:
@@ -4264,9 +4725,10 @@ class FeatrixSphereClient:
4264
4725
  target_column=target_column,
4265
4726
  target_column_type=target_column_type,
4266
4727
  epochs=epochs,
4267
- positive_label=positive_label,
4728
+ rare_label_value=rare_label_value,
4268
4729
  class_imbalance=class_imbalance,
4269
- optimize_for=optimize_for,
4730
+ cost_false_positive=cost_false_positive,
4731
+ cost_false_negative=cost_false_negative,
4270
4732
  verbose=verbose,
4271
4733
  webhooks=webhooks
4272
4734
  )
@@ -4277,10 +4739,12 @@ class FeatrixSphereClient:
4277
4739
  "target_column_type": target_column_type,
4278
4740
  "epochs": epochs,
4279
4741
  "validation_ignore_columns": validation_ignore_columns or [],
4280
- "positive_label": positive_label,
4281
- "class_imbalance": class_imbalance,
4282
- "optimize_for": optimize_for
4742
+ "rare_label_value": rare_label_value,
4743
+ "class_imbalance": class_imbalance
4283
4744
  }
4745
+ if cost_false_positive is not None and cost_false_negative is not None:
4746
+ data["cost_false_positive"] = cost_false_positive
4747
+ data["cost_false_negative"] = cost_false_negative
4284
4748
  if webhooks:
4285
4749
  data['webhooks'] = webhooks
4286
4750
 
@@ -4577,9 +5041,8 @@ class FeatrixSphereClient:
4577
5041
 
4578
5042
  def train_predictor_more(self, session_id: str, epochs: int = 50,
4579
5043
  predictor_id: str = None, target_column: str = None,
4580
- batch_size: int = 0, learning_rate: float = None,
4581
5044
  poll_interval: int = 30, max_poll_time: int = 3600,
4582
- verbose: bool = True) -> Dict[str, Any]:
5045
+ verbose: bool = True, webhooks: Dict[str, str] = None) -> Dict[str, Any]:
4583
5046
  """
4584
5047
  Continue training an existing single predictor for more epochs.
4585
5048
  Loads the existing predictor and resumes training from where it left off.
@@ -4589,11 +5052,10 @@ class FeatrixSphereClient:
4589
5052
  epochs: Additional epochs to train (required)
4590
5053
  predictor_id: Predictor ID to continue training (optional, highest priority)
4591
5054
  target_column: Target column name to find predictor (optional, alternative to predictor_id)
4592
- batch_size: Batch size for continuation (0 = use existing from predictor)
4593
- learning_rate: Learning rate for continuation (None = use existing from predictor)
4594
5055
  poll_interval: Seconds between status checks (default: 30)
4595
5056
  max_poll_time: Maximum time to poll in seconds (default: 3600 = 1 hour)
4596
5057
  verbose: Whether to print status updates (default: True)
5058
+ webhooks: Optional dict with webhook configuration keys (webhook_callback_secret, s3_backup_url, model_id_update_url)
4597
5059
 
4598
5060
  Returns:
4599
5061
  Response with continuation start confirmation or completion status
@@ -4616,15 +5078,14 @@ class FeatrixSphereClient:
4616
5078
 
4617
5079
  data = {
4618
5080
  "epochs": epochs,
4619
- "batch_size": batch_size,
4620
5081
  }
4621
5082
 
4622
5083
  if predictor_id:
4623
5084
  data["predictor_id"] = predictor_id
4624
5085
  if target_column:
4625
5086
  data["target_column"] = target_column
4626
- if learning_rate is not None:
4627
- data["learning_rate"] = learning_rate
5087
+ if webhooks:
5088
+ data["webhooks"] = webhooks
4628
5089
 
4629
5090
  if verbose:
4630
5091
  print(f"🔄 Continuing training for predictor on session {session_id}")
@@ -4714,6 +5175,139 @@ class FeatrixSphereClient:
4714
5175
  print(f"❌ Error starting predictor continuation: {e}")
4715
5176
  raise
4716
5177
 
5178
+ def foundation_model_train_more(self, session_id: str, es_id: str = None, data_passes: int = None,
5179
+ epochs: int = None, poll_interval: int = 30, max_poll_time: int = 3600,
5180
+ verbose: bool = True, webhooks: Dict[str, str] = None) -> Dict[str, Any]:
5181
+ """
5182
+ Continue training an existing foundation model (embedding space) for more epochs.
5183
+ Loads the existing embedding space and resumes training from where it left off.
5184
+
5185
+ Args:
5186
+ session_id: Session ID containing the trained foundation model
5187
+ es_id: Embedding space ID (optional, uses session's ES if not provided)
5188
+ data_passes: Additional epochs to train (preferred, default: 50)
5189
+ epochs: Additional epochs to train (deprecated, use data_passes instead, for compatibility)
5190
+ poll_interval: Seconds between status checks (default: 30)
5191
+ max_poll_time: Maximum time to poll in seconds (default: 3600 = 1 hour)
5192
+ verbose: Whether to print status updates (default: True)
5193
+ webhooks: Optional dict with webhook configuration keys (webhook_callback_secret, s3_backup_url, model_id_update_url)
5194
+
5195
+ Returns:
5196
+ Response with continuation start confirmation or completion status
5197
+
5198
+ Example:
5199
+ ```python
5200
+ # Continue training for 50 more epochs
5201
+ result = client.foundation_model_train_more(
5202
+ session_id="abc123",
5203
+ data_passes=50
5204
+ )
5205
+ ```
5206
+ """
5207
+ # Support both data_passes and epochs for compatibility
5208
+ if data_passes is None and epochs is None:
5209
+ data_passes = 50 # Default
5210
+ elif data_passes is None:
5211
+ data_passes = epochs # Use epochs if data_passes not provided
5212
+ # If both provided, data_passes takes precedence
5213
+
5214
+ if data_passes <= 0:
5215
+ raise ValueError("data_passes (or epochs) must be > 0 (specify additional epochs to train)")
5216
+
5217
+ data = {
5218
+ "data_passes": data_passes,
5219
+ }
5220
+
5221
+ if es_id:
5222
+ data["es_id"] = es_id
5223
+ if webhooks:
5224
+ data["webhooks"] = webhooks
5225
+
5226
+ if verbose:
5227
+ print(f"🔄 Continuing training for foundation model on session {session_id}")
5228
+ print(f" Additional epochs: {data_passes}")
5229
+ if es_id:
5230
+ print(f" ES ID: {es_id}")
5231
+
5232
+ try:
5233
+ response_data = self._post_json(f"/compute/session/{session_id}/train_foundation_model_more", data)
5234
+
5235
+ if verbose:
5236
+ print(f"✅ Foundation model continuation started: {response_data.get('message')}")
5237
+
5238
+ # Poll for completion if requested
5239
+ if poll_interval > 0 and max_poll_time > 0:
5240
+ import time
5241
+ start_time = time.time()
5242
+ last_status = ""
5243
+
5244
+ while time.time() - start_time < max_poll_time:
5245
+ try:
5246
+ session_info = self.get_session_status(session_id)
5247
+ jobs = session_info.jobs if hasattr(session_info, 'jobs') else {}
5248
+
5249
+ # Find continuation jobs
5250
+ es_jobs = {j_id: j for j_id, j in jobs.items()
5251
+ if j.get('type') == 'train_es'}
5252
+
5253
+ if not es_jobs:
5254
+ if verbose:
5255
+ print("✅ No continuation jobs found - training may have completed")
5256
+ break
5257
+
5258
+ # Check job statuses
5259
+ running_jobs = [j_id for j_id, j in es_jobs.items() if j.get('status') == 'running']
5260
+ completed_jobs = [j_id for j_id, j in es_jobs.items() if j.get('status') == 'done']
5261
+ failed_jobs = [j_id for j_id, j in es_jobs.items() if j.get('status') == 'failed']
5262
+
5263
+ current_status = f"Running: {len(running_jobs)}, Done: {len(completed_jobs)}, Failed: {len(failed_jobs)}"
5264
+ if current_status != last_status and verbose:
5265
+ print(f"📊 Status: {current_status}")
5266
+ last_status = current_status
5267
+
5268
+ if not running_jobs and (completed_jobs or failed_jobs):
5269
+ if completed_jobs:
5270
+ if verbose:
5271
+ print(f"✅ Foundation model continuation completed successfully!")
5272
+ return {
5273
+ "message": "Foundation model continuation completed successfully",
5274
+ "session_id": session_id,
5275
+ "status": "completed",
5276
+ "additional_epochs": data_passes
5277
+ }
5278
+ else:
5279
+ if verbose:
5280
+ print(f"❌ Foundation model continuation failed")
5281
+ return {
5282
+ "message": "Foundation model continuation failed",
5283
+ "session_id": session_id,
5284
+ "status": "failed",
5285
+ "failed_jobs": failed_jobs
5286
+ }
5287
+
5288
+ time.sleep(poll_interval)
5289
+ except Exception as poll_error:
5290
+ if verbose:
5291
+ print(f"⚠️ Error during polling: {poll_error}")
5292
+ time.sleep(poll_interval)
5293
+
5294
+ # Timeout
5295
+ if verbose:
5296
+ print(f"⏱️ Polling timeout reached ({max_poll_time}s)")
5297
+ return {
5298
+ "message": "Polling timeout",
5299
+ "session_id": session_id,
5300
+ "status": "timeout",
5301
+ "additional_epochs": data_passes
5302
+ }
5303
+
5304
+ return response_data
5305
+
5306
+ except Exception as e:
5307
+ if verbose:
5308
+ print(f"❌ Error starting foundation model continuation: {e}")
5309
+ raise
5310
+
4717
5311
  def _train_single_predictor_with_file(
4718
5312
  self,
4719
5313
  session_id: str,
@@ -4721,10 +5315,11 @@ class FeatrixSphereClient:
4721
5315
  target_column: str,
4722
5316
  target_column_type: str,
4723
5317
  epochs: int,
4724
- positive_label: str,
5318
+ rare_label_value: str,
4725
5319
  class_imbalance: dict,
4726
- optimize_for: str,
4727
- verbose: bool,
5320
+ cost_false_positive: float = None,
5321
+ cost_false_negative: float = None,
5322
+ verbose: bool = True,
4728
5323
  webhooks: Dict[str, str] = None
4729
5324
  ) -> Dict[str, Any]:
4730
5325
  """
@@ -4749,16 +5344,19 @@ class FeatrixSphereClient:
4749
5344
  data = {
4750
5345
  'target_column': target_column,
4751
5346
  'target_column_type': target_column_type,
4752
- 'epochs': str(epochs),
4753
- 'optimize_for': optimize_for,
5347
+ 'epochs': str(epochs)
4754
5348
  }
4755
5349
 
4756
- if positive_label:
4757
- data['positive_label'] = positive_label
5350
+ if rare_label_value:
5351
+ data['rare_label_value'] = rare_label_value
4758
5352
 
4759
5353
  if class_imbalance:
4760
5354
  data['class_imbalance'] = json.dumps(class_imbalance)
4761
5355
 
5356
+ if cost_false_positive is not None and cost_false_negative is not None:
5357
+ data['cost_false_positive'] = str(cost_false_positive)
5358
+ data['cost_false_negative'] = str(cost_false_negative)
5359
+
4762
5360
  if webhooks:
4763
5361
  data['webhooks'] = json.dumps(webhooks)
4764
5362
 
@@ -5785,7 +6383,24 @@ class FeatrixSphereClient:
5785
6383
  if not file_path.exists():
5786
6384
  raise FileNotFoundError(f"File not found: {file_path}")
5787
6385
 
5788
- df = pd.read_csv(file_path)
6386
+ # Support CSV, Parquet, JSON, and JSONL files
6387
+ file_path_str = str(file_path).lower()
6388
+ if file_path_str.endswith('.parquet'):
6389
+ df = pd.read_parquet(file_path)
6390
+ elif file_path_str.endswith('.jsonl'):
6391
+ # JSONL: one JSON object per line
6392
+ import json
6393
+ records = []
6394
+ with open(file_path, 'r', encoding='utf-8') as f:
6395
+ for line in f:
6396
+ if line.strip():
6397
+ records.append(json.loads(line))
6398
+ df = pd.DataFrame(records)
6399
+ elif file_path_str.endswith('.json'):
6400
+ # Regular JSON
6401
+ df = pd.read_json(file_path)
6402
+ else:
6403
+ df = pd.read_csv(file_path)
5789
6404
 
5790
6405
  # Convert to JSON Tables format and clean NaNs
5791
6406
  table_data = JSONTablesEncoder.from_dataframe(df)
@@ -5939,11 +6554,11 @@ class FeatrixSphereClient:
5939
6554
  def run_csv_predictions(self, session_id: str, csv_file: str, target_column: str = None,
5940
6555
  sample_size: int = None, remove_target: bool = True) -> Dict[str, Any]:
5941
6556
  """
5942
- Run predictions on a CSV file with automatic accuracy calculation.
6557
+ Run predictions on a CSV, Parquet, JSON, or JSONL file with automatic accuracy calculation.
5943
6558
 
5944
6559
  Args:
5945
6560
  session_id: ID of session with trained predictor
5946
- csv_file: Path to CSV file
6561
+ csv_file: Path to CSV, Parquet, JSON, or JSONL file
5947
6562
  target_column: Name of target column (for accuracy calculation)
5948
6563
  sample_size: Number of records to test (None = all records)
5949
6564
  remove_target: Whether to remove target column from prediction input
@@ -5953,8 +6568,24 @@ class FeatrixSphereClient:
5953
6568
  """
5954
6569
  import pandas as pd
5955
6570
 
5956
- # Load CSV
5957
- df = pd.read_csv(csv_file)
6571
+ # Load CSV, Parquet, JSON, or JSONL
6572
+ csv_file_lower = csv_file.lower()
6573
+ if csv_file_lower.endswith('.parquet'):
6574
+ df = pd.read_parquet(csv_file)
6575
+ elif csv_file_lower.endswith('.jsonl'):
6576
+ # JSONL: one JSON object per line
6577
+ import json
6578
+ records = []
6579
+ with open(csv_file, 'r', encoding='utf-8') as f:
6580
+ for line in f:
6581
+ if line.strip():
6582
+ records.append(json.loads(line))
6583
+ df = pd.DataFrame(records)
6584
+ elif csv_file_lower.endswith('.json'):
6585
+ # Regular JSON
6586
+ df = pd.read_json(csv_file)
6587
+ else:
6588
+ df = pd.read_csv(csv_file)
5958
6589
 
5959
6590
  # Handle target column
5960
6591
  actual_values = None
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: featrixsphere
3
- Version: 0.2.1002
3
+ Version: 0.2.1314
4
4
  Summary: Transform any CSV into a production-ready ML model in minutes, not months.
5
5
  Home-page: https://github.com/Featrix/sphere
6
6
  Author: Featrix
@@ -0,0 +1,9 @@
1
+ featrixsphere/__init__.py,sha256=S0XWndX6ycMk8j03X_Dt_GpMuCMG2k0uwzoVJ6EbbnA,1888
2
+ featrixsphere/cli.py,sha256=AW9O3vCvCNJ2UxVGN66eRmeN7XLSiHJlvK6JLZ9UJXc,13358
3
+ featrixsphere/client.py,sha256=c1axFTTB6Hvdu2cWngN0VnkBdU0W0neTDKwzIU-IFXc,380183
4
+ featrixsphere/test_client.py,sha256=4SiRbib0ms3poK0UpnUv4G0HFQSzidF3Iswo_J2cjLk,11981
5
+ featrixsphere-0.2.1314.dist-info/METADATA,sha256=gu4_0nXC3gk8GfKB-lDbgrZPhdvTQCRcUEIlIMtWy-I,16232
6
+ featrixsphere-0.2.1314.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
7
+ featrixsphere-0.2.1314.dist-info/entry_points.txt,sha256=QreJeYfD_VWvbEqPmMXZ3pqqlFlJ1qZb-NtqnyhEldc,51
8
+ featrixsphere-0.2.1314.dist-info/top_level.txt,sha256=AyN4wjfzlD0hWnDieuEHX0KckphIk_aC73XCG4df5uU,14
9
+ featrixsphere-0.2.1314.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- featrixsphere/__init__.py,sha256=sevQkmuBmlhyq__FJhZi9d_8ts-PgzPLNDhTX--NoXM,1888
2
- featrixsphere/cli.py,sha256=AW9O3vCvCNJ2UxVGN66eRmeN7XLSiHJlvK6JLZ9UJXc,13358
3
- featrixsphere/client.py,sha256=rIq2kawfaffmq20-T1RBsbwibQJg0XRu3z5NVc1vgCw,349355
4
- featrixsphere/test_client.py,sha256=4SiRbib0ms3poK0UpnUv4G0HFQSzidF3Iswo_J2cjLk,11981
5
- featrixsphere-0.2.1002.dist-info/METADATA,sha256=M1heARJnCtK_p9bVNUZYhHL-RyWRNmYFiUwVMk0PBk8,16232
6
- featrixsphere-0.2.1002.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
7
- featrixsphere-0.2.1002.dist-info/entry_points.txt,sha256=QreJeYfD_VWvbEqPmMXZ3pqqlFlJ1qZb-NtqnyhEldc,51
8
- featrixsphere-0.2.1002.dist-info/top_level.txt,sha256=AyN4wjfzlD0hWnDieuEHX0KckphIk_aC73XCG4df5uU,14
9
- featrixsphere-0.2.1002.dist-info/RECORD,,