featrixsphere 0.2.1002__tar.gz → 0.2.1206__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (19) hide show
  1. {featrixsphere-0.2.1002/featrixsphere.egg-info → featrixsphere-0.2.1206}/PKG-INFO +1 -1
  2. featrixsphere-0.2.1206/VERSION +1 -0
  3. {featrixsphere-0.2.1002 → featrixsphere-0.2.1206}/featrixsphere/__init__.py +1 -1
  4. {featrixsphere-0.2.1002 → featrixsphere-0.2.1206}/featrixsphere/client.py +527 -38
  5. {featrixsphere-0.2.1002 → featrixsphere-0.2.1206/featrixsphere.egg-info}/PKG-INFO +1 -1
  6. featrixsphere-0.2.1002/VERSION +0 -1
  7. {featrixsphere-0.2.1002 → featrixsphere-0.2.1206}/MANIFEST.in +0 -0
  8. {featrixsphere-0.2.1002 → featrixsphere-0.2.1206}/README.md +0 -0
  9. {featrixsphere-0.2.1002 → featrixsphere-0.2.1206}/featrixsphere/cli.py +0 -0
  10. {featrixsphere-0.2.1002 → featrixsphere-0.2.1206}/featrixsphere/test_client.py +0 -0
  11. {featrixsphere-0.2.1002 → featrixsphere-0.2.1206}/featrixsphere.egg-info/SOURCES.txt +0 -0
  12. {featrixsphere-0.2.1002 → featrixsphere-0.2.1206}/featrixsphere.egg-info/dependency_links.txt +0 -0
  13. {featrixsphere-0.2.1002 → featrixsphere-0.2.1206}/featrixsphere.egg-info/entry_points.txt +0 -0
  14. {featrixsphere-0.2.1002 → featrixsphere-0.2.1206}/featrixsphere.egg-info/not-zip-safe +0 -0
  15. {featrixsphere-0.2.1002 → featrixsphere-0.2.1206}/featrixsphere.egg-info/requires.txt +0 -0
  16. {featrixsphere-0.2.1002 → featrixsphere-0.2.1206}/featrixsphere.egg-info/top_level.txt +0 -0
  17. {featrixsphere-0.2.1002 → featrixsphere-0.2.1206}/requirements.txt +0 -0
  18. {featrixsphere-0.2.1002 → featrixsphere-0.2.1206}/setup.cfg +0 -0
  19. {featrixsphere-0.2.1002 → featrixsphere-0.2.1206}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: featrixsphere
3
- Version: 0.2.1002
3
+ Version: 0.2.1206
4
4
  Summary: Transform any CSV into a production-ready ML model in minutes, not months.
5
5
  Home-page: https://github.com/Featrix/sphere
6
6
  Author: Featrix
@@ -0,0 +1 @@
1
+ 0.2.1206
@@ -38,7 +38,7 @@ Example:
38
38
  ... labels=['Experiment A', 'Experiment B'])
39
39
  """
40
40
 
41
- __version__ = "0.2.1002"
41
+ __version__ = "0.2.1206"
42
42
  __author__ = "Featrix"
43
43
  __email__ = "support@featrix.com"
44
44
  __license__ = "MIT"
@@ -660,6 +660,77 @@ class FeatrixSphereClient:
660
660
  )
661
661
  return response.json()
662
662
 
663
+ def publish_session(self, session_id: str) -> Dict[str, Any]:
664
+ """
665
+ Publish a session by moving it to /sphere/published/<sessionId>.
666
+ Moves both the session file and output directory.
667
+
668
+ Args:
669
+ session_id: Session ID to publish
670
+
671
+ Returns:
672
+ Response with published_path, output_path, and status
673
+
674
+ Example:
675
+ ```python
676
+ result = client.publish_session("abc123")
677
+ print(f"Published to: {result['published_path']}")
678
+ ```
679
+ """
680
+ response_data = self._post_json(f"/compute/session/{session_id}/publish", {})
681
+ return response_data
682
+
683
+ def deprecate_session(self, session_id: str, warning_message: str, expiration_date: str) -> Dict[str, Any]:
684
+ """
685
+ Deprecate a published session with a warning message and expiration date.
686
+ The session remains available until the expiration date.
687
+
688
+ Args:
689
+ session_id: Session ID to deprecate
690
+ warning_message: Warning message to display about deprecation
691
+ expiration_date: ISO format date string when session will be removed (e.g., "2025-12-31T23:59:59Z")
692
+
693
+ Returns:
694
+ Response with deprecation status
695
+
696
+ Example:
697
+ ```python
698
+ from datetime import datetime, timedelta
699
+
700
+ expiration = (datetime.now() + timedelta(days=90)).isoformat() + "Z"
701
+ result = client.deprecate_session(
702
+ session_id="abc123",
703
+ warning_message="This session will be removed on 2025-12-31",
704
+ expiration_date=expiration
705
+ )
706
+ ```
707
+ """
708
+ data = {
709
+ "warning_message": warning_message,
710
+ "expiration_date": expiration_date
711
+ }
712
+ response_data = self._post_json(f"/compute/session/{session_id}/deprecate", data)
713
+ return response_data
714
+
715
+ def unpublish_session(self, session_id: str) -> Dict[str, Any]:
716
+ """
717
+ Unpublish a session by moving it back from /sphere/published/<sessionId>.
718
+
719
+ Args:
720
+ session_id: Session ID to unpublish
721
+
722
+ Returns:
723
+ Response with unpublish status
724
+
725
+ Example:
726
+ ```python
727
+ result = client.unpublish_session("abc123")
728
+ print(f"Status: {result['status']}")
729
+ ```
730
+ """
731
+ response_data = self._post_json(f"/compute/session/{session_id}/unpublish", {})
732
+ return response_data
733
+
663
734
  def get_sessions_for_org(self, name_prefix: str, max_retries: int = None) -> Dict[str, Any]:
664
735
  """
665
736
  Get all sessions matching a name prefix across all compute nodes.
@@ -1424,16 +1495,141 @@ class FeatrixSphereClient:
1424
1495
  job_queue_positions={}
1425
1496
  )
1426
1497
 
1498
+ def fine_tune_embedding_space(
1499
+ self,
1500
+ name: str,
1501
+ parent_session_id: str = None,
1502
+ parent_embedding_space_path: str = None,
1503
+ s3_training_dataset: str = None,
1504
+ s3_validation_dataset: str = None,
1505
+ webhooks: Dict[str, str] = None
1506
+ ) -> SessionInfo:
1507
+ """
1508
+ Fine-tune an existing embedding space on new data.
1509
+
1510
+ This method takes a pre-trained embedding space (the "parent") and fine-tunes it
1511
+ on a new dataset with the same columns. The number of training epochs is automatically
1512
+ calculated based on the dataset size ratio to ensure optimal training.
1513
+
1514
+ **How Epoch Calculation Works:**
1515
+ - The system calculates F = len(new_dataset) / len(old_dataset)
1516
+ - New epochs = original_epochs / F
1517
+ - If new dataset is smaller (F < 1), more epochs are used (to see data enough times)
1518
+ - If new dataset is larger (F > 1), fewer epochs are used (less repetition needed)
1519
+
1520
+ **Example:**
1521
+ - Original: 1000 rows, trained for 100 epochs
1522
+ - New: 500 rows → F = 0.5 → 100/0.5 = 200 epochs
1523
+ - New: 2000 rows → F = 2.0 → 100/2.0 = 50 epochs
1524
+
1525
+ This ensures the model sees the new data an appropriate number of times relative
1526
+ to how much it saw the original data.
1527
+
1528
+ Args:
1529
+ name: Name for the fine-tuned embedding space
1530
+ parent_session_id: Session ID of the parent embedding space (optional)
1531
+ parent_embedding_space_path: Direct path to parent embedding space pickle file (optional)
1532
+ s3_training_dataset: S3 URL for new training dataset (must start with 's3://')
1533
+ s3_validation_dataset: S3 URL for new validation dataset (must start with 's3://')
1534
+ webhooks: Optional dict with webhook configuration keys (webhook_callback_secret, s3_backup_url, model_id_update_url)
1535
+
1536
+ Returns:
1537
+ SessionInfo for the newly created fine-tuning session
1538
+
1539
+ Raises:
1540
+ ValueError: If S3 URLs are invalid or neither parent identifier is provided
1541
+
1542
+ Example:
1543
+ ```python
1544
+ # Fine-tune an existing embedding space on new data
1545
+ client = FeatrixSphereClient("https://sphere-api.featrix.com")
1546
+
1547
+ # Option 1: Use parent session ID
1548
+ fine_tuned = client.fine_tune_embedding_space(
1549
+ name="customer_behavior_v2",
1550
+ parent_session_id="abc123-20240101-120000",
1551
+ s3_training_dataset="s3://my-bucket/new_training_data.csv",
1552
+ s3_validation_dataset="s3://my-bucket/new_validation_data.csv"
1553
+ )
1554
+
1555
+ # Option 2: Use direct path to parent embedding space
1556
+ fine_tuned = client.fine_tune_embedding_space(
1557
+ name="customer_behavior_v2",
1558
+ parent_embedding_space_path="/path/to/parent/embedded_space.pickle",
1559
+ s3_training_dataset="s3://my-bucket/new_training_data.csv",
1560
+ s3_validation_dataset="s3://my-bucket/new_validation_data.csv"
1561
+ )
1562
+
1563
+ # Wait for fine-tuning to complete
1564
+ client.wait_for_session_completion(fine_tuned.session_id)
1565
+
1566
+ # Use the fine-tuned model for predictions
1567
+ result = client.predict(fine_tuned.session_id, {"feature1": "value1"})
1568
+ ```
1569
+ """
1570
+ # Validate S3 URLs
1571
+ if s3_training_dataset and not s3_training_dataset.startswith('s3://'):
1572
+ raise ValueError("s3_training_dataset must be a valid S3 URL (s3://...)")
1573
+ if s3_validation_dataset and not s3_validation_dataset.startswith('s3://'):
1574
+ raise ValueError("s3_validation_dataset must be a valid S3 URL (s3://...)")
1575
+
1576
+ # Validate that we have either parent_session_id or parent_embedding_space_path
1577
+ if not parent_session_id and not parent_embedding_space_path:
1578
+ raise ValueError("Either parent_session_id or parent_embedding_space_path must be provided")
1579
+
1580
+ print(f"Fine-tuning embedding space '{name}'...")
1581
+ if parent_session_id:
1582
+ print(f" Parent session: {parent_session_id}")
1583
+ if parent_embedding_space_path:
1584
+ print(f" Parent embedding space: {parent_embedding_space_path}")
1585
+ print(f" New training data: {s3_training_dataset}")
1586
+ print(f" New validation data: {s3_validation_dataset}")
1587
+
1588
+ data = {
1589
+ "name": name,
1590
+ "s3_file_data_set_training": s3_training_dataset,
1591
+ "s3_file_data_set_validation": s3_validation_dataset
1592
+ }
1593
+
1594
+ if parent_session_id:
1595
+ data["parent_session_id"] = parent_session_id
1596
+ if parent_embedding_space_path:
1597
+ data["parent_embedding_space_path"] = parent_embedding_space_path
1598
+
1599
+ if webhooks:
1600
+ data['webhooks'] = webhooks
1601
+
1602
+ response_data = self._post_json("/compute/fine-tune-embedding-space", data)
1603
+
1604
+ session_id = response_data.get('session_id')
1605
+ fine_tune_info = response_data.get('fine_tune_info', {})
1606
+
1607
+ print(f"Fine-tuning session created: {session_id}")
1608
+ if fine_tune_info:
1609
+ print(f" Original dataset: {fine_tune_info.get('original_train_size', 'N/A')} rows")
1610
+ print(f" New dataset: {fine_tune_info.get('new_total_size', 'N/A')} rows")
1611
+ print(f" Dataset ratio (F): {fine_tune_info.get('F', 'N/A'):.4f}")
1612
+ print(f" Original epochs: {fine_tune_info.get('original_epochs', 'N/A')}")
1613
+ print(f" Calculated epochs: {fine_tune_info.get('calculated_epochs', 'N/A')}")
1614
+
1615
+ return SessionInfo(
1616
+ session_id=session_id,
1617
+ session_type=response_data.get('session_type', 'embedding_space_finetune'),
1618
+ status=response_data.get('status', 'ready'),
1619
+ jobs={},
1620
+ job_queue_positions={}
1621
+ )
1622
+
1427
1623
  # =========================================================================
1428
1624
  # File Upload
1429
1625
  # =========================================================================
1430
1626
 
1431
1627
  def upload_file_and_create_session(self, file_path: Path, session_name_prefix: str = None, name: str = None, webhooks: Dict[str, str] = None) -> SessionInfo:
1432
1628
  """
1433
- Upload a CSV file and create a new session.
1629
+ Upload a CSV, Parquet, JSON, or JSONL file and create a new session.
1434
1630
 
1435
1631
  Args:
1436
- file_path: Path to the CSV file to upload
1632
+ file_path: Path to the CSV, Parquet, JSON, or JSONL file to upload
1437
1633
  session_name_prefix: Optional prefix for the session ID. Session will be named <prefix>-<full-uuid>
1438
1634
  name: Optional name for the embedding space/model (for identification and metadata)
1439
1635
  webhooks: Optional dict with webhook configuration keys (webhook_callback_secret, s3_backup_url, model_id_update_url)
@@ -1491,12 +1687,13 @@ class FeatrixSphereClient:
1491
1687
  string_list_delimiter: str = "|",
1492
1688
  important_columns_for_visualization: List[str] = None,
1493
1689
  metadata: Dict[str, Any] = None,
1690
+ user_metadata: Dict[str, Any] = None, # User metadata for ES/SP identification (max 32KB)
1494
1691
  session_name_prefix: str = None,
1495
1692
  name: str = None,
1496
1693
  webhooks: Dict[str, str] = None,
1497
1694
  epochs: int = None) -> SessionInfo:
1498
1695
  """
1499
- Upload a pandas DataFrame or CSV file and create a new session.
1696
+ Upload a pandas DataFrame, CSV file, Parquet file, JSON file, or JSONL file and create a new session.
1500
1697
 
1501
1698
  Special Column: __featrix_train_predictor
1502
1699
  ------------------------------------------
@@ -1504,7 +1701,7 @@ class FeatrixSphereClient:
1504
1701
  which rows are used for single predictor training.
1505
1702
 
1506
1703
  How it works:
1507
- - Add a boolean column "__featrix_train_predictor" to your DataFrame/CSV before upload
1704
+ - Add a boolean column "__featrix_train_predictor" to your DataFrame/CSV/Parquet/JSON/JSONL before upload
1508
1705
  - Set it to True for rows you want to use for predictor training
1509
1706
  - Set it to False (or any other value) for rows to exclude from predictor training
1510
1707
  - Embedding space training uses ALL rows (ignores this column)
@@ -1538,7 +1735,7 @@ class FeatrixSphereClient:
1538
1735
  Args:
1539
1736
  df: pandas DataFrame to upload (optional if file_path is provided)
1540
1737
  filename: Name to give the uploaded file (default: "data.csv")
1541
- file_path: Path to CSV file to upload (optional if df is provided)
1738
+ file_path: Path to CSV, Parquet, JSON, or JSONL file to upload (optional if df is provided)
1542
1739
  column_overrides: Dict mapping column names to types ("scalar", "set", "free_string", "free_string_list")
1543
1740
  column_types: Alias for column_overrides (for backward compatibility)
1544
1741
  string_list_delimiter: Delimiter for free_string_list columns (default: "|")
@@ -1579,21 +1776,90 @@ class FeatrixSphereClient:
1579
1776
  if not os.path.exists(file_path):
1580
1777
  raise FileNotFoundError(f"File not found: {file_path}")
1581
1778
 
1582
- # Check if it's a CSV file
1583
- if not file_path.lower().endswith(('.csv', '.csv.gz')):
1584
- raise ValueError("File must be a CSV file (with .csv or .csv.gz extension)")
1779
+ # Check if it's a supported file type
1780
+ file_ext = file_path.lower()
1781
+ if not file_ext.endswith(('.csv', '.csv.gz', '.parquet', '.json', '.jsonl')):
1782
+ raise ValueError("File must be a CSV, Parquet, JSON, or JSONL file (with .csv, .csv.gz, .parquet, .json, or .jsonl extension)")
1585
1783
 
1586
1784
  print(f"Uploading file: {file_path}")
1587
1785
 
1588
1786
  # Read the file content
1589
1787
  if file_path.endswith('.gz'):
1590
- # Already gzipped
1788
+ # Already gzipped CSV
1591
1789
  with gzip.open(file_path, 'rb') as f:
1592
1790
  file_content = f.read()
1593
1791
  upload_filename = os.path.basename(file_path)
1594
1792
  content_type = 'application/gzip'
1793
+ elif file_path.lower().endswith(('.json', '.jsonl')):
1794
+ # JSON/JSONL file - read as DataFrame, convert to CSV, then compress
1795
+ print(f"Reading {'JSONL' if file_path.lower().endswith('.jsonl') else 'JSON'} file...")
1796
+ try:
1797
+ from featrix.neural.input_data_file import featrix_wrap_read_json_file
1798
+ json_df = featrix_wrap_read_json_file(file_path)
1799
+ if json_df is None:
1800
+ raise ValueError(f"Failed to parse {'JSONL' if file_path.lower().endswith('.jsonl') else 'JSON'} file")
1801
+ except ImportError:
1802
+ # Fallback to pandas if featrix wrapper not available
1803
+ if file_path.lower().endswith('.jsonl'):
1804
+ # JSONL: one JSON object per line
1805
+ import json
1806
+ records = []
1807
+ with open(file_path, 'r', encoding='utf-8') as f:
1808
+ for line in f:
1809
+ if line.strip():
1810
+ records.append(json.loads(line))
1811
+ json_df = pd.DataFrame(records)
1812
+ else:
1813
+ # Regular JSON
1814
+ json_df = pd.read_json(file_path)
1815
+
1816
+ # Clean NaN values before CSV conversion
1817
+ cleaned_df = json_df.where(pd.notna(json_df), None)
1818
+
1819
+ # Convert to CSV and compress
1820
+ csv_buffer = io.StringIO()
1821
+ cleaned_df.to_csv(csv_buffer, index=False)
1822
+ csv_data = csv_buffer.getvalue().encode('utf-8')
1823
+
1824
+ print(f"Compressing {'JSONL' if file_path.lower().endswith('.jsonl') else 'JSON'} (converted to CSV)...")
1825
+ compressed_buffer = io.BytesIO()
1826
+ with gzip.GzipFile(fileobj=compressed_buffer, mode='wb') as gz:
1827
+ gz.write(csv_data)
1828
+ file_content = compressed_buffer.getvalue()
1829
+ upload_filename = os.path.basename(file_path).replace('.jsonl', '.csv.gz').replace('.json', '.csv.gz')
1830
+ content_type = 'application/gzip'
1831
+
1832
+ original_size = len(csv_data)
1833
+ compressed_size = len(file_content)
1834
+ compression_ratio = (1 - compressed_size / original_size) * 100
1835
+ print(f"Converted {'JSONL' if file_path.lower().endswith('.jsonl') else 'JSON'} to CSV and compressed from {original_size:,} to {compressed_size:,} bytes ({compression_ratio:.1f}% reduction)")
1836
+ elif file_path.lower().endswith('.parquet'):
1837
+ # Parquet file - read as DataFrame, convert to CSV, then compress
1838
+ print("Reading Parquet file...")
1839
+ parquet_df = pd.read_parquet(file_path)
1840
+
1841
+ # Clean NaN values before CSV conversion
1842
+ cleaned_df = parquet_df.where(pd.notna(parquet_df), None)
1843
+
1844
+ # Convert to CSV and compress
1845
+ csv_buffer = io.StringIO()
1846
+ cleaned_df.to_csv(csv_buffer, index=False)
1847
+ csv_data = csv_buffer.getvalue().encode('utf-8')
1848
+
1849
+ print("Compressing Parquet (converted to CSV)...")
1850
+ compressed_buffer = io.BytesIO()
1851
+ with gzip.GzipFile(fileobj=compressed_buffer, mode='wb') as gz:
1852
+ gz.write(csv_data)
1853
+ file_content = compressed_buffer.getvalue()
1854
+ upload_filename = os.path.basename(file_path).replace('.parquet', '.csv.gz')
1855
+ content_type = 'application/gzip'
1856
+
1857
+ original_size = len(csv_data)
1858
+ compressed_size = len(file_content)
1859
+ compression_ratio = (1 - compressed_size / original_size) * 100
1860
+ print(f"Converted Parquet to CSV and compressed from {original_size:,} to {compressed_size:,} bytes ({compression_ratio:.1f}% reduction)")
1595
1861
  else:
1596
- # Read CSV and compress it
1862
+ # Regular CSV file - read and compress it
1597
1863
  with open(file_path, 'rb') as f:
1598
1864
  csv_content = f.read()
1599
1865
 
@@ -1663,6 +1929,10 @@ class FeatrixSphereClient:
1663
1929
  import json
1664
1930
  data['metadata'] = json.dumps(metadata)
1665
1931
  print(f"Session metadata: {metadata}")
1932
+ if user_metadata:
1933
+ import json
1934
+ data['user_metadata'] = json.dumps(user_metadata)
1935
+ print(f"User metadata: {user_metadata}")
1666
1936
  if session_name_prefix:
1667
1937
  data['session_name_prefix'] = session_name_prefix
1668
1938
  print(f"Session name prefix: {session_name_prefix}")
@@ -3239,6 +3509,24 @@ class FeatrixSphereClient:
3239
3509
  response_data = self._delete_json(f"/session/{session_id}/predictor", params=params, max_retries=max_retries)
3240
3510
  return response_data
3241
3511
 
3512
+ def mark_for_deletion(self, session_id: str, max_retries: int = None) -> Dict[str, Any]:
3513
+ """
3514
+ Mark a session for deletion. The session will be deleted by the garbage collection process.
3515
+
3516
+ Args:
3517
+ session_id: Session ID to mark for deletion
3518
+ max_retries: Number of retries for errors (default: uses client default)
3519
+
3520
+ Returns:
3521
+ Dictionary with confirmation that the session was marked for deletion
3522
+
3523
+ Example:
3524
+ result = client.mark_for_deletion("session_123")
3525
+ print(result) # {"status": "marked", "session_id": "session_123"}
3526
+ """
3527
+ response_data = self._post_json(f"/compute/session/{session_id}/mark_for_deletion", max_retries=max_retries)
3528
+ return response_data
3529
+
3242
3530
 
3243
3531
  def _create_interactive_training_movie(self, training_metrics, epoch_projections, session_id,
3244
3532
  show_embedding_evolution, show_loss_evolution):
@@ -3723,7 +4011,7 @@ class FeatrixSphereClient:
3723
4011
  name: str = None,
3724
4012
  session_name_prefix: str = None,
3725
4013
  epochs: int = 0, batch_size: int = 0, learning_rate: float = 0.001,
3726
- positive_label: str = None,
4014
+ rare_label_value: str = None,
3727
4015
  class_imbalance: dict = None,
3728
4016
  optimize_for: str = "balanced",
3729
4017
  poll_interval: int = 30, max_poll_time: int = 3600,
@@ -3746,7 +4034,7 @@ class FeatrixSphereClient:
3746
4034
  epochs: Number of training epochs (default: 0; automatic)
3747
4035
  batch_size: Training batch size (default: 0; automatic)
3748
4036
  learning_rate: Learning rate for training (default: 0.001)
3749
- positive_label: For binary classification, which class is "positive" for metrics (default: None)
4037
+ rare_label_value: For binary classification, which class is the rare/minority class for metrics (default: None)
3750
4038
  class_imbalance: Expected class ratios/counts from real world for sampled data (default: None)
3751
4039
  optimize_for: Optimization target - "balanced" (F1 score), "precision", or "recall" (default: "balanced")
3752
4040
  poll_interval: Seconds between status checks when job is already running (default: 30)
@@ -3787,8 +4075,8 @@ class FeatrixSphereClient:
3787
4075
  data["name"] = name
3788
4076
  if session_name_prefix:
3789
4077
  data["session_name_prefix"] = session_name_prefix
3790
- if positive_label:
3791
- data["positive_label"] = positive_label
4078
+ if rare_label_value:
4079
+ data["rare_label_value"] = rare_label_value
3792
4080
  if class_imbalance:
3793
4081
  data["class_imbalance"] = class_imbalance
3794
4082
  if webhooks:
@@ -3821,9 +4109,11 @@ class FeatrixSphereClient:
3821
4109
  df = None,
3822
4110
  epochs: int = 0,
3823
4111
  validation_ignore_columns: List[str] = None,
3824
- positive_label: str = None,
4112
+ rare_label_value: str = None,
3825
4113
  class_imbalance: dict = None,
3826
4114
  optimize_for: str = "balanced",
4115
+ cost_false_positive: float = None,
4116
+ cost_false_negative: float = None,
3827
4117
  poll_interval: int = 30, max_poll_time: int = 3600,
3828
4118
  verbose: bool = True,
3829
4119
  webhooks: Dict[str, str] = None) -> Dict[str, Any]:
@@ -4036,14 +4326,14 @@ class FeatrixSphereClient:
4036
4326
 
4037
4327
  If not provided, class weights are computed from your training data distribution.
4038
4328
 
4039
- Understanding positive_label:
4329
+ Understanding rare_label_value:
4040
4330
  -----------------------------
4041
- For binary classification, positive_label specifies which class is considered the
4042
- "positive" class for computing metrics like precision, recall, and ROC-AUC.
4331
+ For binary classification, rare_label_value specifies which class is the rare/minority
4332
+ class for computing metrics like precision, recall, and ROC-AUC.
4043
4333
 
4044
4334
  Example: For a credit risk model predicting "good" vs "bad" loans:
4045
4335
 
4046
- positive_label="bad" # We want to detect bad loans
4336
+ rare_label_value="bad" # "bad" is the rare class we want to detect
4047
4337
 
4048
4338
  This affects how metrics are reported:
4049
4339
  - Precision = True Positives / (True Positives + False Positives)
@@ -4124,7 +4414,7 @@ class FeatrixSphereClient:
4124
4414
  session_id=session.session_id,
4125
4415
  target_column='approved',
4126
4416
  target_column_type='set',
4127
- positive_label='yes'
4417
+ rare_label_value='yes'
4128
4418
  )
4129
4419
  ```
4130
4420
 
@@ -4140,7 +4430,7 @@ class FeatrixSphereClient:
4140
4430
  target_column_type='set',
4141
4431
  class_imbalance={'approved': 0.97, 'rejected': 0.03},
4142
4432
  optimize_for='recall', # Don't miss rejections
4143
- positive_label='rejected'
4433
+ rare_label_value='rejected'
4144
4434
  )
4145
4435
 
4146
4436
  # System will:
@@ -4159,7 +4449,7 @@ class FeatrixSphereClient:
4159
4449
  session_id=session.session_id,
4160
4450
  target_column='is_fraud',
4161
4451
  target_column_type='set',
4162
- positive_label='fraud',
4452
+ rare_label_value='fraud',
4163
4453
  optimize_for='precision', # Minimize false alarms
4164
4454
  class_imbalance={'legitimate': 0.999, 'fraud': 0.001}
4165
4455
  )
@@ -4174,7 +4464,7 @@ class FeatrixSphereClient:
4174
4464
  session_id=session.session_id,
4175
4465
  target_column='has_disease',
4176
4466
  target_column_type='set',
4177
- positive_label='positive',
4467
+ rare_label_value='positive',
4178
4468
  optimize_for='recall' # Don't miss any cases
4179
4469
  )
4180
4470
  ```
@@ -4189,7 +4479,7 @@ class FeatrixSphereClient:
4189
4479
  target_column='churn',
4190
4480
  target_column_type='set',
4191
4481
  validation_ignore_columns=['customer_id', 'signup_date'],
4192
- positive_label='churned'
4482
+ rare_label_value='churned'
4193
4483
  )
4194
4484
  ```
4195
4485
 
@@ -4215,9 +4505,16 @@ class FeatrixSphereClient:
4215
4505
  embedding space! If neither provided, uses session's original data file.
4216
4506
  epochs: Number of training epochs (default: 0; automatic)
4217
4507
  validation_ignore_columns: List of column names to exclude from validation queries (default: None)
4218
- positive_label: For binary classification, which class is "positive" for metrics (default: None)
4508
+ rare_label_value: For binary classification, which class is the rare/minority class for metrics (default: None)
4219
4509
  class_imbalance: Expected class ratios/counts from real world for sampled data (default: None)
4220
- optimize_for: Optimization target - "balanced" (F1 score), "precision", or "recall" (default: "balanced")
4510
+ optimize_for: Optimization target - "balanced" (F1 score), "precision", or "recall" (default: "balanced").
4511
+ Ignored if cost_false_positive and cost_false_negative are provided.
4512
+ cost_false_positive: Cost of a false positive (predicting positive when actually negative).
4513
+ Must be specified together with cost_false_negative. Only valid for target_column_type="set".
4514
+ When provided, overrides optimize_for and uses cost-based optimization.
4515
+ cost_false_negative: Cost of a false negative (predicting negative when actually positive).
4516
+ Must be specified together with cost_false_positive. Only valid for target_column_type="set".
4517
+ When provided, overrides optimize_for and uses cost-based optimization.
4221
4518
  poll_interval: Seconds between status checks when job is already running (default: 30)
4222
4519
  max_poll_time: Maximum time to poll in seconds (default: 3600 = 1 hour)
4223
4520
  verbose: Whether to print status updates during polling (default: True)
@@ -4234,6 +4531,18 @@ class FeatrixSphereClient:
4234
4531
  if file_path and df is not None:
4235
4532
  raise ValueError("Provide either file_path or df, not both")
4236
4533
 
4534
+ # Validate cost parameters
4535
+ if cost_false_positive is not None or cost_false_negative is not None:
4536
+ if cost_false_positive is None or cost_false_negative is None:
4537
+ raise ValueError("Both cost_false_positive and cost_false_negative must be specified together")
4538
+ if target_column_type != "set":
4539
+ raise ValueError("cost_false_positive and cost_false_negative are only valid for target_column_type='set' (classification), not 'scalar' (regression)")
4540
+ if cost_false_positive <= 0 or cost_false_negative <= 0:
4541
+ raise ValueError("cost_false_positive and cost_false_negative must be positive numbers")
4542
+ if verbose:
4543
+ print(f"💰 Cost-based optimization enabled: FP cost={cost_false_positive}, FN cost={cost_false_negative}")
4544
+ print(f" (optimize_for='{optimize_for}' will be ignored)")
4545
+
4237
4546
  # If DataFrame provided, save to temp file and use file_path logic
4238
4547
  temp_file = None
4239
4548
  if df is not None:
@@ -4264,9 +4573,11 @@ class FeatrixSphereClient:
4264
4573
  target_column=target_column,
4265
4574
  target_column_type=target_column_type,
4266
4575
  epochs=epochs,
4267
- positive_label=positive_label,
4576
+ rare_label_value=rare_label_value,
4268
4577
  class_imbalance=class_imbalance,
4269
4578
  optimize_for=optimize_for,
4579
+ cost_false_positive=cost_false_positive,
4580
+ cost_false_negative=cost_false_negative,
4270
4581
  verbose=verbose,
4271
4582
  webhooks=webhooks
4272
4583
  )
@@ -4277,10 +4588,13 @@ class FeatrixSphereClient:
4277
4588
  "target_column_type": target_column_type,
4278
4589
  "epochs": epochs,
4279
4590
  "validation_ignore_columns": validation_ignore_columns or [],
4280
- "positive_label": positive_label,
4591
+ "rare_label_value": rare_label_value,
4281
4592
  "class_imbalance": class_imbalance,
4282
4593
  "optimize_for": optimize_for
4283
4594
  }
4595
+ if cost_false_positive is not None and cost_false_negative is not None:
4596
+ data["cost_false_positive"] = cost_false_positive
4597
+ data["cost_false_negative"] = cost_false_negative
4284
4598
  if webhooks:
4285
4599
  data['webhooks'] = webhooks
4286
4600
 
@@ -4579,7 +4893,7 @@ class FeatrixSphereClient:
4579
4893
  predictor_id: str = None, target_column: str = None,
4580
4894
  batch_size: int = 0, learning_rate: float = None,
4581
4895
  poll_interval: int = 30, max_poll_time: int = 3600,
4582
- verbose: bool = True) -> Dict[str, Any]:
4896
+ verbose: bool = True, webhooks: Dict[str, str] = None) -> Dict[str, Any]:
4583
4897
  """
4584
4898
  Continue training an existing single predictor for more epochs.
4585
4899
  Loads the existing predictor and resumes training from where it left off.
@@ -4594,6 +4908,7 @@ class FeatrixSphereClient:
4594
4908
  poll_interval: Seconds between status checks (default: 30)
4595
4909
  max_poll_time: Maximum time to poll in seconds (default: 3600 = 1 hour)
4596
4910
  verbose: Whether to print status updates (default: True)
4911
+ webhooks: Optional dict with webhook configuration keys (webhook_callback_secret, s3_backup_url, model_id_update_url)
4597
4912
 
4598
4913
  Returns:
4599
4914
  Response with continuation start confirmation or completion status
@@ -4625,6 +4940,8 @@ class FeatrixSphereClient:
4625
4940
  data["target_column"] = target_column
4626
4941
  if learning_rate is not None:
4627
4942
  data["learning_rate"] = learning_rate
4943
+ if webhooks:
4944
+ data["webhooks"] = webhooks
4628
4945
 
4629
4946
  if verbose:
4630
4947
  print(f"🔄 Continuing training for predictor on session {session_id}")
@@ -4714,6 +5031,139 @@ class FeatrixSphereClient:
4714
5031
  print(f"❌ Error starting predictor continuation: {e}")
4715
5032
  raise
4716
5033
 
5034
+ def foundation_model_train_more(self, session_id: str, es_id: str = None, data_passes: int = None,
5035
+ epochs: int = None, poll_interval: int = 30, max_poll_time: int = 3600,
5036
+ verbose: bool = True, webhooks: Dict[str, str] = None) -> Dict[str, Any]:
5037
+ """
5038
+ Continue training an existing foundation model (embedding space) for more epochs.
5039
+ Loads the existing embedding space and resumes training from where it left off.
5040
+
5041
+ Args:
5042
+ session_id: Session ID containing the trained foundation model
5043
+ es_id: Embedding space ID (optional, uses session's ES if not provided)
5044
+ data_passes: Additional epochs to train (preferred, default: 50)
5045
+ epochs: Additional epochs to train (deprecated, use data_passes instead, for compatibility)
5046
+ poll_interval: Seconds between status checks (default: 30)
5047
+ max_poll_time: Maximum time to poll in seconds (default: 3600 = 1 hour)
5048
+ verbose: Whether to print status updates (default: True)
5049
+ webhooks: Optional dict with webhook configuration keys (webhook_callback_secret, s3_backup_url, model_id_update_url)
5050
+
5051
+ Returns:
5052
+ Response with continuation start confirmation or completion status
5053
+
5054
+ Example:
5055
+ ```python
5056
+ # Continue training for 50 more epochs
5057
+ result = client.foundation_model_train_more(
5058
+ session_id="abc123",
5059
+ data_passes=50
5060
+ )
5061
+ ```
5062
+ """
5063
+ # Support both data_passes and epochs for compatibility
5064
+ if data_passes is None and epochs is None:
5065
+ data_passes = 50 # Default
5066
+ elif data_passes is None:
5067
+ data_passes = epochs # Use epochs if data_passes not provided
5068
+ # If both provided, data_passes takes precedence
5069
+
5070
+ if data_passes <= 0:
5071
+ raise ValueError("data_passes (or epochs) must be > 0 (specify additional epochs to train)")
5072
+
5073
+ data = {
5074
+ "data_passes": data_passes,
5075
+ }
5076
+
5077
+ if es_id:
5078
+ data["es_id"] = es_id
5079
+ if webhooks:
5080
+ data["webhooks"] = webhooks
5081
+
5082
+ if verbose:
5083
+ print(f"🔄 Continuing training for foundation model on session {session_id}")
5084
+ print(f" Additional epochs: {data_passes}")
5085
+ if es_id:
5086
+ print(f" ES ID: {es_id}")
5087
+
5088
+ try:
5089
+ response_data = self._post_json(f"/compute/session/{session_id}/train_foundation_model_more", data)
5090
+
5091
+ if verbose:
5092
+ print(f"✅ Foundation model continuation started: {response_data.get('message')}")
5093
+
5094
+ # Poll for completion if requested
5095
+ if poll_interval > 0 and max_poll_time > 0:
5096
+ import time
5097
+ start_time = time.time()
5098
+ last_status = ""
5099
+
5100
+ while time.time() - start_time < max_poll_time:
5101
+ try:
5102
+ session_info = self.get_session_status(session_id)
5103
+ jobs = session_info.jobs if hasattr(session_info, 'jobs') else {}
5104
+
5105
+ # Find continuation jobs
5106
+ es_jobs = {j_id: j for j_id, j in jobs.items()
5107
+ if j.get('type') == 'train_es'}
5108
+
5109
+ if not es_jobs:
5110
+ if verbose:
5111
+ print("✅ No continuation jobs found - training may have completed")
5112
+ break
5113
+
5114
+ # Check job statuses
5115
+ running_jobs = [j_id for j_id, j in es_jobs.items() if j.get('status') == 'running']
5116
+ completed_jobs = [j_id for j_id, j in es_jobs.items() if j.get('status') == 'done']
5117
+ failed_jobs = [j_id for j_id, j in es_jobs.items() if j.get('status') == 'failed']
5118
+
5119
+ current_status = f"Running: {len(running_jobs)}, Done: {len(completed_jobs)}, Failed: {len(failed_jobs)}"
5120
+ if current_status != last_status and verbose:
5121
+ print(f"📊 Status: {current_status}")
5122
+ last_status = current_status
5123
+
5124
+ if not running_jobs and (completed_jobs or failed_jobs):
5125
+ if completed_jobs:
5126
+ if verbose:
5127
+ print(f"✅ Foundation model continuation completed successfully!")
5128
+ return {
5129
+ "message": "Foundation model continuation completed successfully",
5130
+ "session_id": session_id,
5131
+ "status": "completed",
5132
+ "additional_epochs": data_passes
5133
+ }
5134
+ else:
5135
+ if verbose:
5136
+ print(f"❌ Foundation model continuation failed")
5137
+ return {
5138
+ "message": "Foundation model continuation failed",
5139
+ "session_id": session_id,
5140
+ "status": "failed",
5141
+ "failed_jobs": failed_jobs
5142
+ }
5143
+
5144
+ time.sleep(poll_interval)
5145
+ except Exception as poll_error:
5146
+ if verbose:
5147
+ print(f"⚠️ Error during polling: {poll_error}")
5148
+ time.sleep(poll_interval)
5149
+
5150
+ # Timeout
5151
+ if verbose:
5152
+ print(f"⏱️ Polling timeout reached ({max_poll_time}s)")
5153
+ return {
5154
+ "message": "Polling timeout",
5155
+ "session_id": session_id,
5156
+ "status": "timeout",
5157
+ "additional_epochs": data_passes
5158
+ }
5159
+
5160
+ return response_data
5161
+
5162
+ except Exception as e:
5163
+ if verbose:
5164
+ print(f"❌ Error starting foundation model continuation: {e}")
5165
+ raise
5166
+
4717
5167
  def _train_single_predictor_with_file(
4718
5168
  self,
4719
5169
  session_id: str,
@@ -4721,10 +5171,12 @@ class FeatrixSphereClient:
4721
5171
  target_column: str,
4722
5172
  target_column_type: str,
4723
5173
  epochs: int,
4724
- positive_label: str,
5174
+ rare_label_value: str,
4725
5175
  class_imbalance: dict,
4726
5176
  optimize_for: str,
4727
- verbose: bool,
5177
+ cost_false_positive: float = None,
5178
+ cost_false_negative: float = None,
5179
+ verbose: bool = True,
4728
5180
  webhooks: Dict[str, str] = None
4729
5181
  ) -> Dict[str, Any]:
4730
5182
  """
@@ -4753,12 +5205,16 @@ class FeatrixSphereClient:
4753
5205
  'optimize_for': optimize_for,
4754
5206
  }
4755
5207
 
4756
- if positive_label:
4757
- data['positive_label'] = positive_label
5208
+ if rare_label_value:
5209
+ data['rare_label_value'] = rare_label_value
4758
5210
 
4759
5211
  if class_imbalance:
4760
5212
  data['class_imbalance'] = json.dumps(class_imbalance)
4761
5213
 
5214
+ if cost_false_positive is not None and cost_false_negative is not None:
5215
+ data['cost_false_positive'] = str(cost_false_positive)
5216
+ data['cost_false_negative'] = str(cost_false_negative)
5217
+
4762
5218
  if webhooks:
4763
5219
  data['webhooks'] = json.dumps(webhooks)
4764
5220
 
@@ -5785,7 +6241,24 @@ class FeatrixSphereClient:
5785
6241
  if not file_path.exists():
5786
6242
  raise FileNotFoundError(f"File not found: {file_path}")
5787
6243
 
5788
- df = pd.read_csv(file_path)
6244
+ # Support CSV, Parquet, JSON, and JSONL files
6245
+ file_path_str = str(file_path).lower()
6246
+ if file_path_str.endswith('.parquet'):
6247
+ df = pd.read_parquet(file_path)
6248
+ elif file_path_str.endswith('.jsonl'):
6249
+ # JSONL: one JSON object per line
6250
+ import json
6251
+ records = []
6252
+ with open(file_path, 'r', encoding='utf-8') as f:
6253
+ for line in f:
6254
+ if line.strip():
6255
+ records.append(json.loads(line))
6256
+ df = pd.DataFrame(records)
6257
+ elif file_path_str.endswith('.json'):
6258
+ # Regular JSON
6259
+ df = pd.read_json(file_path)
6260
+ else:
6261
+ df = pd.read_csv(file_path)
5789
6262
 
5790
6263
  # Convert to JSON Tables format and clean NaNs
5791
6264
  table_data = JSONTablesEncoder.from_dataframe(df)
@@ -5939,11 +6412,11 @@ class FeatrixSphereClient:
5939
6412
  def run_csv_predictions(self, session_id: str, csv_file: str, target_column: str = None,
5940
6413
  sample_size: int = None, remove_target: bool = True) -> Dict[str, Any]:
5941
6414
  """
5942
- Run predictions on a CSV file with automatic accuracy calculation.
6415
+ Run predictions on a CSV, Parquet, JSON, or JSONL file with automatic accuracy calculation.
5943
6416
 
5944
6417
  Args:
5945
6418
  session_id: ID of session with trained predictor
5946
- csv_file: Path to CSV file
6419
+ csv_file: Path to CSV, Parquet, JSON, or JSONL file
5947
6420
  target_column: Name of target column (for accuracy calculation)
5948
6421
  sample_size: Number of records to test (None = all records)
5949
6422
  remove_target: Whether to remove target column from prediction input
@@ -5953,8 +6426,24 @@ class FeatrixSphereClient:
5953
6426
  """
5954
6427
  import pandas as pd
5955
6428
 
5956
- # Load CSV
5957
- df = pd.read_csv(csv_file)
6429
+ # Load CSV, Parquet, JSON, or JSONL
6430
+ csv_file_lower = csv_file.lower()
6431
+ if csv_file_lower.endswith('.parquet'):
6432
+ df = pd.read_parquet(csv_file)
6433
+ elif csv_file_lower.endswith('.jsonl'):
6434
+ # JSONL: one JSON object per line
6435
+ import json
6436
+ records = []
6437
+ with open(csv_file, 'r', encoding='utf-8') as f:
6438
+ for line in f:
6439
+ if line.strip():
6440
+ records.append(json.loads(line))
6441
+ df = pd.DataFrame(records)
6442
+ elif csv_file_lower.endswith('.json'):
6443
+ # Regular JSON
6444
+ df = pd.read_json(csv_file)
6445
+ else:
6446
+ df = pd.read_csv(csv_file)
5958
6447
 
5959
6448
  # Handle target column
5960
6449
  actual_values = None
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: featrixsphere
3
- Version: 0.2.1002
3
+ Version: 0.2.1206
4
4
  Summary: Transform any CSV into a production-ready ML model in minutes, not months.
5
5
  Home-page: https://github.com/Featrix/sphere
6
6
  Author: Featrix
@@ -1 +0,0 @@
1
- 0.2.1001