PyPI - featrixsphere - Versions diffs - 0.2.1002__tar.gz → 0.2.1206__tar.gz - Mend

featrixsphere 0.2.1002tar.gz → 0.2.1206tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

{featrixsphere-0.2.1002/featrixsphere.egg-info → featrixsphere-0.2.1206}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: featrixsphere
-Version: 0.2.1002
+Version: 0.2.1206
 Summary: Transform any CSV into a production-ready ML model in minutes, not months.
 Home-page: https://github.com/Featrix/sphere
 Author: Featrix

featrixsphere-0.2.1206/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.2.1206

{featrixsphere-0.2.1002 → featrixsphere-0.2.1206}/featrixsphere/__init__.py RENAMED Viewed

@@ -38,7 +38,7 @@ Example:
     ...                                labels=['Experiment A', 'Experiment B'])
 """
-__version__ = "0.2.1002"
+__version__ = "0.2.1206"
 __author__ = "Featrix"
 __email__ = "support@featrix.com"
 __license__ = "MIT"

{featrixsphere-0.2.1002 → featrixsphere-0.2.1206}/featrixsphere/client.py RENAMED Viewed

@@ -660,6 +660,77 @@ class FeatrixSphereClient:
         )
         return response.json()
+    def publish_session(self, session_id: str) -> Dict[str, Any]:
+        """
+        Publish a session by moving it to /sphere/published/<sessionId>.
+        Moves both the session file and output directory.
+        Args:
+            session_id: Session ID to publish
+        Returns:
+            Response with published_path, output_path, and status
+        Example:
+            ```python
+            result = client.publish_session("abc123")
+            print(f"Published to: {result['published_path']}")
+            ```
+        """
+        response_data = self._post_json(f"/compute/session/{session_id}/publish", {})
+        return response_data
+    def deprecate_session(self, session_id: str, warning_message: str, expiration_date: str) -> Dict[str, Any]:
+        """
+        Deprecate a published session with a warning message and expiration date.
+        The session remains available until the expiration date.
+        Args:
+            session_id: Session ID to deprecate
+            warning_message: Warning message to display about deprecation
+            expiration_date: ISO format date string when session will be removed (e.g., "2025-12-31T23:59:59Z")
+        Returns:
+            Response with deprecation status
+        Example:
+            ```python
+            from datetime import datetime, timedelta
+            expiration = (datetime.now() + timedelta(days=90)).isoformat() + "Z"
+            result = client.deprecate_session(
+                session_id="abc123",
+                warning_message="This session will be removed on 2025-12-31",
+                expiration_date=expiration
+            )
+            ```
+        """
+        data = {
+            "warning_message": warning_message,
+            "expiration_date": expiration_date
+        }
+        response_data = self._post_json(f"/compute/session/{session_id}/deprecate", data)
+        return response_data
+    def unpublish_session(self, session_id: str) -> Dict[str, Any]:
+        """
+        Unpublish a session by moving it back from /sphere/published/<sessionId>.
+        Args:
+            session_id: Session ID to unpublish
+        Returns:
+            Response with unpublish status
+        Example:
+            ```python
+            result = client.unpublish_session("abc123")
+            print(f"Status: {result['status']}")
+            ```
+        """
+        response_data = self._post_json(f"/compute/session/{session_id}/unpublish", {})
+        return response_data
     def get_sessions_for_org(self, name_prefix: str, max_retries: int = None) -> Dict[str, Any]:
         """
         Get all sessions matching a name prefix across all compute nodes.
@@ -1424,16 +1495,141 @@ class FeatrixSphereClient:
             job_queue_positions={}
         )
+    def fine_tune_embedding_space(
+        self,
+        name: str,
+        parent_session_id: str = None,
+        parent_embedding_space_path: str = None,
+        s3_training_dataset: str = None,
+        s3_validation_dataset: str = None,
+        webhooks: Dict[str, str] = None
+    ) -> SessionInfo:
+        """
+        Fine-tune an existing embedding space on new data.
+        This method takes a pre-trained embedding space (the "parent") and fine-tunes it
+        on a new dataset with the same columns. The number of training epochs is automatically
+        calculated based on the dataset size ratio to ensure optimal training.
+        **How Epoch Calculation Works:**
+        - The system calculates F = len(new_dataset) / len(old_dataset)
+        - New epochs = original_epochs / F
+        - If new dataset is smaller (F < 1), more epochs are used (to see data enough times)
+        - If new dataset is larger (F > 1), fewer epochs are used (less repetition needed)
+        **Example:**
+        - Original: 1000 rows, trained for 100 epochs
+        - New: 500 rows → F = 0.5 → 100/0.5 = 200 epochs
+        - New: 2000 rows → F = 2.0 → 100/2.0 = 50 epochs
+        This ensures the model sees the new data an appropriate number of times relative
+        to how much it saw the original data.
+        Args:
+            name: Name for the fine-tuned embedding space
+            parent_session_id: Session ID of the parent embedding space (optional)
+            parent_embedding_space_path: Direct path to parent embedding space pickle file (optional)
+            s3_training_dataset: S3 URL for new training dataset (must start with 's3://')
+            s3_validation_dataset: S3 URL for new validation dataset (must start with 's3://')
+            webhooks: Optional dict with webhook configuration keys (webhook_callback_secret, s3_backup_url, model_id_update_url)
+        Returns:
+            SessionInfo for the newly created fine-tuning session
+        Raises:
+            ValueError: If S3 URLs are invalid or neither parent identifier is provided
+        Example:
+            ```python
+            # Fine-tune an existing embedding space on new data
+            client = FeatrixSphereClient("https://sphere-api.featrix.com")
+            # Option 1: Use parent session ID
+            fine_tuned = client.fine_tune_embedding_space(
+                name="customer_behavior_v2",
+                parent_session_id="abc123-20240101-120000",
+                s3_training_dataset="s3://my-bucket/new_training_data.csv",
+                s3_validation_dataset="s3://my-bucket/new_validation_data.csv"
+            )
+            # Option 2: Use direct path to parent embedding space
+            fine_tuned = client.fine_tune_embedding_space(
+                name="customer_behavior_v2",
+                parent_embedding_space_path="/path/to/parent/embedded_space.pickle",
+                s3_training_dataset="s3://my-bucket/new_training_data.csv",
+                s3_validation_dataset="s3://my-bucket/new_validation_data.csv"
+            )
+            # Wait for fine-tuning to complete
+            client.wait_for_session_completion(fine_tuned.session_id)
+            # Use the fine-tuned model for predictions
+            result = client.predict(fine_tuned.session_id, {"feature1": "value1"})
+            ```
+        """
+        # Validate S3 URLs
+        if s3_training_dataset and not s3_training_dataset.startswith('s3://'):
+            raise ValueError("s3_training_dataset must be a valid S3 URL (s3://...)")
+        if s3_validation_dataset and not s3_validation_dataset.startswith('s3://'):
+            raise ValueError("s3_validation_dataset must be a valid S3 URL (s3://...)")
+        # Validate that we have either parent_session_id or parent_embedding_space_path
+        if not parent_session_id and not parent_embedding_space_path:
+            raise ValueError("Either parent_session_id or parent_embedding_space_path must be provided")
+        print(f"Fine-tuning embedding space '{name}'...")
+        if parent_session_id:
+            print(f"  Parent session: {parent_session_id}")
+        if parent_embedding_space_path:
+            print(f"  Parent embedding space: {parent_embedding_space_path}")
+        print(f"  New training data: {s3_training_dataset}")
+        print(f"  New validation data: {s3_validation_dataset}")
+        data = {
+            "name": name,
+            "s3_file_data_set_training": s3_training_dataset,
+            "s3_file_data_set_validation": s3_validation_dataset
+        }
+        if parent_session_id:
+            data["parent_session_id"] = parent_session_id
+        if parent_embedding_space_path:
+            data["parent_embedding_space_path"] = parent_embedding_space_path
+        if webhooks:
+            data['webhooks'] = webhooks
+        response_data = self._post_json("/compute/fine-tune-embedding-space", data)
+        session_id = response_data.get('session_id')
+        fine_tune_info = response_data.get('fine_tune_info', {})
+        print(f"Fine-tuning session created: {session_id}")
+        if fine_tune_info:
+            print(f"  Original dataset: {fine_tune_info.get('original_train_size', 'N/A')} rows")
+            print(f"  New dataset: {fine_tune_info.get('new_total_size', 'N/A')} rows")
+            print(f"  Dataset ratio (F): {fine_tune_info.get('F', 'N/A'):.4f}")
+            print(f"  Original epochs: {fine_tune_info.get('original_epochs', 'N/A')}")
+            print(f"  Calculated epochs: {fine_tune_info.get('calculated_epochs', 'N/A')}")
+        return SessionInfo(
+            session_id=session_id,
+            session_type=response_data.get('session_type', 'embedding_space_finetune'),
+            status=response_data.get('status', 'ready'),
+            jobs={},
+            job_queue_positions={}
+        )
     # =========================================================================
     # File Upload
     # =========================================================================
     def upload_file_and_create_session(self, file_path: Path, session_name_prefix: str = None, name: str = None, webhooks: Dict[str, str] = None) -> SessionInfo:
         """
-        Upload a CSV file and create a new session.
+        Upload a CSV, Parquet, JSON, or JSONL file and create a new session.
         Args:
-            file_path: Path to the CSV file to upload
+            file_path: Path to the CSV, Parquet, JSON, or JSONL file to upload
             session_name_prefix: Optional prefix for the session ID. Session will be named <prefix>-<full-uuid>
             name: Optional name for the embedding space/model (for identification and metadata)
             webhooks: Optional dict with webhook configuration keys (webhook_callback_secret, s3_backup_url, model_id_update_url)
@@ -1491,12 +1687,13 @@ class FeatrixSphereClient:
                                     string_list_delimiter: str = "|",
                                     important_columns_for_visualization: List[str] = None,
                                     metadata: Dict[str, Any] = None,
+                                    user_metadata: Dict[str, Any] = None,  # User metadata for ES/SP identification (max 32KB)
                                     session_name_prefix: str = None,
                                     name: str = None,
                                     webhooks: Dict[str, str] = None,
                                     epochs: int = None) -> SessionInfo:
         """
-        Upload a pandas DataFrame or CSV file and create a new session.
+        Upload a pandas DataFrame, CSV file, Parquet file, JSON file, or JSONL file and create a new session.
         Special Column: __featrix_train_predictor
         ------------------------------------------
@@ -1504,7 +1701,7 @@ class FeatrixSphereClient:
         which rows are used for single predictor training.
         How it works:
-        - Add a boolean column "__featrix_train_predictor" to your DataFrame/CSV before upload
+        - Add a boolean column "__featrix_train_predictor" to your DataFrame/CSV/Parquet/JSON/JSONL before upload
         - Set it to True for rows you want to use for predictor training
         - Set it to False (or any other value) for rows to exclude from predictor training
         - Embedding space training uses ALL rows (ignores this column)
@@ -1538,7 +1735,7 @@ class FeatrixSphereClient:
         Args:
             df: pandas DataFrame to upload (optional if file_path is provided)
             filename: Name to give the uploaded file (default: "data.csv")
-            file_path: Path to CSV file to upload (optional if df is provided)
+            file_path: Path to CSV, Parquet, JSON, or JSONL file to upload (optional if df is provided)
             column_overrides: Dict mapping column names to types ("scalar", "set", "free_string", "free_string_list")
             column_types: Alias for column_overrides (for backward compatibility)
             string_list_delimiter: Delimiter for free_string_list columns (default: "|")
@@ -1579,21 +1776,90 @@ class FeatrixSphereClient:
             if not os.path.exists(file_path):
                 raise FileNotFoundError(f"File not found: {file_path}")
-            # Check if it's a CSV file
-            if not file_path.lower().endswith(('.csv', '.csv.gz')):
-                raise ValueError("File must be a CSV file (with .csv or .csv.gz extension)")
+            # Check if it's a supported file type
+            file_ext = file_path.lower()
+            if not file_ext.endswith(('.csv', '.csv.gz', '.parquet', '.json', '.jsonl')):
+                raise ValueError("File must be a CSV, Parquet, JSON, or JSONL file (with .csv, .csv.gz, .parquet, .json, or .jsonl extension)")
             print(f"Uploading file: {file_path}")
             # Read the file content
             if file_path.endswith('.gz'):
-                # Already gzipped
+                # Already gzipped CSV
                 with gzip.open(file_path, 'rb') as f:
                     file_content = f.read()
                 upload_filename = os.path.basename(file_path)
                 content_type = 'application/gzip'
+            elif file_path.lower().endswith(('.json', '.jsonl')):
+                # JSON/JSONL file - read as DataFrame, convert to CSV, then compress
+                print(f"Reading {'JSONL' if file_path.lower().endswith('.jsonl') else 'JSON'} file...")
+                try:
+                    from featrix.neural.input_data_file import featrix_wrap_read_json_file
+                    json_df = featrix_wrap_read_json_file(file_path)
+                    if json_df is None:
+                        raise ValueError(f"Failed to parse {'JSONL' if file_path.lower().endswith('.jsonl') else 'JSON'} file")
+                except ImportError:
+                    # Fallback to pandas if featrix wrapper not available
+                    if file_path.lower().endswith('.jsonl'):
+                        # JSONL: one JSON object per line
+                        import json
+                        records = []
+                        with open(file_path, 'r', encoding='utf-8') as f:
+                            for line in f:
+                                if line.strip():
+                                    records.append(json.loads(line))
+                        json_df = pd.DataFrame(records)
+                    else:
+                        # Regular JSON
+                        json_df = pd.read_json(file_path)
+                # Clean NaN values before CSV conversion
+                cleaned_df = json_df.where(pd.notna(json_df), None)
+                # Convert to CSV and compress
+                csv_buffer = io.StringIO()
+                cleaned_df.to_csv(csv_buffer, index=False)
+                csv_data = csv_buffer.getvalue().encode('utf-8')
+                print(f"Compressing {'JSONL' if file_path.lower().endswith('.jsonl') else 'JSON'} (converted to CSV)...")
+                compressed_buffer = io.BytesIO()
+                with gzip.GzipFile(fileobj=compressed_buffer, mode='wb') as gz:
+                    gz.write(csv_data)
+                file_content = compressed_buffer.getvalue()
+                upload_filename = os.path.basename(file_path).replace('.jsonl', '.csv.gz').replace('.json', '.csv.gz')
+                content_type = 'application/gzip'
+                original_size = len(csv_data)
+                compressed_size = len(file_content)
+                compression_ratio = (1 - compressed_size / original_size) * 100
+                print(f"Converted {'JSONL' if file_path.lower().endswith('.jsonl') else 'JSON'} to CSV and compressed from {original_size:,} to {compressed_size:,} bytes ({compression_ratio:.1f}% reduction)")
+            elif file_path.lower().endswith('.parquet'):
+                # Parquet file - read as DataFrame, convert to CSV, then compress
+                print("Reading Parquet file...")
+                parquet_df = pd.read_parquet(file_path)
+                # Clean NaN values before CSV conversion
+                cleaned_df = parquet_df.where(pd.notna(parquet_df), None)
+                # Convert to CSV and compress
+                csv_buffer = io.StringIO()
+                cleaned_df.to_csv(csv_buffer, index=False)
+                csv_data = csv_buffer.getvalue().encode('utf-8')
+                print("Compressing Parquet (converted to CSV)...")
+                compressed_buffer = io.BytesIO()
+                with gzip.GzipFile(fileobj=compressed_buffer, mode='wb') as gz:
+                    gz.write(csv_data)
+                file_content = compressed_buffer.getvalue()
+                upload_filename = os.path.basename(file_path).replace('.parquet', '.csv.gz')
+                content_type = 'application/gzip'
+                original_size = len(csv_data)
+                compressed_size = len(file_content)
+                compression_ratio = (1 - compressed_size / original_size) * 100
+                print(f"Converted Parquet to CSV and compressed from {original_size:,} to {compressed_size:,} bytes ({compression_ratio:.1f}% reduction)")
             else:
-                # Read CSV and compress it
+                # Regular CSV file - read and compress it
                 with open(file_path, 'rb') as f:
                     csv_content = f.read()
@@ -1663,6 +1929,10 @@ class FeatrixSphereClient:
             import json
             data['metadata'] = json.dumps(metadata)
             print(f"Session metadata: {metadata}")
+        if user_metadata:
+            import json
+            data['user_metadata'] = json.dumps(user_metadata)
+            print(f"User metadata: {user_metadata}")
         if session_name_prefix:
             data['session_name_prefix'] = session_name_prefix
             print(f"Session name prefix: {session_name_prefix}")
@@ -3239,6 +3509,24 @@ class FeatrixSphereClient:
         response_data = self._delete_json(f"/session/{session_id}/predictor", params=params, max_retries=max_retries)
         return response_data
+    def mark_for_deletion(self, session_id: str, max_retries: int = None) -> Dict[str, Any]:
+        """
+        Mark a session for deletion. The session will be deleted by the garbage collection process.
+        Args:
+            session_id: Session ID to mark for deletion
+            max_retries: Number of retries for errors (default: uses client default)
+        Returns:
+            Dictionary with confirmation that the session was marked for deletion
+        Example:
+            result = client.mark_for_deletion("session_123")
+            print(result)  # {"status": "marked", "session_id": "session_123"}
+        """
+        response_data = self._post_json(f"/compute/session/{session_id}/mark_for_deletion", max_retries=max_retries)
+        return response_data
     def _create_interactive_training_movie(self, training_metrics, epoch_projections, session_id,
                                           show_embedding_evolution, show_loss_evolution):
@@ -3723,7 +4011,7 @@ class FeatrixSphereClient:
                                     name: str = None,
                                     session_name_prefix: str = None,
                                     epochs: int = 0, batch_size: int = 0, learning_rate: float = 0.001,
-                                    positive_label: str = None,
+                                    rare_label_value: str = None,
                                     class_imbalance: dict = None,
                                     optimize_for: str = "balanced",
                                     poll_interval: int = 30, max_poll_time: int = 3600,
@@ -3746,7 +4034,7 @@ class FeatrixSphereClient:
             epochs: Number of training epochs (default: 0; automatic)
             batch_size: Training batch size (default: 0; automatic)
             learning_rate: Learning rate for training (default: 0.001)
-            positive_label: For binary classification, which class is "positive" for metrics (default: None)
+            rare_label_value: For binary classification, which class is the rare/minority class for metrics (default: None)
             class_imbalance: Expected class ratios/counts from real world for sampled data (default: None)
             optimize_for: Optimization target - "balanced" (F1 score), "precision", or "recall" (default: "balanced")
             poll_interval: Seconds between status checks when job is already running (default: 30)
@@ -3787,8 +4075,8 @@ class FeatrixSphereClient:
             data["name"] = name
         if session_name_prefix:
             data["session_name_prefix"] = session_name_prefix
-        if positive_label:
-            data["positive_label"] = positive_label
+        if rare_label_value:
+            data["rare_label_value"] = rare_label_value
         if class_imbalance:
             data["class_imbalance"] = class_imbalance
         if webhooks:
@@ -3821,9 +4109,11 @@ class FeatrixSphereClient:
                               df = None,
                               epochs: int = 0,
                               validation_ignore_columns: List[str] = None,
-                              positive_label: str = None,
+                              rare_label_value: str = None,
                               class_imbalance: dict = None,
                               optimize_for: str = "balanced",
+                              cost_false_positive: float = None,
+                              cost_false_negative: float = None,
                               poll_interval: int = 30, max_poll_time: int = 3600,
                               verbose: bool = True,
                               webhooks: Dict[str, str] = None) -> Dict[str, Any]:
@@ -4036,14 +4326,14 @@ class FeatrixSphereClient:
         If not provided, class weights are computed from your training data distribution.
-        Understanding positive_label:
+        Understanding rare_label_value:
         -----------------------------
-        For binary classification, positive_label specifies which class is considered the
-        "positive" class for computing metrics like precision, recall, and ROC-AUC.
+        For binary classification, rare_label_value specifies which class is the rare/minority
+        class for computing metrics like precision, recall, and ROC-AUC.
         Example: For a credit risk model predicting "good" vs "bad" loans:
-            positive_label="bad"  # We want to detect bad loans
+            rare_label_value="bad"  # "bad" is the rare class we want to detect
         This affects how metrics are reported:
         - Precision = True Positives / (True Positives + False Positives)
@@ -4124,7 +4414,7 @@ class FeatrixSphereClient:
             session_id=session.session_id,
             target_column='approved',
             target_column_type='set',
-            positive_label='yes'
+            rare_label_value='yes'
         )
         ```
@@ -4140,7 +4430,7 @@ class FeatrixSphereClient:
             target_column_type='set',
             class_imbalance={'approved': 0.97, 'rejected': 0.03},
             optimize_for='recall',  # Don't miss rejections
-            positive_label='rejected'
+            rare_label_value='rejected'
         )
         # System will:
@@ -4159,7 +4449,7 @@ class FeatrixSphereClient:
             session_id=session.session_id,
             target_column='is_fraud',
             target_column_type='set',
-            positive_label='fraud',
+            rare_label_value='fraud',
             optimize_for='precision',  # Minimize false alarms
             class_imbalance={'legitimate': 0.999, 'fraud': 0.001}
         )
@@ -4174,7 +4464,7 @@ class FeatrixSphereClient:
             session_id=session.session_id,
             target_column='has_disease',
             target_column_type='set',
-            positive_label='positive',
+            rare_label_value='positive',
             optimize_for='recall'  # Don't miss any cases
         )
         ```
@@ -4189,7 +4479,7 @@ class FeatrixSphereClient:
             target_column='churn',
             target_column_type='set',
             validation_ignore_columns=['customer_id', 'signup_date'],
-            positive_label='churned'
+            rare_label_value='churned'
         )
         ```
@@ -4215,9 +4505,16 @@ class FeatrixSphereClient:
                 embedding space! If neither provided, uses session's original data file.
             epochs: Number of training epochs (default: 0; automatic)
             validation_ignore_columns: List of column names to exclude from validation queries (default: None)
-            positive_label: For binary classification, which class is "positive" for metrics (default: None)
+            rare_label_value: For binary classification, which class is the rare/minority class for metrics (default: None)
             class_imbalance: Expected class ratios/counts from real world for sampled data (default: None)
-            optimize_for: Optimization target - "balanced" (F1 score), "precision", or "recall" (default: "balanced")
+            optimize_for: Optimization target - "balanced" (F1 score), "precision", or "recall" (default: "balanced").
+                Ignored if cost_false_positive and cost_false_negative are provided.
+            cost_false_positive: Cost of a false positive (predicting positive when actually negative).
+                Must be specified together with cost_false_negative. Only valid for target_column_type="set".
+                When provided, overrides optimize_for and uses cost-based optimization.
+            cost_false_negative: Cost of a false negative (predicting negative when actually positive).
+                Must be specified together with cost_false_positive. Only valid for target_column_type="set".
+                When provided, overrides optimize_for and uses cost-based optimization.
             poll_interval: Seconds between status checks when job is already running (default: 30)
             max_poll_time: Maximum time to poll in seconds (default: 3600 = 1 hour)
             verbose: Whether to print status updates during polling (default: True)
@@ -4234,6 +4531,18 @@ class FeatrixSphereClient:
         if file_path and df is not None:
             raise ValueError("Provide either file_path or df, not both")
+        # Validate cost parameters
+        if cost_false_positive is not None or cost_false_negative is not None:
+            if cost_false_positive is None or cost_false_negative is None:
+                raise ValueError("Both cost_false_positive and cost_false_negative must be specified together")
+            if target_column_type != "set":
+                raise ValueError("cost_false_positive and cost_false_negative are only valid for target_column_type='set' (classification), not 'scalar' (regression)")
+            if cost_false_positive <= 0 or cost_false_negative <= 0:
+                raise ValueError("cost_false_positive and cost_false_negative must be positive numbers")
+            if verbose:
+                print(f"💰 Cost-based optimization enabled: FP cost={cost_false_positive}, FN cost={cost_false_negative}")
+                print(f"   (optimize_for='{optimize_for}' will be ignored)")
         # If DataFrame provided, save to temp file and use file_path logic
         temp_file = None
         if df is not None:
@@ -4264,9 +4573,11 @@ class FeatrixSphereClient:
                     target_column=target_column,
                     target_column_type=target_column_type,
                     epochs=epochs,
-                    positive_label=positive_label,
+                    rare_label_value=rare_label_value,
                     class_imbalance=class_imbalance,
                     optimize_for=optimize_for,
+                    cost_false_positive=cost_false_positive,
+                    cost_false_negative=cost_false_negative,
                     verbose=verbose,
                     webhooks=webhooks
                 )
@@ -4277,10 +4588,13 @@ class FeatrixSphereClient:
                 "target_column_type": target_column_type,
                 "epochs": epochs,
                 "validation_ignore_columns": validation_ignore_columns or [],
-                "positive_label": positive_label,
+                "rare_label_value": rare_label_value,
                 "class_imbalance": class_imbalance,
                 "optimize_for": optimize_for
             }
+            if cost_false_positive is not None and cost_false_negative is not None:
+                data["cost_false_positive"] = cost_false_positive
+                data["cost_false_negative"] = cost_false_negative
             if webhooks:
                 data['webhooks'] = webhooks
@@ -4579,7 +4893,7 @@ class FeatrixSphereClient:
                             predictor_id: str = None, target_column: str = None,
                             batch_size: int = 0, learning_rate: float = None,
                             poll_interval: int = 30, max_poll_time: int = 3600,
-                            verbose: bool = True) -> Dict[str, Any]:
+                            verbose: bool = True, webhooks: Dict[str, str] = None) -> Dict[str, Any]:
         """
         Continue training an existing single predictor for more epochs.
         Loads the existing predictor and resumes training from where it left off.
@@ -4594,6 +4908,7 @@ class FeatrixSphereClient:
             poll_interval: Seconds between status checks (default: 30)
             max_poll_time: Maximum time to poll in seconds (default: 3600 = 1 hour)
             verbose: Whether to print status updates (default: True)
+            webhooks: Optional dict with webhook configuration keys (webhook_callback_secret, s3_backup_url, model_id_update_url)
         Returns:
             Response with continuation start confirmation or completion status
@@ -4625,6 +4940,8 @@ class FeatrixSphereClient:
             data["target_column"] = target_column
         if learning_rate is not None:
             data["learning_rate"] = learning_rate
+        if webhooks:
+            data["webhooks"] = webhooks
         if verbose:
             print(f"🔄 Continuing training for predictor on session {session_id}")
@@ -4714,6 +5031,139 @@ class FeatrixSphereClient:
                 print(f"❌ Error starting predictor continuation: {e}")
             raise
+    def foundation_model_train_more(self, session_id: str, es_id: str = None, data_passes: int = None,
+                                   epochs: int = None, poll_interval: int = 30, max_poll_time: int = 3600,
+                                   verbose: bool = True, webhooks: Dict[str, str] = None) -> Dict[str, Any]:
+        """
+        Continue training an existing foundation model (embedding space) for more epochs.
+        Loads the existing embedding space and resumes training from where it left off.
+        Args:
+            session_id: Session ID containing the trained foundation model
+            es_id: Embedding space ID (optional, uses session's ES if not provided)
+            data_passes: Additional epochs to train (preferred, default: 50)
+            epochs: Additional epochs to train (deprecated, use data_passes instead, for compatibility)
+            poll_interval: Seconds between status checks (default: 30)
+            max_poll_time: Maximum time to poll in seconds (default: 3600 = 1 hour)
+            verbose: Whether to print status updates (default: True)
+            webhooks: Optional dict with webhook configuration keys (webhook_callback_secret, s3_backup_url, model_id_update_url)
+        Returns:
+            Response with continuation start confirmation or completion status
+        Example:
+            ```python
+            # Continue training for 50 more epochs
+            result = client.foundation_model_train_more(
+                session_id="abc123",
+                data_passes=50
+            )
+            ```
+        """
+        # Support both data_passes and epochs for compatibility
+        if data_passes is None and epochs is None:
+            data_passes = 50  # Default
+        elif data_passes is None:
+            data_passes = epochs  # Use epochs if data_passes not provided
+        # If both provided, data_passes takes precedence
+        if data_passes <= 0:
+            raise ValueError("data_passes (or epochs) must be > 0 (specify additional epochs to train)")
+        data = {
+            "data_passes": data_passes,
+        }
+        if es_id:
+            data["es_id"] = es_id
+        if webhooks:
+            data["webhooks"] = webhooks
+        if verbose:
+            print(f"🔄 Continuing training for foundation model on session {session_id}")
+            print(f"   Additional epochs: {data_passes}")
+            if es_id:
+                print(f"   ES ID: {es_id}")
+        try:
+            response_data = self._post_json(f"/compute/session/{session_id}/train_foundation_model_more", data)
+            if verbose:
+                print(f"✅ Foundation model continuation started: {response_data.get('message')}")
+            # Poll for completion if requested
+            if poll_interval > 0 and max_poll_time > 0:
+                import time
+                start_time = time.time()
+                last_status = ""
+                while time.time() - start_time < max_poll_time:
+                    try:
+                        session_info = self.get_session_status(session_id)
+                        jobs = session_info.jobs if hasattr(session_info, 'jobs') else {}
+                        # Find continuation jobs
+                        es_jobs = {j_id: j for j_id, j in jobs.items()
+                                 if j.get('type') == 'train_es'}
+                        if not es_jobs:
+                            if verbose:
+                                print("✅ No continuation jobs found - training may have completed")
+                            break
+                        # Check job statuses
+                        running_jobs = [j_id for j_id, j in es_jobs.items() if j.get('status') == 'running']
+                        completed_jobs = [j_id for j_id, j in es_jobs.items() if j.get('status') == 'done']
+                        failed_jobs = [j_id for j_id, j in es_jobs.items() if j.get('status') == 'failed']
+                        current_status = f"Running: {len(running_jobs)}, Done: {len(completed_jobs)}, Failed: {len(failed_jobs)}"
+                        if current_status != last_status and verbose:
+                            print(f"📊 Status: {current_status}")
+                            last_status = current_status
+                        if not running_jobs and (completed_jobs or failed_jobs):
+                            if completed_jobs:
+                                if verbose:
+                                    print(f"✅ Foundation model continuation completed successfully!")
+                                return {
+                                    "message": "Foundation model continuation completed successfully",
+                                    "session_id": session_id,
+                                    "status": "completed",
+                                    "additional_epochs": data_passes
+                                }
+                            else:
+                                if verbose:
+                                    print(f"❌ Foundation model continuation failed")
+                                return {
+                                    "message": "Foundation model continuation failed",
+                                    "session_id": session_id,
+                                    "status": "failed",
+                                    "failed_jobs": failed_jobs
+                                }
+                        time.sleep(poll_interval)
+                    except Exception as poll_error:
+                        if verbose:
+                            print(f"⚠️ Error during polling: {poll_error}")
+                        time.sleep(poll_interval)
+                # Timeout
+                if verbose:
+                    print(f"⏱️ Polling timeout reached ({max_poll_time}s)")
+                return {
+                    "message": "Polling timeout",
+                    "session_id": session_id,
+                    "status": "timeout",
+                    "additional_epochs": data_passes
+                }
+            return response_data
+        except Exception as e:
+            if verbose:
+                print(f"❌ Error starting foundation model continuation: {e}")
+            raise
     def _train_single_predictor_with_file(
         self,
         session_id: str,
@@ -4721,10 +5171,12 @@ class FeatrixSphereClient:
         target_column: str,
         target_column_type: str,
         epochs: int,
-        positive_label: str,
+        rare_label_value: str,
         class_imbalance: dict,
         optimize_for: str,
-        verbose: bool,
+        cost_false_positive: float = None,
+        cost_false_negative: float = None,
+        verbose: bool = True,
         webhooks: Dict[str, str] = None
     ) -> Dict[str, Any]:
         """
@@ -4753,12 +5205,16 @@ class FeatrixSphereClient:
             'optimize_for': optimize_for,
         }
-        if positive_label:
-            data['positive_label'] = positive_label
+        if rare_label_value:
+            data['rare_label_value'] = rare_label_value
         if class_imbalance:
             data['class_imbalance'] = json.dumps(class_imbalance)
+        if cost_false_positive is not None and cost_false_negative is not None:
+            data['cost_false_positive'] = str(cost_false_positive)
+            data['cost_false_negative'] = str(cost_false_negative)
         if webhooks:
             data['webhooks'] = json.dumps(webhooks)
@@ -5785,7 +6241,24 @@ class FeatrixSphereClient:
         if not file_path.exists():
             raise FileNotFoundError(f"File not found: {file_path}")
-        df = pd.read_csv(file_path)
+        # Support CSV, Parquet, JSON, and JSONL files
+        file_path_str = str(file_path).lower()
+        if file_path_str.endswith('.parquet'):
+            df = pd.read_parquet(file_path)
+        elif file_path_str.endswith('.jsonl'):
+            # JSONL: one JSON object per line
+            import json
+            records = []
+            with open(file_path, 'r', encoding='utf-8') as f:
+                for line in f:
+                    if line.strip():
+                        records.append(json.loads(line))
+            df = pd.DataFrame(records)
+        elif file_path_str.endswith('.json'):
+            # Regular JSON
+            df = pd.read_json(file_path)
+        else:
+            df = pd.read_csv(file_path)
         # Convert to JSON Tables format and clean NaNs
         table_data = JSONTablesEncoder.from_dataframe(df)
@@ -5939,11 +6412,11 @@ class FeatrixSphereClient:
     def run_csv_predictions(self, session_id: str, csv_file: str, target_column: str = None,
                            sample_size: int = None, remove_target: bool = True) -> Dict[str, Any]:
         """
-        Run predictions on a CSV file with automatic accuracy calculation.
+        Run predictions on a CSV, Parquet, JSON, or JSONL file with automatic accuracy calculation.
         Args:
             session_id: ID of session with trained predictor
-            csv_file: Path to CSV file
+            csv_file: Path to CSV, Parquet, JSON, or JSONL file
             target_column: Name of target column (for accuracy calculation)
             sample_size: Number of records to test (None = all records)
             remove_target: Whether to remove target column from prediction input
@@ -5953,8 +6426,24 @@ class FeatrixSphereClient:
         """
         import pandas as pd
-        # Load CSV
-        df = pd.read_csv(csv_file)
+        # Load CSV, Parquet, JSON, or JSONL
+        csv_file_lower = csv_file.lower()
+        if csv_file_lower.endswith('.parquet'):
+            df = pd.read_parquet(csv_file)
+        elif csv_file_lower.endswith('.jsonl'):
+            # JSONL: one JSON object per line
+            import json
+            records = []
+            with open(csv_file, 'r', encoding='utf-8') as f:
+                for line in f:
+                    if line.strip():
+                        records.append(json.loads(line))
+            df = pd.DataFrame(records)
+        elif csv_file_lower.endswith('.json'):
+            # Regular JSON
+            df = pd.read_json(csv_file)
+        else:
+            df = pd.read_csv(csv_file)
         # Handle target column
         actual_values = None

{featrixsphere-0.2.1002 → featrixsphere-0.2.1206/featrixsphere.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: featrixsphere
-Version: 0.2.1002
+Version: 0.2.1206
 Summary: Transform any CSV into a production-ready ML model in minutes, not months.
 Home-page: https://github.com/Featrix/sphere
 Author: Featrix