PyPI - featrixsphere - Versions diffs - 0.2.1830__py3-none-any.whl → 0.2.2279__py3-none-any.whl - Mend

featrixsphere 0.2.1830py3-none-any.whl → 0.2.2279py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

featrixsphere/__init__.py CHANGED Viewed

@@ -38,7 +38,7 @@ Example:
     ...                                labels=['Experiment A', 'Experiment B'])
 """
-__version__ = "0.2.1830"
+__version__ = "0.2.2279"
 __author__ = "Featrix"
 __email__ = "support@featrix.com"
 __license__ = "MIT"

featrixsphere/client.py CHANGED Viewed

@@ -565,7 +565,7 @@ class FeatrixSphereClient:
         """Make a DELETE request and return JSON response."""
         response = self._make_request("DELETE", endpoint, max_retries=max_retries, **kwargs)
         return self._unwrap_response(response.json())
     # =========================================================================
     # Session Management
     # =========================================================================
@@ -893,6 +893,113 @@ class FeatrixSphereClient:
         """
         response_data = self._post_json(f"/compute/session/{session_id}/unpublish", {})
         return response_data
+    def publish_partial_foundation(
+        self,
+        source_session_id: str,
+        name: str,
+        checkpoint_epoch: int = None,
+        session_name_prefix: str = None,
+        publish: bool = True,
+        verbose: bool = True
+    ) -> Dict[str, Any]:
+        """
+        Publish a checkpoint from in-progress training as a standalone foundation model.
+        Takes a checkpoint from ongoing ES training and creates a NEW foundation model
+        session with full provenance tracking. Perfect for snapshotting good intermediate
+        models while training continues.
+        The new foundation model can be used with:
+        - train_on_foundational_model() - Train predictors on it
+        - Any standard foundation model operations
+        - Available across all compute nodes via backplane
+        Args:
+            source_session_id: Session with ES training (in-progress or completed)
+            name: Name for the new foundation model (REQUIRED)
+            checkpoint_epoch: Which epoch checkpoint to use (None = best/latest)
+            session_name_prefix: Optional prefix for new session ID
+            publish: Move to /sphere/published/ directory (default: True)
+            verbose: Print status updates
+        Returns:
+            dict with:
+                - foundation_session_id: New foundation session ID
+                - checkpoint_epoch: Epoch used
+                - provenance: Full metadata about source and training progress
+                - published_path: Path if published
+        Example:
+            ```python
+            # Snapshot epoch 50 as foundation v0.5 while training continues
+            result = client.publish_partial_foundation(
+                source_session_id="abc-123",
+                name="My Foundation v0.5",
+                checkpoint_epoch=50,
+                session_name_prefix="foundation-v0.5",
+                publish=True
+            )
+            foundation_id = result['foundation_session_id']
+            print(f"Published foundation: {foundation_id}")
+            print(f"Source was {result['provenance']['training_progress_percent']}% trained")
+            # Use immediately like any foundation model
+            client.train_on_foundational_model(
+                foundation_model_id=foundation_id,
+                target_column="price",
+                target_column_type="scalar"
+            )
+            # Available on all compute nodes automatically via backplane
+            ```
+        """
+        if verbose:
+            print(f"📦 Publishing partial foundation from {source_session_id}")
+            print(f"   Name: {name}")
+            if checkpoint_epoch is not None:
+                print(f"   Checkpoint epoch: {checkpoint_epoch}")
+            else:
+                print(f"   Checkpoint epoch: best/latest available")
+            print(f"   Publish to /sphere/published/: {publish}")
+        data = {
+            'name': name,
+            'publish': publish
+        }
+        if checkpoint_epoch is not None:
+            data['checkpoint_epoch'] = checkpoint_epoch
+        if session_name_prefix:
+            data['session_name_prefix'] = session_name_prefix
+        try:
+            response_data = self._post_json(
+                f"/compute/session/{source_session_id}/publish_partial_foundation",
+                data
+            )
+            foundation_id = response_data.get('foundation_session_id')
+            checkpoint_used = response_data.get('checkpoint_epoch')
+            provenance = response_data.get('provenance', {})
+            if verbose:
+                print(f"✅ {response_data.get('message')}")
+                print(f"   Foundation session ID: {foundation_id}")
+                print(f"   Checkpoint epoch: {checkpoint_used}")
+                if provenance.get('training_progress_percent'):
+                    print(f"   Source training progress: {provenance['training_progress_percent']}%")
+                if provenance.get('validation_loss_at_checkpoint'):
+                    print(f"   Val loss at checkpoint: {provenance['validation_loss_at_checkpoint']:.4f}")
+                if response_data.get('published_path'):
+                    print(f"   Published to: {response_data['published_path']}")
+            return response_data
+        except Exception as e:
+            if verbose:
+                print(f"❌ Error publishing partial foundation: {e}")
+            raise
     def get_sessions_for_org(self, name_prefix: str, max_retries: int = None) -> Dict[str, Any]:
         """
@@ -1974,9 +2081,30 @@ class FeatrixSphereClient:
         - Category split: Use full data for ES, specific categories for predictor
         - Label completeness: Include unlabeled rows in ES, exclude from predictor
         - Test/holdout: Keep test data in ES context but exclude from predictor training
+        Special Input: Dictionary of Datasets
+        --------------------------------------
+        You can pass a dictionary of datasets instead of a single DataFrame. Each key is a dataset name,
+        and each value is a list of DataFrames/tables to include in that dataset.
+        When using this format:
+        - A __featrix_dataset_name column is automatically added to track which dataset each row came from
+        - All tables from all datasets are concatenated into a single DataFrame before upload
+        - The concatenated DataFrame is uploaded as normal
+        Example - Upload multiple datasets with labels:
+            datasets = {
+                'training_data': [df1, df2, df3],
+                'validation_data': [df4, df5],
+                'test_data': [df6]
+            }
+            session = client.upload_df_and_create_session(df=datasets)
+            # Uploads a single DataFrame with __featrix_dataset_name column indicating source
         Args:
-            df: pandas DataFrame to upload (optional if file_path is provided)
+            df: pandas DataFrame OR dict of {dataset_name: [DataFrames]} to upload (optional if file_path is provided)
             filename: Name to give the uploaded file (default: "data.csv")
             file_path: Path to CSV, Parquet, JSON, or JSONL file to upload (optional if df is provided)
             column_overrides: Dict mapping column names to types ("scalar", "set", "free_string", "free_string_list")
@@ -2005,6 +2133,80 @@ class FeatrixSphereClient:
         if column_types is not None:
             column_overrides = column_types
+        # Handle dictionary of datasets input
+        if df is not None and isinstance(df, dict):
+            print("Detected dictionary of datasets - concatenating with __featrix_dataset_name labels")
+            all_dataframes = []
+            total_rows = 0
+            for dataset_name, tables in df.items():
+                if not isinstance(tables, list):
+                    raise ValueError(f"Value for dataset '{dataset_name}' must be a list of DataFrames/file paths, got {type(tables)}")
+                for i, table in enumerate(tables):
+                    # Handle file path (string)
+                    if isinstance(table, str):
+                        file_path_to_load = str(table)
+                        if not os.path.exists(file_path_to_load):
+                            raise FileNotFoundError(f"File not found in dataset '{dataset_name}': {file_path_to_load}")
+                        # Determine file type and load
+                        file_ext = file_path_to_load.lower()
+                        print(f"  - {dataset_name} loading file: {os.path.basename(file_path_to_load)}")
+                        if file_ext.endswith('.parquet'):
+                            loaded_df = pd.read_parquet(file_path_to_load)
+                        elif file_ext.endswith(('.json', '.jsonl')):
+                            try:
+                                from featrix.neural.input_data_file import featrix_wrap_read_json_file
+                                loaded_df = featrix_wrap_read_json_file(file_path_to_load)
+                                if loaded_df is None:
+                                    raise ValueError(f"Failed to parse {'JSONL' if file_ext.endswith('.jsonl') else 'JSON'} file")
+                            except ImportError:
+                                # Fallback to pandas
+                                if file_ext.endswith('.jsonl'):
+                                    import json
+                                    records = []
+                                    with open(file_path_to_load, 'r', encoding='utf-8') as f:
+                                        for line in f:
+                                            if line.strip():
+                                                records.append(json.loads(line))
+                                    loaded_df = pd.DataFrame(records)
+                                else:
+                                    loaded_df = pd.read_json(file_path_to_load)
+                        elif file_ext.endswith(('.csv', '.csv.gz')):
+                            loaded_df = pd.read_csv(file_path_to_load)
+                        else:
+                            raise ValueError(f"Unsupported file type in dataset '{dataset_name}': {file_path_to_load}. "
+                                           f"Supported: .csv, .csv.gz, .parquet, .json, .jsonl")
+                        labeled_table = loaded_df
+                        print(f"    Loaded {len(loaded_df)} rows, {len(loaded_df.columns)} columns")
+                    # Handle DataFrame
+                    elif isinstance(table, pd.DataFrame):
+                        # Create a copy to avoid modifying the original
+                        labeled_table = table.copy()
+                        print(f"  - {dataset_name} DataFrame {i+1}: {len(labeled_table)} rows, {len(labeled_table.columns)} columns")
+                    else:
+                        raise ValueError(f"Table {i} in dataset '{dataset_name}' must be a pandas DataFrame or file path (str), got {type(table)}")
+                    # Add the dataset name label column
+                    labeled_table['__featrix_dataset_name'] = dataset_name
+                    all_dataframes.append(labeled_table)
+                    total_rows += len(labeled_table)
+            if not all_dataframes:
+                raise ValueError("No DataFrames found in the provided dictionary")
+            # Concatenate all dataframes
+            print(f"Concatenating {len(all_dataframes)} tables from {len(df)} datasets ({total_rows} total rows)")
+            df = pd.concat(all_dataframes, ignore_index=True)
+            print(f"Combined DataFrame: {len(df)} rows, {len(df.columns)} columns (includes __featrix_dataset_name)")
         # Validate inputs
         if df is None and file_path is None:
             raise ValueError("Either df or file_path must be provided")
@@ -4236,25 +4438,21 @@ class FeatrixSphereClient:
     def clone_in_progress_embedding_space(self, session_id: str, from_compute: str, to_compute: str,
                                          es_id: str = None, new_session_name: str = None) -> Dict[str, Any]:
         """
-        Clone a partially-trained embedding space from one compute node to another.
+        INTERNAL: Clone embedding space between compute nodes.
-        This creates a new session on the destination node with the embedding space and strings cache
-        transferred in 512MB chunks. The new session will be marked as "ready" and can immediately
-        be used to train single predictors.
+        Note: With the backplane system, users generally don't need to manually clone.
+        Sessions are automatically available across all compute nodes.
+        This method is kept for backward compatibility and special cases.
         Args:
-            session_id: Source session ID containing the embedding space to clone
-            from_compute: Source compute node name (e.g., 'taco', 'churro', 'burrito')
-            to_compute: Destination compute node name
-            es_id: Optional ES ID to clone (required if session has multiple embedding spaces)
-            new_session_name: Optional name for the new cloned session
+            session_id: Source session ID
+            from_compute: Source node name
+            to_compute: Destination node name
+            es_id: Optional ES ID (if session has multiple)
+            new_session_name: Optional name for cloned session
         Returns:
-            Dict with new_session_id from destination node
-        Raises:
-            ValueError: If multiple ES found in session and es_id not provided
-            HTTPException: If cloning fails
+            Dict with new_session_id
         """
         # Prepare request data
         request_data = {
@@ -4322,40 +4520,6 @@ class FeatrixSphereClient:
         print(f"Training predictor on foundation model {foundation_model_id}...")
         print(f"  Target: {target_column} ({target_column_type})")
-        # Get the compute cluster from the foundation model session
-        # This ensures we upload files to the same node where the foundation model lives
-        # If the foundation session doesn't exist (404), we'll proceed with current compute cluster
-        foundation_compute_cluster = None
-        try:
-            foundation_session = self.get_session_status(foundation_model_id)
-            foundation_compute_cluster = self.get_last_server_metadata()
-            foundation_compute_cluster = foundation_compute_cluster.get('compute_cluster') if foundation_compute_cluster else None
-        except Exception as e:
-            # Foundation session might not exist or be accessible - that's okay
-            # The server will validate it when we submit the training request
-            if verbose:
-                # Check if it's a 404 HTTP error
-                is_404 = False
-                if isinstance(e, requests.exceptions.HTTPError):
-                    if hasattr(e, 'response') and e.response.status_code == 404:
-                        is_404 = True
-                if is_404:
-                    print(f"  ⚠️  Foundation session not found (404) - will use current compute cluster")
-                    print(f"     Server will validate foundation model when training starts")
-                else:
-                    print(f"  ⚠️  Could not fetch foundation session: {e}")
-                    print(f"     Will proceed with current compute cluster")
-        # Temporarily set compute cluster for file uploads if we found one
-        original_compute_cluster = self.compute_cluster
-        original_headers = self.session.headers.copy()
-        if foundation_compute_cluster:
-            self.set_compute_cluster(foundation_compute_cluster)
-            if verbose:
-                print(f"  Using compute cluster: {foundation_compute_cluster}")
-        elif verbose and self.compute_cluster:
-            print(f"  Using current compute cluster: {self.compute_cluster}")
         try:
             # Validate that only one data source is provided
@@ -4453,20 +4617,9 @@ class FeatrixSphereClient:
             new_session_id = response_data.get('session_id')
             print(f"✅ Predictor training session created: {new_session_id}")
-            # Restore original compute cluster setting
-            if original_compute_cluster != self.compute_cluster:
-                if original_compute_cluster:
-                    self.set_compute_cluster(original_compute_cluster)
-                else:
-                    self.session.headers = original_headers
-        finally:
-            # Ensure we restore headers even if there's an error
-            if original_compute_cluster != self.compute_cluster:
-                if original_compute_cluster:
-                    self.set_compute_cluster(original_compute_cluster)
-                else:
-                    self.session.headers = original_headers
+        except Exception as e:
+            raise
         if verbose:
             print(f"⏳ Waiting for training to complete...")
@@ -5079,7 +5232,7 @@ class FeatrixSphereClient:
         Extend embedding space training with new data.
         This function:
-        1. Clones the existing embedding space to a new session
+        1. Creates a new session with the existing embedding space
         2. Uploads/processes the new data
         3. Continues training from where the previous training left off
         4. Trains for the specified number of additional epochs (data_passes)

{featrixsphere-0.2.1830.dist-info → featrixsphere-0.2.2279.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: featrixsphere
-Version: 0.2.1830
+Version: 0.2.2279
 Summary: Transform any CSV into a production-ready ML model in minutes, not months.
 Home-page: https://github.com/Featrix/sphere
 Author: Featrix

featrixsphere-0.2.2279.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+featrixsphere/__init__.py,sha256=CdXrzec_MfZTUpRyajYawHBu5iEWAe-flVrlXjiLLN8,1888
+featrixsphere/client.py,sha256=ez8ML3OMHMMO0ewzOGNgF1tg3KYygLG2NbEw33sf3AA,408591
+featrixsphere/test_client.py,sha256=4SiRbib0ms3poK0UpnUv4G0HFQSzidF3Iswo_J2cjLk,11981
+featrixsphere-0.2.2279.dist-info/METADATA,sha256=2hEITDtsSYmMLgl0RfqvcBzac4EBAH3DN4TSVf_1rPY,16232
+featrixsphere-0.2.2279.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+featrixsphere-0.2.2279.dist-info/entry_points.txt,sha256=QreJeYfD_VWvbEqPmMXZ3pqqlFlJ1qZb-NtqnyhEldc,51
+featrixsphere-0.2.2279.dist-info/top_level.txt,sha256=AyN4wjfzlD0hWnDieuEHX0KckphIk_aC73XCG4df5uU,14
+featrixsphere-0.2.2279.dist-info/RECORD,,

featrixsphere-0.2.1830.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-featrixsphere/__init__.py,sha256=I27lMJL_tBPzKyo_79loiIS83AAC-vuoz1kA3ZY2fhc,1888
-featrixsphere/client.py,sha256=L97tRb-6pCvP7lKYOsK4iYHfsFt3V0URu4RO1mQzFoQ,401468
-featrixsphere/test_client.py,sha256=4SiRbib0ms3poK0UpnUv4G0HFQSzidF3Iswo_J2cjLk,11981
-featrixsphere-0.2.1830.dist-info/METADATA,sha256=IFITUpYkfYT2s7WXDX0-5Xl-iiUBt6bb69-Mr9_w6O8,16232
-featrixsphere-0.2.1830.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-featrixsphere-0.2.1830.dist-info/entry_points.txt,sha256=QreJeYfD_VWvbEqPmMXZ3pqqlFlJ1qZb-NtqnyhEldc,51
-featrixsphere-0.2.1830.dist-info/top_level.txt,sha256=AyN4wjfzlD0hWnDieuEHX0KckphIk_aC73XCG4df5uU,14
-featrixsphere-0.2.1830.dist-info/RECORD,,

{featrixsphere-0.2.1830.dist-info → featrixsphere-0.2.2279.dist-info}/WHEEL RENAMED Viewed

File without changes

{featrixsphere-0.2.1830.dist-info → featrixsphere-0.2.2279.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{featrixsphere-0.2.1830.dist-info → featrixsphere-0.2.2279.dist-info}/top_level.txt RENAMED Viewed

File without changes

featrixsphere 0.2.1830__py3-none-any.whl → 0.2.2279__py3-none-any.whl

featrixsphere 0.2.1830py3-none-any.whl → 0.2.2279py3-none-any.whl