PyPI - featrixsphere - Versions diffs - 0.2.1830__tar.gz → 0.2.2280__tar.gz - Mend

featrixsphere 0.2.1830tar.gz → 0.2.2280tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (207) hide show

{featrixsphere-0.2.1830 → featrixsphere-0.2.2280}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: featrixsphere
-Version: 0.2.1830
+Version: 0.2.2280
 Summary: Transform any CSV into a production-ready ML model in minutes, not months.
 Home-page: https://github.com/Featrix/sphere
 Author: Featrix

featrixsphere-0.2.2280/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.2.2278

{featrixsphere-0.2.1830 → featrixsphere-0.2.2280}/featrix-update.py RENAMED Viewed

@@ -208,9 +208,13 @@ def find_newest_version(index: Dict[str, Any]) -> Optional[Dict[str, Any]]:
         print("⚠️  No files with version information found")
         return None
-    # Sort by version (newest first)
+    # Sort by version (newest first), then by date_modified (newest first) as tiebreaker
+    # This ensures when multiple builds have the same version, we get the most recent one
     versioned_files.sort(
-        key=lambda x: tuple(int(p) for p in x['version'].split('.')[:3]),
+        key=lambda x: (
+            tuple(int(p) for p in x['version'].split('.')[:3]),
+            x.get('date_modified', '')
+        ),
         reverse=True
     )
@@ -267,16 +271,62 @@ def install_package(package_file: Path, force: bool = False) -> bool:
         print(f"   Package version: {package_version}")
         print(f"   Package hash: {package_hash}")
-        # Check if already deployed (unless forced)
-        if not force:
-            deployed_hash = None
-            if Path("/sphere/app/VERSION_HASH").exists():
-                deployed_hash = Path("/sphere/app/VERSION_HASH").read_text().strip()
-            if deployed_hash and package_hash != "unknown" and package_hash == deployed_hash:
+        # Check if already deployed
+        deployed_hash = None
+        deployed_version = None
+        if Path("/sphere/app/VERSION_HASH").exists():
+            deployed_hash = Path("/sphere/app/VERSION_HASH").read_text().strip()
+        if Path("/sphere/app/VERSION").exists():
+            deployed_version = Path("/sphere/app/VERSION").read_text().strip()
+        same_hash = deployed_hash and package_hash != "unknown" and package_hash == deployed_hash
+        same_version = deployed_version and package_version != "unknown" and package_version == deployed_version
+        if same_hash and same_version:
+            if not force:
                 print(f"\n⏭️  This package is already deployed (hash: {package_hash})")
                 print(f"   Skipping installation. Use --force to reinstall anyway.")
                 return True
+            else:
+                # FORCE REINSTALL OF SAME BUILD - MAKE IT SUPER OBVIOUS
+                import time
+                print()
+                print()
+                print("\033[1;33m" + "╔" + "=" * 78 + "╗" + "\033[0m")
+                print("\033[1;33m" + "║" + " " * 78 + "║" + "\033[0m")
+                print("\033[1;33m" + "║" + "  ⚠️  ⚠️  ⚠️  REINSTALLING THE EXACT SAME BUILD  ⚠️  ⚠️  ⚠️".ljust(78) + "║" + "\033[0m")
+                print("\033[1;33m" + "║" + " " * 78 + "║" + "\033[0m")
+                print("\033[1;33m" + "╚" + "=" * 78 + "╝" + "\033[0m")
+                print()
+                print("\033[1;33m" + "⚠️  WARNING: You are using --force to reinstall the SAME build!" + "\033[0m")
+                print()
+                print(f"   Currently installed:")
+                print(f"     Version: {deployed_version}")
+                print(f"     Hash:    {deployed_hash}")
+                print()
+                print(f"   Package to install:")
+                print(f"     Version: {package_version}")
+                print(f"     Hash:    {package_hash}")
+                print()
+                print("\033[1;33m" + "   👉 THIS IS THE EXACT SAME BUILD (version AND hash match)" + "\033[0m")
+                print()
+                print("   This will:")
+                print("   • Kill and restart all services")
+                print("   • Copy the exact same files over existing files")
+                print("   • Take 2-3 minutes to complete")
+                print()
+                print("   Common reasons to do this:")
+                print("   • Testing deployment process")
+                print("   • Services are broken and need clean restart")
+                print("   • Files were manually modified and need to be restored")
+                print()
+                for i in range(10, 0, -1):
+                    print(f"\r   ⏳ Starting reinstall in {i} seconds... (Ctrl+C to abort)", end='', flush=True)
+                    time.sleep(1)
+                print()
+                print()
+                print("\033[1;32m" + "▶️  Proceeding with reinstall..." + "\033[0m")
+                print()
         # Find node-install.sh
         install_script = None
@@ -306,10 +356,14 @@ def install_package(package_file: Path, force: bool = False) -> bool:
             # 1. We've already done version checking in featrix-update.py
             # 2. The package is a specific version we want to install
             # 3. node-install.sh checks git state from /home/mitch/sphere which may not match the package
+            # node-install.sh REQUIRES root (checked at line 326)
+            # It calls 'sbit fix-permissions' which needs sbit to have setuid bit
+            # The install script itself must run as root
             cmd = ["sudo", str(install_script), "--force"]
             print(f"   Using --force flag (installing from package)")
-            # Run the install script with sudo
+            # Run the install script
             result = subprocess.run(
                 cmd,
                 check=True,
@@ -440,12 +494,19 @@ def main():
                 print(f"\n✅ Update available: {current_version} → {newest_version}")
                 should_update = True
             elif comparison == 0:
-                print(f"\n✅ Already on latest version: {current_version}")
-                if args.force:
-                    print("   --force flag set, will reinstall anyway")
+                # Same version - check if hash is different (newer build of same version)
+                current_hash = get_current_version_hash()
+                if current_hash and newest_hash and current_hash != newest_hash:
+                    print(f"\n✅ Newer build available: {current_version} ({current_hash} → {newest_hash[:8]})")
+                    print(f"   Same version number but different hash (newer build)")
                     should_update = True
                 else:
-                    should_update = False
+                    print(f"\n✅ Already on latest version: {current_version}")
+                    if args.force:
+                        print("   --force flag set, will reinstall anyway")
+                        should_update = True
+                    else:
+                        should_update = False
             else:
                 print(f"\n⚠️  Current version ({current_version}) is newer than available ({newest_version})")
                 if args.force:

{featrixsphere-0.2.1830 → featrixsphere-0.2.2280}/featrixsphere/__init__.py RENAMED Viewed

@@ -38,7 +38,7 @@ Example:
     ...                                labels=['Experiment A', 'Experiment B'])
 """
-__version__ = "0.2.1830"
+__version__ = "0.2.2280"
 __author__ = "Featrix"
 __email__ = "support@featrix.com"
 __license__ = "MIT"

{featrixsphere-0.2.1830 → featrixsphere-0.2.2280}/featrixsphere/client.py RENAMED Viewed

@@ -565,7 +565,7 @@ class FeatrixSphereClient:
         """Make a DELETE request and return JSON response."""
         response = self._make_request("DELETE", endpoint, max_retries=max_retries, **kwargs)
         return self._unwrap_response(response.json())
     # =========================================================================
     # Session Management
     # =========================================================================
@@ -893,6 +893,113 @@ class FeatrixSphereClient:
         """
         response_data = self._post_json(f"/compute/session/{session_id}/unpublish", {})
         return response_data
+    def publish_partial_foundation(
+        self,
+        source_session_id: str,
+        name: str,
+        checkpoint_epoch: int = None,
+        session_name_prefix: str = None,
+        publish: bool = True,
+        verbose: bool = True
+    ) -> Dict[str, Any]:
+        """
+        Publish a checkpoint from in-progress training as a standalone foundation model.
+        Takes a checkpoint from ongoing ES training and creates a NEW foundation model
+        session with full provenance tracking. Perfect for snapshotting good intermediate
+        models while training continues.
+        The new foundation model can be used with:
+        - train_on_foundational_model() - Train predictors on it
+        - Any standard foundation model operations
+        - Available across all compute nodes via backplane
+        Args:
+            source_session_id: Session with ES training (in-progress or completed)
+            name: Name for the new foundation model (REQUIRED)
+            checkpoint_epoch: Which epoch checkpoint to use (None = best/latest)
+            session_name_prefix: Optional prefix for new session ID
+            publish: Move to /sphere/published/ directory (default: True)
+            verbose: Print status updates
+        Returns:
+            dict with:
+                - foundation_session_id: New foundation session ID
+                - checkpoint_epoch: Epoch used
+                - provenance: Full metadata about source and training progress
+                - published_path: Path if published
+        Example:
+            ```python
+            # Snapshot epoch 50 as foundation v0.5 while training continues
+            result = client.publish_partial_foundation(
+                source_session_id="abc-123",
+                name="My Foundation v0.5",
+                checkpoint_epoch=50,
+                session_name_prefix="foundation-v0.5",
+                publish=True
+            )
+            foundation_id = result['foundation_session_id']
+            print(f"Published foundation: {foundation_id}")
+            print(f"Source was {result['provenance']['training_progress_percent']}% trained")
+            # Use immediately like any foundation model
+            client.train_on_foundational_model(
+                foundation_model_id=foundation_id,
+                target_column="price",
+                target_column_type="scalar"
+            )
+            # Available on all compute nodes automatically via backplane
+            ```
+        """
+        if verbose:
+            print(f"📦 Publishing partial foundation from {source_session_id}")
+            print(f"   Name: {name}")
+            if checkpoint_epoch is not None:
+                print(f"   Checkpoint epoch: {checkpoint_epoch}")
+            else:
+                print(f"   Checkpoint epoch: best/latest available")
+            print(f"   Publish to /sphere/published/: {publish}")
+        data = {
+            'name': name,
+            'publish': publish
+        }
+        if checkpoint_epoch is not None:
+            data['checkpoint_epoch'] = checkpoint_epoch
+        if session_name_prefix:
+            data['session_name_prefix'] = session_name_prefix
+        try:
+            response_data = self._post_json(
+                f"/compute/session/{source_session_id}/publish_partial_foundation",
+                data
+            )
+            foundation_id = response_data.get('foundation_session_id')
+            checkpoint_used = response_data.get('checkpoint_epoch')
+            provenance = response_data.get('provenance', {})
+            if verbose:
+                print(f"✅ {response_data.get('message')}")
+                print(f"   Foundation session ID: {foundation_id}")
+                print(f"   Checkpoint epoch: {checkpoint_used}")
+                if provenance.get('training_progress_percent'):
+                    print(f"   Source training progress: {provenance['training_progress_percent']}%")
+                if provenance.get('validation_loss_at_checkpoint'):
+                    print(f"   Val loss at checkpoint: {provenance['validation_loss_at_checkpoint']:.4f}")
+                if response_data.get('published_path'):
+                    print(f"   Published to: {response_data['published_path']}")
+            return response_data
+        except Exception as e:
+            if verbose:
+                print(f"❌ Error publishing partial foundation: {e}")
+            raise
     def get_sessions_for_org(self, name_prefix: str, max_retries: int = None) -> Dict[str, Any]:
         """
@@ -1974,9 +2081,30 @@ class FeatrixSphereClient:
         - Category split: Use full data for ES, specific categories for predictor
         - Label completeness: Include unlabeled rows in ES, exclude from predictor
         - Test/holdout: Keep test data in ES context but exclude from predictor training
+        Special Input: Dictionary of Datasets
+        --------------------------------------
+        You can pass a dictionary of datasets instead of a single DataFrame. Each key is a dataset name,
+        and each value is a list of DataFrames/tables to include in that dataset.
+        When using this format:
+        - A __featrix_dataset_name column is automatically added to track which dataset each row came from
+        - All tables from all datasets are concatenated into a single DataFrame before upload
+        - The concatenated DataFrame is uploaded as normal
+        Example - Upload multiple datasets with labels:
+            datasets = {
+                'training_data': [df1, df2, df3],
+                'validation_data': [df4, df5],
+                'test_data': [df6]
+            }
+            session = client.upload_df_and_create_session(df=datasets)
+            # Uploads a single DataFrame with __featrix_dataset_name column indicating source
         Args:
-            df: pandas DataFrame to upload (optional if file_path is provided)
+            df: pandas DataFrame OR dict of {dataset_name: [DataFrames]} to upload (optional if file_path is provided)
             filename: Name to give the uploaded file (default: "data.csv")
             file_path: Path to CSV, Parquet, JSON, or JSONL file to upload (optional if df is provided)
             column_overrides: Dict mapping column names to types ("scalar", "set", "free_string", "free_string_list")
@@ -2005,6 +2133,80 @@ class FeatrixSphereClient:
         if column_types is not None:
             column_overrides = column_types
+        # Handle dictionary of datasets input
+        if df is not None and isinstance(df, dict):
+            print("Detected dictionary of datasets - concatenating with __featrix_dataset_name labels")
+            all_dataframes = []
+            total_rows = 0
+            for dataset_name, tables in df.items():
+                if not isinstance(tables, list):
+                    raise ValueError(f"Value for dataset '{dataset_name}' must be a list of DataFrames/file paths, got {type(tables)}")
+                for i, table in enumerate(tables):
+                    # Handle file path (string)
+                    if isinstance(table, str):
+                        file_path_to_load = str(table)
+                        if not os.path.exists(file_path_to_load):
+                            raise FileNotFoundError(f"File not found in dataset '{dataset_name}': {file_path_to_load}")
+                        # Determine file type and load
+                        file_ext = file_path_to_load.lower()
+                        print(f"  - {dataset_name} loading file: {os.path.basename(file_path_to_load)}")
+                        if file_ext.endswith('.parquet'):
+                            loaded_df = pd.read_parquet(file_path_to_load)
+                        elif file_ext.endswith(('.json', '.jsonl')):
+                            try:
+                                from featrix.neural.input_data_file import featrix_wrap_read_json_file
+                                loaded_df = featrix_wrap_read_json_file(file_path_to_load)
+                                if loaded_df is None:
+                                    raise ValueError(f"Failed to parse {'JSONL' if file_ext.endswith('.jsonl') else 'JSON'} file")
+                            except ImportError:
+                                # Fallback to pandas
+                                if file_ext.endswith('.jsonl'):
+                                    import json
+                                    records = []
+                                    with open(file_path_to_load, 'r', encoding='utf-8') as f:
+                                        for line in f:
+                                            if line.strip():
+                                                records.append(json.loads(line))
+                                    loaded_df = pd.DataFrame(records)
+                                else:
+                                    loaded_df = pd.read_json(file_path_to_load)
+                        elif file_ext.endswith(('.csv', '.csv.gz')):
+                            loaded_df = pd.read_csv(file_path_to_load)
+                        else:
+                            raise ValueError(f"Unsupported file type in dataset '{dataset_name}': {file_path_to_load}. "
+                                           f"Supported: .csv, .csv.gz, .parquet, .json, .jsonl")
+                        labeled_table = loaded_df
+                        print(f"    Loaded {len(loaded_df)} rows, {len(loaded_df.columns)} columns")
+                    # Handle DataFrame
+                    elif isinstance(table, pd.DataFrame):
+                        # Create a copy to avoid modifying the original
+                        labeled_table = table.copy()
+                        print(f"  - {dataset_name} DataFrame {i+1}: {len(labeled_table)} rows, {len(labeled_table.columns)} columns")
+                    else:
+                        raise ValueError(f"Table {i} in dataset '{dataset_name}' must be a pandas DataFrame or file path (str), got {type(table)}")
+                    # Add the dataset name label column
+                    labeled_table['__featrix_dataset_name'] = dataset_name
+                    all_dataframes.append(labeled_table)
+                    total_rows += len(labeled_table)
+            if not all_dataframes:
+                raise ValueError("No DataFrames found in the provided dictionary")
+            # Concatenate all dataframes
+            print(f"Concatenating {len(all_dataframes)} tables from {len(df)} datasets ({total_rows} total rows)")
+            df = pd.concat(all_dataframes, ignore_index=True)
+            print(f"Combined DataFrame: {len(df)} rows, {len(df.columns)} columns (includes __featrix_dataset_name)")
         # Validate inputs
         if df is None and file_path is None:
             raise ValueError("Either df or file_path must be provided")
@@ -4236,25 +4438,21 @@ class FeatrixSphereClient:
     def clone_in_progress_embedding_space(self, session_id: str, from_compute: str, to_compute: str,
                                          es_id: str = None, new_session_name: str = None) -> Dict[str, Any]:
         """
-        Clone a partially-trained embedding space from one compute node to another.
+        INTERNAL: Clone embedding space between compute nodes.
-        This creates a new session on the destination node with the embedding space and strings cache
-        transferred in 512MB chunks. The new session will be marked as "ready" and can immediately
-        be used to train single predictors.
+        Note: With the backplane system, users generally don't need to manually clone.
+        Sessions are automatically available across all compute nodes.
+        This method is kept for backward compatibility and special cases.
         Args:
-            session_id: Source session ID containing the embedding space to clone
-            from_compute: Source compute node name (e.g., 'taco', 'churro', 'burrito')
-            to_compute: Destination compute node name
-            es_id: Optional ES ID to clone (required if session has multiple embedding spaces)
-            new_session_name: Optional name for the new cloned session
+            session_id: Source session ID
+            from_compute: Source node name
+            to_compute: Destination node name
+            es_id: Optional ES ID (if session has multiple)
+            new_session_name: Optional name for cloned session
         Returns:
-            Dict with new_session_id from destination node
-        Raises:
-            ValueError: If multiple ES found in session and es_id not provided
-            HTTPException: If cloning fails
+            Dict with new_session_id
         """
         # Prepare request data
         request_data = {
@@ -4322,40 +4520,6 @@ class FeatrixSphereClient:
         print(f"Training predictor on foundation model {foundation_model_id}...")
         print(f"  Target: {target_column} ({target_column_type})")
-        # Get the compute cluster from the foundation model session
-        # This ensures we upload files to the same node where the foundation model lives
-        # If the foundation session doesn't exist (404), we'll proceed with current compute cluster
-        foundation_compute_cluster = None
-        try:
-            foundation_session = self.get_session_status(foundation_model_id)
-            foundation_compute_cluster = self.get_last_server_metadata()
-            foundation_compute_cluster = foundation_compute_cluster.get('compute_cluster') if foundation_compute_cluster else None
-        except Exception as e:
-            # Foundation session might not exist or be accessible - that's okay
-            # The server will validate it when we submit the training request
-            if verbose:
-                # Check if it's a 404 HTTP error
-                is_404 = False
-                if isinstance(e, requests.exceptions.HTTPError):
-                    if hasattr(e, 'response') and e.response.status_code == 404:
-                        is_404 = True
-                if is_404:
-                    print(f"  ⚠️  Foundation session not found (404) - will use current compute cluster")
-                    print(f"     Server will validate foundation model when training starts")
-                else:
-                    print(f"  ⚠️  Could not fetch foundation session: {e}")
-                    print(f"     Will proceed with current compute cluster")
-        # Temporarily set compute cluster for file uploads if we found one
-        original_compute_cluster = self.compute_cluster
-        original_headers = self.session.headers.copy()
-        if foundation_compute_cluster:
-            self.set_compute_cluster(foundation_compute_cluster)
-            if verbose:
-                print(f"  Using compute cluster: {foundation_compute_cluster}")
-        elif verbose and self.compute_cluster:
-            print(f"  Using current compute cluster: {self.compute_cluster}")
         try:
             # Validate that only one data source is provided
@@ -4453,20 +4617,9 @@ class FeatrixSphereClient:
             new_session_id = response_data.get('session_id')
             print(f"✅ Predictor training session created: {new_session_id}")
-            # Restore original compute cluster setting
-            if original_compute_cluster != self.compute_cluster:
-                if original_compute_cluster:
-                    self.set_compute_cluster(original_compute_cluster)
-                else:
-                    self.session.headers = original_headers
-        finally:
-            # Ensure we restore headers even if there's an error
-            if original_compute_cluster != self.compute_cluster:
-                if original_compute_cluster:
-                    self.set_compute_cluster(original_compute_cluster)
-                else:
-                    self.session.headers = original_headers
+        except Exception as e:
+            raise
         if verbose:
             print(f"⏳ Waiting for training to complete...")
@@ -4854,14 +5007,44 @@ class FeatrixSphereClient:
         The system handles the hard decisions so you can focus on your problem, not
         hyperparameter tuning.
+        MULTI-DATASET INPUT (NEW):
+        ---------------------------
+        You can now pass a dictionary of datasets for the `df` parameter, just like in
+        upload_df_and_create_session(). This is useful when combining multiple sources
+        for predictor training:
+        ```python
+        # Train predictor on multiple datasets with labels
+        training_data = {
+            'extra_rows_from_matt': ['matt_supplement.csv', 'matt_additions.parquet'],
+            'main_training': [df1, df2, 'training.csv'],
+            'validation_samples': ['validation.csv']
+        }
+        result = client.train_single_predictor(
+            session_id=session.session_id,
+            df=training_data,  # Dictionary of datasets
+            target_column='outcome',
+            target_column_type='set'
+        )
+        ```
+        When using dictionary format:
+        - Each key is a dataset name (e.g., 'extra_rows_from_matt')
+        - Each value is a list of DataFrames and/or file paths
+        - A __featrix_dataset_name column is automatically added
+        - All tables are concatenated before training
+        - Works with all file types: CSV, Parquet, JSON, JSONL
         Args:
             session_id: ID of session with trained embedding space
             target_column: Name of the target column to predict
             target_column_type: Type of target column ("set" or "scalar")
             file_path: Path to DIFFERENT training file (CSV or .csv.gz) to use for predictor training.
-            df: pandas DataFrame with DIFFERENT training data to use for predictor training.
-                Use file_path OR df (not both) to train predictor on different data than your
-                embedding space! If neither provided, uses session's original data file.
+            df: pandas DataFrame OR dict of {dataset_name: [DataFrames/file paths]} with DIFFERENT
+                training data to use for predictor training. Use file_path OR df (not both) to train
+                predictor on different data than your embedding space! If neither provided, uses
+                session's original data file.
             epochs: Number of training epochs (default: 0; automatic)
             validation_ignore_columns: List of column names to exclude from validation queries (default: None)
             rare_label_value: For binary classification, which class is the rare/minority class for metrics (default: None)
@@ -4882,6 +5065,86 @@ class FeatrixSphereClient:
         if file_path and df is not None:
             raise ValueError("Provide either file_path or df, not both")
+        # Handle dictionary of datasets input (same as upload_df_and_create_session)
+        if df is not None and isinstance(df, dict):
+            if verbose:
+                print("Detected dictionary of datasets - concatenating with __featrix_dataset_name labels")
+            all_dataframes = []
+            total_rows = 0
+            for dataset_name, tables in df.items():
+                if not isinstance(tables, list):
+                    raise ValueError(f"Value for dataset '{dataset_name}' must be a list of DataFrames/file paths, got {type(tables)}")
+                for i, table in enumerate(tables):
+                    # Handle file path (string)
+                    if isinstance(table, str):
+                        file_path_to_load = str(table)
+                        if not os.path.exists(file_path_to_load):
+                            raise FileNotFoundError(f"File not found in dataset '{dataset_name}': {file_path_to_load}")
+                        # Determine file type and load
+                        file_ext = file_path_to_load.lower()
+                        if verbose:
+                            print(f"  - {dataset_name} loading file: {os.path.basename(file_path_to_load)}")
+                        if file_ext.endswith('.parquet'):
+                            loaded_df = pd.read_parquet(file_path_to_load)
+                        elif file_ext.endswith(('.json', '.jsonl')):
+                            try:
+                                from featrix.neural.input_data_file import featrix_wrap_read_json_file
+                                loaded_df = featrix_wrap_read_json_file(file_path_to_load)
+                                if loaded_df is None:
+                                    raise ValueError(f"Failed to parse {'JSONL' if file_ext.endswith('.jsonl') else 'JSON'} file")
+                            except ImportError:
+                                # Fallback to pandas
+                                if file_ext.endswith('.jsonl'):
+                                    import json
+                                    records = []
+                                    with open(file_path_to_load, 'r', encoding='utf-8') as f:
+                                        for line in f:
+                                            if line.strip():
+                                                records.append(json.loads(line))
+                                    loaded_df = pd.DataFrame(records)
+                                else:
+                                    loaded_df = pd.read_json(file_path_to_load)
+                        elif file_ext.endswith(('.csv', '.csv.gz')):
+                            loaded_df = pd.read_csv(file_path_to_load)
+                        else:
+                            raise ValueError(f"Unsupported file type in dataset '{dataset_name}': {file_path_to_load}. "
+                                           f"Supported: .csv, .csv.gz, .parquet, .json, .jsonl")
+                        labeled_table = loaded_df
+                        if verbose:
+                            print(f"    Loaded {len(loaded_df)} rows, {len(loaded_df.columns)} columns")
+                    # Handle DataFrame
+                    elif isinstance(table, pd.DataFrame):
+                        # Create a copy to avoid modifying the original
+                        labeled_table = table.copy()
+                        if verbose:
+                            print(f"  - {dataset_name} DataFrame {i+1}: {len(labeled_table)} rows, {len(labeled_table.columns)} columns")
+                    else:
+                        raise ValueError(f"Table {i} in dataset '{dataset_name}' must be a pandas DataFrame or file path (str), got {type(table)}")
+                    # Add the dataset name label column
+                    labeled_table['__featrix_dataset_name'] = dataset_name
+                    all_dataframes.append(labeled_table)
+                    total_rows += len(labeled_table)
+            if not all_dataframes:
+                raise ValueError("No DataFrames found in the provided dictionary")
+            # Concatenate all dataframes
+            if verbose:
+                print(f"Concatenating {len(all_dataframes)} tables from {len(df)} datasets ({total_rows} total rows)")
+            df = pd.concat(all_dataframes, ignore_index=True)
+            if verbose:
+                print(f"Combined DataFrame: {len(df)} rows, {len(df.columns)} columns (includes __featrix_dataset_name)")
         # Validate cost parameters
         if cost_false_positive is not None or cost_false_negative is not None:
             if cost_false_positive is None or cost_false_negative is None:
@@ -5079,7 +5342,7 @@ class FeatrixSphereClient:
         Extend embedding space training with new data.
         This function:
-        1. Clones the existing embedding space to a new session
+        1. Creates a new session with the existing embedding space
         2. Uploads/processes the new data
         3. Continues training from where the previous training left off
         4. Trains for the specified number of additional epochs (data_passes)

{featrixsphere-0.2.1830 → featrixsphere-0.2.2280}/featrixsphere.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: featrixsphere
-Version: 0.2.1830
+Version: 0.2.2280
 Summary: Transform any CSV into a production-ready ML model in minutes, not months.
 Home-page: https://github.com/Featrix/sphere
 Author: Featrix

featrixsphere 0.2.1830__tar.gz → 0.2.2280__tar.gz

featrixsphere 0.2.1830tar.gz → 0.2.2280tar.gz