featrixsphere 0.2.2279__py3-none-any.whl → 0.2.2280__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
featrixsphere/__init__.py CHANGED
@@ -38,7 +38,7 @@ Example:
38
38
  ... labels=['Experiment A', 'Experiment B'])
39
39
  """
40
40
 
41
- __version__ = "0.2.2279"
41
+ __version__ = "0.2.2280"
42
42
  __author__ = "Featrix"
43
43
  __email__ = "support@featrix.com"
44
44
  __license__ = "MIT"
featrixsphere/client.py CHANGED
@@ -5007,14 +5007,44 @@ class FeatrixSphereClient:
5007
5007
  The system handles the hard decisions so you can focus on your problem, not
5008
5008
  hyperparameter tuning.
5009
5009
 
5010
+ MULTI-DATASET INPUT (NEW):
5011
+ ---------------------------
5012
+ You can now pass a dictionary of datasets for the `df` parameter, just like in
5013
+ upload_df_and_create_session(). This is useful when combining multiple sources
5014
+ for predictor training:
5015
+
5016
+ ```python
5017
+ # Train predictor on multiple datasets with labels
5018
+ training_data = {
5019
+ 'extra_rows_from_matt': ['matt_supplement.csv', 'matt_additions.parquet'],
5020
+ 'main_training': [df1, df2, 'training.csv'],
5021
+ 'validation_samples': ['validation.csv']
5022
+ }
5023
+
5024
+ result = client.train_single_predictor(
5025
+ session_id=session.session_id,
5026
+ df=training_data, # Dictionary of datasets
5027
+ target_column='outcome',
5028
+ target_column_type='set'
5029
+ )
5030
+ ```
5031
+
5032
+ When using dictionary format:
5033
+ - Each key is a dataset name (e.g., 'extra_rows_from_matt')
5034
+ - Each value is a list of DataFrames and/or file paths
5035
+ - A __featrix_dataset_name column is automatically added
5036
+ - All tables are concatenated before training
5037
+ - Works with all file types: CSV, Parquet, JSON, JSONL
5038
+
5010
5039
  Args:
5011
5040
  session_id: ID of session with trained embedding space
5012
5041
  target_column: Name of the target column to predict
5013
5042
  target_column_type: Type of target column ("set" or "scalar")
5014
5043
  file_path: Path to DIFFERENT training file (CSV or .csv.gz) to use for predictor training.
5015
- df: pandas DataFrame with DIFFERENT training data to use for predictor training.
5016
- Use file_path OR df (not both) to train predictor on different data than your
5017
- embedding space! If neither provided, uses session's original data file.
5044
+ df: pandas DataFrame OR dict of {dataset_name: [DataFrames/file paths]} with DIFFERENT
5045
+ training data to use for predictor training. Use file_path OR df (not both) to train
5046
+ predictor on different data than your embedding space! If neither provided, uses
5047
+ session's original data file.
5018
5048
  epochs: Number of training epochs (default: 0; automatic)
5019
5049
  validation_ignore_columns: List of column names to exclude from validation queries (default: None)
5020
5050
  rare_label_value: For binary classification, which class is the rare/minority class for metrics (default: None)
@@ -5035,6 +5065,86 @@ class FeatrixSphereClient:
5035
5065
  if file_path and df is not None:
5036
5066
  raise ValueError("Provide either file_path or df, not both")
5037
5067
 
5068
+ # Handle dictionary of datasets input (same as upload_df_and_create_session)
5069
+ if df is not None and isinstance(df, dict):
5070
+ if verbose:
5071
+ print("Detected dictionary of datasets - concatenating with __featrix_dataset_name labels")
5072
+ all_dataframes = []
5073
+ total_rows = 0
5074
+
5075
+ for dataset_name, tables in df.items():
5076
+ if not isinstance(tables, list):
5077
+ raise ValueError(f"Value for dataset '{dataset_name}' must be a list of DataFrames/file paths, got {type(tables)}")
5078
+
5079
+ for i, table in enumerate(tables):
5080
+ # Handle file path (string)
5081
+ if isinstance(table, str):
5082
+ file_path_to_load = str(table)
5083
+
5084
+ if not os.path.exists(file_path_to_load):
5085
+ raise FileNotFoundError(f"File not found in dataset '{dataset_name}': {file_path_to_load}")
5086
+
5087
+ # Determine file type and load
5088
+ file_ext = file_path_to_load.lower()
5089
+ if verbose:
5090
+ print(f" - {dataset_name} loading file: {os.path.basename(file_path_to_load)}")
5091
+
5092
+ if file_ext.endswith('.parquet'):
5093
+ loaded_df = pd.read_parquet(file_path_to_load)
5094
+ elif file_ext.endswith(('.json', '.jsonl')):
5095
+ try:
5096
+ from featrix.neural.input_data_file import featrix_wrap_read_json_file
5097
+ loaded_df = featrix_wrap_read_json_file(file_path_to_load)
5098
+ if loaded_df is None:
5099
+ raise ValueError(f"Failed to parse {'JSONL' if file_ext.endswith('.jsonl') else 'JSON'} file")
5100
+ except ImportError:
5101
+ # Fallback to pandas
5102
+ if file_ext.endswith('.jsonl'):
5103
+ import json
5104
+ records = []
5105
+ with open(file_path_to_load, 'r', encoding='utf-8') as f:
5106
+ for line in f:
5107
+ if line.strip():
5108
+ records.append(json.loads(line))
5109
+ loaded_df = pd.DataFrame(records)
5110
+ else:
5111
+ loaded_df = pd.read_json(file_path_to_load)
5112
+ elif file_ext.endswith(('.csv', '.csv.gz')):
5113
+ loaded_df = pd.read_csv(file_path_to_load)
5114
+ else:
5115
+ raise ValueError(f"Unsupported file type in dataset '{dataset_name}': {file_path_to_load}. "
5116
+ f"Supported: .csv, .csv.gz, .parquet, .json, .jsonl")
5117
+
5118
+ labeled_table = loaded_df
5119
+ if verbose:
5120
+ print(f" Loaded {len(loaded_df)} rows, {len(loaded_df.columns)} columns")
5121
+
5122
+ # Handle DataFrame
5123
+ elif isinstance(table, pd.DataFrame):
5124
+ # Create a copy to avoid modifying the original
5125
+ labeled_table = table.copy()
5126
+ if verbose:
5127
+ print(f" - {dataset_name} DataFrame {i+1}: {len(labeled_table)} rows, {len(labeled_table.columns)} columns")
5128
+
5129
+ else:
5130
+ raise ValueError(f"Table {i} in dataset '{dataset_name}' must be a pandas DataFrame or file path (str), got {type(table)}")
5131
+
5132
+ # Add the dataset name label column
5133
+ labeled_table['__featrix_dataset_name'] = dataset_name
5134
+
5135
+ all_dataframes.append(labeled_table)
5136
+ total_rows += len(labeled_table)
5137
+
5138
+ if not all_dataframes:
5139
+ raise ValueError("No DataFrames found in the provided dictionary")
5140
+
5141
+ # Concatenate all dataframes
5142
+ if verbose:
5143
+ print(f"Concatenating {len(all_dataframes)} tables from {len(df)} datasets ({total_rows} total rows)")
5144
+ df = pd.concat(all_dataframes, ignore_index=True)
5145
+ if verbose:
5146
+ print(f"Combined DataFrame: {len(df)} rows, {len(df.columns)} columns (includes __featrix_dataset_name)")
5147
+
5038
5148
  # Validate cost parameters
5039
5149
  if cost_false_positive is not None or cost_false_negative is not None:
5040
5150
  if cost_false_positive is None or cost_false_negative is None:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: featrixsphere
3
- Version: 0.2.2279
3
+ Version: 0.2.2280
4
4
  Summary: Transform any CSV into a production-ready ML model in minutes, not months.
5
5
  Home-page: https://github.com/Featrix/sphere
6
6
  Author: Featrix
@@ -0,0 +1,8 @@
1
+ featrixsphere/__init__.py,sha256=1f94O-3V4hG42-5IfPSRDSLuySPvxLXTaLfGnuibzto,1888
2
+ featrixsphere/client.py,sha256=jCf6_zv7DhG6OWv8dyCo8dMs0qm9uFjHlH4uPTfUqDU,414594
3
+ featrixsphere/test_client.py,sha256=4SiRbib0ms3poK0UpnUv4G0HFQSzidF3Iswo_J2cjLk,11981
4
+ featrixsphere-0.2.2280.dist-info/METADATA,sha256=6KTfLXB6DPxNmGoGkzVe6FcecIk4sfIaIhWpcLTAptM,16232
5
+ featrixsphere-0.2.2280.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
6
+ featrixsphere-0.2.2280.dist-info/entry_points.txt,sha256=QreJeYfD_VWvbEqPmMXZ3pqqlFlJ1qZb-NtqnyhEldc,51
7
+ featrixsphere-0.2.2280.dist-info/top_level.txt,sha256=AyN4wjfzlD0hWnDieuEHX0KckphIk_aC73XCG4df5uU,14
8
+ featrixsphere-0.2.2280.dist-info/RECORD,,
@@ -1,8 +0,0 @@
1
- featrixsphere/__init__.py,sha256=CdXrzec_MfZTUpRyajYawHBu5iEWAe-flVrlXjiLLN8,1888
2
- featrixsphere/client.py,sha256=ez8ML3OMHMMO0ewzOGNgF1tg3KYygLG2NbEw33sf3AA,408591
3
- featrixsphere/test_client.py,sha256=4SiRbib0ms3poK0UpnUv4G0HFQSzidF3Iswo_J2cjLk,11981
4
- featrixsphere-0.2.2279.dist-info/METADATA,sha256=2hEITDtsSYmMLgl0RfqvcBzac4EBAH3DN4TSVf_1rPY,16232
5
- featrixsphere-0.2.2279.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
6
- featrixsphere-0.2.2279.dist-info/entry_points.txt,sha256=QreJeYfD_VWvbEqPmMXZ3pqqlFlJ1qZb-NtqnyhEldc,51
7
- featrixsphere-0.2.2279.dist-info/top_level.txt,sha256=AyN4wjfzlD0hWnDieuEHX0KckphIk_aC73XCG4df5uU,14
8
- featrixsphere-0.2.2279.dist-info/RECORD,,