featrixsphere 0.2.2279__py3-none-any.whl → 0.2.2280__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- featrixsphere/__init__.py +1 -1
- featrixsphere/client.py +113 -3
- {featrixsphere-0.2.2279.dist-info → featrixsphere-0.2.2280.dist-info}/METADATA +1 -1
- featrixsphere-0.2.2280.dist-info/RECORD +8 -0
- featrixsphere-0.2.2279.dist-info/RECORD +0 -8
- {featrixsphere-0.2.2279.dist-info → featrixsphere-0.2.2280.dist-info}/WHEEL +0 -0
- {featrixsphere-0.2.2279.dist-info → featrixsphere-0.2.2280.dist-info}/entry_points.txt +0 -0
- {featrixsphere-0.2.2279.dist-info → featrixsphere-0.2.2280.dist-info}/top_level.txt +0 -0
featrixsphere/__init__.py
CHANGED
featrixsphere/client.py
CHANGED
|
@@ -5007,14 +5007,44 @@ class FeatrixSphereClient:
|
|
|
5007
5007
|
The system handles the hard decisions so you can focus on your problem, not
|
|
5008
5008
|
hyperparameter tuning.
|
|
5009
5009
|
|
|
5010
|
+
MULTI-DATASET INPUT (NEW):
|
|
5011
|
+
---------------------------
|
|
5012
|
+
You can now pass a dictionary of datasets for the `df` parameter, just like in
|
|
5013
|
+
upload_df_and_create_session(). This is useful when combining multiple sources
|
|
5014
|
+
for predictor training:
|
|
5015
|
+
|
|
5016
|
+
```python
|
|
5017
|
+
# Train predictor on multiple datasets with labels
|
|
5018
|
+
training_data = {
|
|
5019
|
+
'extra_rows_from_matt': ['matt_supplement.csv', 'matt_additions.parquet'],
|
|
5020
|
+
'main_training': [df1, df2, 'training.csv'],
|
|
5021
|
+
'validation_samples': ['validation.csv']
|
|
5022
|
+
}
|
|
5023
|
+
|
|
5024
|
+
result = client.train_single_predictor(
|
|
5025
|
+
session_id=session.session_id,
|
|
5026
|
+
df=training_data, # Dictionary of datasets
|
|
5027
|
+
target_column='outcome',
|
|
5028
|
+
target_column_type='set'
|
|
5029
|
+
)
|
|
5030
|
+
```
|
|
5031
|
+
|
|
5032
|
+
When using dictionary format:
|
|
5033
|
+
- Each key is a dataset name (e.g., 'extra_rows_from_matt')
|
|
5034
|
+
- Each value is a list of DataFrames and/or file paths
|
|
5035
|
+
- A __featrix_dataset_name column is automatically added
|
|
5036
|
+
- All tables are concatenated before training
|
|
5037
|
+
- Works with all file types: CSV, Parquet, JSON, JSONL
|
|
5038
|
+
|
|
5010
5039
|
Args:
|
|
5011
5040
|
session_id: ID of session with trained embedding space
|
|
5012
5041
|
target_column: Name of the target column to predict
|
|
5013
5042
|
target_column_type: Type of target column ("set" or "scalar")
|
|
5014
5043
|
file_path: Path to DIFFERENT training file (CSV or .csv.gz) to use for predictor training.
|
|
5015
|
-
df: pandas DataFrame
|
|
5016
|
-
Use file_path OR df (not both) to train
|
|
5017
|
-
embedding space! If neither provided, uses
|
|
5044
|
+
df: pandas DataFrame OR dict of {dataset_name: [DataFrames/file paths]} with DIFFERENT
|
|
5045
|
+
training data to use for predictor training. Use file_path OR df (not both) to train
|
|
5046
|
+
predictor on different data than your embedding space! If neither provided, uses
|
|
5047
|
+
session's original data file.
|
|
5018
5048
|
epochs: Number of training epochs (default: 0; automatic)
|
|
5019
5049
|
validation_ignore_columns: List of column names to exclude from validation queries (default: None)
|
|
5020
5050
|
rare_label_value: For binary classification, which class is the rare/minority class for metrics (default: None)
|
|
@@ -5035,6 +5065,86 @@ class FeatrixSphereClient:
|
|
|
5035
5065
|
if file_path and df is not None:
|
|
5036
5066
|
raise ValueError("Provide either file_path or df, not both")
|
|
5037
5067
|
|
|
5068
|
+
# Handle dictionary of datasets input (same as upload_df_and_create_session)
|
|
5069
|
+
if df is not None and isinstance(df, dict):
|
|
5070
|
+
if verbose:
|
|
5071
|
+
print("Detected dictionary of datasets - concatenating with __featrix_dataset_name labels")
|
|
5072
|
+
all_dataframes = []
|
|
5073
|
+
total_rows = 0
|
|
5074
|
+
|
|
5075
|
+
for dataset_name, tables in df.items():
|
|
5076
|
+
if not isinstance(tables, list):
|
|
5077
|
+
raise ValueError(f"Value for dataset '{dataset_name}' must be a list of DataFrames/file paths, got {type(tables)}")
|
|
5078
|
+
|
|
5079
|
+
for i, table in enumerate(tables):
|
|
5080
|
+
# Handle file path (string)
|
|
5081
|
+
if isinstance(table, str):
|
|
5082
|
+
file_path_to_load = str(table)
|
|
5083
|
+
|
|
5084
|
+
if not os.path.exists(file_path_to_load):
|
|
5085
|
+
raise FileNotFoundError(f"File not found in dataset '{dataset_name}': {file_path_to_load}")
|
|
5086
|
+
|
|
5087
|
+
# Determine file type and load
|
|
5088
|
+
file_ext = file_path_to_load.lower()
|
|
5089
|
+
if verbose:
|
|
5090
|
+
print(f" - {dataset_name} loading file: {os.path.basename(file_path_to_load)}")
|
|
5091
|
+
|
|
5092
|
+
if file_ext.endswith('.parquet'):
|
|
5093
|
+
loaded_df = pd.read_parquet(file_path_to_load)
|
|
5094
|
+
elif file_ext.endswith(('.json', '.jsonl')):
|
|
5095
|
+
try:
|
|
5096
|
+
from featrix.neural.input_data_file import featrix_wrap_read_json_file
|
|
5097
|
+
loaded_df = featrix_wrap_read_json_file(file_path_to_load)
|
|
5098
|
+
if loaded_df is None:
|
|
5099
|
+
raise ValueError(f"Failed to parse {'JSONL' if file_ext.endswith('.jsonl') else 'JSON'} file")
|
|
5100
|
+
except ImportError:
|
|
5101
|
+
# Fallback to pandas
|
|
5102
|
+
if file_ext.endswith('.jsonl'):
|
|
5103
|
+
import json
|
|
5104
|
+
records = []
|
|
5105
|
+
with open(file_path_to_load, 'r', encoding='utf-8') as f:
|
|
5106
|
+
for line in f:
|
|
5107
|
+
if line.strip():
|
|
5108
|
+
records.append(json.loads(line))
|
|
5109
|
+
loaded_df = pd.DataFrame(records)
|
|
5110
|
+
else:
|
|
5111
|
+
loaded_df = pd.read_json(file_path_to_load)
|
|
5112
|
+
elif file_ext.endswith(('.csv', '.csv.gz')):
|
|
5113
|
+
loaded_df = pd.read_csv(file_path_to_load)
|
|
5114
|
+
else:
|
|
5115
|
+
raise ValueError(f"Unsupported file type in dataset '{dataset_name}': {file_path_to_load}. "
|
|
5116
|
+
f"Supported: .csv, .csv.gz, .parquet, .json, .jsonl")
|
|
5117
|
+
|
|
5118
|
+
labeled_table = loaded_df
|
|
5119
|
+
if verbose:
|
|
5120
|
+
print(f" Loaded {len(loaded_df)} rows, {len(loaded_df.columns)} columns")
|
|
5121
|
+
|
|
5122
|
+
# Handle DataFrame
|
|
5123
|
+
elif isinstance(table, pd.DataFrame):
|
|
5124
|
+
# Create a copy to avoid modifying the original
|
|
5125
|
+
labeled_table = table.copy()
|
|
5126
|
+
if verbose:
|
|
5127
|
+
print(f" - {dataset_name} DataFrame {i+1}: {len(labeled_table)} rows, {len(labeled_table.columns)} columns")
|
|
5128
|
+
|
|
5129
|
+
else:
|
|
5130
|
+
raise ValueError(f"Table {i} in dataset '{dataset_name}' must be a pandas DataFrame or file path (str), got {type(table)}")
|
|
5131
|
+
|
|
5132
|
+
# Add the dataset name label column
|
|
5133
|
+
labeled_table['__featrix_dataset_name'] = dataset_name
|
|
5134
|
+
|
|
5135
|
+
all_dataframes.append(labeled_table)
|
|
5136
|
+
total_rows += len(labeled_table)
|
|
5137
|
+
|
|
5138
|
+
if not all_dataframes:
|
|
5139
|
+
raise ValueError("No DataFrames found in the provided dictionary")
|
|
5140
|
+
|
|
5141
|
+
# Concatenate all dataframes
|
|
5142
|
+
if verbose:
|
|
5143
|
+
print(f"Concatenating {len(all_dataframes)} tables from {len(df)} datasets ({total_rows} total rows)")
|
|
5144
|
+
df = pd.concat(all_dataframes, ignore_index=True)
|
|
5145
|
+
if verbose:
|
|
5146
|
+
print(f"Combined DataFrame: {len(df)} rows, {len(df.columns)} columns (includes __featrix_dataset_name)")
|
|
5147
|
+
|
|
5038
5148
|
# Validate cost parameters
|
|
5039
5149
|
if cost_false_positive is not None or cost_false_negative is not None:
|
|
5040
5150
|
if cost_false_positive is None or cost_false_negative is None:
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
featrixsphere/__init__.py,sha256=1f94O-3V4hG42-5IfPSRDSLuySPvxLXTaLfGnuibzto,1888
|
|
2
|
+
featrixsphere/client.py,sha256=jCf6_zv7DhG6OWv8dyCo8dMs0qm9uFjHlH4uPTfUqDU,414594
|
|
3
|
+
featrixsphere/test_client.py,sha256=4SiRbib0ms3poK0UpnUv4G0HFQSzidF3Iswo_J2cjLk,11981
|
|
4
|
+
featrixsphere-0.2.2280.dist-info/METADATA,sha256=6KTfLXB6DPxNmGoGkzVe6FcecIk4sfIaIhWpcLTAptM,16232
|
|
5
|
+
featrixsphere-0.2.2280.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
6
|
+
featrixsphere-0.2.2280.dist-info/entry_points.txt,sha256=QreJeYfD_VWvbEqPmMXZ3pqqlFlJ1qZb-NtqnyhEldc,51
|
|
7
|
+
featrixsphere-0.2.2280.dist-info/top_level.txt,sha256=AyN4wjfzlD0hWnDieuEHX0KckphIk_aC73XCG4df5uU,14
|
|
8
|
+
featrixsphere-0.2.2280.dist-info/RECORD,,
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
featrixsphere/__init__.py,sha256=CdXrzec_MfZTUpRyajYawHBu5iEWAe-flVrlXjiLLN8,1888
|
|
2
|
-
featrixsphere/client.py,sha256=ez8ML3OMHMMO0ewzOGNgF1tg3KYygLG2NbEw33sf3AA,408591
|
|
3
|
-
featrixsphere/test_client.py,sha256=4SiRbib0ms3poK0UpnUv4G0HFQSzidF3Iswo_J2cjLk,11981
|
|
4
|
-
featrixsphere-0.2.2279.dist-info/METADATA,sha256=2hEITDtsSYmMLgl0RfqvcBzac4EBAH3DN4TSVf_1rPY,16232
|
|
5
|
-
featrixsphere-0.2.2279.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
6
|
-
featrixsphere-0.2.2279.dist-info/entry_points.txt,sha256=QreJeYfD_VWvbEqPmMXZ3pqqlFlJ1qZb-NtqnyhEldc,51
|
|
7
|
-
featrixsphere-0.2.2279.dist-info/top_level.txt,sha256=AyN4wjfzlD0hWnDieuEHX0KckphIk_aC73XCG4df5uU,14
|
|
8
|
-
featrixsphere-0.2.2279.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|