featrixsphere 0.2.2279__py3-none-any.whl → 0.2.3611__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- featrixsphere/__init__.py +1 -1
- featrixsphere/client.py +149 -21
- {featrixsphere-0.2.2279.dist-info → featrixsphere-0.2.3611.dist-info}/METADATA +1 -1
- featrixsphere-0.2.3611.dist-info/RECORD +8 -0
- featrixsphere-0.2.2279.dist-info/RECORD +0 -8
- {featrixsphere-0.2.2279.dist-info → featrixsphere-0.2.3611.dist-info}/WHEEL +0 -0
- {featrixsphere-0.2.2279.dist-info → featrixsphere-0.2.3611.dist-info}/entry_points.txt +0 -0
- {featrixsphere-0.2.2279.dist-info → featrixsphere-0.2.3611.dist-info}/top_level.txt +0 -0
featrixsphere/__init__.py
CHANGED
featrixsphere/client.py
CHANGED
|
@@ -4547,20 +4547,20 @@ class FeatrixSphereClient:
|
|
|
4547
4547
|
if verbose:
|
|
4548
4548
|
print(f"📊 Using provided DataFrame ({len(df)} rows, {len(df.columns)} columns)")
|
|
4549
4549
|
|
|
4550
|
-
# Create temporary
|
|
4551
|
-
temp_file = tempfile.NamedTemporaryFile(mode='
|
|
4550
|
+
# Create temporary parquet file (faster and more efficient than CSV)
|
|
4551
|
+
temp_file = tempfile.NamedTemporaryFile(mode='wb', suffix='.parquet', delete=False)
|
|
4552
4552
|
temp_file_path = temp_file.name
|
|
4553
4553
|
temp_file.close()
|
|
4554
4554
|
|
|
4555
|
-
# Save DataFrame to temp file
|
|
4556
|
-
df.
|
|
4555
|
+
# Save DataFrame to temp parquet file
|
|
4556
|
+
df.to_parquet(temp_file_path, index=False, engine='pyarrow')
|
|
4557
4557
|
|
|
4558
4558
|
if verbose:
|
|
4559
|
-
print(f"📁 Saved to temporary file: {os.path.basename(temp_file_path)}")
|
|
4559
|
+
print(f"📁 Saved to temporary parquet file: {os.path.basename(temp_file_path)}")
|
|
4560
4560
|
print(f"📤 Uploading file directly with training request...")
|
|
4561
4561
|
|
|
4562
4562
|
# Send file in multipart form
|
|
4563
|
-
files = {'file': (os.path.basename(temp_file_path), open(temp_file_path, 'rb'), '
|
|
4563
|
+
files = {'file': (os.path.basename(temp_file_path), open(temp_file_path, 'rb'), 'application/octet-stream')}
|
|
4564
4564
|
|
|
4565
4565
|
elif input_filename:
|
|
4566
4566
|
# If absolute path provided, send file directly
|
|
@@ -4573,8 +4573,17 @@ class FeatrixSphereClient:
|
|
|
4573
4573
|
if verbose:
|
|
4574
4574
|
print(f"📤 Sending file directly from absolute path: {input_filename}")
|
|
4575
4575
|
|
|
4576
|
+
# Determine MIME type based on file extension
|
|
4577
|
+
mime_type = 'application/octet-stream' # Default fallback
|
|
4578
|
+
if input_path.suffix == '.csv':
|
|
4579
|
+
mime_type = 'text/csv'
|
|
4580
|
+
elif input_path.suffix == '.parquet':
|
|
4581
|
+
mime_type = 'application/octet-stream'
|
|
4582
|
+
elif input_path.suffix == '.gz':
|
|
4583
|
+
mime_type = 'application/gzip'
|
|
4584
|
+
|
|
4576
4585
|
# Send file in multipart form
|
|
4577
|
-
files = {'file': (input_path.name, open(input_path, 'rb'),
|
|
4586
|
+
files = {'file': (input_path.name, open(input_path, 'rb'), mime_type)}
|
|
4578
4587
|
else:
|
|
4579
4588
|
# Relative filename - assume it's already on the server
|
|
4580
4589
|
data["input_filename"] = input_filename
|
|
@@ -5007,14 +5016,44 @@ class FeatrixSphereClient:
|
|
|
5007
5016
|
The system handles the hard decisions so you can focus on your problem, not
|
|
5008
5017
|
hyperparameter tuning.
|
|
5009
5018
|
|
|
5019
|
+
MULTI-DATASET INPUT (NEW):
|
|
5020
|
+
---------------------------
|
|
5021
|
+
You can now pass a dictionary of datasets for the `df` parameter, just like in
|
|
5022
|
+
upload_df_and_create_session(). This is useful when combining multiple sources
|
|
5023
|
+
for predictor training:
|
|
5024
|
+
|
|
5025
|
+
```python
|
|
5026
|
+
# Train predictor on multiple datasets with labels
|
|
5027
|
+
training_data = {
|
|
5028
|
+
'extra_rows_from_matt': ['matt_supplement.csv', 'matt_additions.parquet'],
|
|
5029
|
+
'main_training': [df1, df2, 'training.csv'],
|
|
5030
|
+
'validation_samples': ['validation.csv']
|
|
5031
|
+
}
|
|
5032
|
+
|
|
5033
|
+
result = client.train_single_predictor(
|
|
5034
|
+
session_id=session.session_id,
|
|
5035
|
+
df=training_data, # Dictionary of datasets
|
|
5036
|
+
target_column='outcome',
|
|
5037
|
+
target_column_type='set'
|
|
5038
|
+
)
|
|
5039
|
+
```
|
|
5040
|
+
|
|
5041
|
+
When using dictionary format:
|
|
5042
|
+
- Each key is a dataset name (e.g., 'extra_rows_from_matt')
|
|
5043
|
+
- Each value is a list of DataFrames and/or file paths
|
|
5044
|
+
- A __featrix_dataset_name column is automatically added
|
|
5045
|
+
- All tables are concatenated before training
|
|
5046
|
+
- Works with all file types: CSV, Parquet, JSON, JSONL
|
|
5047
|
+
|
|
5010
5048
|
Args:
|
|
5011
5049
|
session_id: ID of session with trained embedding space
|
|
5012
5050
|
target_column: Name of the target column to predict
|
|
5013
5051
|
target_column_type: Type of target column ("set" or "scalar")
|
|
5014
5052
|
file_path: Path to DIFFERENT training file (CSV or .csv.gz) to use for predictor training.
|
|
5015
|
-
df: pandas DataFrame
|
|
5016
|
-
Use file_path OR df (not both) to train
|
|
5017
|
-
embedding space! If neither provided, uses
|
|
5053
|
+
df: pandas DataFrame OR dict of {dataset_name: [DataFrames/file paths]} with DIFFERENT
|
|
5054
|
+
training data to use for predictor training. Use file_path OR df (not both) to train
|
|
5055
|
+
predictor on different data than your embedding space! If neither provided, uses
|
|
5056
|
+
session's original data file.
|
|
5018
5057
|
epochs: Number of training epochs (default: 0; automatic)
|
|
5019
5058
|
validation_ignore_columns: List of column names to exclude from validation queries (default: None)
|
|
5020
5059
|
rare_label_value: For binary classification, which class is the rare/minority class for metrics (default: None)
|
|
@@ -5035,6 +5074,86 @@ class FeatrixSphereClient:
|
|
|
5035
5074
|
if file_path and df is not None:
|
|
5036
5075
|
raise ValueError("Provide either file_path or df, not both")
|
|
5037
5076
|
|
|
5077
|
+
# Handle dictionary of datasets input (same as upload_df_and_create_session)
|
|
5078
|
+
if df is not None and isinstance(df, dict):
|
|
5079
|
+
if verbose:
|
|
5080
|
+
print("Detected dictionary of datasets - concatenating with __featrix_dataset_name labels")
|
|
5081
|
+
all_dataframes = []
|
|
5082
|
+
total_rows = 0
|
|
5083
|
+
|
|
5084
|
+
for dataset_name, tables in df.items():
|
|
5085
|
+
if not isinstance(tables, list):
|
|
5086
|
+
raise ValueError(f"Value for dataset '{dataset_name}' must be a list of DataFrames/file paths, got {type(tables)}")
|
|
5087
|
+
|
|
5088
|
+
for i, table in enumerate(tables):
|
|
5089
|
+
# Handle file path (string)
|
|
5090
|
+
if isinstance(table, str):
|
|
5091
|
+
file_path_to_load = str(table)
|
|
5092
|
+
|
|
5093
|
+
if not os.path.exists(file_path_to_load):
|
|
5094
|
+
raise FileNotFoundError(f"File not found in dataset '{dataset_name}': {file_path_to_load}")
|
|
5095
|
+
|
|
5096
|
+
# Determine file type and load
|
|
5097
|
+
file_ext = file_path_to_load.lower()
|
|
5098
|
+
if verbose:
|
|
5099
|
+
print(f" - {dataset_name} loading file: {os.path.basename(file_path_to_load)}")
|
|
5100
|
+
|
|
5101
|
+
if file_ext.endswith('.parquet'):
|
|
5102
|
+
loaded_df = pd.read_parquet(file_path_to_load)
|
|
5103
|
+
elif file_ext.endswith(('.json', '.jsonl')):
|
|
5104
|
+
try:
|
|
5105
|
+
from featrix.neural.input_data_file import featrix_wrap_read_json_file
|
|
5106
|
+
loaded_df = featrix_wrap_read_json_file(file_path_to_load)
|
|
5107
|
+
if loaded_df is None:
|
|
5108
|
+
raise ValueError(f"Failed to parse {'JSONL' if file_ext.endswith('.jsonl') else 'JSON'} file")
|
|
5109
|
+
except ImportError:
|
|
5110
|
+
# Fallback to pandas
|
|
5111
|
+
if file_ext.endswith('.jsonl'):
|
|
5112
|
+
import json
|
|
5113
|
+
records = []
|
|
5114
|
+
with open(file_path_to_load, 'r', encoding='utf-8') as f:
|
|
5115
|
+
for line in f:
|
|
5116
|
+
if line.strip():
|
|
5117
|
+
records.append(json.loads(line))
|
|
5118
|
+
loaded_df = pd.DataFrame(records)
|
|
5119
|
+
else:
|
|
5120
|
+
loaded_df = pd.read_json(file_path_to_load)
|
|
5121
|
+
elif file_ext.endswith(('.csv', '.csv.gz')):
|
|
5122
|
+
loaded_df = pd.read_csv(file_path_to_load)
|
|
5123
|
+
else:
|
|
5124
|
+
raise ValueError(f"Unsupported file type in dataset '{dataset_name}': {file_path_to_load}. "
|
|
5125
|
+
f"Supported: .csv, .csv.gz, .parquet, .json, .jsonl")
|
|
5126
|
+
|
|
5127
|
+
labeled_table = loaded_df
|
|
5128
|
+
if verbose:
|
|
5129
|
+
print(f" Loaded {len(loaded_df)} rows, {len(loaded_df.columns)} columns")
|
|
5130
|
+
|
|
5131
|
+
# Handle DataFrame
|
|
5132
|
+
elif isinstance(table, pd.DataFrame):
|
|
5133
|
+
# Create a copy to avoid modifying the original
|
|
5134
|
+
labeled_table = table.copy()
|
|
5135
|
+
if verbose:
|
|
5136
|
+
print(f" - {dataset_name} DataFrame {i+1}: {len(labeled_table)} rows, {len(labeled_table.columns)} columns")
|
|
5137
|
+
|
|
5138
|
+
else:
|
|
5139
|
+
raise ValueError(f"Table {i} in dataset '{dataset_name}' must be a pandas DataFrame or file path (str), got {type(table)}")
|
|
5140
|
+
|
|
5141
|
+
# Add the dataset name label column
|
|
5142
|
+
labeled_table['__featrix_dataset_name'] = dataset_name
|
|
5143
|
+
|
|
5144
|
+
all_dataframes.append(labeled_table)
|
|
5145
|
+
total_rows += len(labeled_table)
|
|
5146
|
+
|
|
5147
|
+
if not all_dataframes:
|
|
5148
|
+
raise ValueError("No DataFrames found in the provided dictionary")
|
|
5149
|
+
|
|
5150
|
+
# Concatenate all dataframes
|
|
5151
|
+
if verbose:
|
|
5152
|
+
print(f"Concatenating {len(all_dataframes)} tables from {len(df)} datasets ({total_rows} total rows)")
|
|
5153
|
+
df = pd.concat(all_dataframes, ignore_index=True)
|
|
5154
|
+
if verbose:
|
|
5155
|
+
print(f"Combined DataFrame: {len(df)} rows, {len(df.columns)} columns (includes __featrix_dataset_name)")
|
|
5156
|
+
|
|
5038
5157
|
# Validate cost parameters
|
|
5039
5158
|
if cost_false_positive is not None or cost_false_negative is not None:
|
|
5040
5159
|
if cost_false_positive is None or cost_false_negative is None:
|
|
@@ -5055,17 +5174,17 @@ class FeatrixSphereClient:
|
|
|
5055
5174
|
if verbose:
|
|
5056
5175
|
print(f"📊 Using provided DataFrame ({len(df)} rows, {len(df.columns)} columns)")
|
|
5057
5176
|
|
|
5058
|
-
# Create temporary
|
|
5059
|
-
temp_file = tempfile.NamedTemporaryFile(mode='
|
|
5177
|
+
# Create temporary parquet file (faster and more efficient than CSV)
|
|
5178
|
+
temp_file = tempfile.NamedTemporaryFile(mode='wb', suffix='.parquet', delete=False)
|
|
5060
5179
|
temp_file_path = temp_file.name
|
|
5061
5180
|
temp_file.close()
|
|
5062
5181
|
|
|
5063
|
-
# Save DataFrame to temp file
|
|
5064
|
-
df.
|
|
5182
|
+
# Save DataFrame to temp parquet file
|
|
5183
|
+
df.to_parquet(temp_file_path, index=False, engine='pyarrow')
|
|
5065
5184
|
file_path = temp_file_path
|
|
5066
5185
|
|
|
5067
5186
|
if verbose:
|
|
5068
|
-
print(f"📁 Saved to temporary file: {os.path.basename(temp_file_path)}")
|
|
5187
|
+
print(f"📁 Saved to temporary parquet file: {os.path.basename(temp_file_path)}")
|
|
5069
5188
|
|
|
5070
5189
|
try:
|
|
5071
5190
|
# If a custom training file is provided (or we created one from df), use the file upload endpoint
|
|
@@ -5296,17 +5415,17 @@ class FeatrixSphereClient:
|
|
|
5296
5415
|
if verbose:
|
|
5297
5416
|
print(f"📊 Using provided DataFrame ({len(new_data_df)} rows, {len(new_data_df.columns)} columns)")
|
|
5298
5417
|
|
|
5299
|
-
# Create temporary
|
|
5300
|
-
temp_file = tempfile.NamedTemporaryFile(mode='
|
|
5418
|
+
# Create temporary parquet file (faster and more efficient than CSV)
|
|
5419
|
+
temp_file = tempfile.NamedTemporaryFile(mode='wb', suffix='.parquet', delete=False)
|
|
5301
5420
|
temp_file_path = temp_file.name
|
|
5302
5421
|
temp_file.close()
|
|
5303
5422
|
|
|
5304
|
-
# Save DataFrame to temp file
|
|
5305
|
-
new_data_df.
|
|
5423
|
+
# Save DataFrame to temp parquet file
|
|
5424
|
+
new_data_df.to_parquet(temp_file_path, index=False, engine='pyarrow')
|
|
5306
5425
|
file_to_upload = temp_file_path
|
|
5307
5426
|
|
|
5308
5427
|
if verbose:
|
|
5309
|
-
print(f"📁 Saved to temporary file: {os.path.basename(temp_file_path)}")
|
|
5428
|
+
print(f"📁 Saved to temporary parquet file: {os.path.basename(temp_file_path)}")
|
|
5310
5429
|
|
|
5311
5430
|
try:
|
|
5312
5431
|
# Prepare file upload
|
|
@@ -5901,8 +6020,17 @@ class FeatrixSphereClient:
|
|
|
5901
6020
|
print(f"📤 Uploading custom training file: {file_path.name}")
|
|
5902
6021
|
|
|
5903
6022
|
# Prepare the multipart form data
|
|
6023
|
+
# Determine MIME type based on file extension
|
|
6024
|
+
mime_type = 'application/octet-stream' # Default fallback
|
|
6025
|
+
if file_path.suffix == '.csv':
|
|
6026
|
+
mime_type = 'text/csv'
|
|
6027
|
+
elif file_path.suffix == '.parquet':
|
|
6028
|
+
mime_type = 'application/octet-stream' # Parquet MIME type
|
|
6029
|
+
elif file_path.suffix == '.gz':
|
|
6030
|
+
mime_type = 'application/gzip'
|
|
6031
|
+
|
|
5904
6032
|
files = {
|
|
5905
|
-
'file': (file_path.name, open(file_path, 'rb'),
|
|
6033
|
+
'file': (file_path.name, open(file_path, 'rb'), mime_type)
|
|
5906
6034
|
}
|
|
5907
6035
|
|
|
5908
6036
|
data = {
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
featrixsphere/__init__.py,sha256=0Aq33fh4GcL9nPH57I7gam8zpatQ3Ouooy6qBE98FdE,1888
|
|
2
|
+
featrixsphere/client.py,sha256=TiEZIT1Qmc983CbPljc8__e0jJRnpQ3Lf6SabwrvLlo,415649
|
|
3
|
+
featrixsphere/test_client.py,sha256=4SiRbib0ms3poK0UpnUv4G0HFQSzidF3Iswo_J2cjLk,11981
|
|
4
|
+
featrixsphere-0.2.3611.dist-info/METADATA,sha256=Ix0dJltPnTz5HCdFzy9fE8vLPGSttGQ56G6GhBXUwwM,16232
|
|
5
|
+
featrixsphere-0.2.3611.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
6
|
+
featrixsphere-0.2.3611.dist-info/entry_points.txt,sha256=QreJeYfD_VWvbEqPmMXZ3pqqlFlJ1qZb-NtqnyhEldc,51
|
|
7
|
+
featrixsphere-0.2.3611.dist-info/top_level.txt,sha256=AyN4wjfzlD0hWnDieuEHX0KckphIk_aC73XCG4df5uU,14
|
|
8
|
+
featrixsphere-0.2.3611.dist-info/RECORD,,
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
featrixsphere/__init__.py,sha256=CdXrzec_MfZTUpRyajYawHBu5iEWAe-flVrlXjiLLN8,1888
|
|
2
|
-
featrixsphere/client.py,sha256=ez8ML3OMHMMO0ewzOGNgF1tg3KYygLG2NbEw33sf3AA,408591
|
|
3
|
-
featrixsphere/test_client.py,sha256=4SiRbib0ms3poK0UpnUv4G0HFQSzidF3Iswo_J2cjLk,11981
|
|
4
|
-
featrixsphere-0.2.2279.dist-info/METADATA,sha256=2hEITDtsSYmMLgl0RfqvcBzac4EBAH3DN4TSVf_1rPY,16232
|
|
5
|
-
featrixsphere-0.2.2279.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
6
|
-
featrixsphere-0.2.2279.dist-info/entry_points.txt,sha256=QreJeYfD_VWvbEqPmMXZ3pqqlFlJ1qZb-NtqnyhEldc,51
|
|
7
|
-
featrixsphere-0.2.2279.dist-info/top_level.txt,sha256=AyN4wjfzlD0hWnDieuEHX0KckphIk_aC73XCG4df5uU,14
|
|
8
|
-
featrixsphere-0.2.2279.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|