featrixsphere 0.2.1002__tar.gz → 0.2.1206__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {featrixsphere-0.2.1002/featrixsphere.egg-info → featrixsphere-0.2.1206}/PKG-INFO +1 -1
- featrixsphere-0.2.1206/VERSION +1 -0
- {featrixsphere-0.2.1002 → featrixsphere-0.2.1206}/featrixsphere/__init__.py +1 -1
- {featrixsphere-0.2.1002 → featrixsphere-0.2.1206}/featrixsphere/client.py +527 -38
- {featrixsphere-0.2.1002 → featrixsphere-0.2.1206/featrixsphere.egg-info}/PKG-INFO +1 -1
- featrixsphere-0.2.1002/VERSION +0 -1
- {featrixsphere-0.2.1002 → featrixsphere-0.2.1206}/MANIFEST.in +0 -0
- {featrixsphere-0.2.1002 → featrixsphere-0.2.1206}/README.md +0 -0
- {featrixsphere-0.2.1002 → featrixsphere-0.2.1206}/featrixsphere/cli.py +0 -0
- {featrixsphere-0.2.1002 → featrixsphere-0.2.1206}/featrixsphere/test_client.py +0 -0
- {featrixsphere-0.2.1002 → featrixsphere-0.2.1206}/featrixsphere.egg-info/SOURCES.txt +0 -0
- {featrixsphere-0.2.1002 → featrixsphere-0.2.1206}/featrixsphere.egg-info/dependency_links.txt +0 -0
- {featrixsphere-0.2.1002 → featrixsphere-0.2.1206}/featrixsphere.egg-info/entry_points.txt +0 -0
- {featrixsphere-0.2.1002 → featrixsphere-0.2.1206}/featrixsphere.egg-info/not-zip-safe +0 -0
- {featrixsphere-0.2.1002 → featrixsphere-0.2.1206}/featrixsphere.egg-info/requires.txt +0 -0
- {featrixsphere-0.2.1002 → featrixsphere-0.2.1206}/featrixsphere.egg-info/top_level.txt +0 -0
- {featrixsphere-0.2.1002 → featrixsphere-0.2.1206}/requirements.txt +0 -0
- {featrixsphere-0.2.1002 → featrixsphere-0.2.1206}/setup.cfg +0 -0
- {featrixsphere-0.2.1002 → featrixsphere-0.2.1206}/setup.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.2.1206
|
|
@@ -660,6 +660,77 @@ class FeatrixSphereClient:
|
|
|
660
660
|
)
|
|
661
661
|
return response.json()
|
|
662
662
|
|
|
663
|
+
def publish_session(self, session_id: str) -> Dict[str, Any]:
|
|
664
|
+
"""
|
|
665
|
+
Publish a session by moving it to /sphere/published/<sessionId>.
|
|
666
|
+
Moves both the session file and output directory.
|
|
667
|
+
|
|
668
|
+
Args:
|
|
669
|
+
session_id: Session ID to publish
|
|
670
|
+
|
|
671
|
+
Returns:
|
|
672
|
+
Response with published_path, output_path, and status
|
|
673
|
+
|
|
674
|
+
Example:
|
|
675
|
+
```python
|
|
676
|
+
result = client.publish_session("abc123")
|
|
677
|
+
print(f"Published to: {result['published_path']}")
|
|
678
|
+
```
|
|
679
|
+
"""
|
|
680
|
+
response_data = self._post_json(f"/compute/session/{session_id}/publish", {})
|
|
681
|
+
return response_data
|
|
682
|
+
|
|
683
|
+
def deprecate_session(self, session_id: str, warning_message: str, expiration_date: str) -> Dict[str, Any]:
|
|
684
|
+
"""
|
|
685
|
+
Deprecate a published session with a warning message and expiration date.
|
|
686
|
+
The session remains available until the expiration date.
|
|
687
|
+
|
|
688
|
+
Args:
|
|
689
|
+
session_id: Session ID to deprecate
|
|
690
|
+
warning_message: Warning message to display about deprecation
|
|
691
|
+
expiration_date: ISO format date string when session will be removed (e.g., "2025-12-31T23:59:59Z")
|
|
692
|
+
|
|
693
|
+
Returns:
|
|
694
|
+
Response with deprecation status
|
|
695
|
+
|
|
696
|
+
Example:
|
|
697
|
+
```python
|
|
698
|
+
from datetime import datetime, timedelta
|
|
699
|
+
|
|
700
|
+
expiration = (datetime.now() + timedelta(days=90)).isoformat() + "Z"
|
|
701
|
+
result = client.deprecate_session(
|
|
702
|
+
session_id="abc123",
|
|
703
|
+
warning_message="This session will be removed on 2025-12-31",
|
|
704
|
+
expiration_date=expiration
|
|
705
|
+
)
|
|
706
|
+
```
|
|
707
|
+
"""
|
|
708
|
+
data = {
|
|
709
|
+
"warning_message": warning_message,
|
|
710
|
+
"expiration_date": expiration_date
|
|
711
|
+
}
|
|
712
|
+
response_data = self._post_json(f"/compute/session/{session_id}/deprecate", data)
|
|
713
|
+
return response_data
|
|
714
|
+
|
|
715
|
+
def unpublish_session(self, session_id: str) -> Dict[str, Any]:
|
|
716
|
+
"""
|
|
717
|
+
Unpublish a session by moving it back from /sphere/published/<sessionId>.
|
|
718
|
+
|
|
719
|
+
Args:
|
|
720
|
+
session_id: Session ID to unpublish
|
|
721
|
+
|
|
722
|
+
Returns:
|
|
723
|
+
Response with unpublish status
|
|
724
|
+
|
|
725
|
+
Example:
|
|
726
|
+
```python
|
|
727
|
+
result = client.unpublish_session("abc123")
|
|
728
|
+
print(f"Status: {result['status']}")
|
|
729
|
+
```
|
|
730
|
+
"""
|
|
731
|
+
response_data = self._post_json(f"/compute/session/{session_id}/unpublish", {})
|
|
732
|
+
return response_data
|
|
733
|
+
|
|
663
734
|
def get_sessions_for_org(self, name_prefix: str, max_retries: int = None) -> Dict[str, Any]:
|
|
664
735
|
"""
|
|
665
736
|
Get all sessions matching a name prefix across all compute nodes.
|
|
@@ -1424,16 +1495,141 @@ class FeatrixSphereClient:
|
|
|
1424
1495
|
job_queue_positions={}
|
|
1425
1496
|
)
|
|
1426
1497
|
|
|
1498
|
+
def fine_tune_embedding_space(
|
|
1499
|
+
self,
|
|
1500
|
+
name: str,
|
|
1501
|
+
parent_session_id: str = None,
|
|
1502
|
+
parent_embedding_space_path: str = None,
|
|
1503
|
+
s3_training_dataset: str = None,
|
|
1504
|
+
s3_validation_dataset: str = None,
|
|
1505
|
+
webhooks: Dict[str, str] = None
|
|
1506
|
+
) -> SessionInfo:
|
|
1507
|
+
"""
|
|
1508
|
+
Fine-tune an existing embedding space on new data.
|
|
1509
|
+
|
|
1510
|
+
This method takes a pre-trained embedding space (the "parent") and fine-tunes it
|
|
1511
|
+
on a new dataset with the same columns. The number of training epochs is automatically
|
|
1512
|
+
calculated based on the dataset size ratio to ensure optimal training.
|
|
1513
|
+
|
|
1514
|
+
**How Epoch Calculation Works:**
|
|
1515
|
+
- The system calculates F = len(new_dataset) / len(old_dataset)
|
|
1516
|
+
- New epochs = original_epochs / F
|
|
1517
|
+
- If new dataset is smaller (F < 1), more epochs are used (to see data enough times)
|
|
1518
|
+
- If new dataset is larger (F > 1), fewer epochs are used (less repetition needed)
|
|
1519
|
+
|
|
1520
|
+
**Example:**
|
|
1521
|
+
- Original: 1000 rows, trained for 100 epochs
|
|
1522
|
+
- New: 500 rows → F = 0.5 → 100/0.5 = 200 epochs
|
|
1523
|
+
- New: 2000 rows → F = 2.0 → 100/2.0 = 50 epochs
|
|
1524
|
+
|
|
1525
|
+
This ensures the model sees the new data an appropriate number of times relative
|
|
1526
|
+
to how much it saw the original data.
|
|
1527
|
+
|
|
1528
|
+
Args:
|
|
1529
|
+
name: Name for the fine-tuned embedding space
|
|
1530
|
+
parent_session_id: Session ID of the parent embedding space (optional)
|
|
1531
|
+
parent_embedding_space_path: Direct path to parent embedding space pickle file (optional)
|
|
1532
|
+
s3_training_dataset: S3 URL for new training dataset (must start with 's3://')
|
|
1533
|
+
s3_validation_dataset: S3 URL for new validation dataset (must start with 's3://')
|
|
1534
|
+
webhooks: Optional dict with webhook configuration keys (webhook_callback_secret, s3_backup_url, model_id_update_url)
|
|
1535
|
+
|
|
1536
|
+
Returns:
|
|
1537
|
+
SessionInfo for the newly created fine-tuning session
|
|
1538
|
+
|
|
1539
|
+
Raises:
|
|
1540
|
+
ValueError: If S3 URLs are invalid or neither parent identifier is provided
|
|
1541
|
+
|
|
1542
|
+
Example:
|
|
1543
|
+
```python
|
|
1544
|
+
# Fine-tune an existing embedding space on new data
|
|
1545
|
+
client = FeatrixSphereClient("https://sphere-api.featrix.com")
|
|
1546
|
+
|
|
1547
|
+
# Option 1: Use parent session ID
|
|
1548
|
+
fine_tuned = client.fine_tune_embedding_space(
|
|
1549
|
+
name="customer_behavior_v2",
|
|
1550
|
+
parent_session_id="abc123-20240101-120000",
|
|
1551
|
+
s3_training_dataset="s3://my-bucket/new_training_data.csv",
|
|
1552
|
+
s3_validation_dataset="s3://my-bucket/new_validation_data.csv"
|
|
1553
|
+
)
|
|
1554
|
+
|
|
1555
|
+
# Option 2: Use direct path to parent embedding space
|
|
1556
|
+
fine_tuned = client.fine_tune_embedding_space(
|
|
1557
|
+
name="customer_behavior_v2",
|
|
1558
|
+
parent_embedding_space_path="/path/to/parent/embedded_space.pickle",
|
|
1559
|
+
s3_training_dataset="s3://my-bucket/new_training_data.csv",
|
|
1560
|
+
s3_validation_dataset="s3://my-bucket/new_validation_data.csv"
|
|
1561
|
+
)
|
|
1562
|
+
|
|
1563
|
+
# Wait for fine-tuning to complete
|
|
1564
|
+
client.wait_for_session_completion(fine_tuned.session_id)
|
|
1565
|
+
|
|
1566
|
+
# Use the fine-tuned model for predictions
|
|
1567
|
+
result = client.predict(fine_tuned.session_id, {"feature1": "value1"})
|
|
1568
|
+
```
|
|
1569
|
+
"""
|
|
1570
|
+
# Validate S3 URLs
|
|
1571
|
+
if s3_training_dataset and not s3_training_dataset.startswith('s3://'):
|
|
1572
|
+
raise ValueError("s3_training_dataset must be a valid S3 URL (s3://...)")
|
|
1573
|
+
if s3_validation_dataset and not s3_validation_dataset.startswith('s3://'):
|
|
1574
|
+
raise ValueError("s3_validation_dataset must be a valid S3 URL (s3://...)")
|
|
1575
|
+
|
|
1576
|
+
# Validate that we have either parent_session_id or parent_embedding_space_path
|
|
1577
|
+
if not parent_session_id and not parent_embedding_space_path:
|
|
1578
|
+
raise ValueError("Either parent_session_id or parent_embedding_space_path must be provided")
|
|
1579
|
+
|
|
1580
|
+
print(f"Fine-tuning embedding space '{name}'...")
|
|
1581
|
+
if parent_session_id:
|
|
1582
|
+
print(f" Parent session: {parent_session_id}")
|
|
1583
|
+
if parent_embedding_space_path:
|
|
1584
|
+
print(f" Parent embedding space: {parent_embedding_space_path}")
|
|
1585
|
+
print(f" New training data: {s3_training_dataset}")
|
|
1586
|
+
print(f" New validation data: {s3_validation_dataset}")
|
|
1587
|
+
|
|
1588
|
+
data = {
|
|
1589
|
+
"name": name,
|
|
1590
|
+
"s3_file_data_set_training": s3_training_dataset,
|
|
1591
|
+
"s3_file_data_set_validation": s3_validation_dataset
|
|
1592
|
+
}
|
|
1593
|
+
|
|
1594
|
+
if parent_session_id:
|
|
1595
|
+
data["parent_session_id"] = parent_session_id
|
|
1596
|
+
if parent_embedding_space_path:
|
|
1597
|
+
data["parent_embedding_space_path"] = parent_embedding_space_path
|
|
1598
|
+
|
|
1599
|
+
if webhooks:
|
|
1600
|
+
data['webhooks'] = webhooks
|
|
1601
|
+
|
|
1602
|
+
response_data = self._post_json("/compute/fine-tune-embedding-space", data)
|
|
1603
|
+
|
|
1604
|
+
session_id = response_data.get('session_id')
|
|
1605
|
+
fine_tune_info = response_data.get('fine_tune_info', {})
|
|
1606
|
+
|
|
1607
|
+
print(f"Fine-tuning session created: {session_id}")
|
|
1608
|
+
if fine_tune_info:
|
|
1609
|
+
print(f" Original dataset: {fine_tune_info.get('original_train_size', 'N/A')} rows")
|
|
1610
|
+
print(f" New dataset: {fine_tune_info.get('new_total_size', 'N/A')} rows")
|
|
1611
|
+
print(f" Dataset ratio (F): {fine_tune_info.get('F', 'N/A'):.4f}")
|
|
1612
|
+
print(f" Original epochs: {fine_tune_info.get('original_epochs', 'N/A')}")
|
|
1613
|
+
print(f" Calculated epochs: {fine_tune_info.get('calculated_epochs', 'N/A')}")
|
|
1614
|
+
|
|
1615
|
+
return SessionInfo(
|
|
1616
|
+
session_id=session_id,
|
|
1617
|
+
session_type=response_data.get('session_type', 'embedding_space_finetune'),
|
|
1618
|
+
status=response_data.get('status', 'ready'),
|
|
1619
|
+
jobs={},
|
|
1620
|
+
job_queue_positions={}
|
|
1621
|
+
)
|
|
1622
|
+
|
|
1427
1623
|
# =========================================================================
|
|
1428
1624
|
# File Upload
|
|
1429
1625
|
# =========================================================================
|
|
1430
1626
|
|
|
1431
1627
|
def upload_file_and_create_session(self, file_path: Path, session_name_prefix: str = None, name: str = None, webhooks: Dict[str, str] = None) -> SessionInfo:
|
|
1432
1628
|
"""
|
|
1433
|
-
Upload a CSV file and create a new session.
|
|
1629
|
+
Upload a CSV, Parquet, JSON, or JSONL file and create a new session.
|
|
1434
1630
|
|
|
1435
1631
|
Args:
|
|
1436
|
-
file_path: Path to the CSV file to upload
|
|
1632
|
+
file_path: Path to the CSV, Parquet, JSON, or JSONL file to upload
|
|
1437
1633
|
session_name_prefix: Optional prefix for the session ID. Session will be named <prefix>-<full-uuid>
|
|
1438
1634
|
name: Optional name for the embedding space/model (for identification and metadata)
|
|
1439
1635
|
webhooks: Optional dict with webhook configuration keys (webhook_callback_secret, s3_backup_url, model_id_update_url)
|
|
@@ -1491,12 +1687,13 @@ class FeatrixSphereClient:
|
|
|
1491
1687
|
string_list_delimiter: str = "|",
|
|
1492
1688
|
important_columns_for_visualization: List[str] = None,
|
|
1493
1689
|
metadata: Dict[str, Any] = None,
|
|
1690
|
+
user_metadata: Dict[str, Any] = None, # User metadata for ES/SP identification (max 32KB)
|
|
1494
1691
|
session_name_prefix: str = None,
|
|
1495
1692
|
name: str = None,
|
|
1496
1693
|
webhooks: Dict[str, str] = None,
|
|
1497
1694
|
epochs: int = None) -> SessionInfo:
|
|
1498
1695
|
"""
|
|
1499
|
-
Upload a pandas DataFrame or
|
|
1696
|
+
Upload a pandas DataFrame, CSV file, Parquet file, JSON file, or JSONL file and create a new session.
|
|
1500
1697
|
|
|
1501
1698
|
Special Column: __featrix_train_predictor
|
|
1502
1699
|
------------------------------------------
|
|
@@ -1504,7 +1701,7 @@ class FeatrixSphereClient:
|
|
|
1504
1701
|
which rows are used for single predictor training.
|
|
1505
1702
|
|
|
1506
1703
|
How it works:
|
|
1507
|
-
- Add a boolean column "__featrix_train_predictor" to your DataFrame/CSV before upload
|
|
1704
|
+
- Add a boolean column "__featrix_train_predictor" to your DataFrame/CSV/Parquet/JSON/JSONL before upload
|
|
1508
1705
|
- Set it to True for rows you want to use for predictor training
|
|
1509
1706
|
- Set it to False (or any other value) for rows to exclude from predictor training
|
|
1510
1707
|
- Embedding space training uses ALL rows (ignores this column)
|
|
@@ -1538,7 +1735,7 @@ class FeatrixSphereClient:
|
|
|
1538
1735
|
Args:
|
|
1539
1736
|
df: pandas DataFrame to upload (optional if file_path is provided)
|
|
1540
1737
|
filename: Name to give the uploaded file (default: "data.csv")
|
|
1541
|
-
file_path: Path to CSV file to upload (optional if df is provided)
|
|
1738
|
+
file_path: Path to CSV, Parquet, JSON, or JSONL file to upload (optional if df is provided)
|
|
1542
1739
|
column_overrides: Dict mapping column names to types ("scalar", "set", "free_string", "free_string_list")
|
|
1543
1740
|
column_types: Alias for column_overrides (for backward compatibility)
|
|
1544
1741
|
string_list_delimiter: Delimiter for free_string_list columns (default: "|")
|
|
@@ -1579,21 +1776,90 @@ class FeatrixSphereClient:
|
|
|
1579
1776
|
if not os.path.exists(file_path):
|
|
1580
1777
|
raise FileNotFoundError(f"File not found: {file_path}")
|
|
1581
1778
|
|
|
1582
|
-
# Check if it's a
|
|
1583
|
-
|
|
1584
|
-
|
|
1779
|
+
# Check if it's a supported file type
|
|
1780
|
+
file_ext = file_path.lower()
|
|
1781
|
+
if not file_ext.endswith(('.csv', '.csv.gz', '.parquet', '.json', '.jsonl')):
|
|
1782
|
+
raise ValueError("File must be a CSV, Parquet, JSON, or JSONL file (with .csv, .csv.gz, .parquet, .json, or .jsonl extension)")
|
|
1585
1783
|
|
|
1586
1784
|
print(f"Uploading file: {file_path}")
|
|
1587
1785
|
|
|
1588
1786
|
# Read the file content
|
|
1589
1787
|
if file_path.endswith('.gz'):
|
|
1590
|
-
# Already gzipped
|
|
1788
|
+
# Already gzipped CSV
|
|
1591
1789
|
with gzip.open(file_path, 'rb') as f:
|
|
1592
1790
|
file_content = f.read()
|
|
1593
1791
|
upload_filename = os.path.basename(file_path)
|
|
1594
1792
|
content_type = 'application/gzip'
|
|
1793
|
+
elif file_path.lower().endswith(('.json', '.jsonl')):
|
|
1794
|
+
# JSON/JSONL file - read as DataFrame, convert to CSV, then compress
|
|
1795
|
+
print(f"Reading {'JSONL' if file_path.lower().endswith('.jsonl') else 'JSON'} file...")
|
|
1796
|
+
try:
|
|
1797
|
+
from featrix.neural.input_data_file import featrix_wrap_read_json_file
|
|
1798
|
+
json_df = featrix_wrap_read_json_file(file_path)
|
|
1799
|
+
if json_df is None:
|
|
1800
|
+
raise ValueError(f"Failed to parse {'JSONL' if file_path.lower().endswith('.jsonl') else 'JSON'} file")
|
|
1801
|
+
except ImportError:
|
|
1802
|
+
# Fallback to pandas if featrix wrapper not available
|
|
1803
|
+
if file_path.lower().endswith('.jsonl'):
|
|
1804
|
+
# JSONL: one JSON object per line
|
|
1805
|
+
import json
|
|
1806
|
+
records = []
|
|
1807
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
1808
|
+
for line in f:
|
|
1809
|
+
if line.strip():
|
|
1810
|
+
records.append(json.loads(line))
|
|
1811
|
+
json_df = pd.DataFrame(records)
|
|
1812
|
+
else:
|
|
1813
|
+
# Regular JSON
|
|
1814
|
+
json_df = pd.read_json(file_path)
|
|
1815
|
+
|
|
1816
|
+
# Clean NaN values before CSV conversion
|
|
1817
|
+
cleaned_df = json_df.where(pd.notna(json_df), None)
|
|
1818
|
+
|
|
1819
|
+
# Convert to CSV and compress
|
|
1820
|
+
csv_buffer = io.StringIO()
|
|
1821
|
+
cleaned_df.to_csv(csv_buffer, index=False)
|
|
1822
|
+
csv_data = csv_buffer.getvalue().encode('utf-8')
|
|
1823
|
+
|
|
1824
|
+
print(f"Compressing {'JSONL' if file_path.lower().endswith('.jsonl') else 'JSON'} (converted to CSV)...")
|
|
1825
|
+
compressed_buffer = io.BytesIO()
|
|
1826
|
+
with gzip.GzipFile(fileobj=compressed_buffer, mode='wb') as gz:
|
|
1827
|
+
gz.write(csv_data)
|
|
1828
|
+
file_content = compressed_buffer.getvalue()
|
|
1829
|
+
upload_filename = os.path.basename(file_path).replace('.jsonl', '.csv.gz').replace('.json', '.csv.gz')
|
|
1830
|
+
content_type = 'application/gzip'
|
|
1831
|
+
|
|
1832
|
+
original_size = len(csv_data)
|
|
1833
|
+
compressed_size = len(file_content)
|
|
1834
|
+
compression_ratio = (1 - compressed_size / original_size) * 100
|
|
1835
|
+
print(f"Converted {'JSONL' if file_path.lower().endswith('.jsonl') else 'JSON'} to CSV and compressed from {original_size:,} to {compressed_size:,} bytes ({compression_ratio:.1f}% reduction)")
|
|
1836
|
+
elif file_path.lower().endswith('.parquet'):
|
|
1837
|
+
# Parquet file - read as DataFrame, convert to CSV, then compress
|
|
1838
|
+
print("Reading Parquet file...")
|
|
1839
|
+
parquet_df = pd.read_parquet(file_path)
|
|
1840
|
+
|
|
1841
|
+
# Clean NaN values before CSV conversion
|
|
1842
|
+
cleaned_df = parquet_df.where(pd.notna(parquet_df), None)
|
|
1843
|
+
|
|
1844
|
+
# Convert to CSV and compress
|
|
1845
|
+
csv_buffer = io.StringIO()
|
|
1846
|
+
cleaned_df.to_csv(csv_buffer, index=False)
|
|
1847
|
+
csv_data = csv_buffer.getvalue().encode('utf-8')
|
|
1848
|
+
|
|
1849
|
+
print("Compressing Parquet (converted to CSV)...")
|
|
1850
|
+
compressed_buffer = io.BytesIO()
|
|
1851
|
+
with gzip.GzipFile(fileobj=compressed_buffer, mode='wb') as gz:
|
|
1852
|
+
gz.write(csv_data)
|
|
1853
|
+
file_content = compressed_buffer.getvalue()
|
|
1854
|
+
upload_filename = os.path.basename(file_path).replace('.parquet', '.csv.gz')
|
|
1855
|
+
content_type = 'application/gzip'
|
|
1856
|
+
|
|
1857
|
+
original_size = len(csv_data)
|
|
1858
|
+
compressed_size = len(file_content)
|
|
1859
|
+
compression_ratio = (1 - compressed_size / original_size) * 100
|
|
1860
|
+
print(f"Converted Parquet to CSV and compressed from {original_size:,} to {compressed_size:,} bytes ({compression_ratio:.1f}% reduction)")
|
|
1595
1861
|
else:
|
|
1596
|
-
#
|
|
1862
|
+
# Regular CSV file - read and compress it
|
|
1597
1863
|
with open(file_path, 'rb') as f:
|
|
1598
1864
|
csv_content = f.read()
|
|
1599
1865
|
|
|
@@ -1663,6 +1929,10 @@ class FeatrixSphereClient:
|
|
|
1663
1929
|
import json
|
|
1664
1930
|
data['metadata'] = json.dumps(metadata)
|
|
1665
1931
|
print(f"Session metadata: {metadata}")
|
|
1932
|
+
if user_metadata:
|
|
1933
|
+
import json
|
|
1934
|
+
data['user_metadata'] = json.dumps(user_metadata)
|
|
1935
|
+
print(f"User metadata: {user_metadata}")
|
|
1666
1936
|
if session_name_prefix:
|
|
1667
1937
|
data['session_name_prefix'] = session_name_prefix
|
|
1668
1938
|
print(f"Session name prefix: {session_name_prefix}")
|
|
@@ -3239,6 +3509,24 @@ class FeatrixSphereClient:
|
|
|
3239
3509
|
response_data = self._delete_json(f"/session/{session_id}/predictor", params=params, max_retries=max_retries)
|
|
3240
3510
|
return response_data
|
|
3241
3511
|
|
|
3512
|
+
def mark_for_deletion(self, session_id: str, max_retries: int = None) -> Dict[str, Any]:
|
|
3513
|
+
"""
|
|
3514
|
+
Mark a session for deletion. The session will be deleted by the garbage collection process.
|
|
3515
|
+
|
|
3516
|
+
Args:
|
|
3517
|
+
session_id: Session ID to mark for deletion
|
|
3518
|
+
max_retries: Number of retries for errors (default: uses client default)
|
|
3519
|
+
|
|
3520
|
+
Returns:
|
|
3521
|
+
Dictionary with confirmation that the session was marked for deletion
|
|
3522
|
+
|
|
3523
|
+
Example:
|
|
3524
|
+
result = client.mark_for_deletion("session_123")
|
|
3525
|
+
print(result) # {"status": "marked", "session_id": "session_123"}
|
|
3526
|
+
"""
|
|
3527
|
+
response_data = self._post_json(f"/compute/session/{session_id}/mark_for_deletion", max_retries=max_retries)
|
|
3528
|
+
return response_data
|
|
3529
|
+
|
|
3242
3530
|
|
|
3243
3531
|
def _create_interactive_training_movie(self, training_metrics, epoch_projections, session_id,
|
|
3244
3532
|
show_embedding_evolution, show_loss_evolution):
|
|
@@ -3723,7 +4011,7 @@ class FeatrixSphereClient:
|
|
|
3723
4011
|
name: str = None,
|
|
3724
4012
|
session_name_prefix: str = None,
|
|
3725
4013
|
epochs: int = 0, batch_size: int = 0, learning_rate: float = 0.001,
|
|
3726
|
-
|
|
4014
|
+
rare_label_value: str = None,
|
|
3727
4015
|
class_imbalance: dict = None,
|
|
3728
4016
|
optimize_for: str = "balanced",
|
|
3729
4017
|
poll_interval: int = 30, max_poll_time: int = 3600,
|
|
@@ -3746,7 +4034,7 @@ class FeatrixSphereClient:
|
|
|
3746
4034
|
epochs: Number of training epochs (default: 0; automatic)
|
|
3747
4035
|
batch_size: Training batch size (default: 0; automatic)
|
|
3748
4036
|
learning_rate: Learning rate for training (default: 0.001)
|
|
3749
|
-
|
|
4037
|
+
rare_label_value: For binary classification, which class is the rare/minority class for metrics (default: None)
|
|
3750
4038
|
class_imbalance: Expected class ratios/counts from real world for sampled data (default: None)
|
|
3751
4039
|
optimize_for: Optimization target - "balanced" (F1 score), "precision", or "recall" (default: "balanced")
|
|
3752
4040
|
poll_interval: Seconds between status checks when job is already running (default: 30)
|
|
@@ -3787,8 +4075,8 @@ class FeatrixSphereClient:
|
|
|
3787
4075
|
data["name"] = name
|
|
3788
4076
|
if session_name_prefix:
|
|
3789
4077
|
data["session_name_prefix"] = session_name_prefix
|
|
3790
|
-
if
|
|
3791
|
-
data["
|
|
4078
|
+
if rare_label_value:
|
|
4079
|
+
data["rare_label_value"] = rare_label_value
|
|
3792
4080
|
if class_imbalance:
|
|
3793
4081
|
data["class_imbalance"] = class_imbalance
|
|
3794
4082
|
if webhooks:
|
|
@@ -3821,9 +4109,11 @@ class FeatrixSphereClient:
|
|
|
3821
4109
|
df = None,
|
|
3822
4110
|
epochs: int = 0,
|
|
3823
4111
|
validation_ignore_columns: List[str] = None,
|
|
3824
|
-
|
|
4112
|
+
rare_label_value: str = None,
|
|
3825
4113
|
class_imbalance: dict = None,
|
|
3826
4114
|
optimize_for: str = "balanced",
|
|
4115
|
+
cost_false_positive: float = None,
|
|
4116
|
+
cost_false_negative: float = None,
|
|
3827
4117
|
poll_interval: int = 30, max_poll_time: int = 3600,
|
|
3828
4118
|
verbose: bool = True,
|
|
3829
4119
|
webhooks: Dict[str, str] = None) -> Dict[str, Any]:
|
|
@@ -4036,14 +4326,14 @@ class FeatrixSphereClient:
|
|
|
4036
4326
|
|
|
4037
4327
|
If not provided, class weights are computed from your training data distribution.
|
|
4038
4328
|
|
|
4039
|
-
Understanding
|
|
4329
|
+
Understanding rare_label_value:
|
|
4040
4330
|
-----------------------------
|
|
4041
|
-
For binary classification,
|
|
4042
|
-
|
|
4331
|
+
For binary classification, rare_label_value specifies which class is the rare/minority
|
|
4332
|
+
class for computing metrics like precision, recall, and ROC-AUC.
|
|
4043
4333
|
|
|
4044
4334
|
Example: For a credit risk model predicting "good" vs "bad" loans:
|
|
4045
4335
|
|
|
4046
|
-
|
|
4336
|
+
rare_label_value="bad" # "bad" is the rare class we want to detect
|
|
4047
4337
|
|
|
4048
4338
|
This affects how metrics are reported:
|
|
4049
4339
|
- Precision = True Positives / (True Positives + False Positives)
|
|
@@ -4124,7 +4414,7 @@ class FeatrixSphereClient:
|
|
|
4124
4414
|
session_id=session.session_id,
|
|
4125
4415
|
target_column='approved',
|
|
4126
4416
|
target_column_type='set',
|
|
4127
|
-
|
|
4417
|
+
rare_label_value='yes'
|
|
4128
4418
|
)
|
|
4129
4419
|
```
|
|
4130
4420
|
|
|
@@ -4140,7 +4430,7 @@ class FeatrixSphereClient:
|
|
|
4140
4430
|
target_column_type='set',
|
|
4141
4431
|
class_imbalance={'approved': 0.97, 'rejected': 0.03},
|
|
4142
4432
|
optimize_for='recall', # Don't miss rejections
|
|
4143
|
-
|
|
4433
|
+
rare_label_value='rejected'
|
|
4144
4434
|
)
|
|
4145
4435
|
|
|
4146
4436
|
# System will:
|
|
@@ -4159,7 +4449,7 @@ class FeatrixSphereClient:
|
|
|
4159
4449
|
session_id=session.session_id,
|
|
4160
4450
|
target_column='is_fraud',
|
|
4161
4451
|
target_column_type='set',
|
|
4162
|
-
|
|
4452
|
+
rare_label_value='fraud',
|
|
4163
4453
|
optimize_for='precision', # Minimize false alarms
|
|
4164
4454
|
class_imbalance={'legitimate': 0.999, 'fraud': 0.001}
|
|
4165
4455
|
)
|
|
@@ -4174,7 +4464,7 @@ class FeatrixSphereClient:
|
|
|
4174
4464
|
session_id=session.session_id,
|
|
4175
4465
|
target_column='has_disease',
|
|
4176
4466
|
target_column_type='set',
|
|
4177
|
-
|
|
4467
|
+
rare_label_value='positive',
|
|
4178
4468
|
optimize_for='recall' # Don't miss any cases
|
|
4179
4469
|
)
|
|
4180
4470
|
```
|
|
@@ -4189,7 +4479,7 @@ class FeatrixSphereClient:
|
|
|
4189
4479
|
target_column='churn',
|
|
4190
4480
|
target_column_type='set',
|
|
4191
4481
|
validation_ignore_columns=['customer_id', 'signup_date'],
|
|
4192
|
-
|
|
4482
|
+
rare_label_value='churned'
|
|
4193
4483
|
)
|
|
4194
4484
|
```
|
|
4195
4485
|
|
|
@@ -4215,9 +4505,16 @@ class FeatrixSphereClient:
|
|
|
4215
4505
|
embedding space! If neither provided, uses session's original data file.
|
|
4216
4506
|
epochs: Number of training epochs (default: 0; automatic)
|
|
4217
4507
|
validation_ignore_columns: List of column names to exclude from validation queries (default: None)
|
|
4218
|
-
|
|
4508
|
+
rare_label_value: For binary classification, which class is the rare/minority class for metrics (default: None)
|
|
4219
4509
|
class_imbalance: Expected class ratios/counts from real world for sampled data (default: None)
|
|
4220
|
-
optimize_for: Optimization target - "balanced" (F1 score), "precision", or "recall" (default: "balanced")
|
|
4510
|
+
optimize_for: Optimization target - "balanced" (F1 score), "precision", or "recall" (default: "balanced").
|
|
4511
|
+
Ignored if cost_false_positive and cost_false_negative are provided.
|
|
4512
|
+
cost_false_positive: Cost of a false positive (predicting positive when actually negative).
|
|
4513
|
+
Must be specified together with cost_false_negative. Only valid for target_column_type="set".
|
|
4514
|
+
When provided, overrides optimize_for and uses cost-based optimization.
|
|
4515
|
+
cost_false_negative: Cost of a false negative (predicting negative when actually positive).
|
|
4516
|
+
Must be specified together with cost_false_positive. Only valid for target_column_type="set".
|
|
4517
|
+
When provided, overrides optimize_for and uses cost-based optimization.
|
|
4221
4518
|
poll_interval: Seconds between status checks when job is already running (default: 30)
|
|
4222
4519
|
max_poll_time: Maximum time to poll in seconds (default: 3600 = 1 hour)
|
|
4223
4520
|
verbose: Whether to print status updates during polling (default: True)
|
|
@@ -4234,6 +4531,18 @@ class FeatrixSphereClient:
|
|
|
4234
4531
|
if file_path and df is not None:
|
|
4235
4532
|
raise ValueError("Provide either file_path or df, not both")
|
|
4236
4533
|
|
|
4534
|
+
# Validate cost parameters
|
|
4535
|
+
if cost_false_positive is not None or cost_false_negative is not None:
|
|
4536
|
+
if cost_false_positive is None or cost_false_negative is None:
|
|
4537
|
+
raise ValueError("Both cost_false_positive and cost_false_negative must be specified together")
|
|
4538
|
+
if target_column_type != "set":
|
|
4539
|
+
raise ValueError("cost_false_positive and cost_false_negative are only valid for target_column_type='set' (classification), not 'scalar' (regression)")
|
|
4540
|
+
if cost_false_positive <= 0 or cost_false_negative <= 0:
|
|
4541
|
+
raise ValueError("cost_false_positive and cost_false_negative must be positive numbers")
|
|
4542
|
+
if verbose:
|
|
4543
|
+
print(f"💰 Cost-based optimization enabled: FP cost={cost_false_positive}, FN cost={cost_false_negative}")
|
|
4544
|
+
print(f" (optimize_for='{optimize_for}' will be ignored)")
|
|
4545
|
+
|
|
4237
4546
|
# If DataFrame provided, save to temp file and use file_path logic
|
|
4238
4547
|
temp_file = None
|
|
4239
4548
|
if df is not None:
|
|
@@ -4264,9 +4573,11 @@ class FeatrixSphereClient:
|
|
|
4264
4573
|
target_column=target_column,
|
|
4265
4574
|
target_column_type=target_column_type,
|
|
4266
4575
|
epochs=epochs,
|
|
4267
|
-
|
|
4576
|
+
rare_label_value=rare_label_value,
|
|
4268
4577
|
class_imbalance=class_imbalance,
|
|
4269
4578
|
optimize_for=optimize_for,
|
|
4579
|
+
cost_false_positive=cost_false_positive,
|
|
4580
|
+
cost_false_negative=cost_false_negative,
|
|
4270
4581
|
verbose=verbose,
|
|
4271
4582
|
webhooks=webhooks
|
|
4272
4583
|
)
|
|
@@ -4277,10 +4588,13 @@ class FeatrixSphereClient:
|
|
|
4277
4588
|
"target_column_type": target_column_type,
|
|
4278
4589
|
"epochs": epochs,
|
|
4279
4590
|
"validation_ignore_columns": validation_ignore_columns or [],
|
|
4280
|
-
"
|
|
4591
|
+
"rare_label_value": rare_label_value,
|
|
4281
4592
|
"class_imbalance": class_imbalance,
|
|
4282
4593
|
"optimize_for": optimize_for
|
|
4283
4594
|
}
|
|
4595
|
+
if cost_false_positive is not None and cost_false_negative is not None:
|
|
4596
|
+
data["cost_false_positive"] = cost_false_positive
|
|
4597
|
+
data["cost_false_negative"] = cost_false_negative
|
|
4284
4598
|
if webhooks:
|
|
4285
4599
|
data['webhooks'] = webhooks
|
|
4286
4600
|
|
|
@@ -4579,7 +4893,7 @@ class FeatrixSphereClient:
|
|
|
4579
4893
|
predictor_id: str = None, target_column: str = None,
|
|
4580
4894
|
batch_size: int = 0, learning_rate: float = None,
|
|
4581
4895
|
poll_interval: int = 30, max_poll_time: int = 3600,
|
|
4582
|
-
verbose: bool = True) -> Dict[str, Any]:
|
|
4896
|
+
verbose: bool = True, webhooks: Dict[str, str] = None) -> Dict[str, Any]:
|
|
4583
4897
|
"""
|
|
4584
4898
|
Continue training an existing single predictor for more epochs.
|
|
4585
4899
|
Loads the existing predictor and resumes training from where it left off.
|
|
@@ -4594,6 +4908,7 @@ class FeatrixSphereClient:
|
|
|
4594
4908
|
poll_interval: Seconds between status checks (default: 30)
|
|
4595
4909
|
max_poll_time: Maximum time to poll in seconds (default: 3600 = 1 hour)
|
|
4596
4910
|
verbose: Whether to print status updates (default: True)
|
|
4911
|
+
webhooks: Optional dict with webhook configuration keys (webhook_callback_secret, s3_backup_url, model_id_update_url)
|
|
4597
4912
|
|
|
4598
4913
|
Returns:
|
|
4599
4914
|
Response with continuation start confirmation or completion status
|
|
@@ -4625,6 +4940,8 @@ class FeatrixSphereClient:
|
|
|
4625
4940
|
data["target_column"] = target_column
|
|
4626
4941
|
if learning_rate is not None:
|
|
4627
4942
|
data["learning_rate"] = learning_rate
|
|
4943
|
+
if webhooks:
|
|
4944
|
+
data["webhooks"] = webhooks
|
|
4628
4945
|
|
|
4629
4946
|
if verbose:
|
|
4630
4947
|
print(f"🔄 Continuing training for predictor on session {session_id}")
|
|
@@ -4714,6 +5031,139 @@ class FeatrixSphereClient:
|
|
|
4714
5031
|
print(f"❌ Error starting predictor continuation: {e}")
|
|
4715
5032
|
raise
|
|
4716
5033
|
|
|
5034
|
+
def foundation_model_train_more(self, session_id: str, es_id: str = None, data_passes: int = None,
|
|
5035
|
+
epochs: int = None, poll_interval: int = 30, max_poll_time: int = 3600,
|
|
5036
|
+
verbose: bool = True, webhooks: Dict[str, str] = None) -> Dict[str, Any]:
|
|
5037
|
+
"""
|
|
5038
|
+
Continue training an existing foundation model (embedding space) for more epochs.
|
|
5039
|
+
Loads the existing embedding space and resumes training from where it left off.
|
|
5040
|
+
|
|
5041
|
+
Args:
|
|
5042
|
+
session_id: Session ID containing the trained foundation model
|
|
5043
|
+
es_id: Embedding space ID (optional, uses session's ES if not provided)
|
|
5044
|
+
data_passes: Additional epochs to train (preferred, default: 50)
|
|
5045
|
+
epochs: Additional epochs to train (deprecated, use data_passes instead, for compatibility)
|
|
5046
|
+
poll_interval: Seconds between status checks (default: 30)
|
|
5047
|
+
max_poll_time: Maximum time to poll in seconds (default: 3600 = 1 hour)
|
|
5048
|
+
verbose: Whether to print status updates (default: True)
|
|
5049
|
+
webhooks: Optional dict with webhook configuration keys (webhook_callback_secret, s3_backup_url, model_id_update_url)
|
|
5050
|
+
|
|
5051
|
+
Returns:
|
|
5052
|
+
Response with continuation start confirmation or completion status
|
|
5053
|
+
|
|
5054
|
+
Example:
|
|
5055
|
+
```python
|
|
5056
|
+
# Continue training for 50 more epochs
|
|
5057
|
+
result = client.foundation_model_train_more(
|
|
5058
|
+
session_id="abc123",
|
|
5059
|
+
data_passes=50
|
|
5060
|
+
)
|
|
5061
|
+
```
|
|
5062
|
+
"""
|
|
5063
|
+
# Support both data_passes and epochs for compatibility
|
|
5064
|
+
if data_passes is None and epochs is None:
|
|
5065
|
+
data_passes = 50 # Default
|
|
5066
|
+
elif data_passes is None:
|
|
5067
|
+
data_passes = epochs # Use epochs if data_passes not provided
|
|
5068
|
+
# If both provided, data_passes takes precedence
|
|
5069
|
+
|
|
5070
|
+
if data_passes <= 0:
|
|
5071
|
+
raise ValueError("data_passes (or epochs) must be > 0 (specify additional epochs to train)")
|
|
5072
|
+
|
|
5073
|
+
data = {
|
|
5074
|
+
"data_passes": data_passes,
|
|
5075
|
+
}
|
|
5076
|
+
|
|
5077
|
+
if es_id:
|
|
5078
|
+
data["es_id"] = es_id
|
|
5079
|
+
if webhooks:
|
|
5080
|
+
data["webhooks"] = webhooks
|
|
5081
|
+
|
|
5082
|
+
if verbose:
|
|
5083
|
+
print(f"🔄 Continuing training for foundation model on session {session_id}")
|
|
5084
|
+
print(f" Additional epochs: {data_passes}")
|
|
5085
|
+
if es_id:
|
|
5086
|
+
print(f" ES ID: {es_id}")
|
|
5087
|
+
|
|
5088
|
+
try:
|
|
5089
|
+
response_data = self._post_json(f"/compute/session/{session_id}/train_foundation_model_more", data)
|
|
5090
|
+
|
|
5091
|
+
if verbose:
|
|
5092
|
+
print(f"✅ Foundation model continuation started: {response_data.get('message')}")
|
|
5093
|
+
|
|
5094
|
+
# Poll for completion if requested
|
|
5095
|
+
if poll_interval > 0 and max_poll_time > 0:
|
|
5096
|
+
import time
|
|
5097
|
+
start_time = time.time()
|
|
5098
|
+
last_status = ""
|
|
5099
|
+
|
|
5100
|
+
while time.time() - start_time < max_poll_time:
|
|
5101
|
+
try:
|
|
5102
|
+
session_info = self.get_session_status(session_id)
|
|
5103
|
+
jobs = session_info.jobs if hasattr(session_info, 'jobs') else {}
|
|
5104
|
+
|
|
5105
|
+
# Find continuation jobs
|
|
5106
|
+
es_jobs = {j_id: j for j_id, j in jobs.items()
|
|
5107
|
+
if j.get('type') == 'train_es'}
|
|
5108
|
+
|
|
5109
|
+
if not es_jobs:
|
|
5110
|
+
if verbose:
|
|
5111
|
+
print("✅ No continuation jobs found - training may have completed")
|
|
5112
|
+
break
|
|
5113
|
+
|
|
5114
|
+
# Check job statuses
|
|
5115
|
+
running_jobs = [j_id for j_id, j in es_jobs.items() if j.get('status') == 'running']
|
|
5116
|
+
completed_jobs = [j_id for j_id, j in es_jobs.items() if j.get('status') == 'done']
|
|
5117
|
+
failed_jobs = [j_id for j_id, j in es_jobs.items() if j.get('status') == 'failed']
|
|
5118
|
+
|
|
5119
|
+
current_status = f"Running: {len(running_jobs)}, Done: {len(completed_jobs)}, Failed: {len(failed_jobs)}"
|
|
5120
|
+
if current_status != last_status and verbose:
|
|
5121
|
+
print(f"📊 Status: {current_status}")
|
|
5122
|
+
last_status = current_status
|
|
5123
|
+
|
|
5124
|
+
if not running_jobs and (completed_jobs or failed_jobs):
|
|
5125
|
+
if completed_jobs:
|
|
5126
|
+
if verbose:
|
|
5127
|
+
print(f"✅ Foundation model continuation completed successfully!")
|
|
5128
|
+
return {
|
|
5129
|
+
"message": "Foundation model continuation completed successfully",
|
|
5130
|
+
"session_id": session_id,
|
|
5131
|
+
"status": "completed",
|
|
5132
|
+
"additional_epochs": data_passes
|
|
5133
|
+
}
|
|
5134
|
+
else:
|
|
5135
|
+
if verbose:
|
|
5136
|
+
print(f"❌ Foundation model continuation failed")
|
|
5137
|
+
return {
|
|
5138
|
+
"message": "Foundation model continuation failed",
|
|
5139
|
+
"session_id": session_id,
|
|
5140
|
+
"status": "failed",
|
|
5141
|
+
"failed_jobs": failed_jobs
|
|
5142
|
+
}
|
|
5143
|
+
|
|
5144
|
+
time.sleep(poll_interval)
|
|
5145
|
+
except Exception as poll_error:
|
|
5146
|
+
if verbose:
|
|
5147
|
+
print(f"⚠️ Error during polling: {poll_error}")
|
|
5148
|
+
time.sleep(poll_interval)
|
|
5149
|
+
|
|
5150
|
+
# Timeout
|
|
5151
|
+
if verbose:
|
|
5152
|
+
print(f"⏱️ Polling timeout reached ({max_poll_time}s)")
|
|
5153
|
+
return {
|
|
5154
|
+
"message": "Polling timeout",
|
|
5155
|
+
"session_id": session_id,
|
|
5156
|
+
"status": "timeout",
|
|
5157
|
+
"additional_epochs": data_passes
|
|
5158
|
+
}
|
|
5159
|
+
|
|
5160
|
+
return response_data
|
|
5161
|
+
|
|
5162
|
+
except Exception as e:
|
|
5163
|
+
if verbose:
|
|
5164
|
+
print(f"❌ Error starting foundation model continuation: {e}")
|
|
5165
|
+
raise
|
|
5166
|
+
|
|
4717
5167
|
def _train_single_predictor_with_file(
|
|
4718
5168
|
self,
|
|
4719
5169
|
session_id: str,
|
|
@@ -4721,10 +5171,12 @@ class FeatrixSphereClient:
|
|
|
4721
5171
|
target_column: str,
|
|
4722
5172
|
target_column_type: str,
|
|
4723
5173
|
epochs: int,
|
|
4724
|
-
|
|
5174
|
+
rare_label_value: str,
|
|
4725
5175
|
class_imbalance: dict,
|
|
4726
5176
|
optimize_for: str,
|
|
4727
|
-
|
|
5177
|
+
cost_false_positive: float = None,
|
|
5178
|
+
cost_false_negative: float = None,
|
|
5179
|
+
verbose: bool = True,
|
|
4728
5180
|
webhooks: Dict[str, str] = None
|
|
4729
5181
|
) -> Dict[str, Any]:
|
|
4730
5182
|
"""
|
|
@@ -4753,12 +5205,16 @@ class FeatrixSphereClient:
|
|
|
4753
5205
|
'optimize_for': optimize_for,
|
|
4754
5206
|
}
|
|
4755
5207
|
|
|
4756
|
-
if
|
|
4757
|
-
data['
|
|
5208
|
+
if rare_label_value:
|
|
5209
|
+
data['rare_label_value'] = rare_label_value
|
|
4758
5210
|
|
|
4759
5211
|
if class_imbalance:
|
|
4760
5212
|
data['class_imbalance'] = json.dumps(class_imbalance)
|
|
4761
5213
|
|
|
5214
|
+
if cost_false_positive is not None and cost_false_negative is not None:
|
|
5215
|
+
data['cost_false_positive'] = str(cost_false_positive)
|
|
5216
|
+
data['cost_false_negative'] = str(cost_false_negative)
|
|
5217
|
+
|
|
4762
5218
|
if webhooks:
|
|
4763
5219
|
data['webhooks'] = json.dumps(webhooks)
|
|
4764
5220
|
|
|
@@ -5785,7 +6241,24 @@ class FeatrixSphereClient:
|
|
|
5785
6241
|
if not file_path.exists():
|
|
5786
6242
|
raise FileNotFoundError(f"File not found: {file_path}")
|
|
5787
6243
|
|
|
5788
|
-
|
|
6244
|
+
# Support CSV, Parquet, JSON, and JSONL files
|
|
6245
|
+
file_path_str = str(file_path).lower()
|
|
6246
|
+
if file_path_str.endswith('.parquet'):
|
|
6247
|
+
df = pd.read_parquet(file_path)
|
|
6248
|
+
elif file_path_str.endswith('.jsonl'):
|
|
6249
|
+
# JSONL: one JSON object per line
|
|
6250
|
+
import json
|
|
6251
|
+
records = []
|
|
6252
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
6253
|
+
for line in f:
|
|
6254
|
+
if line.strip():
|
|
6255
|
+
records.append(json.loads(line))
|
|
6256
|
+
df = pd.DataFrame(records)
|
|
6257
|
+
elif file_path_str.endswith('.json'):
|
|
6258
|
+
# Regular JSON
|
|
6259
|
+
df = pd.read_json(file_path)
|
|
6260
|
+
else:
|
|
6261
|
+
df = pd.read_csv(file_path)
|
|
5789
6262
|
|
|
5790
6263
|
# Convert to JSON Tables format and clean NaNs
|
|
5791
6264
|
table_data = JSONTablesEncoder.from_dataframe(df)
|
|
@@ -5939,11 +6412,11 @@ class FeatrixSphereClient:
|
|
|
5939
6412
|
def run_csv_predictions(self, session_id: str, csv_file: str, target_column: str = None,
|
|
5940
6413
|
sample_size: int = None, remove_target: bool = True) -> Dict[str, Any]:
|
|
5941
6414
|
"""
|
|
5942
|
-
Run predictions on a CSV file with automatic accuracy calculation.
|
|
6415
|
+
Run predictions on a CSV, Parquet, JSON, or JSONL file with automatic accuracy calculation.
|
|
5943
6416
|
|
|
5944
6417
|
Args:
|
|
5945
6418
|
session_id: ID of session with trained predictor
|
|
5946
|
-
csv_file: Path to CSV file
|
|
6419
|
+
csv_file: Path to CSV, Parquet, JSON, or JSONL file
|
|
5947
6420
|
target_column: Name of target column (for accuracy calculation)
|
|
5948
6421
|
sample_size: Number of records to test (None = all records)
|
|
5949
6422
|
remove_target: Whether to remove target column from prediction input
|
|
@@ -5953,8 +6426,24 @@ class FeatrixSphereClient:
|
|
|
5953
6426
|
"""
|
|
5954
6427
|
import pandas as pd
|
|
5955
6428
|
|
|
5956
|
-
# Load CSV
|
|
5957
|
-
|
|
6429
|
+
# Load CSV, Parquet, JSON, or JSONL
|
|
6430
|
+
csv_file_lower = csv_file.lower()
|
|
6431
|
+
if csv_file_lower.endswith('.parquet'):
|
|
6432
|
+
df = pd.read_parquet(csv_file)
|
|
6433
|
+
elif csv_file_lower.endswith('.jsonl'):
|
|
6434
|
+
# JSONL: one JSON object per line
|
|
6435
|
+
import json
|
|
6436
|
+
records = []
|
|
6437
|
+
with open(csv_file, 'r', encoding='utf-8') as f:
|
|
6438
|
+
for line in f:
|
|
6439
|
+
if line.strip():
|
|
6440
|
+
records.append(json.loads(line))
|
|
6441
|
+
df = pd.DataFrame(records)
|
|
6442
|
+
elif csv_file_lower.endswith('.json'):
|
|
6443
|
+
# Regular JSON
|
|
6444
|
+
df = pd.read_json(csv_file)
|
|
6445
|
+
else:
|
|
6446
|
+
df = pd.read_csv(csv_file)
|
|
5958
6447
|
|
|
5959
6448
|
# Handle target column
|
|
5960
6449
|
actual_values = None
|
featrixsphere-0.2.1002/VERSION
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
0.2.1001
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{featrixsphere-0.2.1002 → featrixsphere-0.2.1206}/featrixsphere.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|