featrixsphere 0.2.1141__py3-none-any.whl → 0.2.1206__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- featrixsphere/__init__.py +1 -1
- featrixsphere/client.py +325 -16
- {featrixsphere-0.2.1141.dist-info → featrixsphere-0.2.1206.dist-info}/METADATA +1 -1
- featrixsphere-0.2.1206.dist-info/RECORD +9 -0
- featrixsphere-0.2.1141.dist-info/RECORD +0 -9
- {featrixsphere-0.2.1141.dist-info → featrixsphere-0.2.1206.dist-info}/WHEEL +0 -0
- {featrixsphere-0.2.1141.dist-info → featrixsphere-0.2.1206.dist-info}/entry_points.txt +0 -0
- {featrixsphere-0.2.1141.dist-info → featrixsphere-0.2.1206.dist-info}/top_level.txt +0 -0
featrixsphere/__init__.py
CHANGED
featrixsphere/client.py
CHANGED
|
@@ -660,6 +660,77 @@ class FeatrixSphereClient:
|
|
|
660
660
|
)
|
|
661
661
|
return response.json()
|
|
662
662
|
|
|
663
|
+
def publish_session(self, session_id: str) -> Dict[str, Any]:
|
|
664
|
+
"""
|
|
665
|
+
Publish a session by moving it to /sphere/published/<sessionId>.
|
|
666
|
+
Moves both the session file and output directory.
|
|
667
|
+
|
|
668
|
+
Args:
|
|
669
|
+
session_id: Session ID to publish
|
|
670
|
+
|
|
671
|
+
Returns:
|
|
672
|
+
Response with published_path, output_path, and status
|
|
673
|
+
|
|
674
|
+
Example:
|
|
675
|
+
```python
|
|
676
|
+
result = client.publish_session("abc123")
|
|
677
|
+
print(f"Published to: {result['published_path']}")
|
|
678
|
+
```
|
|
679
|
+
"""
|
|
680
|
+
response_data = self._post_json(f"/compute/session/{session_id}/publish", {})
|
|
681
|
+
return response_data
|
|
682
|
+
|
|
683
|
+
def deprecate_session(self, session_id: str, warning_message: str, expiration_date: str) -> Dict[str, Any]:
|
|
684
|
+
"""
|
|
685
|
+
Deprecate a published session with a warning message and expiration date.
|
|
686
|
+
The session remains available until the expiration date.
|
|
687
|
+
|
|
688
|
+
Args:
|
|
689
|
+
session_id: Session ID to deprecate
|
|
690
|
+
warning_message: Warning message to display about deprecation
|
|
691
|
+
expiration_date: ISO format date string when session will be removed (e.g., "2025-12-31T23:59:59Z")
|
|
692
|
+
|
|
693
|
+
Returns:
|
|
694
|
+
Response with deprecation status
|
|
695
|
+
|
|
696
|
+
Example:
|
|
697
|
+
```python
|
|
698
|
+
from datetime import datetime, timedelta
|
|
699
|
+
|
|
700
|
+
expiration = (datetime.now() + timedelta(days=90)).isoformat() + "Z"
|
|
701
|
+
result = client.deprecate_session(
|
|
702
|
+
session_id="abc123",
|
|
703
|
+
warning_message="This session will be removed on 2025-12-31",
|
|
704
|
+
expiration_date=expiration
|
|
705
|
+
)
|
|
706
|
+
```
|
|
707
|
+
"""
|
|
708
|
+
data = {
|
|
709
|
+
"warning_message": warning_message,
|
|
710
|
+
"expiration_date": expiration_date
|
|
711
|
+
}
|
|
712
|
+
response_data = self._post_json(f"/compute/session/{session_id}/deprecate", data)
|
|
713
|
+
return response_data
|
|
714
|
+
|
|
715
|
+
def unpublish_session(self, session_id: str) -> Dict[str, Any]:
|
|
716
|
+
"""
|
|
717
|
+
Unpublish a session by moving it back from /sphere/published/<sessionId>.
|
|
718
|
+
|
|
719
|
+
Args:
|
|
720
|
+
session_id: Session ID to unpublish
|
|
721
|
+
|
|
722
|
+
Returns:
|
|
723
|
+
Response with unpublish status
|
|
724
|
+
|
|
725
|
+
Example:
|
|
726
|
+
```python
|
|
727
|
+
result = client.unpublish_session("abc123")
|
|
728
|
+
print(f"Status: {result['status']}")
|
|
729
|
+
```
|
|
730
|
+
"""
|
|
731
|
+
response_data = self._post_json(f"/compute/session/{session_id}/unpublish", {})
|
|
732
|
+
return response_data
|
|
733
|
+
|
|
663
734
|
def get_sessions_for_org(self, name_prefix: str, max_retries: int = None) -> Dict[str, Any]:
|
|
664
735
|
"""
|
|
665
736
|
Get all sessions matching a name prefix across all compute nodes.
|
|
@@ -1555,10 +1626,10 @@ class FeatrixSphereClient:
|
|
|
1555
1626
|
|
|
1556
1627
|
def upload_file_and_create_session(self, file_path: Path, session_name_prefix: str = None, name: str = None, webhooks: Dict[str, str] = None) -> SessionInfo:
|
|
1557
1628
|
"""
|
|
1558
|
-
Upload a CSV file and create a new session.
|
|
1629
|
+
Upload a CSV, Parquet, JSON, or JSONL file and create a new session.
|
|
1559
1630
|
|
|
1560
1631
|
Args:
|
|
1561
|
-
file_path: Path to the CSV file to upload
|
|
1632
|
+
file_path: Path to the CSV, Parquet, JSON, or JSONL file to upload
|
|
1562
1633
|
session_name_prefix: Optional prefix for the session ID. Session will be named <prefix>-<full-uuid>
|
|
1563
1634
|
name: Optional name for the embedding space/model (for identification and metadata)
|
|
1564
1635
|
webhooks: Optional dict with webhook configuration keys (webhook_callback_secret, s3_backup_url, model_id_update_url)
|
|
@@ -1622,7 +1693,7 @@ class FeatrixSphereClient:
|
|
|
1622
1693
|
webhooks: Dict[str, str] = None,
|
|
1623
1694
|
epochs: int = None) -> SessionInfo:
|
|
1624
1695
|
"""
|
|
1625
|
-
Upload a pandas DataFrame or
|
|
1696
|
+
Upload a pandas DataFrame, CSV file, Parquet file, JSON file, or JSONL file and create a new session.
|
|
1626
1697
|
|
|
1627
1698
|
Special Column: __featrix_train_predictor
|
|
1628
1699
|
------------------------------------------
|
|
@@ -1630,7 +1701,7 @@ class FeatrixSphereClient:
|
|
|
1630
1701
|
which rows are used for single predictor training.
|
|
1631
1702
|
|
|
1632
1703
|
How it works:
|
|
1633
|
-
- Add a boolean column "__featrix_train_predictor" to your DataFrame/CSV before upload
|
|
1704
|
+
- Add a boolean column "__featrix_train_predictor" to your DataFrame/CSV/Parquet/JSON/JSONL before upload
|
|
1634
1705
|
- Set it to True for rows you want to use for predictor training
|
|
1635
1706
|
- Set it to False (or any other value) for rows to exclude from predictor training
|
|
1636
1707
|
- Embedding space training uses ALL rows (ignores this column)
|
|
@@ -1664,7 +1735,7 @@ class FeatrixSphereClient:
|
|
|
1664
1735
|
Args:
|
|
1665
1736
|
df: pandas DataFrame to upload (optional if file_path is provided)
|
|
1666
1737
|
filename: Name to give the uploaded file (default: "data.csv")
|
|
1667
|
-
file_path: Path to CSV file to upload (optional if df is provided)
|
|
1738
|
+
file_path: Path to CSV, Parquet, JSON, or JSONL file to upload (optional if df is provided)
|
|
1668
1739
|
column_overrides: Dict mapping column names to types ("scalar", "set", "free_string", "free_string_list")
|
|
1669
1740
|
column_types: Alias for column_overrides (for backward compatibility)
|
|
1670
1741
|
string_list_delimiter: Delimiter for free_string_list columns (default: "|")
|
|
@@ -1705,21 +1776,90 @@ class FeatrixSphereClient:
|
|
|
1705
1776
|
if not os.path.exists(file_path):
|
|
1706
1777
|
raise FileNotFoundError(f"File not found: {file_path}")
|
|
1707
1778
|
|
|
1708
|
-
# Check if it's a
|
|
1709
|
-
|
|
1710
|
-
|
|
1779
|
+
# Check if it's a supported file type
|
|
1780
|
+
file_ext = file_path.lower()
|
|
1781
|
+
if not file_ext.endswith(('.csv', '.csv.gz', '.parquet', '.json', '.jsonl')):
|
|
1782
|
+
raise ValueError("File must be a CSV, Parquet, JSON, or JSONL file (with .csv, .csv.gz, .parquet, .json, or .jsonl extension)")
|
|
1711
1783
|
|
|
1712
1784
|
print(f"Uploading file: {file_path}")
|
|
1713
1785
|
|
|
1714
1786
|
# Read the file content
|
|
1715
1787
|
if file_path.endswith('.gz'):
|
|
1716
|
-
# Already gzipped
|
|
1788
|
+
# Already gzipped CSV
|
|
1717
1789
|
with gzip.open(file_path, 'rb') as f:
|
|
1718
1790
|
file_content = f.read()
|
|
1719
1791
|
upload_filename = os.path.basename(file_path)
|
|
1720
1792
|
content_type = 'application/gzip'
|
|
1793
|
+
elif file_path.lower().endswith(('.json', '.jsonl')):
|
|
1794
|
+
# JSON/JSONL file - read as DataFrame, convert to CSV, then compress
|
|
1795
|
+
print(f"Reading {'JSONL' if file_path.lower().endswith('.jsonl') else 'JSON'} file...")
|
|
1796
|
+
try:
|
|
1797
|
+
from featrix.neural.input_data_file import featrix_wrap_read_json_file
|
|
1798
|
+
json_df = featrix_wrap_read_json_file(file_path)
|
|
1799
|
+
if json_df is None:
|
|
1800
|
+
raise ValueError(f"Failed to parse {'JSONL' if file_path.lower().endswith('.jsonl') else 'JSON'} file")
|
|
1801
|
+
except ImportError:
|
|
1802
|
+
# Fallback to pandas if featrix wrapper not available
|
|
1803
|
+
if file_path.lower().endswith('.jsonl'):
|
|
1804
|
+
# JSONL: one JSON object per line
|
|
1805
|
+
import json
|
|
1806
|
+
records = []
|
|
1807
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
1808
|
+
for line in f:
|
|
1809
|
+
if line.strip():
|
|
1810
|
+
records.append(json.loads(line))
|
|
1811
|
+
json_df = pd.DataFrame(records)
|
|
1812
|
+
else:
|
|
1813
|
+
# Regular JSON
|
|
1814
|
+
json_df = pd.read_json(file_path)
|
|
1815
|
+
|
|
1816
|
+
# Clean NaN values before CSV conversion
|
|
1817
|
+
cleaned_df = json_df.where(pd.notna(json_df), None)
|
|
1818
|
+
|
|
1819
|
+
# Convert to CSV and compress
|
|
1820
|
+
csv_buffer = io.StringIO()
|
|
1821
|
+
cleaned_df.to_csv(csv_buffer, index=False)
|
|
1822
|
+
csv_data = csv_buffer.getvalue().encode('utf-8')
|
|
1823
|
+
|
|
1824
|
+
print(f"Compressing {'JSONL' if file_path.lower().endswith('.jsonl') else 'JSON'} (converted to CSV)...")
|
|
1825
|
+
compressed_buffer = io.BytesIO()
|
|
1826
|
+
with gzip.GzipFile(fileobj=compressed_buffer, mode='wb') as gz:
|
|
1827
|
+
gz.write(csv_data)
|
|
1828
|
+
file_content = compressed_buffer.getvalue()
|
|
1829
|
+
upload_filename = os.path.basename(file_path).replace('.jsonl', '.csv.gz').replace('.json', '.csv.gz')
|
|
1830
|
+
content_type = 'application/gzip'
|
|
1831
|
+
|
|
1832
|
+
original_size = len(csv_data)
|
|
1833
|
+
compressed_size = len(file_content)
|
|
1834
|
+
compression_ratio = (1 - compressed_size / original_size) * 100
|
|
1835
|
+
print(f"Converted {'JSONL' if file_path.lower().endswith('.jsonl') else 'JSON'} to CSV and compressed from {original_size:,} to {compressed_size:,} bytes ({compression_ratio:.1f}% reduction)")
|
|
1836
|
+
elif file_path.lower().endswith('.parquet'):
|
|
1837
|
+
# Parquet file - read as DataFrame, convert to CSV, then compress
|
|
1838
|
+
print("Reading Parquet file...")
|
|
1839
|
+
parquet_df = pd.read_parquet(file_path)
|
|
1840
|
+
|
|
1841
|
+
# Clean NaN values before CSV conversion
|
|
1842
|
+
cleaned_df = parquet_df.where(pd.notna(parquet_df), None)
|
|
1843
|
+
|
|
1844
|
+
# Convert to CSV and compress
|
|
1845
|
+
csv_buffer = io.StringIO()
|
|
1846
|
+
cleaned_df.to_csv(csv_buffer, index=False)
|
|
1847
|
+
csv_data = csv_buffer.getvalue().encode('utf-8')
|
|
1848
|
+
|
|
1849
|
+
print("Compressing Parquet (converted to CSV)...")
|
|
1850
|
+
compressed_buffer = io.BytesIO()
|
|
1851
|
+
with gzip.GzipFile(fileobj=compressed_buffer, mode='wb') as gz:
|
|
1852
|
+
gz.write(csv_data)
|
|
1853
|
+
file_content = compressed_buffer.getvalue()
|
|
1854
|
+
upload_filename = os.path.basename(file_path).replace('.parquet', '.csv.gz')
|
|
1855
|
+
content_type = 'application/gzip'
|
|
1856
|
+
|
|
1857
|
+
original_size = len(csv_data)
|
|
1858
|
+
compressed_size = len(file_content)
|
|
1859
|
+
compression_ratio = (1 - compressed_size / original_size) * 100
|
|
1860
|
+
print(f"Converted Parquet to CSV and compressed from {original_size:,} to {compressed_size:,} bytes ({compression_ratio:.1f}% reduction)")
|
|
1721
1861
|
else:
|
|
1722
|
-
#
|
|
1862
|
+
# Regular CSV file - read and compress it
|
|
1723
1863
|
with open(file_path, 'rb') as f:
|
|
1724
1864
|
csv_content = f.read()
|
|
1725
1865
|
|
|
@@ -4753,7 +4893,7 @@ class FeatrixSphereClient:
|
|
|
4753
4893
|
predictor_id: str = None, target_column: str = None,
|
|
4754
4894
|
batch_size: int = 0, learning_rate: float = None,
|
|
4755
4895
|
poll_interval: int = 30, max_poll_time: int = 3600,
|
|
4756
|
-
verbose: bool = True) -> Dict[str, Any]:
|
|
4896
|
+
verbose: bool = True, webhooks: Dict[str, str] = None) -> Dict[str, Any]:
|
|
4757
4897
|
"""
|
|
4758
4898
|
Continue training an existing single predictor for more epochs.
|
|
4759
4899
|
Loads the existing predictor and resumes training from where it left off.
|
|
@@ -4768,6 +4908,7 @@ class FeatrixSphereClient:
|
|
|
4768
4908
|
poll_interval: Seconds between status checks (default: 30)
|
|
4769
4909
|
max_poll_time: Maximum time to poll in seconds (default: 3600 = 1 hour)
|
|
4770
4910
|
verbose: Whether to print status updates (default: True)
|
|
4911
|
+
webhooks: Optional dict with webhook configuration keys (webhook_callback_secret, s3_backup_url, model_id_update_url)
|
|
4771
4912
|
|
|
4772
4913
|
Returns:
|
|
4773
4914
|
Response with continuation start confirmation or completion status
|
|
@@ -4799,6 +4940,8 @@ class FeatrixSphereClient:
|
|
|
4799
4940
|
data["target_column"] = target_column
|
|
4800
4941
|
if learning_rate is not None:
|
|
4801
4942
|
data["learning_rate"] = learning_rate
|
|
4943
|
+
if webhooks:
|
|
4944
|
+
data["webhooks"] = webhooks
|
|
4802
4945
|
|
|
4803
4946
|
if verbose:
|
|
4804
4947
|
print(f"🔄 Continuing training for predictor on session {session_id}")
|
|
@@ -4888,6 +5031,139 @@ class FeatrixSphereClient:
|
|
|
4888
5031
|
print(f"❌ Error starting predictor continuation: {e}")
|
|
4889
5032
|
raise
|
|
4890
5033
|
|
|
5034
|
+
def foundation_model_train_more(self, session_id: str, es_id: str = None, data_passes: int = None,
|
|
5035
|
+
epochs: int = None, poll_interval: int = 30, max_poll_time: int = 3600,
|
|
5036
|
+
verbose: bool = True, webhooks: Dict[str, str] = None) -> Dict[str, Any]:
|
|
5037
|
+
"""
|
|
5038
|
+
Continue training an existing foundation model (embedding space) for more epochs.
|
|
5039
|
+
Loads the existing embedding space and resumes training from where it left off.
|
|
5040
|
+
|
|
5041
|
+
Args:
|
|
5042
|
+
session_id: Session ID containing the trained foundation model
|
|
5043
|
+
es_id: Embedding space ID (optional, uses session's ES if not provided)
|
|
5044
|
+
data_passes: Additional epochs to train (preferred, default: 50)
|
|
5045
|
+
epochs: Additional epochs to train (deprecated, use data_passes instead, for compatibility)
|
|
5046
|
+
poll_interval: Seconds between status checks (default: 30)
|
|
5047
|
+
max_poll_time: Maximum time to poll in seconds (default: 3600 = 1 hour)
|
|
5048
|
+
verbose: Whether to print status updates (default: True)
|
|
5049
|
+
webhooks: Optional dict with webhook configuration keys (webhook_callback_secret, s3_backup_url, model_id_update_url)
|
|
5050
|
+
|
|
5051
|
+
Returns:
|
|
5052
|
+
Response with continuation start confirmation or completion status
|
|
5053
|
+
|
|
5054
|
+
Example:
|
|
5055
|
+
```python
|
|
5056
|
+
# Continue training for 50 more epochs
|
|
5057
|
+
result = client.foundation_model_train_more(
|
|
5058
|
+
session_id="abc123",
|
|
5059
|
+
data_passes=50
|
|
5060
|
+
)
|
|
5061
|
+
```
|
|
5062
|
+
"""
|
|
5063
|
+
# Support both data_passes and epochs for compatibility
|
|
5064
|
+
if data_passes is None and epochs is None:
|
|
5065
|
+
data_passes = 50 # Default
|
|
5066
|
+
elif data_passes is None:
|
|
5067
|
+
data_passes = epochs # Use epochs if data_passes not provided
|
|
5068
|
+
# If both provided, data_passes takes precedence
|
|
5069
|
+
|
|
5070
|
+
if data_passes <= 0:
|
|
5071
|
+
raise ValueError("data_passes (or epochs) must be > 0 (specify additional epochs to train)")
|
|
5072
|
+
|
|
5073
|
+
data = {
|
|
5074
|
+
"data_passes": data_passes,
|
|
5075
|
+
}
|
|
5076
|
+
|
|
5077
|
+
if es_id:
|
|
5078
|
+
data["es_id"] = es_id
|
|
5079
|
+
if webhooks:
|
|
5080
|
+
data["webhooks"] = webhooks
|
|
5081
|
+
|
|
5082
|
+
if verbose:
|
|
5083
|
+
print(f"🔄 Continuing training for foundation model on session {session_id}")
|
|
5084
|
+
print(f" Additional epochs: {data_passes}")
|
|
5085
|
+
if es_id:
|
|
5086
|
+
print(f" ES ID: {es_id}")
|
|
5087
|
+
|
|
5088
|
+
try:
|
|
5089
|
+
response_data = self._post_json(f"/compute/session/{session_id}/train_foundation_model_more", data)
|
|
5090
|
+
|
|
5091
|
+
if verbose:
|
|
5092
|
+
print(f"✅ Foundation model continuation started: {response_data.get('message')}")
|
|
5093
|
+
|
|
5094
|
+
# Poll for completion if requested
|
|
5095
|
+
if poll_interval > 0 and max_poll_time > 0:
|
|
5096
|
+
import time
|
|
5097
|
+
start_time = time.time()
|
|
5098
|
+
last_status = ""
|
|
5099
|
+
|
|
5100
|
+
while time.time() - start_time < max_poll_time:
|
|
5101
|
+
try:
|
|
5102
|
+
session_info = self.get_session_status(session_id)
|
|
5103
|
+
jobs = session_info.jobs if hasattr(session_info, 'jobs') else {}
|
|
5104
|
+
|
|
5105
|
+
# Find continuation jobs
|
|
5106
|
+
es_jobs = {j_id: j for j_id, j in jobs.items()
|
|
5107
|
+
if j.get('type') == 'train_es'}
|
|
5108
|
+
|
|
5109
|
+
if not es_jobs:
|
|
5110
|
+
if verbose:
|
|
5111
|
+
print("✅ No continuation jobs found - training may have completed")
|
|
5112
|
+
break
|
|
5113
|
+
|
|
5114
|
+
# Check job statuses
|
|
5115
|
+
running_jobs = [j_id for j_id, j in es_jobs.items() if j.get('status') == 'running']
|
|
5116
|
+
completed_jobs = [j_id for j_id, j in es_jobs.items() if j.get('status') == 'done']
|
|
5117
|
+
failed_jobs = [j_id for j_id, j in es_jobs.items() if j.get('status') == 'failed']
|
|
5118
|
+
|
|
5119
|
+
current_status = f"Running: {len(running_jobs)}, Done: {len(completed_jobs)}, Failed: {len(failed_jobs)}"
|
|
5120
|
+
if current_status != last_status and verbose:
|
|
5121
|
+
print(f"📊 Status: {current_status}")
|
|
5122
|
+
last_status = current_status
|
|
5123
|
+
|
|
5124
|
+
if not running_jobs and (completed_jobs or failed_jobs):
|
|
5125
|
+
if completed_jobs:
|
|
5126
|
+
if verbose:
|
|
5127
|
+
print(f"✅ Foundation model continuation completed successfully!")
|
|
5128
|
+
return {
|
|
5129
|
+
"message": "Foundation model continuation completed successfully",
|
|
5130
|
+
"session_id": session_id,
|
|
5131
|
+
"status": "completed",
|
|
5132
|
+
"additional_epochs": data_passes
|
|
5133
|
+
}
|
|
5134
|
+
else:
|
|
5135
|
+
if verbose:
|
|
5136
|
+
print(f"❌ Foundation model continuation failed")
|
|
5137
|
+
return {
|
|
5138
|
+
"message": "Foundation model continuation failed",
|
|
5139
|
+
"session_id": session_id,
|
|
5140
|
+
"status": "failed",
|
|
5141
|
+
"failed_jobs": failed_jobs
|
|
5142
|
+
}
|
|
5143
|
+
|
|
5144
|
+
time.sleep(poll_interval)
|
|
5145
|
+
except Exception as poll_error:
|
|
5146
|
+
if verbose:
|
|
5147
|
+
print(f"⚠️ Error during polling: {poll_error}")
|
|
5148
|
+
time.sleep(poll_interval)
|
|
5149
|
+
|
|
5150
|
+
# Timeout
|
|
5151
|
+
if verbose:
|
|
5152
|
+
print(f"⏱️ Polling timeout reached ({max_poll_time}s)")
|
|
5153
|
+
return {
|
|
5154
|
+
"message": "Polling timeout",
|
|
5155
|
+
"session_id": session_id,
|
|
5156
|
+
"status": "timeout",
|
|
5157
|
+
"additional_epochs": data_passes
|
|
5158
|
+
}
|
|
5159
|
+
|
|
5160
|
+
return response_data
|
|
5161
|
+
|
|
5162
|
+
except Exception as e:
|
|
5163
|
+
if verbose:
|
|
5164
|
+
print(f"❌ Error starting foundation model continuation: {e}")
|
|
5165
|
+
raise
|
|
5166
|
+
|
|
4891
5167
|
def _train_single_predictor_with_file(
|
|
4892
5168
|
self,
|
|
4893
5169
|
session_id: str,
|
|
@@ -5965,7 +6241,24 @@ class FeatrixSphereClient:
|
|
|
5965
6241
|
if not file_path.exists():
|
|
5966
6242
|
raise FileNotFoundError(f"File not found: {file_path}")
|
|
5967
6243
|
|
|
5968
|
-
|
|
6244
|
+
# Support CSV, Parquet, JSON, and JSONL files
|
|
6245
|
+
file_path_str = str(file_path).lower()
|
|
6246
|
+
if file_path_str.endswith('.parquet'):
|
|
6247
|
+
df = pd.read_parquet(file_path)
|
|
6248
|
+
elif file_path_str.endswith('.jsonl'):
|
|
6249
|
+
# JSONL: one JSON object per line
|
|
6250
|
+
import json
|
|
6251
|
+
records = []
|
|
6252
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
6253
|
+
for line in f:
|
|
6254
|
+
if line.strip():
|
|
6255
|
+
records.append(json.loads(line))
|
|
6256
|
+
df = pd.DataFrame(records)
|
|
6257
|
+
elif file_path_str.endswith('.json'):
|
|
6258
|
+
# Regular JSON
|
|
6259
|
+
df = pd.read_json(file_path)
|
|
6260
|
+
else:
|
|
6261
|
+
df = pd.read_csv(file_path)
|
|
5969
6262
|
|
|
5970
6263
|
# Convert to JSON Tables format and clean NaNs
|
|
5971
6264
|
table_data = JSONTablesEncoder.from_dataframe(df)
|
|
@@ -6119,11 +6412,11 @@ class FeatrixSphereClient:
|
|
|
6119
6412
|
def run_csv_predictions(self, session_id: str, csv_file: str, target_column: str = None,
|
|
6120
6413
|
sample_size: int = None, remove_target: bool = True) -> Dict[str, Any]:
|
|
6121
6414
|
"""
|
|
6122
|
-
Run predictions on a CSV file with automatic accuracy calculation.
|
|
6415
|
+
Run predictions on a CSV, Parquet, JSON, or JSONL file with automatic accuracy calculation.
|
|
6123
6416
|
|
|
6124
6417
|
Args:
|
|
6125
6418
|
session_id: ID of session with trained predictor
|
|
6126
|
-
csv_file: Path to CSV file
|
|
6419
|
+
csv_file: Path to CSV, Parquet, JSON, or JSONL file
|
|
6127
6420
|
target_column: Name of target column (for accuracy calculation)
|
|
6128
6421
|
sample_size: Number of records to test (None = all records)
|
|
6129
6422
|
remove_target: Whether to remove target column from prediction input
|
|
@@ -6133,8 +6426,24 @@ class FeatrixSphereClient:
|
|
|
6133
6426
|
"""
|
|
6134
6427
|
import pandas as pd
|
|
6135
6428
|
|
|
6136
|
-
# Load CSV
|
|
6137
|
-
|
|
6429
|
+
# Load CSV, Parquet, JSON, or JSONL
|
|
6430
|
+
csv_file_lower = csv_file.lower()
|
|
6431
|
+
if csv_file_lower.endswith('.parquet'):
|
|
6432
|
+
df = pd.read_parquet(csv_file)
|
|
6433
|
+
elif csv_file_lower.endswith('.jsonl'):
|
|
6434
|
+
# JSONL: one JSON object per line
|
|
6435
|
+
import json
|
|
6436
|
+
records = []
|
|
6437
|
+
with open(csv_file, 'r', encoding='utf-8') as f:
|
|
6438
|
+
for line in f:
|
|
6439
|
+
if line.strip():
|
|
6440
|
+
records.append(json.loads(line))
|
|
6441
|
+
df = pd.DataFrame(records)
|
|
6442
|
+
elif csv_file_lower.endswith('.json'):
|
|
6443
|
+
# Regular JSON
|
|
6444
|
+
df = pd.read_json(csv_file)
|
|
6445
|
+
else:
|
|
6446
|
+
df = pd.read_csv(csv_file)
|
|
6138
6447
|
|
|
6139
6448
|
# Handle target column
|
|
6140
6449
|
actual_values = None
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
featrixsphere/__init__.py,sha256=6sjz6sAZEsBAotAiefePXdOHnoBmovpFqVy4vq4wONE,1888
|
|
2
|
+
featrixsphere/cli.py,sha256=AW9O3vCvCNJ2UxVGN66eRmeN7XLSiHJlvK6JLZ9UJXc,13358
|
|
3
|
+
featrixsphere/client.py,sha256=C9b_x2aRrGJJI0UCN2GtAdoSjIsMzAR9EKGbJvAzzPE,374039
|
|
4
|
+
featrixsphere/test_client.py,sha256=4SiRbib0ms3poK0UpnUv4G0HFQSzidF3Iswo_J2cjLk,11981
|
|
5
|
+
featrixsphere-0.2.1206.dist-info/METADATA,sha256=SjuRakp3SS59KnY4N7r65ZnlLp7Lg0O1DAq_NgQZ1fo,16232
|
|
6
|
+
featrixsphere-0.2.1206.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
7
|
+
featrixsphere-0.2.1206.dist-info/entry_points.txt,sha256=QreJeYfD_VWvbEqPmMXZ3pqqlFlJ1qZb-NtqnyhEldc,51
|
|
8
|
+
featrixsphere-0.2.1206.dist-info/top_level.txt,sha256=AyN4wjfzlD0hWnDieuEHX0KckphIk_aC73XCG4df5uU,14
|
|
9
|
+
featrixsphere-0.2.1206.dist-info/RECORD,,
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
featrixsphere/__init__.py,sha256=FMxe64cn4iu9Ce5UDkOAtWZQMeWSijwX-tsiTDvblkM,1888
|
|
2
|
-
featrixsphere/cli.py,sha256=AW9O3vCvCNJ2UxVGN66eRmeN7XLSiHJlvK6JLZ9UJXc,13358
|
|
3
|
-
featrixsphere/client.py,sha256=TsiV-nr0VbBS1jJfidk5zrhOx6StolKsSn_txH0wmmg,358958
|
|
4
|
-
featrixsphere/test_client.py,sha256=4SiRbib0ms3poK0UpnUv4G0HFQSzidF3Iswo_J2cjLk,11981
|
|
5
|
-
featrixsphere-0.2.1141.dist-info/METADATA,sha256=27KEfgeXQqUNAlO3HIFhYkJU43YN3RdCjTJ_-viNJow,16232
|
|
6
|
-
featrixsphere-0.2.1141.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
7
|
-
featrixsphere-0.2.1141.dist-info/entry_points.txt,sha256=QreJeYfD_VWvbEqPmMXZ3pqqlFlJ1qZb-NtqnyhEldc,51
|
|
8
|
-
featrixsphere-0.2.1141.dist-info/top_level.txt,sha256=AyN4wjfzlD0hWnDieuEHX0KckphIk_aC73XCG4df5uU,14
|
|
9
|
-
featrixsphere-0.2.1141.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|