BoiiiSplit 0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- boiiisplit-0.1/BoiiiSplit/__init__.py +1 -0
- boiiisplit-0.1/BoiiiSplit/main.py +47 -0
- boiiisplit-0.1/BoiiiSplit.egg-info/PKG-INFO +5 -0
- boiiisplit-0.1/BoiiiSplit.egg-info/SOURCES.txt +9 -0
- boiiisplit-0.1/BoiiiSplit.egg-info/dependency_links.txt +1 -0
- boiiisplit-0.1/BoiiiSplit.egg-info/requires.txt +1 -0
- boiiisplit-0.1/BoiiiSplit.egg-info/top_level.txt +1 -0
- boiiisplit-0.1/PKG-INFO +5 -0
- boiiisplit-0.1/README.md +0 -0
- boiiisplit-0.1/setup.cfg +4 -0
- boiiisplit-0.1/setup.py +12 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .main import BoiiiSplit
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
def train_test_cv_split(X, y, test_size=20, cv_size=20, random_state=42):
|
|
4
|
+
"""
|
|
5
|
+
Splits data into Train, Test, and CV sets using Pandas.
|
|
6
|
+
|
|
7
|
+
Args:
|
|
8
|
+
X (pd.DataFrame): Features
|
|
9
|
+
y (pd.Series or pd.DataFrame): Target
|
|
10
|
+
test_size (int): Percentage for test set (0-100)
|
|
11
|
+
cv_size (int): Percentage for cross-validation/validation set (0-100)
|
|
12
|
+
random_state (int): Seed for reproducibility
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
# Ensure inputs are Pandas objects
|
|
16
|
+
X = pd.DataFrame(X)
|
|
17
|
+
y = pd.Series(y) if not isinstance(y, pd.DataFrame) else y
|
|
18
|
+
|
|
19
|
+
# 1. Verification
|
|
20
|
+
if test_size + cv_size >= 100:
|
|
21
|
+
raise ValueError("Sum of test_size and cv_size must be less than 100.")
|
|
22
|
+
|
|
23
|
+
# 2. Shuffle the data using the random_state
|
|
24
|
+
# We use the index of X to shuffle both X and y in sync
|
|
25
|
+
indices = X.index.tolist()
|
|
26
|
+
shuffled_indices = pd.Series(indices).sample(frac=1, random_state=random_state).values
|
|
27
|
+
|
|
28
|
+
X_shuffled = X.loc[shuffled_indices].reset_index(drop=True)
|
|
29
|
+
y_shuffled = y.loc[shuffled_indices].reset_index(drop=True)
|
|
30
|
+
|
|
31
|
+
n = len(X_shuffled)
|
|
32
|
+
|
|
33
|
+
# 3. Calculate split points
|
|
34
|
+
test_cutoff = int(n * (test_size / 100))
|
|
35
|
+
cv_cutoff = int(n * (cv_size / 100)) + test_cutoff
|
|
36
|
+
|
|
37
|
+
# 4. Slice the data
|
|
38
|
+
# Test: 0 to test_cutoff
|
|
39
|
+
X_test, y_test = X_shuffled.iloc[:test_cutoff], y_shuffled.iloc[:test_cutoff]
|
|
40
|
+
|
|
41
|
+
# CV: test_cutoff to cv_cutoff
|
|
42
|
+
X_cv, y_cv = X_shuffled.iloc[test_cutoff:cv_cutoff], y_shuffled.iloc[test_cutoff:cv_cutoff]
|
|
43
|
+
|
|
44
|
+
# Train: cv_cutoff to the end
|
|
45
|
+
X_train, y_train = X_shuffled.iloc[cv_cutoff:], y_shuffled.iloc[cv_cutoff:]
|
|
46
|
+
|
|
47
|
+
return X_train, X_test, X_cv, y_train, y_test, y_cv
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
pandas
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
BoiiiSplit
|
boiiisplit-0.1/PKG-INFO
ADDED
boiiisplit-0.1/README.md
ADDED
|
File without changes
|
boiiisplit-0.1/setup.cfg
ADDED