ddi-fw 0.0.170__py3-none-any.whl → 0.0.172__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ddi_fw/datasets/core.py +40 -27
- {ddi_fw-0.0.170.dist-info → ddi_fw-0.0.172.dist-info}/METADATA +1 -1
- {ddi_fw-0.0.170.dist-info → ddi_fw-0.0.172.dist-info}/RECORD +5 -5
- {ddi_fw-0.0.170.dist-info → ddi_fw-0.0.172.dist-info}/WHEEL +0 -0
- {ddi_fw-0.0.170.dist-info → ddi_fw-0.0.172.dist-info}/top_level.txt +0 -0
ddi_fw/datasets/core.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
import glob
|
2
|
+
import logging
|
2
3
|
from typing import Any, Dict, List, Optional, Type
|
3
4
|
import numpy as np
|
4
5
|
import pandas as pd
|
@@ -20,7 +21,6 @@ except ImportError:
|
|
20
21
|
"Failed to import langchain.embeddings module. ")
|
21
22
|
|
22
23
|
|
23
|
-
|
24
24
|
def stack(df_column):
|
25
25
|
return np.stack(df_column.values)
|
26
26
|
|
@@ -56,25 +56,20 @@ def generate_sim_matrices_new(df, generated_vectors, columns, key_column="id"):
|
|
56
56
|
|
57
57
|
class BaseDataset(BaseModel):
|
58
58
|
dataset_name: str
|
59
|
-
index_path: str
|
59
|
+
index_path: Optional[str] = None
|
60
60
|
dataset_splitter_type: Type[DatasetSplitter]
|
61
61
|
class_column: str = 'class'
|
62
62
|
dataframe: Optional[pd.DataFrame] = None
|
63
|
-
X_train:
|
64
|
-
X_test:
|
65
|
-
y_train:
|
66
|
-
y_test:
|
67
|
-
train_indexes:
|
68
|
-
test_indexes:
|
69
|
-
train_idx_arr:
|
70
|
-
val_idx_arr:
|
71
|
-
# train_idx_arr: Optional[List[np.ndarray]] = None
|
72
|
-
# val_idx_arr: Optional[List[np.ndarray]] = None
|
63
|
+
X_train: Optional[pd.DataFrame | np.ndarray] = None
|
64
|
+
X_test: Optional[pd.DataFrame | np.ndarray] = None
|
65
|
+
y_train: Optional[pd.Series | np.ndarray] = None
|
66
|
+
y_test: Optional[pd.Series | np.ndarray] = None
|
67
|
+
train_indexes: Optional[pd.Index] = None
|
68
|
+
test_indexes: Optional[pd.Index] = None
|
69
|
+
train_idx_arr: Optional[List[np.ndarray]] = None
|
70
|
+
val_idx_arr: Optional[List[np.ndarray]] = None
|
73
71
|
columns: List[str] = []
|
74
72
|
|
75
|
-
# feature_process: FeatureProcessor
|
76
|
-
# similarity_matrix_service: SimilarityMatrixService
|
77
|
-
|
78
73
|
class Config:
|
79
74
|
arbitrary_types_allowed = True
|
80
75
|
|
@@ -93,7 +88,7 @@ class BaseDataset(BaseModel):
|
|
93
88
|
# items.append([f'{column}_embedding', train_data,
|
94
89
|
# y_train_label, test_data, y_test_label])
|
95
90
|
return items
|
96
|
-
|
91
|
+
|
97
92
|
@computed_field
|
98
93
|
@property
|
99
94
|
def dataset_splitter(self) -> DatasetSplitter:
|
@@ -107,9 +102,22 @@ class BaseDataset(BaseModel):
|
|
107
102
|
pass
|
108
103
|
|
109
104
|
def load(self):
|
105
|
+
"""
|
106
|
+
Load the dataset. If X_train, y_train, X_test, and y_test are already provided,
|
107
|
+
skip deriving them. Otherwise, derive them from the dataframe and indices.
|
108
|
+
"""
|
109
|
+
if self.X_train is not None and self.y_train is not None and self.X_test is not None and self.y_test is not None:
|
110
|
+
# Data is already provided, no need to calculate
|
111
|
+
logging.info(
|
112
|
+
"X_train, y_train, X_test, and y_test are already provided. Skipping calculation.")
|
113
|
+
return self.X_train, self.X_test, self.y_train, self.y_test, self.train_indexes, self.test_indexes, self.train_idx_arr, self.val_idx_arr
|
114
|
+
|
110
115
|
if self.index_path is None:
|
111
116
|
raise Exception(
|
112
|
-
"There is no index path
|
117
|
+
"There is no index path. Please call split_dataset or provide indices.")
|
118
|
+
|
119
|
+
if self.dataframe is None:
|
120
|
+
raise Exception("There is no dataframe to derive data from.")
|
113
121
|
|
114
122
|
try:
|
115
123
|
train_idx_all, test_idx_all, train_idx_arr, val_idx_arr = self.__get_indexes__(
|
@@ -119,9 +127,6 @@ class BaseDataset(BaseModel):
|
|
119
127
|
|
120
128
|
self.prep()
|
121
129
|
|
122
|
-
if self.dataframe is None:
|
123
|
-
raise Exception("There is no dataframe")
|
124
|
-
|
125
130
|
train = self.dataframe[self.dataframe.index.isin(train_idx_all)]
|
126
131
|
test = self.dataframe[self.dataframe.index.isin(test_idx_all)]
|
127
132
|
|
@@ -135,7 +140,7 @@ class BaseDataset(BaseModel):
|
|
135
140
|
self.train_idx_arr = train_idx_arr
|
136
141
|
self.val_idx_arr = val_idx_arr
|
137
142
|
|
138
|
-
return self.X_train, self.X_test, self.y_train, self.y_test, self.
|
143
|
+
return self.X_train, self.X_test, self.y_train, self.y_test, self.train_indexes, self.test_indexes, self.train_idx_arr, self.val_idx_arr
|
139
144
|
|
140
145
|
def __get_indexes__(self, path):
|
141
146
|
train_index_path = path+'/train_indexes.txt'
|
@@ -167,14 +172,21 @@ class BaseDataset(BaseModel):
|
|
167
172
|
f.write('\n'.join(str_indexes))
|
168
173
|
|
169
174
|
def split_dataset(self, save_indexes: bool = False):
|
170
|
-
|
175
|
+
"""
|
176
|
+
Split the dataset into training and testing sets. This method is only available
|
177
|
+
if a dataframe exists. If X_train, y_train, X_test, and y_test are already present,
|
178
|
+
raise an error.
|
179
|
+
"""
|
180
|
+
if self.X_train is not None or self.X_test is not None:
|
181
|
+
raise Exception(
|
182
|
+
"X_train and X_test are already present. Splitting is not allowed.")
|
183
|
+
|
184
|
+
if self.dataframe is None:
|
185
|
+
raise Exception("There is no dataframe to split.")
|
171
186
|
|
172
187
|
save_path = self.index_path
|
173
188
|
self.prep()
|
174
189
|
|
175
|
-
if self.dataframe is None:
|
176
|
-
raise Exception("There is no data")
|
177
|
-
|
178
190
|
X = self.dataframe.drop(self.class_column, axis=1)
|
179
191
|
y = self.dataframe[self.class_column]
|
180
192
|
|
@@ -206,8 +218,9 @@ class BaseDataset(BaseModel):
|
|
206
218
|
|
207
219
|
|
208
220
|
class TextDatasetMixin(BaseDataset):
|
209
|
-
embedding_size: Optional[int] = None
|
210
|
-
embedding_dict: Dict[str, Any] = Field(
|
221
|
+
embedding_size: Optional[int] = None
|
222
|
+
embedding_dict: Dict[str, Any] = Field(
|
223
|
+
default_factory=dict, description="Dictionary for embeddings")
|
211
224
|
embeddings_pooling_strategy: PoolingStrategy | None = None
|
212
225
|
|
213
226
|
def process_text(self):
|
@@ -1,5 +1,5 @@
|
|
1
1
|
ddi_fw/datasets/__init__.py,sha256=_I3iDHARwzmg7_EL5XKtB_TgG1yAkLSOVTujLL9Wz9Q,280
|
2
|
-
ddi_fw/datasets/core.py,sha256=
|
2
|
+
ddi_fw/datasets/core.py,sha256=jXPEMrlQ685qMEZ-Pj4izOVH7nkE62JtpMsDjfosBeQ,9350
|
3
3
|
ddi_fw/datasets/dataset_splitter.py,sha256=8H8uZTAf8N9LUZeSeHOMawtJFJhnDgUUqFcnl7dquBQ,1672
|
4
4
|
ddi_fw/datasets/db_utils.py,sha256=OTsa3d-Iic7z3HmzSQK9UigedRbHDxYChJk0s4GfLnw,6191
|
5
5
|
ddi_fw/datasets/setup_._py,sha256=khYVJuW5PlOY_i_A16F3UbSZ6s6o_ljw33Byw3C-A8E,1047
|
@@ -97,7 +97,7 @@ ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,55
|
|
97
97
|
ddi_fw/vectorization/__init__.py,sha256=LcJOpLVoLvHPDw9phGFlUQGeNcST_zKV-Oi1Pm5h_nE,110
|
98
98
|
ddi_fw/vectorization/feature_vector_generation.py,sha256=Z1A_DOBqDFPqLN4YB-3oYlOQWJK-X6Oes6UFjpzR47Q,4760
|
99
99
|
ddi_fw/vectorization/idf_helper.py,sha256=_Gd1dtDSLaw8o-o0JugzSKMt9FpeXewTh4wGEaUd4VQ,2571
|
100
|
-
ddi_fw-0.0.
|
101
|
-
ddi_fw-0.0.
|
102
|
-
ddi_fw-0.0.
|
103
|
-
ddi_fw-0.0.
|
100
|
+
ddi_fw-0.0.172.dist-info/METADATA,sha256=saohphdC9IZ8Fg4_QLDpyzEufhTWY_NBr2GzJqw5imU,2542
|
101
|
+
ddi_fw-0.0.172.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
102
|
+
ddi_fw-0.0.172.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
|
103
|
+
ddi_fw-0.0.172.dist-info/RECORD,,
|
File without changes
|
File without changes
|