ddi-fw 0.0.264__tar.gz → 0.0.266__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/PKG-INFO +1 -1
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/pyproject.toml +1 -1
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/datasets/core.py +60 -92
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/ml/ml_helper.py +96 -30
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/pipeline/multi_pipeline.py +2 -12
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/pipeline/pipeline.py +11 -14
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw.egg-info/PKG-INFO +1 -1
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/README.md +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/setup.cfg +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/datasets/__init__.py +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/datasets/dataset_splitter.py +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/datasets/db_utils.py +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/datasets/setup_._py +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/langchain/__init__.py +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/langchain/chroma_storage.py +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/langchain/embeddings.py +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/langchain/faiss_storage.py +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/langchain/sentence_splitter.py +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/langchain/storage.py +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/ml/__init__.py +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/ml/evaluation_helper.py +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/ml/model_wrapper.py +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/ml/pytorch_wrapper.py +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/ml/tensorflow_wrapper.py +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/ml/tracking_service.py +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/ner/__init__.py +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/ner/mmlrestclient.py +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/ner/ner.py +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/pipeline/__init__.py +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/pipeline/multi_modal_combination_strategy.py +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/pipeline/multi_pipeline_org.py +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/pipeline/ner_pipeline.py +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/utils/__init__.py +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/utils/categorical_data_encoding_checker.py +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/utils/enums.py +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/utils/json_helper.py +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/utils/kaggle.py +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/utils/numpy_utils.py +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/utils/package_helper.py +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/utils/py7zr_helper.py +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/utils/utils.py +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/utils/zip_helper.py +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/vectorization/__init__.py +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/vectorization/feature_vector_generation.py +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw/vectorization/idf_helper.py +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw.egg-info/SOURCES.txt +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw.egg-info/dependency_links.txt +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw.egg-info/requires.txt +0 -0
- {ddi_fw-0.0.264 → ddi_fw-0.0.266}/src/ddi_fw.egg-info/top_level.txt +0 -0
@@ -75,12 +75,58 @@ class BaseDataset(BaseModel, abc.ABC):
|
|
75
75
|
val_idx_arr: Optional[List[np.ndarray]] = None
|
76
76
|
columns: List[str] = []
|
77
77
|
additional_config: Optional[Dict[str, Any]] = None
|
78
|
+
input_processing: Optional[List[Dict[str, Any]]] = None
|
78
79
|
|
79
80
|
class Config:
|
80
81
|
arbitrary_types_allowed = True
|
82
|
+
|
83
|
+
def process_input_data(self,data, processing_config=None):
|
84
|
+
|
85
|
+
if not processing_config:
|
86
|
+
return data
|
87
|
+
if processing_config.get("flatten", False):
|
88
|
+
print("Flattening data...")
|
89
|
+
data = np.array(data).flatten()
|
90
|
+
print(f"Data shape after flattening: {data.shape}")
|
91
|
+
|
92
|
+
if processing_config.get("stack", False):
|
93
|
+
print("Stacking data...")
|
94
|
+
data = np.stack(data)
|
95
|
+
print(f"Data shape after stacking: {data.shape}")
|
96
|
+
if not isinstance(data, np.ndarray):
|
97
|
+
data = np.array(data)
|
98
|
+
# if processing_config.get("flatten", False):
|
99
|
+
# data = np.stack(data.flatten().tolist())
|
100
|
+
# Ensure we start with a NumPy array
|
101
|
+
|
102
|
+
|
103
|
+
# Normalize input
|
104
|
+
if processing_config.get("normalize", False):
|
105
|
+
data = data.astype(np.float32)
|
106
|
+
max_val = np.max(data)
|
107
|
+
if max_val > 1:
|
108
|
+
data /= max_val
|
109
|
+
|
110
|
+
# Reshape input (for images etc.)
|
111
|
+
if "reshape" in processing_config:
|
112
|
+
try:
|
113
|
+
target_shape = tuple(processing_config["reshape"])
|
114
|
+
data = data.reshape((-1, *target_shape))
|
115
|
+
except Exception as e:
|
116
|
+
raise ValueError(f"Reshape failed for data with shape {data.shape}: {e}")
|
117
|
+
|
118
|
+
|
119
|
+
return data
|
81
120
|
|
82
121
|
# TODO columns yoksa tüm feature'lar alınıyor, bu pipeline'da nasıl yapılacak?
|
83
122
|
def produce_inputs(self):
|
123
|
+
# Grouping the list by "column" key
|
124
|
+
grouped_data = defaultdict(dict)
|
125
|
+
|
126
|
+
if self.input_processing:
|
127
|
+
for item in self.input_processing:
|
128
|
+
grouped_data[item["column"]] = item
|
129
|
+
|
84
130
|
items = []
|
85
131
|
if self.X_train is None or self.X_test is None:
|
86
132
|
raise Exception("There is no data to produce inputs")
|
@@ -90,40 +136,30 @@ class BaseDataset(BaseModel, abc.ABC):
|
|
90
136
|
if self.columns is None or len(self.columns) == 0 or len(self.columns) == 1:
|
91
137
|
# If no columns or only one column are provided, do not change the data
|
92
138
|
# and use the entire dataset as a single input.
|
93
|
-
train_data, test_data = self.X_train[:, :], self.X_test[:, :]
|
94
|
-
|
95
|
-
train_data,test_data = np.stack(train_data.flatten().tolist()), np.stack(test_data.flatten().tolist())
|
96
139
|
column = self.columns[0] if self.columns else 'default'
|
140
|
+
train_data, test_data = self.X_train[:, :], self.X_test[:, :]
|
141
|
+
processing_config = grouped_data[column]
|
142
|
+
train_data = self.process_input_data(train_data, processing_config)
|
143
|
+
test_data = self.process_input_data(test_data, processing_config)
|
144
|
+
# train_data,test_data = np.stack(train_data.flatten().tolist()), np.stack(test_data.flatten().tolist())
|
97
145
|
items.append([f'{column}', np.nan_to_num(train_data),
|
98
146
|
y_train_label, np.nan_to_num(test_data), y_test_label])
|
99
147
|
else:
|
100
148
|
for index, column in enumerate(self.columns):
|
149
|
+
processing_config = grouped_data[column]
|
101
150
|
train_data, test_data = self.X_train[:,
|
102
151
|
index], self.X_test[:, index]
|
103
152
|
#TODO üstteki satır ile alttaki tek satır olsun, tolist() ile numpy array'e çevrilmesin, numpy array zaten ama uyarı verdiği için böyle
|
104
|
-
train_data
|
153
|
+
train_data = self.process_input_data(train_data, processing_config)
|
154
|
+
test_data = self.process_input_data(test_data, processing_config)
|
155
|
+
# train_data,test_data = np.stack(train_data.tolist()), np.stack(test_data.tolist())
|
105
156
|
items.append([f'{column}', np.nan_to_num(train_data),
|
106
157
|
y_train_label, np.nan_to_num(test_data), y_test_label])
|
107
158
|
|
108
159
|
# items.append([f'{column}_embedding', train_data,
|
109
160
|
# y_train_label, test_data, y_test_label])
|
110
161
|
return items
|
111
|
-
|
112
|
-
def produce_inputs_ex(self):
|
113
|
-
items = []
|
114
|
-
if self.X_train is None or self.X_test is None:
|
115
|
-
raise Exception("There is no data to produce inputs")
|
116
|
-
y_train_label, y_test_label = stack(self.y_train), stack(self.y_test)
|
117
|
-
|
118
|
-
for column in self.columns:
|
119
|
-
train_data, test_data = stack(
|
120
|
-
self.X_train[column]), stack(self.X_test[column])
|
121
|
-
items.append([f'{column}', np.nan_to_num(train_data),
|
122
|
-
y_train_label, np.nan_to_num(test_data), y_test_label])
|
123
|
-
|
124
|
-
# items.append([f'{column}_embedding', train_data,
|
125
|
-
# y_train_label, test_data, y_test_label])
|
126
|
-
return items
|
162
|
+
|
127
163
|
|
128
164
|
@computed_field
|
129
165
|
@property
|
@@ -294,72 +330,7 @@ class TextDatasetMixin(BaseModel):
|
|
294
330
|
|
295
331
|
class Config:
|
296
332
|
arbitrary_types_allowed = True
|
297
|
-
|
298
|
-
# def __create_or_update_embeddings__(self, embedding_dict, vector_db_persist_directory, vector_db_collection_name, column=None):
|
299
|
-
# """
|
300
|
-
# Fetch embeddings and metadata from a persistent Chroma vector database and update the provided embedding_dict.
|
301
|
-
|
302
|
-
# Args:
|
303
|
-
# - vector_db_persist_directory (str): The path to the directory where the Chroma vector database is stored.
|
304
|
-
# - vector_db_collection_name (str): The name of the collection to query.
|
305
|
-
# - embedding_dict (dict): The existing dictionary to update with embeddings.
|
306
|
-
|
307
|
-
# """
|
308
|
-
# if vector_db_persist_directory:
|
309
|
-
# # Initialize the Chroma client and get the collection
|
310
|
-
# vector_db = chromadb.PersistentClient(
|
311
|
-
# path=vector_db_persist_directory)
|
312
|
-
# collection = vector_db.get_collection(vector_db_collection_name)
|
313
|
-
# # include = [IncludeEnum.embeddings, IncludeEnum.metadatas]
|
314
|
-
# include: chromadb.Include = ["embeddings","metadatas"]
|
315
|
-
# dictionary: chromadb.GetResult
|
316
|
-
# # Fetch the embeddings and metadata
|
317
|
-
# if column == None:
|
318
|
-
# dictionary = collection.get(
|
319
|
-
# include=include
|
320
|
-
# # include=['embeddings', 'metadatas']
|
321
|
-
# )
|
322
|
-
# print(
|
323
|
-
# f"Embeddings are calculated from {vector_db_collection_name}")
|
324
|
-
# else:
|
325
|
-
# dictionary = collection.get(
|
326
|
-
# include=include,
|
327
|
-
# # include=['embeddings', 'metadatas'],
|
328
|
-
# where={
|
329
|
-
# "type": {"$eq": f"{column}"}})
|
330
|
-
# print(
|
331
|
-
# f"Embeddings of {column} are calculated from {vector_db_collection_name}")
|
332
|
-
|
333
|
-
# # Populate the embedding dictionary with embeddings from the vector database
|
334
|
-
# metadatas = dictionary["metadatas"]
|
335
|
-
# embeddings = dictionary["embeddings"]
|
336
|
-
# if metadatas is None or embeddings is None:
|
337
|
-
# raise ValueError(
|
338
|
-
# "The collection does not contain embeddings or metadatas.")
|
339
|
-
# for metadata, embedding in zip(metadatas, embeddings):
|
340
|
-
# embedding_dict[metadata["type"]
|
341
|
-
# ][metadata["id"]].append(embedding)
|
342
|
-
|
343
|
-
# else:
|
344
|
-
# raise ValueError(
|
345
|
-
# "Persistent directory for the vector DB is not specified.")
|
346
|
-
|
347
|
-
# def __initialize_embedding_dict(self):
|
348
|
-
# embedding_dict = defaultdict(lambda: defaultdict(list))
|
349
|
-
# if self.column_embedding_configs:
|
350
|
-
# for item in self.column_embedding_configs:
|
351
|
-
# col = item["column"]
|
352
|
-
# col_db_dir = item["vector_db_persist_directory"]
|
353
|
-
# col_db_collection = item["vector_db_collection_name"]
|
354
|
-
# self.__create_or_update_embeddings__(embedding_dict, col_db_dir, col_db_collection, col)
|
355
|
-
# elif self.vector_db_persist_directory:
|
356
|
-
# self.__create_or_update_embeddings__(embedding_dict, self.vector_db_persist_directory, self.vector_db_collection_name)
|
357
|
-
# else:
|
358
|
-
# logging.warning("There is no configuration of Embeddings")
|
359
|
-
# raise ValueError(
|
360
|
-
# "There is no configuration of Embeddings. Please provide a vector database directory and collection name.")
|
361
|
-
# return embedding_dict
|
362
|
-
|
333
|
+
|
363
334
|
def __calculate_embedding_size(self):
|
364
335
|
if not self.embedding_dict:
|
365
336
|
raise ValueError("Embedding dictionary is not initialized, embedding size cannot be calculated.")
|
@@ -370,15 +341,12 @@ class TextDatasetMixin(BaseModel):
|
|
370
341
|
def process_text(self):
|
371
342
|
logging.info("Processing text data...")
|
372
343
|
|
373
|
-
|
374
|
-
# kwargs = {"columns": self.columns}
|
375
|
-
# if self.ner_threshold:
|
376
|
-
# for k, v in self.ner_threshold.items():
|
377
|
-
# kwargs[k] = v
|
344
|
+
|
378
345
|
if not self.embedding_dict:
|
379
346
|
if self.vector_store_manager is not None:
|
380
347
|
self.embedding_dict = self.vector_store_manager.initialize_embedding_dict()
|
381
|
-
|
348
|
+
else:
|
349
|
+
raise ValueError("Either embedding_dict or vector_store_manager must be provided for text processing.")
|
382
350
|
self.__calculate_embedding_size()
|
383
351
|
|
384
352
|
|
@@ -3,7 +3,7 @@ from ddi_fw.ml.pytorch_wrapper import PTModelWrapper
|
|
3
3
|
from ddi_fw.ml.tensorflow_wrapper import TFModelWrapper
|
4
4
|
from ddi_fw.utils.package_helper import get_import
|
5
5
|
import numpy as np
|
6
|
-
from ddi_fw.ml.evaluation_helper import
|
6
|
+
from ddi_fw.ml.evaluation_helper import evaluate
|
7
7
|
|
8
8
|
# import tf2onnx
|
9
9
|
# import onnx
|
@@ -48,7 +48,8 @@ class MultiModalRunner:
|
|
48
48
|
def __predict(self, single_results):
|
49
49
|
item_dict = {t[0]: t for t in self.items}
|
50
50
|
if self.default_model is None and not self.multi_modal:
|
51
|
-
raise Exception(
|
51
|
+
raise Exception(
|
52
|
+
"Default model and multi modal cannot be None at the same time")
|
52
53
|
|
53
54
|
if self.multi_modal:
|
54
55
|
for m in self.multi_modal:
|
@@ -56,49 +57,113 @@ class MultiModalRunner:
|
|
56
57
|
# input_type = m.get('input_type')
|
57
58
|
input = m.get('input')
|
58
59
|
inputs = m.get('inputs')
|
59
|
-
|
60
|
-
|
60
|
+
if m.get("model_type") is None:
|
61
|
+
model_type = self.default_model.get("model_type")
|
62
|
+
kwargs = self.default_model.get('params')
|
63
|
+
else:
|
64
|
+
model_type = get_import(m.get("model_type"))
|
65
|
+
kwargs = m.get('params')
|
66
|
+
|
67
|
+
if model_type is None:
|
68
|
+
raise Exception(
|
69
|
+
"model_type cannot be None, it should be defined in multi_modal or default_model")
|
70
|
+
|
61
71
|
T = self.__create_model(self.library)
|
62
72
|
single_modal = T(self.date, name, model_type,
|
63
|
-
|
64
|
-
|
65
|
-
if input
|
66
|
-
raise
|
67
|
-
|
73
|
+
tracking_service=self.tracking_service, **kwargs)
|
74
|
+
|
75
|
+
if input and inputs:
|
76
|
+
raise ValueError(
|
77
|
+
"Only one of 'input' or 'inputs' should be defined.")
|
78
|
+
if not input and not inputs:
|
79
|
+
raise ValueError(
|
80
|
+
"At least one of 'input' or 'inputs' must be defined.")
|
81
|
+
|
82
|
+
if input and not isinstance(input, str):
|
83
|
+
raise ValueError(
|
84
|
+
"'input' should be a single string. For multiple inputs, use 'inputs'.")
|
85
|
+
|
86
|
+
# Get stacking and reshaping config
|
87
|
+
force_stack = m.get("force_stack", True)
|
88
|
+
reshape_dims = m.get("reshape")
|
89
|
+
train_data, train_label, test_data, test_label = None, None, None, None
|
90
|
+
# --- SINGLE INPUT CASE ---
|
68
91
|
if input:
|
69
92
|
item = item_dict[input]
|
70
|
-
|
71
|
-
|
93
|
+
train_data = item[1]
|
94
|
+
train_label = item[2]
|
95
|
+
test_data = item[3]
|
96
|
+
test_label = item[4]
|
97
|
+
|
98
|
+
# Optional: force stack single input to simulate extra dimension
|
99
|
+
if force_stack:
|
100
|
+
train_data = np.expand_dims(train_data, axis=1)
|
101
|
+
test_data = np.expand_dims(test_data, axis=1)
|
102
|
+
|
103
|
+
# --- MULTIPLE INPUTS CASE ---
|
72
104
|
elif inputs:
|
73
|
-
# check keys
|
74
105
|
filtered_dict = {k: item_dict[k]
|
75
|
-
|
76
|
-
|
106
|
+
for k in inputs if k in item_dict}
|
107
|
+
if not filtered_dict:
|
108
|
+
raise ValueError(
|
109
|
+
f"No matching inputs found in item_dict for: {inputs}")
|
110
|
+
|
77
111
|
first_input = next(iter(filtered_dict.values()))
|
78
112
|
train_data_list = [f[1] for f in filtered_dict.values()]
|
79
113
|
test_data_list = [f[3] for f in filtered_dict.values()]
|
80
|
-
train_data = np.stack(train_data_list, axis=1)
|
81
|
-
test_data = np.stack(test_data_list, axis=1)
|
82
114
|
train_label = first_input[2]
|
83
115
|
test_label = first_input[4]
|
84
|
-
|
85
|
-
|
116
|
+
|
117
|
+
# Stack across inputs
|
118
|
+
if len(train_data_list) == 1:
|
119
|
+
train_data = train_data_list[0]
|
120
|
+
test_data = test_data_list[0]
|
121
|
+
|
122
|
+
if force_stack:
|
123
|
+
train_data = np.stack(train_data_list, axis=1)
|
124
|
+
test_data = np.stack(test_data_list, axis=1)
|
125
|
+
|
126
|
+
else:
|
127
|
+
# train_data = np.concatenate(train_data_list, axis=0)
|
128
|
+
# test_data = np.concatenate(test_data_list, axis=0)
|
129
|
+
train_data = np.array(train_data_list).T
|
130
|
+
test_data = np.array(test_data_list).T
|
86
131
|
else:
|
87
132
|
raise Exception("check configurations")
|
133
|
+
|
134
|
+
# --- OPTIONAL: Reshape if needed ---
|
135
|
+
if reshape_dims:
|
136
|
+
train_data = train_data.reshape((-1, *reshape_dims))
|
137
|
+
test_data = test_data.reshape((-1, *reshape_dims))
|
138
|
+
|
139
|
+
# --- Finalize ---
|
140
|
+
single_modal.set_data(
|
141
|
+
self.train_idx_arr, self.val_idx_arr,
|
142
|
+
train_data, train_label,
|
143
|
+
test_data, test_label
|
144
|
+
)
|
145
|
+
|
88
146
|
logs, metrics, prediction = single_modal.fit_and_evaluate()
|
89
147
|
self.result.add_metric(name, metrics)
|
90
148
|
single_results[name] = prediction
|
91
|
-
else:
|
149
|
+
else: # TODO default model maybe?
|
92
150
|
print("Default model will be used")
|
151
|
+
if self.default_model is None:
|
152
|
+
raise Exception(
|
153
|
+
"Default model cannot be None if multi_modal is not defined")
|
154
|
+
if self.default_model.get("model_type") is None:
|
155
|
+
raise Exception(
|
156
|
+
"model_type cannot be None, it should be defined in default_model")
|
157
|
+
|
93
158
|
model_type = get_import(self.default_model.get("model_type"))
|
94
159
|
kwargs = self.default_model.get('params')
|
95
160
|
for item in self.items:
|
96
161
|
name = item[0]
|
97
162
|
T = self.__create_model(self.library)
|
98
163
|
single_modal = T(self.date, name, model_type,
|
99
|
-
|
164
|
+
tracking_service=self.tracking_service, **kwargs)
|
100
165
|
single_modal.set_data(
|
101
|
-
|
166
|
+
self.train_idx_arr, self.val_idx_arr, item[1], item[2], item[3], item[4])
|
102
167
|
|
103
168
|
logs, metrics, prediction = single_modal.fit_and_evaluate()
|
104
169
|
self.result.add_metric(name, metrics)
|
@@ -116,14 +181,15 @@ class MultiModalRunner:
|
|
116
181
|
combinations = []
|
117
182
|
for i in range(2, len(l) + 1):
|
118
183
|
combinations.extend(list(itertools.combinations(l, i))) # all
|
119
|
-
|
184
|
+
|
120
185
|
def _f():
|
121
186
|
self.__predict(single_results)
|
122
187
|
if combinations:
|
123
188
|
self.evaluate_combinations(single_results, combinations)
|
124
|
-
|
189
|
+
|
125
190
|
if self.tracking_service:
|
126
|
-
self.tracking_service.run(
|
191
|
+
self.tracking_service.run(
|
192
|
+
run_name=self.prefix, description="***", func=_f, nested_run=False)
|
127
193
|
else:
|
128
194
|
self.__predict(single_results)
|
129
195
|
if combinations:
|
@@ -138,13 +204,13 @@ class MultiModalRunner:
|
|
138
204
|
def evaluate_combination(artifact_uri=None):
|
139
205
|
self.__evaluate_combinations(
|
140
206
|
single_results, combination, combination_descriptor, artifact_uri
|
141
|
-
|
142
|
-
|
143
|
-
self.tracking_service.run(run_name=combination_descriptor,
|
144
|
-
description="***",
|
145
|
-
nested_run=True,
|
207
|
+
)
|
208
|
+
|
209
|
+
self.tracking_service.run(run_name=combination_descriptor,
|
210
|
+
description="***",
|
211
|
+
nested_run=True,
|
146
212
|
func=evaluate_combination)
|
147
|
-
|
213
|
+
|
148
214
|
# with mlflow.start_run(run_name=combination_descriptor, description="***", nested=True) as combination_run:
|
149
215
|
# self.__evaluate_combinations(
|
150
216
|
# single_results, combination, combination_descriptor, combination_run.info.artifact_uri)
|
@@ -126,6 +126,7 @@ class MultiPipeline():
|
|
126
126
|
dataset_splitter_type = get_import(dataset_config.get("dataset_splitter_type"))
|
127
127
|
columns = dataset_config.get("columns", [])
|
128
128
|
additional_config = dataset_config.get("additional_config", {})
|
129
|
+
input_processing = dataset_config.get("input_processing", [])
|
129
130
|
|
130
131
|
# Vector database configuration
|
131
132
|
vector_database = config.get("vector_databases", {})
|
@@ -149,19 +150,7 @@ class MultiPipeline():
|
|
149
150
|
|
150
151
|
# Default model configuration
|
151
152
|
default_model = config.get("default_model", {})
|
152
|
-
if default_model:
|
153
|
-
default_model_type = get_import(default_model.get("model_type"))
|
154
|
-
default_model_params = default_model.get("params", {})
|
155
|
-
|
156
153
|
multi_modal = config.get("multi_modal")
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
#ner move it to related dataset
|
161
|
-
|
162
|
-
# ner_data_file = config.get("ner_data_file")
|
163
|
-
# ner_threshold = config.get("ner_threshold")
|
164
|
-
|
165
154
|
|
166
155
|
combination_type = None
|
167
156
|
kwargs_combination_params=None
|
@@ -185,6 +174,7 @@ class MultiPipeline():
|
|
185
174
|
dataset_additional_config=additional_config,
|
186
175
|
dataset_splitter_type=dataset_splitter_type,
|
187
176
|
columns=columns,
|
177
|
+
input_processing=input_processing,
|
188
178
|
vector_store_manager_type=vector_store_manager_type,
|
189
179
|
column_embedding_configs=column_embedding_configs,
|
190
180
|
vector_db_persist_directory=vector_db_persist_directory,
|
@@ -22,6 +22,7 @@ class Pipeline(BaseModel):
|
|
22
22
|
dataset_additional_config: Optional[Dict[str, Any]] = None
|
23
23
|
dataset_splitter_type: Type[DatasetSplitter] = DatasetSplitter
|
24
24
|
columns: Optional[List[str]] = None
|
25
|
+
input_processing: Optional[List[Dict[str, Any]]] = None
|
25
26
|
embedding_dict: Optional[Dict[str, Any]] = None
|
26
27
|
column_embedding_configs: Optional[List] = None
|
27
28
|
vector_db_persist_directory: Optional[str] = None
|
@@ -76,13 +77,6 @@ class Pipeline(BaseModel):
|
|
76
77
|
# 'enzyme','target','pathway','smile','all_text','indication', 'description','mechanism_of_action','pharmacodynamics', 'tui', 'cui', 'entities'
|
77
78
|
kwargs = {"columns": self.columns,
|
78
79
|
"additional_config": self.dataset_additional_config}
|
79
|
-
# DDIMDL Dataset'e aktar
|
80
|
-
# if self.ner_threshold:
|
81
|
-
# for k, v in self.ner_threshold.items():
|
82
|
-
# kwargs[k] = v
|
83
|
-
|
84
|
-
# ner_df = CTakesNER(df=None).load(
|
85
|
-
# filename=self.ner_data_file) if self.ner_data_file else None
|
86
80
|
|
87
81
|
dataset_splitter = self.dataset_splitter_type()
|
88
82
|
pooling_strategy = self.embedding_pooling_strategy_type(
|
@@ -97,12 +91,12 @@ class Pipeline(BaseModel):
|
|
97
91
|
if self.vector_db_collection_name is not None:
|
98
92
|
params["collection_name"] = self.vector_db_collection_name
|
99
93
|
|
100
|
-
|
101
|
-
|
94
|
+
vector_store_manager = self.vector_store_manager_type(
|
95
|
+
**params) if self.vector_store_manager_type else None
|
102
96
|
if issubclass(self.dataset_type, TextDatasetMixin):
|
103
|
-
|
97
|
+
|
104
98
|
dataset = self.dataset_type(
|
105
|
-
vector_store_manager
|
99
|
+
vector_store_manager=vector_store_manager,
|
106
100
|
embedding_dict=self.embedding_dict,
|
107
101
|
pooling_strategy=pooling_strategy,
|
108
102
|
column_embedding_configs=self.column_embedding_configs,
|
@@ -111,17 +105,20 @@ class Pipeline(BaseModel):
|
|
111
105
|
dataset_splitter_type=self.dataset_splitter_type,
|
112
106
|
**kwargs)
|
113
107
|
|
114
|
-
elif self.dataset_type == BaseDataset:
|
108
|
+
elif self.dataset_type == BaseDataset: ## !!! check it
|
115
109
|
dataset = self.dataset_type(
|
116
110
|
dataset_splitter_type=self.dataset_splitter_type,
|
117
111
|
**kwargs)
|
118
112
|
else:
|
119
|
-
dataset = self.dataset_type(
|
113
|
+
dataset = self.dataset_type(
|
114
|
+
dataset_splitter_type=self.dataset_splitter_type, **kwargs)
|
115
|
+
|
116
|
+
dataset.input_processing = self.input_processing
|
120
117
|
|
121
118
|
# X_train, X_test, y_train, y_test, train_indexes, test_indexes, train_idx_arr, val_idx_arr = dataset.load()
|
122
119
|
|
123
120
|
dataset.load()
|
124
|
-
|
121
|
+
|
125
122
|
self._dataset = dataset
|
126
123
|
self._train_idx_arr = dataset.train_idx_arr
|
127
124
|
self._val_idx_arr = dataset.val_idx_arr
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|