ddi-fw 0.0.263__tar.gz → 0.0.265__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/PKG-INFO +1 -1
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/pyproject.toml +1 -1
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/datasets/core.py +57 -93
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/langchain/faiss_storage.py +45 -1
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/ml/ml_helper.py +96 -30
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/pipeline/multi_pipeline.py +2 -11
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/pipeline/pipeline.py +11 -14
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw.egg-info/PKG-INFO +1 -1
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/README.md +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/setup.cfg +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/datasets/__init__.py +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/datasets/dataset_splitter.py +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/datasets/db_utils.py +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/datasets/setup_._py +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/langchain/__init__.py +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/langchain/chroma_storage.py +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/langchain/embeddings.py +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/langchain/sentence_splitter.py +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/langchain/storage.py +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/ml/__init__.py +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/ml/evaluation_helper.py +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/ml/model_wrapper.py +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/ml/pytorch_wrapper.py +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/ml/tensorflow_wrapper.py +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/ml/tracking_service.py +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/ner/__init__.py +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/ner/mmlrestclient.py +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/ner/ner.py +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/pipeline/__init__.py +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/pipeline/multi_modal_combination_strategy.py +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/pipeline/multi_pipeline_org.py +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/pipeline/ner_pipeline.py +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/utils/__init__.py +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/utils/categorical_data_encoding_checker.py +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/utils/enums.py +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/utils/json_helper.py +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/utils/kaggle.py +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/utils/numpy_utils.py +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/utils/package_helper.py +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/utils/py7zr_helper.py +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/utils/utils.py +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/utils/zip_helper.py +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/vectorization/__init__.py +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/vectorization/feature_vector_generation.py +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw/vectorization/idf_helper.py +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw.egg-info/SOURCES.txt +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw.egg-info/dependency_links.txt +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw.egg-info/requires.txt +0 -0
- {ddi_fw-0.0.263 → ddi_fw-0.0.265}/src/ddi_fw.egg-info/top_level.txt +0 -0
@@ -75,12 +75,54 @@ class BaseDataset(BaseModel, abc.ABC):
|
|
75
75
|
val_idx_arr: Optional[List[np.ndarray]] = None
|
76
76
|
columns: List[str] = []
|
77
77
|
additional_config: Optional[Dict[str, Any]] = None
|
78
|
+
input_processing: Optional[List[Dict[str, Any]]] = None
|
78
79
|
|
79
80
|
class Config:
|
80
81
|
arbitrary_types_allowed = True
|
82
|
+
|
83
|
+
def process_input_data(self,data, processing_config=None):
|
84
|
+
|
85
|
+
if not processing_config:
|
86
|
+
return data
|
87
|
+
|
88
|
+
if processing_config.get("stack", False):
|
89
|
+
print("Stacking data...")
|
90
|
+
data = np.stack(data)
|
91
|
+
print(f"Data shape after stacking: {data.shape}")
|
92
|
+
if not isinstance(data, np.ndarray):
|
93
|
+
data = np.array(data)
|
94
|
+
# if processing_config.get("flatten", False):
|
95
|
+
# data = np.stack(data.flatten().tolist())
|
96
|
+
# Ensure we start with a NumPy array
|
97
|
+
|
98
|
+
|
99
|
+
# Normalize input
|
100
|
+
if processing_config.get("normalize", False):
|
101
|
+
data = data.astype(np.float32)
|
102
|
+
max_val = np.max(data)
|
103
|
+
if max_val > 1:
|
104
|
+
data /= max_val
|
105
|
+
|
106
|
+
# Reshape input (for images etc.)
|
107
|
+
if "reshape" in processing_config:
|
108
|
+
try:
|
109
|
+
target_shape = tuple(processing_config["reshape"])
|
110
|
+
data = data.reshape((-1, *target_shape))
|
111
|
+
except Exception as e:
|
112
|
+
raise ValueError(f"Reshape failed for data with shape {data.shape}: {e}")
|
113
|
+
|
114
|
+
|
115
|
+
return data
|
81
116
|
|
82
117
|
# TODO columns yoksa tüm feature'lar alınıyor, bu pipeline'da nasıl yapılacak?
|
83
118
|
def produce_inputs(self):
|
119
|
+
# Grouping the list by "column" key
|
120
|
+
grouped_data = defaultdict(dict)
|
121
|
+
|
122
|
+
if self.input_processing:
|
123
|
+
for item in self.input_processing:
|
124
|
+
grouped_data[item["column"]] = item
|
125
|
+
|
84
126
|
items = []
|
85
127
|
if self.X_train is None or self.X_test is None:
|
86
128
|
raise Exception("There is no data to produce inputs")
|
@@ -90,40 +132,30 @@ class BaseDataset(BaseModel, abc.ABC):
|
|
90
132
|
if self.columns is None or len(self.columns) == 0 or len(self.columns) == 1:
|
91
133
|
# If no columns or only one column are provided, do not change the data
|
92
134
|
# and use the entire dataset as a single input.
|
93
|
-
train_data, test_data = self.X_train[:, :], self.X_test[:, :]
|
94
|
-
|
95
|
-
train_data,test_data = np.stack(train_data.flatten().tolist()), np.stack(test_data.flatten().tolist())
|
96
135
|
column = self.columns[0] if self.columns else 'default'
|
97
|
-
|
136
|
+
train_data, test_data = self.X_train[:, :], self.X_test[:, :]
|
137
|
+
processing_config = grouped_data[column]
|
138
|
+
train_data = self.process_input_data(train_data, processing_config)
|
139
|
+
test_data = self.process_input_data(test_data, processing_config)
|
140
|
+
# train_data,test_data = np.stack(train_data.flatten().tolist()), np.stack(test_data.flatten().tolist())
|
141
|
+
items.append([f'{column}', np.nan_to_num(train_data),
|
98
142
|
y_train_label, np.nan_to_num(test_data), y_test_label])
|
99
143
|
else:
|
100
144
|
for index, column in enumerate(self.columns):
|
145
|
+
processing_config = grouped_data[column]
|
101
146
|
train_data, test_data = self.X_train[:,
|
102
147
|
index], self.X_test[:, index]
|
103
148
|
#TODO üstteki satır ile alttaki tek satır olsun, tolist() ile numpy array'e çevrilmesin, numpy array zaten ama uyarı verdiği için böyle
|
104
|
-
train_data
|
149
|
+
train_data = self.process_input_data(train_data, processing_config)
|
150
|
+
test_data = self.process_input_data(test_data, processing_config)
|
151
|
+
# train_data,test_data = np.stack(train_data.tolist()), np.stack(test_data.tolist())
|
105
152
|
items.append([f'{column}', np.nan_to_num(train_data),
|
106
153
|
y_train_label, np.nan_to_num(test_data), y_test_label])
|
107
154
|
|
108
155
|
# items.append([f'{column}_embedding', train_data,
|
109
156
|
# y_train_label, test_data, y_test_label])
|
110
157
|
return items
|
111
|
-
|
112
|
-
def produce_inputs_ex(self):
|
113
|
-
items = []
|
114
|
-
if self.X_train is None or self.X_test is None:
|
115
|
-
raise Exception("There is no data to produce inputs")
|
116
|
-
y_train_label, y_test_label = stack(self.y_train), stack(self.y_test)
|
117
|
-
|
118
|
-
for column in self.columns:
|
119
|
-
train_data, test_data = stack(
|
120
|
-
self.X_train[column]), stack(self.X_test[column])
|
121
|
-
items.append([f'{column}', np.nan_to_num(train_data),
|
122
|
-
y_train_label, np.nan_to_num(test_data), y_test_label])
|
123
|
-
|
124
|
-
# items.append([f'{column}_embedding', train_data,
|
125
|
-
# y_train_label, test_data, y_test_label])
|
126
|
-
return items
|
158
|
+
|
127
159
|
|
128
160
|
@computed_field
|
129
161
|
@property
|
@@ -294,72 +326,7 @@ class TextDatasetMixin(BaseModel):
|
|
294
326
|
|
295
327
|
class Config:
|
296
328
|
arbitrary_types_allowed = True
|
297
|
-
|
298
|
-
# def __create_or_update_embeddings__(self, embedding_dict, vector_db_persist_directory, vector_db_collection_name, column=None):
|
299
|
-
# """
|
300
|
-
# Fetch embeddings and metadata from a persistent Chroma vector database and update the provided embedding_dict.
|
301
|
-
|
302
|
-
# Args:
|
303
|
-
# - vector_db_persist_directory (str): The path to the directory where the Chroma vector database is stored.
|
304
|
-
# - vector_db_collection_name (str): The name of the collection to query.
|
305
|
-
# - embedding_dict (dict): The existing dictionary to update with embeddings.
|
306
|
-
|
307
|
-
# """
|
308
|
-
# if vector_db_persist_directory:
|
309
|
-
# # Initialize the Chroma client and get the collection
|
310
|
-
# vector_db = chromadb.PersistentClient(
|
311
|
-
# path=vector_db_persist_directory)
|
312
|
-
# collection = vector_db.get_collection(vector_db_collection_name)
|
313
|
-
# # include = [IncludeEnum.embeddings, IncludeEnum.metadatas]
|
314
|
-
# include: chromadb.Include = ["embeddings","metadatas"]
|
315
|
-
# dictionary: chromadb.GetResult
|
316
|
-
# # Fetch the embeddings and metadata
|
317
|
-
# if column == None:
|
318
|
-
# dictionary = collection.get(
|
319
|
-
# include=include
|
320
|
-
# # include=['embeddings', 'metadatas']
|
321
|
-
# )
|
322
|
-
# print(
|
323
|
-
# f"Embeddings are calculated from {vector_db_collection_name}")
|
324
|
-
# else:
|
325
|
-
# dictionary = collection.get(
|
326
|
-
# include=include,
|
327
|
-
# # include=['embeddings', 'metadatas'],
|
328
|
-
# where={
|
329
|
-
# "type": {"$eq": f"{column}"}})
|
330
|
-
# print(
|
331
|
-
# f"Embeddings of {column} are calculated from {vector_db_collection_name}")
|
332
|
-
|
333
|
-
# # Populate the embedding dictionary with embeddings from the vector database
|
334
|
-
# metadatas = dictionary["metadatas"]
|
335
|
-
# embeddings = dictionary["embeddings"]
|
336
|
-
# if metadatas is None or embeddings is None:
|
337
|
-
# raise ValueError(
|
338
|
-
# "The collection does not contain embeddings or metadatas.")
|
339
|
-
# for metadata, embedding in zip(metadatas, embeddings):
|
340
|
-
# embedding_dict[metadata["type"]
|
341
|
-
# ][metadata["id"]].append(embedding)
|
342
|
-
|
343
|
-
# else:
|
344
|
-
# raise ValueError(
|
345
|
-
# "Persistent directory for the vector DB is not specified.")
|
346
|
-
|
347
|
-
# def __initialize_embedding_dict(self):
|
348
|
-
# embedding_dict = defaultdict(lambda: defaultdict(list))
|
349
|
-
# if self.column_embedding_configs:
|
350
|
-
# for item in self.column_embedding_configs:
|
351
|
-
# col = item["column"]
|
352
|
-
# col_db_dir = item["vector_db_persist_directory"]
|
353
|
-
# col_db_collection = item["vector_db_collection_name"]
|
354
|
-
# self.__create_or_update_embeddings__(embedding_dict, col_db_dir, col_db_collection, col)
|
355
|
-
# elif self.vector_db_persist_directory:
|
356
|
-
# self.__create_or_update_embeddings__(embedding_dict, self.vector_db_persist_directory, self.vector_db_collection_name)
|
357
|
-
# else:
|
358
|
-
# logging.warning("There is no configuration of Embeddings")
|
359
|
-
# raise ValueError(
|
360
|
-
# "There is no configuration of Embeddings. Please provide a vector database directory and collection name.")
|
361
|
-
# return embedding_dict
|
362
|
-
|
329
|
+
|
363
330
|
def __calculate_embedding_size(self):
|
364
331
|
if not self.embedding_dict:
|
365
332
|
raise ValueError("Embedding dictionary is not initialized, embedding size cannot be calculated.")
|
@@ -370,15 +337,12 @@ class TextDatasetMixin(BaseModel):
|
|
370
337
|
def process_text(self):
|
371
338
|
logging.info("Processing text data...")
|
372
339
|
|
373
|
-
|
374
|
-
# kwargs = {"columns": self.columns}
|
375
|
-
# if self.ner_threshold:
|
376
|
-
# for k, v in self.ner_threshold.items():
|
377
|
-
# kwargs[k] = v
|
340
|
+
|
378
341
|
if not self.embedding_dict:
|
379
342
|
if self.vector_store_manager is not None:
|
380
343
|
self.embedding_dict = self.vector_store_manager.initialize_embedding_dict()
|
381
|
-
|
344
|
+
else:
|
345
|
+
raise ValueError("Either embedding_dict or vector_store_manager must be provided for text processing.")
|
382
346
|
self.__calculate_embedding_size()
|
383
347
|
|
384
348
|
|
@@ -460,4 +460,48 @@ def generate_embeddings(
|
|
460
460
|
vector_store_manager.generate_vector_store(docs)
|
461
461
|
|
462
462
|
# Optionally persist/save
|
463
|
-
vector_store_manager.save(persist_dir)
|
463
|
+
vector_store_manager.save(persist_dir)
|
464
|
+
|
465
|
+
|
466
|
+
import os
|
467
|
+
import json
|
468
|
+
|
469
|
+
def generate_embeddings_for_json_object(
|
470
|
+
obj_json: dict,
|
471
|
+
vector_store_manager_type: Type[BaseVectorStoreManager],
|
472
|
+
persist_root: str = "./embeddings",
|
473
|
+
new_model_names: Optional[List] = None,
|
474
|
+
docs=None
|
475
|
+
):
|
476
|
+
"""
|
477
|
+
Generate embeddings for all collections in the given JSON object, storing them in a container folder.
|
478
|
+
|
479
|
+
Args:
|
480
|
+
obj_json: JSON object with 'id', 'name', and 'collections' keys.
|
481
|
+
vector_store_manager_type: The vector store manager class to use.
|
482
|
+
persist_root: Root directory for all embeddings.
|
483
|
+
new_model_names: Optional list of model names to filter collections.
|
484
|
+
docs: Documents to embed (if needed).
|
485
|
+
"""
|
486
|
+
obj_id = obj_json.get("id")
|
487
|
+
obj_name = obj_json.get("name")
|
488
|
+
collections = obj_json.get("collections", [])
|
489
|
+
|
490
|
+
if not obj_id:
|
491
|
+
raise ValueError("JSON object must have an 'id' field.")
|
492
|
+
if not collections:
|
493
|
+
raise ValueError("No collections found in the given JSON object.")
|
494
|
+
|
495
|
+
# Create container directory for this object
|
496
|
+
container_dir = os.path.join(persist_root, str(obj_id))
|
497
|
+
os.makedirs(container_dir, exist_ok=True)
|
498
|
+
|
499
|
+
# Call your existing function
|
500
|
+
generate_embeddings(
|
501
|
+
docs=docs,
|
502
|
+
vector_store_manager_type=vector_store_manager_type,
|
503
|
+
config_file=None,
|
504
|
+
new_model_names=new_model_names,
|
505
|
+
collections=collections,
|
506
|
+
persist_directory=container_dir
|
507
|
+
)
|
@@ -3,7 +3,7 @@ from ddi_fw.ml.pytorch_wrapper import PTModelWrapper
|
|
3
3
|
from ddi_fw.ml.tensorflow_wrapper import TFModelWrapper
|
4
4
|
from ddi_fw.utils.package_helper import get_import
|
5
5
|
import numpy as np
|
6
|
-
from ddi_fw.ml.evaluation_helper import
|
6
|
+
from ddi_fw.ml.evaluation_helper import evaluate
|
7
7
|
|
8
8
|
# import tf2onnx
|
9
9
|
# import onnx
|
@@ -48,7 +48,8 @@ class MultiModalRunner:
|
|
48
48
|
def __predict(self, single_results):
|
49
49
|
item_dict = {t[0]: t for t in self.items}
|
50
50
|
if self.default_model is None and not self.multi_modal:
|
51
|
-
raise Exception(
|
51
|
+
raise Exception(
|
52
|
+
"Default model and multi modal cannot be None at the same time")
|
52
53
|
|
53
54
|
if self.multi_modal:
|
54
55
|
for m in self.multi_modal:
|
@@ -56,49 +57,113 @@ class MultiModalRunner:
|
|
56
57
|
# input_type = m.get('input_type')
|
57
58
|
input = m.get('input')
|
58
59
|
inputs = m.get('inputs')
|
59
|
-
|
60
|
-
|
60
|
+
if m.get("model_type") is None:
|
61
|
+
model_type = self.default_model.get("model_type")
|
62
|
+
kwargs = self.default_model.get('params')
|
63
|
+
else:
|
64
|
+
model_type = get_import(m.get("model_type"))
|
65
|
+
kwargs = m.get('params')
|
66
|
+
|
67
|
+
if model_type is None:
|
68
|
+
raise Exception(
|
69
|
+
"model_type cannot be None, it should be defined in multi_modal or default_model")
|
70
|
+
|
61
71
|
T = self.__create_model(self.library)
|
62
72
|
single_modal = T(self.date, name, model_type,
|
63
|
-
|
64
|
-
|
65
|
-
if input
|
66
|
-
raise
|
67
|
-
|
73
|
+
tracking_service=self.tracking_service, **kwargs)
|
74
|
+
|
75
|
+
if input and inputs:
|
76
|
+
raise ValueError(
|
77
|
+
"Only one of 'input' or 'inputs' should be defined.")
|
78
|
+
if not input and not inputs:
|
79
|
+
raise ValueError(
|
80
|
+
"At least one of 'input' or 'inputs' must be defined.")
|
81
|
+
|
82
|
+
if input and not isinstance(input, str):
|
83
|
+
raise ValueError(
|
84
|
+
"'input' should be a single string. For multiple inputs, use 'inputs'.")
|
85
|
+
|
86
|
+
# Get stacking and reshaping config
|
87
|
+
force_stack = m.get("force_stack", True)
|
88
|
+
reshape_dims = m.get("reshape")
|
89
|
+
train_data, train_label, test_data, test_label = None, None, None, None
|
90
|
+
# --- SINGLE INPUT CASE ---
|
68
91
|
if input:
|
69
92
|
item = item_dict[input]
|
70
|
-
|
71
|
-
|
93
|
+
train_data = item[1]
|
94
|
+
train_label = item[2]
|
95
|
+
test_data = item[3]
|
96
|
+
test_label = item[4]
|
97
|
+
|
98
|
+
# Optional: force stack single input to simulate extra dimension
|
99
|
+
if force_stack:
|
100
|
+
train_data = np.expand_dims(train_data, axis=1)
|
101
|
+
test_data = np.expand_dims(test_data, axis=1)
|
102
|
+
|
103
|
+
# --- MULTIPLE INPUTS CASE ---
|
72
104
|
elif inputs:
|
73
|
-
# check keys
|
74
105
|
filtered_dict = {k: item_dict[k]
|
75
|
-
|
76
|
-
|
106
|
+
for k in inputs if k in item_dict}
|
107
|
+
if not filtered_dict:
|
108
|
+
raise ValueError(
|
109
|
+
f"No matching inputs found in item_dict for: {inputs}")
|
110
|
+
|
77
111
|
first_input = next(iter(filtered_dict.values()))
|
78
112
|
train_data_list = [f[1] for f in filtered_dict.values()]
|
79
113
|
test_data_list = [f[3] for f in filtered_dict.values()]
|
80
|
-
train_data = np.stack(train_data_list, axis=1)
|
81
|
-
test_data = np.stack(test_data_list, axis=1)
|
82
114
|
train_label = first_input[2]
|
83
115
|
test_label = first_input[4]
|
84
|
-
|
85
|
-
|
116
|
+
|
117
|
+
# Stack across inputs
|
118
|
+
if len(train_data_list) == 1:
|
119
|
+
train_data = train_data_list[0]
|
120
|
+
test_data = test_data_list[0]
|
121
|
+
|
122
|
+
if force_stack:
|
123
|
+
train_data = np.stack(train_data_list, axis=1)
|
124
|
+
test_data = np.stack(test_data_list, axis=1)
|
125
|
+
|
126
|
+
else:
|
127
|
+
# train_data = np.concatenate(train_data_list, axis=0)
|
128
|
+
# test_data = np.concatenate(test_data_list, axis=0)
|
129
|
+
train_data = np.array(train_data_list).T
|
130
|
+
test_data = np.array(test_data_list).T
|
86
131
|
else:
|
87
132
|
raise Exception("check configurations")
|
133
|
+
|
134
|
+
# --- OPTIONAL: Reshape if needed ---
|
135
|
+
if reshape_dims:
|
136
|
+
train_data = train_data.reshape((-1, *reshape_dims))
|
137
|
+
test_data = test_data.reshape((-1, *reshape_dims))
|
138
|
+
|
139
|
+
# --- Finalize ---
|
140
|
+
single_modal.set_data(
|
141
|
+
self.train_idx_arr, self.val_idx_arr,
|
142
|
+
train_data, train_label,
|
143
|
+
test_data, test_label
|
144
|
+
)
|
145
|
+
|
88
146
|
logs, metrics, prediction = single_modal.fit_and_evaluate()
|
89
147
|
self.result.add_metric(name, metrics)
|
90
148
|
single_results[name] = prediction
|
91
|
-
else:
|
149
|
+
else: # TODO default model maybe?
|
92
150
|
print("Default model will be used")
|
151
|
+
if self.default_model is None:
|
152
|
+
raise Exception(
|
153
|
+
"Default model cannot be None if multi_modal is not defined")
|
154
|
+
if self.default_model.get("model_type") is None:
|
155
|
+
raise Exception(
|
156
|
+
"model_type cannot be None, it should be defined in default_model")
|
157
|
+
|
93
158
|
model_type = get_import(self.default_model.get("model_type"))
|
94
159
|
kwargs = self.default_model.get('params')
|
95
160
|
for item in self.items:
|
96
161
|
name = item[0]
|
97
162
|
T = self.__create_model(self.library)
|
98
163
|
single_modal = T(self.date, name, model_type,
|
99
|
-
|
164
|
+
tracking_service=self.tracking_service, **kwargs)
|
100
165
|
single_modal.set_data(
|
101
|
-
|
166
|
+
self.train_idx_arr, self.val_idx_arr, item[1], item[2], item[3], item[4])
|
102
167
|
|
103
168
|
logs, metrics, prediction = single_modal.fit_and_evaluate()
|
104
169
|
self.result.add_metric(name, metrics)
|
@@ -116,14 +181,15 @@ class MultiModalRunner:
|
|
116
181
|
combinations = []
|
117
182
|
for i in range(2, len(l) + 1):
|
118
183
|
combinations.extend(list(itertools.combinations(l, i))) # all
|
119
|
-
|
184
|
+
|
120
185
|
def _f():
|
121
186
|
self.__predict(single_results)
|
122
187
|
if combinations:
|
123
188
|
self.evaluate_combinations(single_results, combinations)
|
124
|
-
|
189
|
+
|
125
190
|
if self.tracking_service:
|
126
|
-
self.tracking_service.run(
|
191
|
+
self.tracking_service.run(
|
192
|
+
run_name=self.prefix, description="***", func=_f, nested_run=False)
|
127
193
|
else:
|
128
194
|
self.__predict(single_results)
|
129
195
|
if combinations:
|
@@ -138,13 +204,13 @@ class MultiModalRunner:
|
|
138
204
|
def evaluate_combination(artifact_uri=None):
|
139
205
|
self.__evaluate_combinations(
|
140
206
|
single_results, combination, combination_descriptor, artifact_uri
|
141
|
-
|
142
|
-
|
143
|
-
self.tracking_service.run(run_name=combination_descriptor,
|
144
|
-
description="***",
|
145
|
-
nested_run=True,
|
207
|
+
)
|
208
|
+
|
209
|
+
self.tracking_service.run(run_name=combination_descriptor,
|
210
|
+
description="***",
|
211
|
+
nested_run=True,
|
146
212
|
func=evaluate_combination)
|
147
|
-
|
213
|
+
|
148
214
|
# with mlflow.start_run(run_name=combination_descriptor, description="***", nested=True) as combination_run:
|
149
215
|
# self.__evaluate_combinations(
|
150
216
|
# single_results, combination, combination_descriptor, combination_run.info.artifact_uri)
|
@@ -126,6 +126,7 @@ class MultiPipeline():
|
|
126
126
|
dataset_splitter_type = get_import(dataset_config.get("dataset_splitter_type"))
|
127
127
|
columns = dataset_config.get("columns", [])
|
128
128
|
additional_config = dataset_config.get("additional_config", {})
|
129
|
+
input_processing = dataset_config.get("input_processing", [])
|
129
130
|
|
130
131
|
# Vector database configuration
|
131
132
|
vector_database = config.get("vector_databases", {})
|
@@ -149,18 +150,7 @@ class MultiPipeline():
|
|
149
150
|
|
150
151
|
# Default model configuration
|
151
152
|
default_model = config.get("default_model", {})
|
152
|
-
default_model_type = get_import(default_model.get("model_type"))
|
153
|
-
default_model_params = default_model.get("params", {})
|
154
|
-
|
155
153
|
multi_modal = config.get("multi_modal")
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
#ner move it to related dataset
|
160
|
-
|
161
|
-
# ner_data_file = config.get("ner_data_file")
|
162
|
-
# ner_threshold = config.get("ner_threshold")
|
163
|
-
|
164
154
|
|
165
155
|
combination_type = None
|
166
156
|
kwargs_combination_params=None
|
@@ -184,6 +174,7 @@ class MultiPipeline():
|
|
184
174
|
dataset_additional_config=additional_config,
|
185
175
|
dataset_splitter_type=dataset_splitter_type,
|
186
176
|
columns=columns,
|
177
|
+
input_processing=input_processing,
|
187
178
|
vector_store_manager_type=vector_store_manager_type,
|
188
179
|
column_embedding_configs=column_embedding_configs,
|
189
180
|
vector_db_persist_directory=vector_db_persist_directory,
|
@@ -22,6 +22,7 @@ class Pipeline(BaseModel):
|
|
22
22
|
dataset_additional_config: Optional[Dict[str, Any]] = None
|
23
23
|
dataset_splitter_type: Type[DatasetSplitter] = DatasetSplitter
|
24
24
|
columns: Optional[List[str]] = None
|
25
|
+
input_processing: Optional[List[Dict[str, Any]]] = None
|
25
26
|
embedding_dict: Optional[Dict[str, Any]] = None
|
26
27
|
column_embedding_configs: Optional[List] = None
|
27
28
|
vector_db_persist_directory: Optional[str] = None
|
@@ -76,13 +77,6 @@ class Pipeline(BaseModel):
|
|
76
77
|
# 'enzyme','target','pathway','smile','all_text','indication', 'description','mechanism_of_action','pharmacodynamics', 'tui', 'cui', 'entities'
|
77
78
|
kwargs = {"columns": self.columns,
|
78
79
|
"additional_config": self.dataset_additional_config}
|
79
|
-
# DDIMDL Dataset'e aktar
|
80
|
-
# if self.ner_threshold:
|
81
|
-
# for k, v in self.ner_threshold.items():
|
82
|
-
# kwargs[k] = v
|
83
|
-
|
84
|
-
# ner_df = CTakesNER(df=None).load(
|
85
|
-
# filename=self.ner_data_file) if self.ner_data_file else None
|
86
80
|
|
87
81
|
dataset_splitter = self.dataset_splitter_type()
|
88
82
|
pooling_strategy = self.embedding_pooling_strategy_type(
|
@@ -97,12 +91,12 @@ class Pipeline(BaseModel):
|
|
97
91
|
if self.vector_db_collection_name is not None:
|
98
92
|
params["collection_name"] = self.vector_db_collection_name
|
99
93
|
|
100
|
-
|
101
|
-
|
94
|
+
vector_store_manager = self.vector_store_manager_type(
|
95
|
+
**params) if self.vector_store_manager_type else None
|
102
96
|
if issubclass(self.dataset_type, TextDatasetMixin):
|
103
|
-
|
97
|
+
|
104
98
|
dataset = self.dataset_type(
|
105
|
-
vector_store_manager
|
99
|
+
vector_store_manager=vector_store_manager,
|
106
100
|
embedding_dict=self.embedding_dict,
|
107
101
|
pooling_strategy=pooling_strategy,
|
108
102
|
column_embedding_configs=self.column_embedding_configs,
|
@@ -111,17 +105,20 @@ class Pipeline(BaseModel):
|
|
111
105
|
dataset_splitter_type=self.dataset_splitter_type,
|
112
106
|
**kwargs)
|
113
107
|
|
114
|
-
elif self.dataset_type == BaseDataset:
|
108
|
+
elif self.dataset_type == BaseDataset: ## !!! check it
|
115
109
|
dataset = self.dataset_type(
|
116
110
|
dataset_splitter_type=self.dataset_splitter_type,
|
117
111
|
**kwargs)
|
118
112
|
else:
|
119
|
-
dataset = self.dataset_type(
|
113
|
+
dataset = self.dataset_type(
|
114
|
+
dataset_splitter_type=self.dataset_splitter_type, **kwargs)
|
115
|
+
|
116
|
+
dataset.input_processing = self.input_processing
|
120
117
|
|
121
118
|
# X_train, X_test, y_train, y_test, train_indexes, test_indexes, train_idx_arr, val_idx_arr = dataset.load()
|
122
119
|
|
123
120
|
dataset.load()
|
124
|
-
|
121
|
+
|
125
122
|
self._dataset = dataset
|
126
123
|
self._train_idx_arr = dataset.train_idx_arr
|
127
124
|
self._val_idx_arr = dataset.val_idx_arr
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|