ddi-fw 0.0.266__py3-none-any.whl → 0.0.267__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ddi_fw/datasets/__init__.py +3 -4
- ddi_fw/datasets/core.py +1 -0
- ddi_fw/datasets/processor.py +158 -0
- ddi_fw/ml/ml_helper.py +36 -22
- {ddi_fw-0.0.266.dist-info → ddi_fw-0.0.267.dist-info}/METADATA +1 -1
- {ddi_fw-0.0.266.dist-info → ddi_fw-0.0.267.dist-info}/RECORD +8 -7
- {ddi_fw-0.0.266.dist-info → ddi_fw-0.0.267.dist-info}/WHEEL +0 -0
- {ddi_fw-0.0.266.dist-info → ddi_fw-0.0.267.dist-info}/top_level.txt +0 -0
ddi_fw/datasets/__init__.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1
|
-
from .core import BaseDataset,TextDatasetMixin
|
1
|
+
from .core import BaseDataset, TextDatasetMixin
|
2
2
|
from .dataset_splitter import DatasetSplitter
|
3
|
-
|
4
|
-
|
5
|
-
|
3
|
+
from .processor import BaseInputProcessor, DefaultInputProcessor, ConcatInputProcessor
|
4
|
+
__all__ = ['BaseDataset', 'TextDatasetMixin', 'DatasetSplitter']
|
ddi_fw/datasets/core.py
CHANGED
@@ -119,6 +119,7 @@ class BaseDataset(BaseModel, abc.ABC):
|
|
119
119
|
return data
|
120
120
|
|
121
121
|
# TODO columns yoksa tüm feature'lar alınıyor, bu pipeline'da nasıl yapılacak?
|
122
|
+
# TODO processor sınıfı kullanılsın
|
122
123
|
def produce_inputs(self):
|
123
124
|
# Grouping the list by "column" key
|
124
125
|
grouped_data = defaultdict(dict)
|
@@ -0,0 +1,158 @@
|
|
1
|
+
from typing import Optional
|
2
|
+
import numpy as np
|
3
|
+
|
4
|
+
|
5
|
+
class BaseInputProcessor:
|
6
|
+
def process1(self, data, processing_config=None):
|
7
|
+
raise NotImplementedError("Input processors must implement the process method.")
|
8
|
+
def process2(self, data, processing_config=None):
|
9
|
+
raise NotImplementedError("Input processors must implement the process method.")
|
10
|
+
|
11
|
+
class DefaultInputProcessor(BaseInputProcessor):
|
12
|
+
def __init__(self):
|
13
|
+
pass
|
14
|
+
|
15
|
+
def process2(self, data, processing_config=None):
|
16
|
+
"""
|
17
|
+
Processes input data according to the provided config.
|
18
|
+
Supports stacking, reshaping, and can use item_dict for advanced logic.
|
19
|
+
"""
|
20
|
+
if processing_config is None:
|
21
|
+
raise ValueError("processing_config must be provided.")
|
22
|
+
|
23
|
+
force_stack = processing_config.get("force_stack", False)
|
24
|
+
reshape_dims = processing_config.get("reshape")
|
25
|
+
if type(data) is not list:
|
26
|
+
|
27
|
+
# Optional: force stack single input to simulate extra dimension
|
28
|
+
if force_stack:
|
29
|
+
data = np.expand_dims(data, axis=1)
|
30
|
+
else:
|
31
|
+
# --- MULTIPLE INPUTS CASE ---
|
32
|
+
# Stack across inputs
|
33
|
+
if len(data) == 1:
|
34
|
+
data = data[0]
|
35
|
+
|
36
|
+
if force_stack:
|
37
|
+
data = np.stack(data, axis=1)
|
38
|
+
|
39
|
+
else:
|
40
|
+
data = np.array(data).T
|
41
|
+
|
42
|
+
|
43
|
+
# --- OPTIONAL: Reshape if needed ---
|
44
|
+
if reshape_dims:
|
45
|
+
data = data.reshape((-1, *reshape_dims))
|
46
|
+
|
47
|
+
return data
|
48
|
+
|
49
|
+
|
50
|
+
def process1(self, data, processing_config=None):
|
51
|
+
if not processing_config:
|
52
|
+
return data
|
53
|
+
if processing_config.get("flatten", False):
|
54
|
+
print("Flattening data...")
|
55
|
+
data = np.array(data).flatten()
|
56
|
+
print(f"Data shape after flattening: {data.shape}")
|
57
|
+
|
58
|
+
if processing_config.get("stack", False):
|
59
|
+
print("Stacking data...")
|
60
|
+
data = np.stack(data)
|
61
|
+
print(f"Data shape after stacking: {data.shape}")
|
62
|
+
if not isinstance(data, np.ndarray):
|
63
|
+
data = np.array(data)
|
64
|
+
# if processing_config.get("flatten", False):
|
65
|
+
# data = np.stack(data.flatten().tolist())
|
66
|
+
# Ensure we start with a NumPy array
|
67
|
+
|
68
|
+
|
69
|
+
# Normalize input
|
70
|
+
if processing_config.get("normalize", False):
|
71
|
+
data = data.astype(np.float32)
|
72
|
+
max_val = np.max(data)
|
73
|
+
if max_val > 1:
|
74
|
+
data /= max_val
|
75
|
+
|
76
|
+
# Reshape input (for images etc.)
|
77
|
+
if "reshape" in processing_config:
|
78
|
+
try:
|
79
|
+
target_shape = tuple(processing_config["reshape"])
|
80
|
+
data = data.reshape((-1, *target_shape))
|
81
|
+
except Exception as e:
|
82
|
+
raise ValueError(f"Reshape failed for data with shape {data.shape}: {e}")
|
83
|
+
|
84
|
+
|
85
|
+
return data
|
86
|
+
|
87
|
+
|
88
|
+
|
89
|
+
import numpy as np
|
90
|
+
import pandas as pd
|
91
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
92
|
+
|
93
|
+
class ConcatInputProcessor(BaseInputProcessor):
|
94
|
+
def __init__(self, dataset, id_column, embedding_column, top_k=1):
|
95
|
+
self.ds = dataset # Reference to the dataset instance
|
96
|
+
self.id_column = id_column
|
97
|
+
self.embedding_column = embedding_column
|
98
|
+
self.top_k = top_k
|
99
|
+
self.embeddings_array = None
|
100
|
+
self.id_list = None
|
101
|
+
self.id_to_idx = None
|
102
|
+
self.similarity_matrix = None
|
103
|
+
self.top_k_similar_df = None
|
104
|
+
|
105
|
+
def _prepare_embeddings(self, ids: Optional[list] = None):
|
106
|
+
if ids is None:
|
107
|
+
ids = self.ds.drugs_df[self.id_column].tolist()
|
108
|
+
df = pd.DataFrame.from_dict(self.ds.embedding_dict)
|
109
|
+
df = df[df.index.isin(ids)]
|
110
|
+
if self.embedding_column not in df.columns:
|
111
|
+
raise ValueError(f"Column '{self.embedding_column}' not found in embedding_dict.")
|
112
|
+
df['embeddings'] = df[self.embedding_column].apply(self.ds.pooling_strategy.apply)
|
113
|
+
df = df.dropna(subset=['embeddings'])
|
114
|
+
self.embeddings_array = np.stack(df['embeddings'].values).astype('float32')
|
115
|
+
self.id_list = list(df.index)
|
116
|
+
self.id_to_idx = {drug_id: idx for idx, drug_id in enumerate(self.id_list)}
|
117
|
+
|
118
|
+
def _compute_similarity_matrix(self):
|
119
|
+
self.similarity_matrix = cosine_similarity(self.embeddings_array)
|
120
|
+
|
121
|
+
def get_top_k_similar(self, top_k=None):
|
122
|
+
if top_k is None:
|
123
|
+
top_k = self.top_k
|
124
|
+
arr = self.similarity_matrix.copy()
|
125
|
+
np.fill_diagonal(arr, -np.inf)
|
126
|
+
top_k_idx = np.argpartition(arr, -top_k, axis=1)[:, -top_k:]
|
127
|
+
sorted_top_k_idx = np.argsort(arr[np.arange(arr.shape[0])[:, None], top_k_idx], axis=1)[:, ::-1]
|
128
|
+
final_top_k_idx = np.take_along_axis(top_k_idx, sorted_top_k_idx, axis=1)
|
129
|
+
top_k_ids_list = [[self.id_list[idx] for idx in row] for row in final_top_k_idx]
|
130
|
+
return pd.DataFrame({"drug_id": self.id_list, "top_similar_ids": top_k_ids_list}).set_index("drug_id")
|
131
|
+
|
132
|
+
def process(self, data, processing_config=None):
|
133
|
+
"""
|
134
|
+
For each input vector, concatenate it with its top-k most similar vectors.
|
135
|
+
Assumes 'data' is a DataFrame with an id column and an embedding column.
|
136
|
+
"""
|
137
|
+
# Prepare embeddings and similarity matrix if not already done
|
138
|
+
if self.embeddings_array is None or self.similarity_matrix is None:
|
139
|
+
self._prepare_embeddings()
|
140
|
+
self._compute_similarity_matrix()
|
141
|
+
self.top_k_similar_df = self.get_top_k_similar(self.top_k)
|
142
|
+
|
143
|
+
if self.top_k_similar_df is None:
|
144
|
+
raise ValueError("Top-k similar DataFrame not computed.")
|
145
|
+
# For each row in data, concatenate its embedding with its top-k similar embeddings
|
146
|
+
result = []
|
147
|
+
for idx, row in data.iterrows():
|
148
|
+
drug_id = row[self.id_column]
|
149
|
+
embedding = row[self.embedding_column]
|
150
|
+
similar_ids = self.top_k_similar_df.loc[drug_id, "top_similar_ids"]
|
151
|
+
similar_embeddings = []
|
152
|
+
for sim_id in similar_ids:
|
153
|
+
sim_idx = self.id_to_idx.get(sim_id)
|
154
|
+
if sim_idx is not None:
|
155
|
+
similar_embeddings.append(self.embeddings_array[sim_idx])
|
156
|
+
concat_embedding = np.concatenate([embedding] + similar_embeddings)
|
157
|
+
result.append(concat_embedding)
|
158
|
+
return np.stack(result)
|
ddi_fw/ml/ml_helper.py
CHANGED
@@ -84,9 +84,18 @@ class MultiModalRunner:
|
|
84
84
|
"'input' should be a single string. For multiple inputs, use 'inputs'.")
|
85
85
|
|
86
86
|
# Get stacking and reshaping config
|
87
|
+
processor_type = m.get("processor", "ddi_fw.datasets.processor.DefaultInputProcessor")
|
88
|
+
processor = get_import(processor_type) # Ensure the processor type is valid
|
87
89
|
force_stack = m.get("force_stack", True)
|
88
90
|
reshape_dims = m.get("reshape")
|
89
91
|
train_data, train_label, test_data, test_label = None, None, None, None
|
92
|
+
|
93
|
+
# Prepare processing config with all context
|
94
|
+
processing_config = {
|
95
|
+
"force_stack": force_stack,
|
96
|
+
"reshape": reshape_dims
|
97
|
+
}
|
98
|
+
|
90
99
|
# --- SINGLE INPUT CASE ---
|
91
100
|
if input:
|
92
101
|
item = item_dict[input]
|
@@ -94,11 +103,12 @@ class MultiModalRunner:
|
|
94
103
|
train_label = item[2]
|
95
104
|
test_data = item[3]
|
96
105
|
test_label = item[4]
|
106
|
+
|
97
107
|
|
98
|
-
# Optional: force stack single input to simulate extra dimension
|
99
|
-
if force_stack:
|
100
|
-
|
101
|
-
|
108
|
+
# # Optional: force stack single input to simulate extra dimension
|
109
|
+
# if force_stack:
|
110
|
+
# train_data = np.expand_dims(train_data, axis=1)
|
111
|
+
# test_data = np.expand_dims(test_data, axis=1)
|
102
112
|
|
103
113
|
# --- MULTIPLE INPUTS CASE ---
|
104
114
|
elif inputs:
|
@@ -109,33 +119,37 @@ class MultiModalRunner:
|
|
109
119
|
f"No matching inputs found in item_dict for: {inputs}")
|
110
120
|
|
111
121
|
first_input = next(iter(filtered_dict.values()))
|
112
|
-
|
113
|
-
|
122
|
+
train_data = [f[1] for f in filtered_dict.values()]
|
123
|
+
test_data = [f[3] for f in filtered_dict.values()]
|
114
124
|
train_label = first_input[2]
|
115
125
|
test_label = first_input[4]
|
116
126
|
|
117
|
-
# Stack across inputs
|
118
|
-
if len(train_data_list) == 1:
|
119
|
-
|
120
|
-
|
127
|
+
# # Stack across inputs
|
128
|
+
# if len(train_data_list) == 1:
|
129
|
+
# train_data = train_data_list[0]
|
130
|
+
# test_data = test_data_list[0]
|
121
131
|
|
122
|
-
if force_stack:
|
123
|
-
|
124
|
-
|
132
|
+
# if force_stack:
|
133
|
+
# train_data = np.stack(train_data_list, axis=1)
|
134
|
+
# test_data = np.stack(test_data_list, axis=1)
|
125
135
|
|
126
|
-
else:
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
136
|
+
# else:
|
137
|
+
# # train_data = np.concatenate(train_data_list, axis=0)
|
138
|
+
# # test_data = np.concatenate(test_data_list, axis=0)
|
139
|
+
# train_data = np.array(train_data_list).T
|
140
|
+
# test_data = np.array(test_data_list).T
|
131
141
|
else:
|
132
142
|
raise Exception("check configurations")
|
133
143
|
|
134
|
-
# --- OPTIONAL: Reshape if needed ---
|
135
|
-
if reshape_dims:
|
136
|
-
train_data = train_data.reshape((-1, *reshape_dims))
|
137
|
-
test_data = test_data.reshape((-1, *reshape_dims))
|
138
144
|
|
145
|
+
train_data = processor().process2(train_data, processing_config)
|
146
|
+
test_data = processor().process2(test_data, processing_config)
|
147
|
+
# # --- OPTIONAL: Reshape if needed ---
|
148
|
+
# if reshape_dims:
|
149
|
+
# train_data = train_data.reshape((-1, *reshape_dims))
|
150
|
+
# test_data = test_data.reshape((-1, *reshape_dims))
|
151
|
+
|
152
|
+
|
139
153
|
# --- Finalize ---
|
140
154
|
single_modal.set_data(
|
141
155
|
self.train_idx_arr, self.val_idx_arr,
|
@@ -1,7 +1,8 @@
|
|
1
|
-
ddi_fw/datasets/__init__.py,sha256=
|
2
|
-
ddi_fw/datasets/core.py,sha256=
|
1
|
+
ddi_fw/datasets/__init__.py,sha256=VBOLp6g2M86DXo1hgNfzz4CNpiJDHgtHqPns6cftbHc,250
|
2
|
+
ddi_fw/datasets/core.py,sha256=g8p_lU7XOYGxjHajLPWqfWfw-NRHuludmdlvXs7d1cc,15122
|
3
3
|
ddi_fw/datasets/dataset_splitter.py,sha256=8H8uZTAf8N9LUZeSeHOMawtJFJhnDgUUqFcnl7dquBQ,1672
|
4
4
|
ddi_fw/datasets/db_utils.py,sha256=xRj28U_uXTRPHcz3yIICczFUHXUPiAOZtAj5BM6kH44,6465
|
5
|
+
ddi_fw/datasets/processor.py,sha256=Cwuy7T8domLoNPphGdFgc9gL2qeDRnt_kub4i7LdXJ4,6524
|
5
6
|
ddi_fw/datasets/setup_._py,sha256=khYVJuW5PlOY_i_A16F3UbSZ6s6o_ljw33Byw3C-A8E,1047
|
6
7
|
ddi_fw/langchain/__init__.py,sha256=97Y4lYuxShWqx5hfDbzf8VyV0HrM76fDlNp5xXusKQU,445
|
7
8
|
ddi_fw/langchain/chroma_storage.py,sha256=fOxoJoaqqyOKqtfUtlq2zJd-XY03rARTDvrPE_9nY2I,15855
|
@@ -11,7 +12,7 @@ ddi_fw/langchain/sentence_splitter.py,sha256=NCcDdDWDnwZTZDqarg-5gSbcDFoAM_sxcgH
|
|
11
12
|
ddi_fw/langchain/storage.py,sha256=OizKyWm74Js7T6Q9kez-ulUoBGzIMFo4R46h4kjUyIM,11200
|
12
13
|
ddi_fw/ml/__init__.py,sha256=FteYEawCkVQOaK-cTv2VrHZ2ZnfeFr31BD6VucO7_DQ,268
|
13
14
|
ddi_fw/ml/evaluation_helper.py,sha256=2-7CLSgGTqLEk4HkgCVIOt-GxfLAn6SBozJghAtHb5M,11581
|
14
|
-
ddi_fw/ml/ml_helper.py,sha256=
|
15
|
+
ddi_fw/ml/ml_helper.py,sha256=MO6bn0NW8sj8yc_HY5F-LZBU8XZJ57g8fOfcjfHNBkE,11377
|
15
16
|
ddi_fw/ml/model_wrapper.py,sha256=38uBdHI4H_sjDKPWuhGXovUy_L1tpSNm5tEqCtwmlpY,973
|
16
17
|
ddi_fw/ml/pytorch_wrapper.py,sha256=pe6UsjP2XeTgLxDnIUiodoyhJTGCxV27wD4Cjxysu2Q,8553
|
17
18
|
ddi_fw/ml/tensorflow_wrapper.py,sha256=_mOXMpIkXx7lJySC2wtCDIDhSdtA8bQVEjKwJ5NQ7Io,16782
|
@@ -38,7 +39,7 @@ ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,55
|
|
38
39
|
ddi_fw/vectorization/__init__.py,sha256=LcJOpLVoLvHPDw9phGFlUQGeNcST_zKV-Oi1Pm5h_nE,110
|
39
40
|
ddi_fw/vectorization/feature_vector_generation.py,sha256=92bhZw4Qxh0hqPK-bPHm9bUO7pg2p4cStQYtVrOtetE,7919
|
40
41
|
ddi_fw/vectorization/idf_helper.py,sha256=_Gd1dtDSLaw8o-o0JugzSKMt9FpeXewTh4wGEaUd4VQ,2571
|
41
|
-
ddi_fw-0.0.
|
42
|
-
ddi_fw-0.0.
|
43
|
-
ddi_fw-0.0.
|
44
|
-
ddi_fw-0.0.
|
42
|
+
ddi_fw-0.0.267.dist-info/METADATA,sha256=1IkpdIfCr5lRjvJ_KVWx_dnzd_2o1bAs7D74uVLR9cg,2623
|
43
|
+
ddi_fw-0.0.267.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
44
|
+
ddi_fw-0.0.267.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
|
45
|
+
ddi_fw-0.0.267.dist-info/RECORD,,
|
File without changes
|
File without changes
|