ddi-fw 0.0.266__tar.gz → 0.0.267__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/PKG-INFO +1 -1
  2. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/pyproject.toml +1 -1
  3. ddi_fw-0.0.267/src/ddi_fw/datasets/__init__.py +4 -0
  4. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/datasets/core.py +1 -0
  5. ddi_fw-0.0.267/src/ddi_fw/datasets/processor.py +158 -0
  6. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/ml/ml_helper.py +36 -22
  7. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw.egg-info/PKG-INFO +1 -1
  8. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw.egg-info/SOURCES.txt +1 -0
  9. ddi_fw-0.0.266/src/ddi_fw/datasets/__init__.py +0 -5
  10. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/README.md +0 -0
  11. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/setup.cfg +0 -0
  12. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/datasets/dataset_splitter.py +0 -0
  13. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/datasets/db_utils.py +0 -0
  14. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/datasets/setup_._py +0 -0
  15. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/langchain/__init__.py +0 -0
  16. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/langchain/chroma_storage.py +0 -0
  17. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/langchain/embeddings.py +0 -0
  18. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/langchain/faiss_storage.py +0 -0
  19. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/langchain/sentence_splitter.py +0 -0
  20. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/langchain/storage.py +0 -0
  21. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/ml/__init__.py +0 -0
  22. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/ml/evaluation_helper.py +0 -0
  23. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/ml/model_wrapper.py +0 -0
  24. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/ml/pytorch_wrapper.py +0 -0
  25. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/ml/tensorflow_wrapper.py +0 -0
  26. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/ml/tracking_service.py +0 -0
  27. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/ner/__init__.py +0 -0
  28. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/ner/mmlrestclient.py +0 -0
  29. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/ner/ner.py +0 -0
  30. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/pipeline/__init__.py +0 -0
  31. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/pipeline/multi_modal_combination_strategy.py +0 -0
  32. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/pipeline/multi_pipeline.py +0 -0
  33. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/pipeline/multi_pipeline_org.py +0 -0
  34. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/pipeline/ner_pipeline.py +0 -0
  35. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/pipeline/pipeline.py +0 -0
  36. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/utils/__init__.py +0 -0
  37. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/utils/categorical_data_encoding_checker.py +0 -0
  38. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/utils/enums.py +0 -0
  39. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/utils/json_helper.py +0 -0
  40. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/utils/kaggle.py +0 -0
  41. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/utils/numpy_utils.py +0 -0
  42. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/utils/package_helper.py +0 -0
  43. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/utils/py7zr_helper.py +0 -0
  44. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/utils/utils.py +0 -0
  45. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/utils/zip_helper.py +0 -0
  46. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/vectorization/__init__.py +0 -0
  47. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/vectorization/feature_vector_generation.py +0 -0
  48. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw/vectorization/idf_helper.py +0 -0
  49. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw.egg-info/dependency_links.txt +0 -0
  50. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw.egg-info/requires.txt +0 -0
  51. {ddi_fw-0.0.266 → ddi_fw-0.0.267}/src/ddi_fw.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ddi_fw
3
- Version: 0.0.266
3
+ Version: 0.0.267
4
4
  Summary: Do not use :)
5
5
  Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
6
6
  Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
@@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
6
6
 
7
7
  [project]
8
8
  name = "ddi_fw"
9
- version = "0.0.266"
9
+ version = "0.0.267"
10
10
  description = "Do not use :)"
11
11
  readme = "README.md"
12
12
  authors = [
@@ -0,0 +1,4 @@
1
+ from .core import BaseDataset, TextDatasetMixin
2
+ from .dataset_splitter import DatasetSplitter
3
+ from .processor import BaseInputProcessor, DefaultInputProcessor, ConcatInputProcessor
4
+ __all__ = ['BaseDataset', 'TextDatasetMixin', 'DatasetSplitter']
@@ -119,6 +119,7 @@ class BaseDataset(BaseModel, abc.ABC):
119
119
  return data
120
120
 
121
121
  # TODO columns yoksa tüm feature'lar alınıyor, bu pipeline'da nasıl yapılacak?
122
+ # TODO processor sınıfı kullanılsın
122
123
  def produce_inputs(self):
123
124
  # Grouping the list by "column" key
124
125
  grouped_data = defaultdict(dict)
@@ -0,0 +1,158 @@
1
+ from typing import Optional
2
+ import numpy as np
3
+
4
+
5
+ class BaseInputProcessor:
6
+ def process1(self, data, processing_config=None):
7
+ raise NotImplementedError("Input processors must implement the process method.")
8
+ def process2(self, data, processing_config=None):
9
+ raise NotImplementedError("Input processors must implement the process method.")
10
+
11
+ class DefaultInputProcessor(BaseInputProcessor):
12
+ def __init__(self):
13
+ pass
14
+
15
+ def process2(self, data, processing_config=None):
16
+ """
17
+ Processes input data according to the provided config.
18
+ Supports stacking, reshaping, and can use item_dict for advanced logic.
19
+ """
20
+ if processing_config is None:
21
+ raise ValueError("processing_config must be provided.")
22
+
23
+ force_stack = processing_config.get("force_stack", False)
24
+ reshape_dims = processing_config.get("reshape")
25
+ if type(data) is not list:
26
+
27
+ # Optional: force stack single input to simulate extra dimension
28
+ if force_stack:
29
+ data = np.expand_dims(data, axis=1)
30
+ else:
31
+ # --- MULTIPLE INPUTS CASE ---
32
+ # Stack across inputs
33
+ if len(data) == 1:
34
+ data = data[0]
35
+
36
+ if force_stack:
37
+ data = np.stack(data, axis=1)
38
+
39
+ else:
40
+ data = np.array(data).T
41
+
42
+
43
+ # --- OPTIONAL: Reshape if needed ---
44
+ if reshape_dims:
45
+ data = data.reshape((-1, *reshape_dims))
46
+
47
+ return data
48
+
49
+
50
+ def process1(self, data, processing_config=None):
51
+ if not processing_config:
52
+ return data
53
+ if processing_config.get("flatten", False):
54
+ print("Flattening data...")
55
+ data = np.array(data).flatten()
56
+ print(f"Data shape after flattening: {data.shape}")
57
+
58
+ if processing_config.get("stack", False):
59
+ print("Stacking data...")
60
+ data = np.stack(data)
61
+ print(f"Data shape after stacking: {data.shape}")
62
+ if not isinstance(data, np.ndarray):
63
+ data = np.array(data)
64
+ # if processing_config.get("flatten", False):
65
+ # data = np.stack(data.flatten().tolist())
66
+ # Ensure we start with a NumPy array
67
+
68
+
69
+ # Normalize input
70
+ if processing_config.get("normalize", False):
71
+ data = data.astype(np.float32)
72
+ max_val = np.max(data)
73
+ if max_val > 1:
74
+ data /= max_val
75
+
76
+ # Reshape input (for images etc.)
77
+ if "reshape" in processing_config:
78
+ try:
79
+ target_shape = tuple(processing_config["reshape"])
80
+ data = data.reshape((-1, *target_shape))
81
+ except Exception as e:
82
+ raise ValueError(f"Reshape failed for data with shape {data.shape}: {e}")
83
+
84
+
85
+ return data
86
+
87
+
88
+
89
+ import numpy as np
90
+ import pandas as pd
91
+ from sklearn.metrics.pairwise import cosine_similarity
92
+
93
+ class ConcatInputProcessor(BaseInputProcessor):
94
+ def __init__(self, dataset, id_column, embedding_column, top_k=1):
95
+ self.ds = dataset # Reference to the dataset instance
96
+ self.id_column = id_column
97
+ self.embedding_column = embedding_column
98
+ self.top_k = top_k
99
+ self.embeddings_array = None
100
+ self.id_list = None
101
+ self.id_to_idx = None
102
+ self.similarity_matrix = None
103
+ self.top_k_similar_df = None
104
+
105
+ def _prepare_embeddings(self, ids: Optional[list] = None):
106
+ if ids is None:
107
+ ids = self.ds.drugs_df[self.id_column].tolist()
108
+ df = pd.DataFrame.from_dict(self.ds.embedding_dict)
109
+ df = df[df.index.isin(ids)]
110
+ if self.embedding_column not in df.columns:
111
+ raise ValueError(f"Column '{self.embedding_column}' not found in embedding_dict.")
112
+ df['embeddings'] = df[self.embedding_column].apply(self.ds.pooling_strategy.apply)
113
+ df = df.dropna(subset=['embeddings'])
114
+ self.embeddings_array = np.stack(df['embeddings'].values).astype('float32')
115
+ self.id_list = list(df.index)
116
+ self.id_to_idx = {drug_id: idx for idx, drug_id in enumerate(self.id_list)}
117
+
118
+ def _compute_similarity_matrix(self):
119
+ self.similarity_matrix = cosine_similarity(self.embeddings_array)
120
+
121
+ def get_top_k_similar(self, top_k=None):
122
+ if top_k is None:
123
+ top_k = self.top_k
124
+ arr = self.similarity_matrix.copy()
125
+ np.fill_diagonal(arr, -np.inf)
126
+ top_k_idx = np.argpartition(arr, -top_k, axis=1)[:, -top_k:]
127
+ sorted_top_k_idx = np.argsort(arr[np.arange(arr.shape[0])[:, None], top_k_idx], axis=1)[:, ::-1]
128
+ final_top_k_idx = np.take_along_axis(top_k_idx, sorted_top_k_idx, axis=1)
129
+ top_k_ids_list = [[self.id_list[idx] for idx in row] for row in final_top_k_idx]
130
+ return pd.DataFrame({"drug_id": self.id_list, "top_similar_ids": top_k_ids_list}).set_index("drug_id")
131
+
132
+ def process(self, data, processing_config=None):
133
+ """
134
+ For each input vector, concatenate it with its top-k most similar vectors.
135
+ Assumes 'data' is a DataFrame with an id column and an embedding column.
136
+ """
137
+ # Prepare embeddings and similarity matrix if not already done
138
+ if self.embeddings_array is None or self.similarity_matrix is None:
139
+ self._prepare_embeddings()
140
+ self._compute_similarity_matrix()
141
+ self.top_k_similar_df = self.get_top_k_similar(self.top_k)
142
+
143
+ if self.top_k_similar_df is None:
144
+ raise ValueError("Top-k similar DataFrame not computed.")
145
+ # For each row in data, concatenate its embedding with its top-k similar embeddings
146
+ result = []
147
+ for idx, row in data.iterrows():
148
+ drug_id = row[self.id_column]
149
+ embedding = row[self.embedding_column]
150
+ similar_ids = self.top_k_similar_df.loc[drug_id, "top_similar_ids"]
151
+ similar_embeddings = []
152
+ for sim_id in similar_ids:
153
+ sim_idx = self.id_to_idx.get(sim_id)
154
+ if sim_idx is not None:
155
+ similar_embeddings.append(self.embeddings_array[sim_idx])
156
+ concat_embedding = np.concatenate([embedding] + similar_embeddings)
157
+ result.append(concat_embedding)
158
+ return np.stack(result)
@@ -84,9 +84,18 @@ class MultiModalRunner:
84
84
  "'input' should be a single string. For multiple inputs, use 'inputs'.")
85
85
 
86
86
  # Get stacking and reshaping config
87
+ processor_type = m.get("processor", "ddi_fw.datasets.processor.DefaultInputProcessor")
88
+ processor = get_import(processor_type) # Ensure the processor type is valid
87
89
  force_stack = m.get("force_stack", True)
88
90
  reshape_dims = m.get("reshape")
89
91
  train_data, train_label, test_data, test_label = None, None, None, None
92
+
93
+ # Prepare processing config with all context
94
+ processing_config = {
95
+ "force_stack": force_stack,
96
+ "reshape": reshape_dims
97
+ }
98
+
90
99
  # --- SINGLE INPUT CASE ---
91
100
  if input:
92
101
  item = item_dict[input]
@@ -94,11 +103,12 @@ class MultiModalRunner:
94
103
  train_label = item[2]
95
104
  test_data = item[3]
96
105
  test_label = item[4]
106
+
97
107
 
98
- # Optional: force stack single input to simulate extra dimension
99
- if force_stack:
100
- train_data = np.expand_dims(train_data, axis=1)
101
- test_data = np.expand_dims(test_data, axis=1)
108
+ # # Optional: force stack single input to simulate extra dimension
109
+ # if force_stack:
110
+ # train_data = np.expand_dims(train_data, axis=1)
111
+ # test_data = np.expand_dims(test_data, axis=1)
102
112
 
103
113
  # --- MULTIPLE INPUTS CASE ---
104
114
  elif inputs:
@@ -109,33 +119,37 @@ class MultiModalRunner:
109
119
  f"No matching inputs found in item_dict for: {inputs}")
110
120
 
111
121
  first_input = next(iter(filtered_dict.values()))
112
- train_data_list = [f[1] for f in filtered_dict.values()]
113
- test_data_list = [f[3] for f in filtered_dict.values()]
122
+ train_data = [f[1] for f in filtered_dict.values()]
123
+ test_data = [f[3] for f in filtered_dict.values()]
114
124
  train_label = first_input[2]
115
125
  test_label = first_input[4]
116
126
 
117
- # Stack across inputs
118
- if len(train_data_list) == 1:
119
- train_data = train_data_list[0]
120
- test_data = test_data_list[0]
127
+ # # Stack across inputs
128
+ # if len(train_data_list) == 1:
129
+ # train_data = train_data_list[0]
130
+ # test_data = test_data_list[0]
121
131
 
122
- if force_stack:
123
- train_data = np.stack(train_data_list, axis=1)
124
- test_data = np.stack(test_data_list, axis=1)
132
+ # if force_stack:
133
+ # train_data = np.stack(train_data_list, axis=1)
134
+ # test_data = np.stack(test_data_list, axis=1)
125
135
 
126
- else:
127
- # train_data = np.concatenate(train_data_list, axis=0)
128
- # test_data = np.concatenate(test_data_list, axis=0)
129
- train_data = np.array(train_data_list).T
130
- test_data = np.array(test_data_list).T
136
+ # else:
137
+ # # train_data = np.concatenate(train_data_list, axis=0)
138
+ # # test_data = np.concatenate(test_data_list, axis=0)
139
+ # train_data = np.array(train_data_list).T
140
+ # test_data = np.array(test_data_list).T
131
141
  else:
132
142
  raise Exception("check configurations")
133
143
 
134
- # --- OPTIONAL: Reshape if needed ---
135
- if reshape_dims:
136
- train_data = train_data.reshape((-1, *reshape_dims))
137
- test_data = test_data.reshape((-1, *reshape_dims))
138
144
 
145
+ train_data = processor().process2(train_data, processing_config)
146
+ test_data = processor().process2(test_data, processing_config)
147
+ # # --- OPTIONAL: Reshape if needed ---
148
+ # if reshape_dims:
149
+ # train_data = train_data.reshape((-1, *reshape_dims))
150
+ # test_data = test_data.reshape((-1, *reshape_dims))
151
+
152
+
139
153
  # --- Finalize ---
140
154
  single_modal.set_data(
141
155
  self.train_idx_arr, self.val_idx_arr,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ddi_fw
3
- Version: 0.0.266
3
+ Version: 0.0.267
4
4
  Summary: Do not use :)
5
5
  Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
6
6
  Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
@@ -9,6 +9,7 @@ src/ddi_fw/datasets/__init__.py
9
9
  src/ddi_fw/datasets/core.py
10
10
  src/ddi_fw/datasets/dataset_splitter.py
11
11
  src/ddi_fw/datasets/db_utils.py
12
+ src/ddi_fw/datasets/processor.py
12
13
  src/ddi_fw/datasets/setup_._py
13
14
  src/ddi_fw/langchain/__init__.py
14
15
  src/ddi_fw/langchain/chroma_storage.py
@@ -1,5 +0,0 @@
1
- from .core import BaseDataset,TextDatasetMixin
2
- from .dataset_splitter import DatasetSplitter
3
- __all__ = ['BaseDataset', 'TextDatasetMixin', 'DatasetSplitter']
4
-
5
-
File without changes
File without changes
File without changes