ddi-fw 0.0.149__py3-none-any.whl → 0.0.151__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,12 @@
1
+ from typing import Any, Dict, List, Optional, Type, Union
1
2
  import numpy as np
2
3
  import pandas as pd
3
4
  import chromadb
4
5
  from collections import defaultdict
6
+ from chromadb.api.types import IncludeEnum
7
+
8
+ from pydantic import BaseModel
9
+ from ddi_fw.datasets.core import TextDatasetMixin
5
10
  from ddi_fw.ner.ner import CTakesNER
6
11
  from ddi_fw.langchain.embeddings import PoolingStrategy
7
12
  from ddi_fw.datasets import BaseDataset, DDIMDLDataset
@@ -10,44 +15,83 @@ import mlflow
10
15
  from ddi_fw.ml import MultiModalRunner
11
16
 
12
17
 
13
- class Pipeline:
14
- def __init__(self,
15
- library='tensorflow',
16
- experiment_name=None,
17
- experiment_description=None,
18
- experiment_tags=None,
19
- artifact_location=None,
20
- tracking_uri=None,
21
- dataset_type: BaseDataset = None,
22
- columns=None,
23
- embedding_dict=None,
24
- column_embedding_configs=None,
25
- vector_db_persist_directory=None,
26
- vector_db_collection_name=None,
27
- embedding_pooling_strategy_type: PoolingStrategy = None,
28
- ner_data_file=None,
29
- ner_threshold=None,
30
- combinations=None,
31
- model=None,
32
- multi_modal = None ):
33
- self.library = library
34
- self.experiment_name = experiment_name
35
- self.experiment_description = experiment_description
36
- self.experiment_tags = experiment_tags
37
- self.artifact_location = artifact_location
38
- self.tracking_uri = tracking_uri
39
- self.dataset_type = dataset_type
40
- self.columns = columns
41
- self.embedding_dict = embedding_dict
42
- self.column_embedding_configs = column_embedding_configs
43
- self.vector_db_persist_directory = vector_db_persist_directory
44
- self.vector_db_collection_name = vector_db_collection_name
45
- self.embedding_pooling_strategy_type = embedding_pooling_strategy_type
46
- self.ner_data_file = ner_data_file
47
- self.ner_threshold = ner_threshold
48
- self.combinations = combinations
49
- self.model = model
50
- self.multi_modal = multi_modal
18
+ class Pipeline(BaseModel):
19
+ library: str = 'tensorflow'
20
+ experiment_name: str
21
+ experiment_description: str
22
+ experiment_tags: Optional[Dict[str, Any]] = None
23
+ artifact_location: Optional[str] = None
24
+ tracking_uri: Optional[str] = None
25
+ dataset_type: Type[BaseDataset]
26
+ columns: Optional[List[str]] = None
27
+ embedding_dict: Optional[Dict[str, Any]] = None
28
+ column_embedding_configs: Optional[Dict] = None
29
+ vector_db_persist_directory: Optional[str] = None
30
+ vector_db_collection_name: Optional[str] = None
31
+ embedding_pooling_strategy_type: Type[PoolingStrategy] | None = None
32
+ ner_data_file: Optional[str] = None
33
+ ner_threshold: Optional[dict] = None
34
+ combinations: Optional[List[str]] = None
35
+ model: Optional[Any] = None
36
+ multi_modal: Optional[Any] = None
37
+ use_mlflow: bool = True
38
+ _items: List = []
39
+ _train_idx_arr: List | None = []
40
+ _val_idx_arr: List | None = []
41
+
42
+ @property
43
+ def items(self) -> List:
44
+ return self._items
45
+
46
+ @property
47
+ def train_idx_arr(self) -> List | None:
48
+ return self._train_idx_arr
49
+
50
+ @property
51
+ def val_idx_arr(self) -> List | None:
52
+ return self._val_idx_arr
53
+
54
+ class Config:
55
+ arbitrary_types_allowed = True
56
+
57
+ # class Pipeline:
58
+ # def __init__(self,
59
+ # library='tensorflow',
60
+ # experiment_name=None,
61
+ # experiment_description=None,
62
+ # experiment_tags=None,
63
+ # artifact_location=None,
64
+ # tracking_uri=None,
65
+ # dataset_type: BaseDataset = None,
66
+ # columns=None,
67
+ # embedding_dict=None,
68
+ # column_embedding_configs=None,
69
+ # vector_db_persist_directory=None,
70
+ # vector_db_collection_name=None,
71
+ # embedding_pooling_strategy_type: PoolingStrategy = None,
72
+ # ner_data_file=None,
73
+ # ner_threshold=None,
74
+ # combinations=None,
75
+ # model=None,
76
+ # multi_modal = None ):
77
+ # self.library = library
78
+ # self.experiment_name = experiment_name
79
+ # self.experiment_description = experiment_description
80
+ # self.experiment_tags = experiment_tags
81
+ # self.artifact_location = artifact_location
82
+ # self.tracking_uri = tracking_uri
83
+ # self.dataset_type = dataset_type
84
+ # self.columns = columns
85
+ # self.embedding_dict = embedding_dict
86
+ # self.column_embedding_configs = column_embedding_configs
87
+ # self.vector_db_persist_directory = vector_db_persist_directory
88
+ # self.vector_db_collection_name = vector_db_collection_name
89
+ # self.embedding_pooling_strategy_type = embedding_pooling_strategy_type
90
+ # self.ner_data_file = ner_data_file
91
+ # self.ner_threshold = ner_threshold
92
+ # self.combinations = combinations
93
+ # self.model = model
94
+ # self.multi_modal = multi_modal
51
95
 
52
96
  def __create_or_update_embeddings__(self, embedding_dict, vector_db_persist_directory, vector_db_collection_name, column=None):
53
97
  """
@@ -64,29 +108,50 @@ class Pipeline:
64
108
  vector_db = chromadb.PersistentClient(
65
109
  path=vector_db_persist_directory)
66
110
  collection = vector_db.get_collection(vector_db_collection_name)
67
-
111
+ include = [IncludeEnum.embeddings, IncludeEnum.metadatas]
112
+ dictionary: chromadb.GetResult
68
113
  # Fetch the embeddings and metadata
69
114
  if column == None:
70
115
  dictionary = collection.get(
71
- include=['embeddings', 'metadatas'])
116
+ include=include
117
+ # include=['embeddings', 'metadatas']
118
+ )
72
119
  print(
73
120
  f"Embeddings are calculated from {vector_db_collection_name}")
74
121
  else:
75
- dictionary = collection.get(include=['embeddings', 'metadatas'], where={
76
- "type": {"$eq": f"{column}"}})
122
+ dictionary = collection.get(
123
+ include=include,
124
+ # include=['embeddings', 'metadatas'],
125
+ where={
126
+ "type": {"$eq": f"{column}"}})
77
127
  print(
78
128
  f"Embeddings of {column} are calculated from {vector_db_collection_name}")
129
+
130
+ # if metadatas == None or embeddings == None:
131
+ if 'embeddings' not in dictionary or 'metadatas' not in dictionary or not dictionary['embeddings'] or not dictionary['metadatas']:
132
+ raise ValueError(
133
+ "The collection does not contain embeddings or metadatas.")
134
+
79
135
  # Populate the embedding dictionary with embeddings from the vector database
80
- for metadata, embedding in zip(dictionary['metadatas'], dictionary['embeddings']):
136
+ metadatas = dictionary["metadatas"]
137
+ embeddings = dictionary["embeddings"]
138
+
139
+ for metadata, embedding in zip(metadatas, embeddings):
81
140
  embedding_dict[metadata["type"]
82
141
  ][metadata["id"]].append(embedding)
83
142
 
84
- # return dictionary['embeddings'].shape[1]
85
143
  else:
86
144
  raise ValueError(
87
145
  "Persistent directory for the vector DB is not specified.")
88
146
 
89
147
  def build(self):
148
+ if self.embedding_pooling_strategy_type is not None and not isinstance(self.embedding_pooling_strategy_type, type):
149
+ raise TypeError(
150
+ "self.embedding_pooling_strategy_type must be a class, not an instance")
151
+ if not isinstance(self.dataset_type, type):
152
+ raise TypeError(
153
+ "self.dataset_type must be a class, not an instance")
154
+
90
155
  # 'enzyme','target','pathway','smile','all_text','indication', 'description','mechanism_of_action','pharmacodynamics', 'tui', 'cui', 'entities'
91
156
  kwargs = {"columns": self.columns}
92
157
  if self.ner_threshold:
@@ -103,90 +168,89 @@ class Pipeline:
103
168
  col_db_collection = item["vector_db_collection_name"]
104
169
  self.__create_or_update_embeddings__(
105
170
  embedding_dict, col_db_dir, col_db_collection, col)
106
-
171
+
107
172
  elif self.vector_db_persist_directory:
108
173
  self.__create_or_update_embeddings__(
109
174
  embedding_dict, self.vector_db_persist_directory, self.vector_db_collection_name)
110
-
175
+
111
176
  else:
112
177
  print(
113
178
  f"There is no configuration of Embeddings")
114
179
 
115
- # if self.embedding_dict == None:
116
- # if self.vector_db_persist_directory:
117
- # self.vector_db = chromadb.PersistentClient(
118
- # path=self.vector_db_persist_directory)
119
- # self.collection = self.vector_db.get_collection(
120
- # self.vector_db_collection_name)
121
- # dictionary = self.collection.get(
122
- # include=['embeddings', 'metadatas'])
123
-
124
- # embedding_dict = defaultdict(lambda: defaultdict(list))
125
-
126
- # for metadata, embedding in zip(dictionary['metadatas'], dictionary['embeddings']):
127
- # embedding_dict[metadata["type"]
128
- # ][metadata["id"]].append(embedding)
129
-
130
- # embedding_size = dictionary['embeddings'].shape[1]
131
180
  else:
132
181
  embedding_dict = self.embedding_dict
133
182
  # TODO make generic
134
183
  # embedding_size = list(embedding_dict['all_text'].values())[
135
184
  # 0][0].shape
136
- key, value = next(iter(embedding_dict.items()))
137
- embedding_size = value[next(iter(value))][0].shape[0]
138
- pooling_strategy = self.embedding_pooling_strategy_type()
139
185
 
140
- self.ner_df = CTakesNER().load(
141
- filename=self.ner_data_file) if self.ner_data_file else None
186
+ # self.ner_df = CTakesNER(df=None).load(
187
+ # filename=self.ner_data_file) if self.ner_data_file else None
188
+
189
+ if issubclass(self.dataset_type, TextDatasetMixin):
190
+ key, value = next(iter(embedding_dict.items()))
191
+ embedding_size = value[next(iter(value))][0].shape[0]
192
+ pooling_strategy = self.embedding_pooling_strategy_type(
193
+ ) if self.embedding_pooling_strategy_type else None
194
+
195
+ dataset = self.dataset_type(
196
+ embedding_dict=embedding_dict,
197
+ embedding_size=embedding_size,
198
+ embeddings_pooling_strategy=pooling_strategy,
199
+ **kwargs)
200
+ else:
201
+ dataset = self.dataset_type(**kwargs)
202
+
203
+ X_train, X_test, y_train, y_test, X_train.index, X_test.index, train_idx_arr, val_idx_arr = dataset.load()
142
204
 
143
- self.dataset = self.dataset_type(
144
- embedding_dict=embedding_dict,
145
- embedding_size=embedding_size,
146
- embeddings_pooling_strategy=pooling_strategy,
147
- ner_df=self.ner_df, **kwargs)
205
+ dataframe = dataset.dataframe
148
206
 
149
- X_train, X_test, y_train, y_test, X_train.index, X_test.index, train_idx_arr, val_idx_arr = self.dataset.load()
207
+ if dataframe is None: # if the dataframe is None, it means that the dataset is not loaded
208
+ raise ValueError("The dataset is not loaded")
150
209
 
151
- self.dataframe = self.dataset.dataframe
152
210
  # dataframe.dropna()
153
- self.X_train = self.dataset.X_train
154
- self.X_test = self.dataset.X_test
155
- self.y_train = self.dataset.y_train
156
- self.y_test = self.dataset.y_test
157
- self.train_idx_arr = self.dataset.train_idx_arr
158
- self.val_idx_arr = self.dataset.val_idx_arr
211
+ X_train = dataset.X_train
212
+ X_test = dataset.X_test
213
+ y_train = dataset.y_train
214
+ y_test = dataset.y_test
215
+ self._train_idx_arr = dataset.train_idx_arr
216
+ self._val_idx_arr = dataset.val_idx_arr
159
217
  # Logic to set up the experiment
160
218
  # column name, train data, train label, test data, test label
161
- self.items = self.dataset.produce_inputs()
219
+ self._items = dataset.produce_inputs()
162
220
 
163
- unique_classes = pd.unique(self.dataframe['event_category'])
164
- event_num = len(unique_classes)
221
+ # unique_classes = pd.unique(dataframe[dataset.class_column])
222
+ # event_num = len(unique_classes)
165
223
  # droprate = 0.3
166
- vector_size = self.dataset.drugs_df.shape[0]
224
+ # vector_size = self.dataset.drugs_df.shape[0]
167
225
 
168
226
  print("Building the experiment with the following settings:")
169
227
  print(
170
- f"Name: {self.experiment_name}, Dataset: {self.dataset}, Model: {self.model}")
228
+ f"Name: {self.experiment_name}, Dataset: {dataset}, Model: {self.model}")
171
229
  # Implement additional build logic as needed
172
230
  return self
173
231
 
174
232
  def run(self):
175
- mlflow.set_tracking_uri(self.tracking_uri)
233
+ if self.use_mlflow:
234
+ if self.tracking_uri is None:
235
+ raise ValueError("Tracking uri should be specified")
236
+ mlflow.set_tracking_uri(self.tracking_uri)
176
237
 
177
- if mlflow.get_experiment_by_name(self.experiment_name) == None:
178
- mlflow.create_experiment(
179
- self.experiment_name, self.artifact_location)
180
- mlflow.set_experiment_tags(self.experiment_tags)
181
- mlflow.set_experiment(self.experiment_name)
238
+ if mlflow.get_experiment_by_name(self.experiment_name) == None:
239
+ mlflow.create_experiment(
240
+ self.experiment_name, self.artifact_location)
241
+ if self.experiment_tags is not None:
242
+ mlflow.set_experiment_tags(self.experiment_tags)
243
+ mlflow.set_experiment(self.experiment_name)
182
244
 
183
245
  y_test_label = self.items[0][4]
184
- multi_modal_runner = MultiModalRunner(library=self.library, multi_modal = self.multi_modal)
246
+ multi_modal_runner = MultiModalRunner(
247
+ library=self.library, multi_modal=self.multi_modal)
185
248
  # multi_modal_runner = MultiModalRunner(
186
249
  # library=self.library, model_func=model_func, batch_size=batch_size, epochs=epochs)
187
250
  # multi_modal = TFMultiModal(
188
251
  # model_func=model_func, batch_size=batch_size, epochs=epochs) # 100
189
252
  multi_modal_runner.set_data(
190
253
  self.items, self.train_idx_arr, self.val_idx_arr, y_test_label)
191
- result = multi_modal_runner.predict(self.combinations)
254
+ combinations = self.combinations if self.combinations is not None else []
255
+ result = multi_modal_runner.predict(combinations)
192
256
  return result
@@ -1,4 +1,3 @@
1
- from compress_json import compress, decompress
2
1
  import json
3
2
  import sys
4
3
 
@@ -9,17 +8,4 @@ def minify(folder, file_name):
9
8
  json_string = json.dumps(json_data, separators=(',', ":")) # Compact JSON structure
10
9
  file_name = str(file_name).replace(".json", "") # remove .json from end of file_name string
11
10
  new_file_name = folder+"/{0}_minify.json".format(file_name)
12
- open(new_file_name, "w+", 1).write(json_string) # open and write json_string to file
13
-
14
- json_file = f'C:\\Users\\kivanc\\Downloads\\metrics.json'
15
- data_file = f'C:\\Users\\kivanc\\Downloads\\data.json'
16
-
17
- minify('C:\\Users\\kivanc\\Downloads','metrics.json')
18
-
19
- # with open(json_file, 'r', encoding="utf8") as f:
20
- # data = json.load(f)
21
-
22
- # compressed = compress(data) # the result is a list (array)
23
-
24
- # with open(data_file, "w") as fd:
25
- # fd.write(json.dumps(compressed)) # convert into string if needed
11
+ open(new_file_name, "w+", 1).write(json_string) # open and write json_string to file
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: ddi_fw
3
- Version: 0.0.149
3
+ Version: 0.0.151
4
4
  Summary: Do not use :)
5
5
  Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
6
6
  Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
@@ -22,6 +22,7 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
22
22
  Classifier: Topic :: Scientific/Engineering :: Medical Science Apps.
23
23
  Requires-Python: >=3.10
24
24
  Description-Content-Type: text/markdown
25
+ Requires-Dist: pydantic==2.10.6
25
26
  Requires-Dist: importlib-resources==6.4.5
26
27
  Requires-Dist: python-stopwatch==1.1.11
27
28
  Requires-Dist: lxml==5.3.0
@@ -35,7 +36,7 @@ Requires-Dist: rdkit==2023.3.3
35
36
  Requires-Dist: scikit-learn==1.5.2
36
37
  Requires-Dist: scipy==1.13.1
37
38
  Requires-Dist: accelerate>=0.33.0
38
- Requires-Dist: sentence-transformers>=3.0.1
39
+ Requires-Dist: sentence-transformers<=3.3.1,>=3.0.1
39
40
  Requires-Dist: transformers>=4.42.4
40
41
  Requires-Dist: stanza==1.9.2
41
42
  Requires-Dist: tokenizers>=0.19.1
@@ -49,3 +50,5 @@ Requires-Dist: chromadb>=0.5.15
49
50
  Requires-Dist: langchain_community==0.3.3
50
51
  Requires-Dist: datasets==3.0.2
51
52
  Requires-Dist: unstructured==0.16.3
53
+ Requires-Dist: tensorflow<2.18.0,>=2.17.0
54
+ Requires-Dist: tf-keras==2.17.0
@@ -1,11 +1,13 @@
1
- ddi_fw/datasets/__init__.py,sha256=M74mFusbhvnoJw_F9Vljhtum5JYRu_rWe50zBeorVYQ,399
2
- ddi_fw/datasets/core.py,sha256=z-H-UWIwLjnJVC7QwZQIK4tC8v00XJtjSeA1cYssxAI,17110
1
+ ddi_fw/datasets/__init__.py,sha256=yDsRQD_9Ijpm_Rl2wSDwdutG5Q_wca_UBPEvm7nBx04,444
2
+ ddi_fw/datasets/core.py,sha256=JA6WJz3VCUfxI85rYE7ZBqC4pnn7L8NSS9-EgjLw710,7968
3
+ ddi_fw/datasets/dataset_splitter.py,sha256=lLIelXv-8rCK0tbwLNgHBHYUO_65HT-_kErAlZhRQVE,1662
3
4
  ddi_fw/datasets/db_utils.py,sha256=OTsa3d-Iic7z3HmzSQK9UigedRbHDxYChJk0s4GfLnw,6191
4
- ddi_fw/datasets/embedding_generator.py,sha256=Jqrlv88RCu0Lg812KsA12X0cSaZuxbckJ4LNRKNy_qw,2173
5
+ ddi_fw/datasets/embedding_generator.py,sha256=jiDKwLaPMaQkloxQkuCrhl-A-2OdvocmkSzjWtUnk4g,2255
5
6
  ddi_fw/datasets/feature_vector_generation.py,sha256=gvjpEzkgVV8dp4V8NMMv59u0v-1tNAJ7v83R-keWGoA,4748
6
7
  ddi_fw/datasets/idf_helper.py,sha256=_Gd1dtDSLaw8o-o0JugzSKMt9FpeXewTh4wGEaUd4VQ,2571
7
8
  ddi_fw/datasets/setup_._py,sha256=khYVJuW5PlOY_i_A16F3UbSZ6s6o_ljw33Byw3C-A8E,1047
8
- ddi_fw/datasets/ddi_mdl/base.py,sha256=ynM99u8rx82F0dzlkEXcPGsHHOXEvIbPxiZ9GCi-8wo,6165
9
+ ddi_fw/datasets/ddi_mdl/base.py,sha256=usw3AhBCjdYwZx9MMnyNaUYTEyYXoRSO4fNJJHxnPuk,9312
10
+ ddi_fw/datasets/ddi_mdl/debug.log,sha256=eWz05j8RFqZuHFDTCF7Rck5w4rvtTanFN21iZsgxO7Y,115
9
11
  ddi_fw/datasets/ddi_mdl/readme.md,sha256=WC6lpmsEKvIISnZqENY7TWtzCQr98HPpE3oRsBl8pIw,625
10
12
  ddi_fw/datasets/ddi_mdl/data/event.db,sha256=cmlSsf9MYjRzqR-mw3cUDnTnfT6FkpOG2yCl2mMwwew,30580736
11
13
  ddi_fw/datasets/ddi_mdl/indexes/test_indexes.txt,sha256=XVlDqYATckrQwNSXqMSKVBqyoN_Hg8SK6CL-XMdLADY,102176
@@ -70,43 +72,32 @@ ddi_fw/drugbank/drugbank_processor.py,sha256=vmkt68n9nFLevufgGyXhOSDtTo4G1XzwT9P
70
72
  ddi_fw/drugbank/drugbank_processor_org.py,sha256=eO5Yset50P91qkic79RUXPoEuxRxQKFkKW0l4G29Mas,13322
71
73
  ddi_fw/drugbank/event_extractor.py,sha256=6odoZohhK7OdLF-LF0l-5BFq0_NMG_5jrFJbHrBXsI8,4600
72
74
  ddi_fw/langchain/__init__.py,sha256=zS0CQrakWEP19biSRewFJGcBT8WBZq4899HrEKiMqUY,269
73
- ddi_fw/langchain/embeddings.py,sha256=lU64a5AZ62jP8U3hTSwK0kXt7gThbwPACLfJMZ1baPA,7538
75
+ ddi_fw/langchain/embeddings.py,sha256=XzIYgmqnAO93pnavKRDhYDoz0RhDn-RoC7CDc0yAvbM,7572
74
76
  ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSzu5L2PXzg,280
75
77
  ddi_fw/langchain/storage.py,sha256=OizKyWm74Js7T6Q9kez-ulUoBGzIMFo4R46h4kjUyIM,11200
76
78
  ddi_fw/ml/__init__.py,sha256=tIxiW0g6q1VsmDYVXR_ovvHQR3SCir8g2bKxx_CrS7s,221
77
- ddi_fw/ml/evaluation_helper.py,sha256=o4-w5Xa3t4olLW4ymx_8L-Buhe5wfQEmT2bh4Zz544c,13066
78
- ddi_fw/ml/ml_helper.py,sha256=xSEa_UNpaFyrPswlQcDfZSI2x5nZLStOiKoP54SYkCM,6454
79
- ddi_fw/ml/model_wrapper.py,sha256=kc01_TVJuriUvNI6ABnLngnJWvmG_Y7-XJ6XMusLJ8U,1088
80
- ddi_fw/ml/pytorch_wrapper.py,sha256=AkG-2sKDXr0IBhgmkbjG0i20OuwQv3mhdvqp6UvJDCA,3716
81
- ddi_fw/ml/tensorflow_wrapper.py,sha256=DkW3aVWsPrzA87eGz5XTkiPBRb-Sb-z4tvOUcAZc2r0,6396
79
+ ddi_fw/ml/evaluation_helper.py,sha256=JFATMquaQVa2gckxmEivCztZmivWBAAP7EpJ8PVeI3c,7626
80
+ ddi_fw/ml/ml_helper.py,sha256=E6ef7f1UnQl6JBUdGDbbbI4FIS-904VGypT7tI0a598,8545
81
+ ddi_fw/ml/model_wrapper.py,sha256=kabPXuo7S8tGkp9a00V04n4rXDmv7dD8wYGMjotISRc,1050
82
+ ddi_fw/ml/pytorch_wrapper.py,sha256=pe6UsjP2XeTgLxDnIUiodoyhJTGCxV27wD4Cjxysu2Q,8553
83
+ ddi_fw/ml/tensorflow_wrapper.py,sha256=jt6h9Q-wF0mkbnvV6yCCl1SpUd2paHK70Bu6EFrkmd0,10112
82
84
  ddi_fw/ner/__init__.py,sha256=JwhGXrepomxPSsGsg2b_xPRC72AjvxOIn2CW5Mvscn0,26
83
85
  ddi_fw/ner/mmlrestclient.py,sha256=NZta7m2Qm6I_qtVguMZhqtAUjVBmmXn0-TMnsNp0jpg,6859
84
- ddi_fw/ner/ner.py,sha256=BEs9AFljAxOQrC2BEP1raSzRoypcfELS5UTdl4bjTqw,15863
86
+ ddi_fw/ner/ner.py,sha256=FHyyX53Xwpdw8Hec261dyN88yD7Z9LmJua2mIrQLguI,17967
85
87
  ddi_fw/pipeline/__init__.py,sha256=tKDM_rW4vPjlYTeOkNgi9PujDzb4e9O3LK1w5wqnebw,212
86
- ddi_fw/pipeline/multi_modal_combination_strategy.py,sha256=qIst7vxHaOAhRv4lgozszwa3b1QE4aIrN74t41Xnvr4,1637
87
- ddi_fw/pipeline/multi_pipeline.py,sha256=G8ONZdfwjGZRI2PrzMOaET6w5AUcmgYzMtaV6j5Hbz0,5981
88
- ddi_fw/pipeline/ner_pipeline.py,sha256=wB7hz4YCOv7UAz6bGE6sSpPXXIdoOflOVK5UCc1fO-o,5586
89
- ddi_fw/pipeline/pipeline.py,sha256=-1zGbSJapmUSx9xltJLQajmUCeZdT-9Ow0cC6JZ92y0,8984
90
- ddi_fw/test/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
91
- ddi_fw/test/basic_test.py,sha256=fEOGcZm1ObnsDvMiXNmdmz6YCeUrGc8V0DwlSwGhsq8,376
92
- ddi_fw/test/combination_test.py,sha256=TWNE8sf-DSh1Q9-yRaRBc774Sn1kSMGXLwQhd2_Qynk,324
93
- ddi_fw/test/compress_json_test.py,sha256=BGny56YqiG-pzhMoDzLKQBQI1E7o3jU0S7VYWtclAx4,1045
94
- ddi_fw/test/date_test.py,sha256=QmJ97ennS9LxLl8mGBkM2ob8_KWEFmiLakZTI9zQxxo,532
95
- ddi_fw/test/idf_score.py,sha256=YsAur-F1T3eFxn9KrcK3VXCvrsV_LXrpHxPjMKZeQZ8,1523
96
- ddi_fw/test/jaccard_similarity.py,sha256=pf6SNI52RCUZ0otx_1cz7A0p7kyfoCZv13Tbc_rxfuw,2382
97
- ddi_fw/test/mlfow_test.py,sha256=L2hJAeIU5PDSxsyWTtV6PY0bfaWerWUJ1buni9BTjXo,4853
98
- ddi_fw/test/sklearn-tfidf.py,sha256=cjtg27vLskEMXgrsqUR_EapRGVd4xgwOQ9zYsu72zjs,657
99
- ddi_fw/test/test.py,sha256=zJh9ZBcZl-vZIFDvuftcRrRV8WAwtiFVhPPd6Et4OU4,2997
100
- ddi_fw/test/torch_cuda_test.py,sha256=R-4VGVErl_Ufk54DoZbgL_YXWoCYFyanIVWd6P39IEk,312
101
- ddi_fw/test/type_guarding_test.py,sha256=KxjyBxohDu7lwpejalCj-REjtJ-k1S1wQbOB6TGY0O8,766
88
+ ddi_fw/pipeline/multi_modal_combination_strategy.py,sha256=JSyuP71b1I1yuk0s2ecCJZTtCED85jBtkpwTUxibJvI,1706
89
+ ddi_fw/pipeline/multi_pipeline.py,sha256=D_BZ3ciHbVGuuB7m7cEmVQHESruh1gqhA-vxCMfNKj0,5407
90
+ ddi_fw/pipeline/ner_pipeline.py,sha256=q1aKjb54Ra1HzZ7dARvBw6lB37je9R-POEf2h6QT_nU,6018
91
+ ddi_fw/pipeline/pipeline.py,sha256=NPew1lESAiuXUKR4Ob9R4LwRh2Xe1qfnqZDfmuMuC7k,11253
102
92
  ddi_fw/utils/__init__.py,sha256=77563ikqAtdzjjgRlLp5OAsJBbpLA1Cao8iecGaVUXQ,354
103
93
  ddi_fw/utils/enums.py,sha256=19eJ3fX5eRK_xPvkYcukmug144jXPH4X9zQqtsFBj5A,671
94
+ ddi_fw/utils/json_helper.py,sha256=BVU6wmJgdXPxyqLPu3Ck_9Es5RrP1PDanKvE-OSj1D4,571
104
95
  ddi_fw/utils/kaggle.py,sha256=wKRJ18KpQ6P-CubpZklEgsDtyFpR9RUL1_HyyF6ttEE,2425
105
96
  ddi_fw/utils/package_helper.py,sha256=erl8_onmhK-41zQoaED2qyDUV9GQxmT9sdoyRp9_q5I,1056
106
97
  ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,4747
107
98
  ddi_fw/utils/utils.py,sha256=szwnxMTDRrZoeNRyDuf3aCbtzriwtaRk4mHSH3asLdA,4301
108
99
  ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
109
- ddi_fw-0.0.149.dist-info/METADATA,sha256=ujgYzxc29yv5J7ltzxgEN1MSrTV5SkN3NNBseekoLEA,1965
110
- ddi_fw-0.0.149.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
111
- ddi_fw-0.0.149.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
112
- ddi_fw-0.0.149.dist-info/RECORD,,
100
+ ddi_fw-0.0.151.dist-info/METADATA,sha256=cTz-LpUrPhCU0uKQ2A9oE3lm5uaI3ra3nFHufSoi8hA,2082
101
+ ddi_fw-0.0.151.dist-info/WHEEL,sha256=DK49LOLCYiurdXXOXwGJm6U4DkHkg4lcxjhqwRa0CP4,91
102
+ ddi_fw-0.0.151.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
103
+ ddi_fw-0.0.151.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.6.0)
2
+ Generator: setuptools (78.0.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
ddi_fw/test/__init__.py DELETED
File without changes
ddi_fw/test/basic_test.py DELETED
@@ -1,15 +0,0 @@
1
- import json
2
-
3
-
4
- class Metrics():
5
- def __init__(self, precision, recall, roc_aupr, roc_auc):
6
- self.precision = precision
7
- self.recall = recall
8
- self.roc_aupr = roc_aupr
9
- self.roc_auc = roc_auc
10
-
11
-
12
- m = Metrics( 0.96, 0.96, {"micro": 0.99, "macro": 0.88}, {"micro": 0.99, "macro": 0.88})
13
-
14
- as_json = json.dumps(m.__dict__)
15
- print(as_json)
@@ -1,12 +0,0 @@
1
- import itertools
2
-
3
- l = ['e1','e2','e3','e4','e5']
4
- all_combinations = []
5
- for i in range(2, len(l) + 1):
6
- all_combinations.extend(list(itertools.combinations(l, i)))
7
-
8
- print(all_combinations)
9
-
10
- for combination in all_combinations:
11
- combination_descriptor = '-'.join(combination)
12
- print(combination_descriptor)
ddi_fw/test/date_test.py DELETED
@@ -1,15 +0,0 @@
1
- from datetime import datetime, timezone
2
-
3
- local_datetime = datetime.now()
4
- utc_datetime = datetime.now(timezone.utc)
5
-
6
- local_iso_str = datetime.strftime(local_datetime, "%Y-%m-%dT%H:%M:%S.%f")[:-3]
7
- utc_iso_str = datetime.strftime(utc_datetime, "%Y-%m-%dT%H:%M:%S.%f")[:-3]
8
-
9
- print(f"local dt: {local_iso_str}, tzname: {local_datetime.tzname()}")
10
- print(f" utc dt: {utc_iso_str}, tzname: {utc_datetime.tzname()}")
11
-
12
- print("\n")
13
-
14
- print(f"local dt: {local_datetime.isoformat()}")
15
- print(f" utc dt: {utc_datetime.isoformat()}")
ddi_fw/test/idf_score.py DELETED
@@ -1,54 +0,0 @@
1
- import pandas as pd
2
- import numpy as np
3
-
4
- # from ddi_fw.datasets.feature_vector_generation import find_distinct_elements
5
- def find_distinct_elements(frame):
6
- y = set()
7
- for x in frame:
8
- if x is not None:
9
- for k in x:
10
- y.add(k)
11
- return y
12
-
13
- def calculate_idf(series):
14
- idf_scores = {}
15
- distinct_items = find_distinct_elements(series)
16
- sorted_distinct_items = sorted(distinct_items)
17
- total_document_number = len(all_data)
18
- for item in sorted_distinct_items:
19
- document_freq = series.map(set([item]).issubset).sum()
20
- idf = np.log(total_document_number/document_freq)
21
- idf_scores[item] = idf
22
- return idf_scores
23
-
24
-
25
- item1 = 'T001|T002|T001|T001'
26
- item2 = 'T002|T003'
27
- item3 = 'T004|T005'
28
-
29
-
30
- all_data = [item1, item2, item3]
31
-
32
- df = pd.DataFrame(all_data, columns=['tui_description'])
33
-
34
- df['tui_description'] = df['tui_description'].apply(
35
- lambda x: x.split('|') if x is not None else [])
36
-
37
- print(df.head())
38
-
39
- idf_scores = calculate_idf(df['tui_description'])
40
- idf_scores_sorted_desc = sorted(idf_scores.items(), key=lambda x:x[1], reverse=True)
41
- threshold = 1
42
- keys_over_threshold = [k for k,v in idf_scores.items() if v > threshold]
43
-
44
- print(idf_scores_sorted_desc)
45
- print(keys_over_threshold)
46
-
47
-
48
- def remove_items_by_idf_score(items):
49
- return [item for item in items if item in keys_over_threshold]
50
-
51
- df['tui_description'] = df['tui_description'].apply(
52
- remove_items_by_idf_score)
53
-
54
- print(df)