ddi-fw 0.0.149__py3-none-any.whl → 0.0.150__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,12 @@
1
+ from typing import Any, Dict, List, Optional, Type, Union
1
2
  import numpy as np
2
3
  import pandas as pd
3
4
  import chromadb
4
5
  from collections import defaultdict
6
+ from chromadb.api.types import IncludeEnum
7
+
8
+ from pydantic import BaseModel
9
+ from ddi_fw.datasets.core import TextDatasetMixin
5
10
  from ddi_fw.ner.ner import CTakesNER
6
11
  from ddi_fw.langchain.embeddings import PoolingStrategy
7
12
  from ddi_fw.datasets import BaseDataset, DDIMDLDataset
@@ -10,44 +15,81 @@ import mlflow
10
15
  from ddi_fw.ml import MultiModalRunner
11
16
 
12
17
 
13
- class Pipeline:
14
- def __init__(self,
15
- library='tensorflow',
16
- experiment_name=None,
17
- experiment_description=None,
18
- experiment_tags=None,
19
- artifact_location=None,
20
- tracking_uri=None,
21
- dataset_type: BaseDataset = None,
22
- columns=None,
23
- embedding_dict=None,
24
- column_embedding_configs=None,
25
- vector_db_persist_directory=None,
26
- vector_db_collection_name=None,
27
- embedding_pooling_strategy_type: PoolingStrategy = None,
28
- ner_data_file=None,
29
- ner_threshold=None,
30
- combinations=None,
31
- model=None,
32
- multi_modal = None ):
33
- self.library = library
34
- self.experiment_name = experiment_name
35
- self.experiment_description = experiment_description
36
- self.experiment_tags = experiment_tags
37
- self.artifact_location = artifact_location
38
- self.tracking_uri = tracking_uri
39
- self.dataset_type = dataset_type
40
- self.columns = columns
41
- self.embedding_dict = embedding_dict
42
- self.column_embedding_configs = column_embedding_configs
43
- self.vector_db_persist_directory = vector_db_persist_directory
44
- self.vector_db_collection_name = vector_db_collection_name
45
- self.embedding_pooling_strategy_type = embedding_pooling_strategy_type
46
- self.ner_data_file = ner_data_file
47
- self.ner_threshold = ner_threshold
48
- self.combinations = combinations
49
- self.model = model
50
- self.multi_modal = multi_modal
18
+ class Pipeline(BaseModel):
19
+ library: str = 'tensorflow'
20
+ experiment_name: str
21
+ experiment_description: str
22
+ experiment_tags: Optional[Dict[str, Any]] = None
23
+ artifact_location: Optional[str] = None
24
+ tracking_uri: Optional[str] = None
25
+ dataset_type: Type[BaseDataset]
26
+ columns: Optional[List[str]] = None
27
+ embedding_dict: Optional[Dict[str, Any]] = None
28
+ column_embedding_configs: Optional[Dict] = None
29
+ vector_db_persist_directory: Optional[str] = None
30
+ vector_db_collection_name: Optional[str] = None
31
+ embedding_pooling_strategy_type: Type[PoolingStrategy] | None = None
32
+ ner_data_file: Optional[str] = None
33
+ ner_threshold: Optional[dict] = None
34
+ combinations: Optional[List[str]] = None
35
+ model: Optional[Any] = None
36
+ multi_modal: Optional[Any] = None
37
+ use_mlflow: bool = True
38
+ _items:List=[]
39
+ _train_idx_arr:List|None=[]
40
+ _val_idx_arr:List|None=[]
41
+
42
+ @property
43
+ def items(self) -> List:
44
+ return self._items
45
+ @property
46
+ def train_idx_arr(self) -> List|None:
47
+ return self._train_idx_arr
48
+ @property
49
+ def val_idx_arr(self) -> List|None:
50
+ return self._val_idx_arr
51
+
52
+ class Config:
53
+ arbitrary_types_allowed = True
54
+
55
+ # class Pipeline:
56
+ # def __init__(self,
57
+ # library='tensorflow',
58
+ # experiment_name=None,
59
+ # experiment_description=None,
60
+ # experiment_tags=None,
61
+ # artifact_location=None,
62
+ # tracking_uri=None,
63
+ # dataset_type: BaseDataset = None,
64
+ # columns=None,
65
+ # embedding_dict=None,
66
+ # column_embedding_configs=None,
67
+ # vector_db_persist_directory=None,
68
+ # vector_db_collection_name=None,
69
+ # embedding_pooling_strategy_type: PoolingStrategy = None,
70
+ # ner_data_file=None,
71
+ # ner_threshold=None,
72
+ # combinations=None,
73
+ # model=None,
74
+ # multi_modal = None ):
75
+ # self.library = library
76
+ # self.experiment_name = experiment_name
77
+ # self.experiment_description = experiment_description
78
+ # self.experiment_tags = experiment_tags
79
+ # self.artifact_location = artifact_location
80
+ # self.tracking_uri = tracking_uri
81
+ # self.dataset_type = dataset_type
82
+ # self.columns = columns
83
+ # self.embedding_dict = embedding_dict
84
+ # self.column_embedding_configs = column_embedding_configs
85
+ # self.vector_db_persist_directory = vector_db_persist_directory
86
+ # self.vector_db_collection_name = vector_db_collection_name
87
+ # self.embedding_pooling_strategy_type = embedding_pooling_strategy_type
88
+ # self.ner_data_file = ner_data_file
89
+ # self.ner_threshold = ner_threshold
90
+ # self.combinations = combinations
91
+ # self.model = model
92
+ # self.multi_modal = multi_modal
51
93
 
52
94
  def __create_or_update_embeddings__(self, embedding_dict, vector_db_persist_directory, vector_db_collection_name, column=None):
53
95
  """
@@ -64,29 +106,47 @@ class Pipeline:
64
106
  vector_db = chromadb.PersistentClient(
65
107
  path=vector_db_persist_directory)
66
108
  collection = vector_db.get_collection(vector_db_collection_name)
67
-
109
+ include = [IncludeEnum.embeddings, IncludeEnum.metadatas]
110
+ dictionary: chromadb.GetResult
68
111
  # Fetch the embeddings and metadata
69
112
  if column == None:
70
113
  dictionary = collection.get(
71
- include=['embeddings', 'metadatas'])
114
+ include=include
115
+ # include=['embeddings', 'metadatas']
116
+ )
72
117
  print(
73
118
  f"Embeddings are calculated from {vector_db_collection_name}")
74
119
  else:
75
- dictionary = collection.get(include=['embeddings', 'metadatas'], where={
76
- "type": {"$eq": f"{column}"}})
120
+ dictionary = collection.get(
121
+ include=include,
122
+ # include=['embeddings', 'metadatas'],
123
+ where={
124
+ "type": {"$eq": f"{column}"}})
77
125
  print(
78
126
  f"Embeddings of {column} are calculated from {vector_db_collection_name}")
127
+
79
128
  # Populate the embedding dictionary with embeddings from the vector database
80
- for metadata, embedding in zip(dictionary['metadatas'], dictionary['embeddings']):
129
+ metadatas = dictionary["metadatas"]
130
+ embeddings = dictionary["embeddings"]
131
+ if metadatas == None or embeddings == None:
132
+ raise ValueError(
133
+ "The collection does not contain embeddings or metadatas.")
134
+ for metadata, embedding in zip(metadatas, embeddings):
81
135
  embedding_dict[metadata["type"]
82
136
  ][metadata["id"]].append(embedding)
83
137
 
84
- # return dictionary['embeddings'].shape[1]
85
138
  else:
86
139
  raise ValueError(
87
140
  "Persistent directory for the vector DB is not specified.")
88
141
 
89
142
  def build(self):
143
+ if self.embedding_pooling_strategy_type is not None and not isinstance(self.embedding_pooling_strategy_type, type):
144
+ raise TypeError(
145
+ "self.embedding_pooling_strategy_type must be a class, not an instance")
146
+ if not isinstance(self.dataset_type, type):
147
+ raise TypeError(
148
+ "self.dataset_type must be a class, not an instance")
149
+
90
150
  # 'enzyme','target','pathway','smile','all_text','indication', 'description','mechanism_of_action','pharmacodynamics', 'tui', 'cui', 'entities'
91
151
  kwargs = {"columns": self.columns}
92
152
  if self.ner_threshold:
@@ -103,90 +163,88 @@ class Pipeline:
103
163
  col_db_collection = item["vector_db_collection_name"]
104
164
  self.__create_or_update_embeddings__(
105
165
  embedding_dict, col_db_dir, col_db_collection, col)
106
-
166
+
107
167
  elif self.vector_db_persist_directory:
108
168
  self.__create_or_update_embeddings__(
109
169
  embedding_dict, self.vector_db_persist_directory, self.vector_db_collection_name)
110
-
170
+
111
171
  else:
112
172
  print(
113
173
  f"There is no configuration of Embeddings")
114
174
 
115
- # if self.embedding_dict == None:
116
- # if self.vector_db_persist_directory:
117
- # self.vector_db = chromadb.PersistentClient(
118
- # path=self.vector_db_persist_directory)
119
- # self.collection = self.vector_db.get_collection(
120
- # self.vector_db_collection_name)
121
- # dictionary = self.collection.get(
122
- # include=['embeddings', 'metadatas'])
123
-
124
- # embedding_dict = defaultdict(lambda: defaultdict(list))
125
-
126
- # for metadata, embedding in zip(dictionary['metadatas'], dictionary['embeddings']):
127
- # embedding_dict[metadata["type"]
128
- # ][metadata["id"]].append(embedding)
129
-
130
- # embedding_size = dictionary['embeddings'].shape[1]
131
175
  else:
132
176
  embedding_dict = self.embedding_dict
133
177
  # TODO make generic
134
178
  # embedding_size = list(embedding_dict['all_text'].values())[
135
179
  # 0][0].shape
136
- key, value = next(iter(embedding_dict.items()))
137
- embedding_size = value[next(iter(value))][0].shape[0]
138
- pooling_strategy = self.embedding_pooling_strategy_type()
139
180
 
140
- self.ner_df = CTakesNER().load(
141
- filename=self.ner_data_file) if self.ner_data_file else None
181
+ # self.ner_df = CTakesNER(df=None).load(
182
+ # filename=self.ner_data_file) if self.ner_data_file else None
142
183
 
143
- self.dataset = self.dataset_type(
144
- embedding_dict=embedding_dict,
145
- embedding_size=embedding_size,
146
- embeddings_pooling_strategy=pooling_strategy,
147
- ner_df=self.ner_df, **kwargs)
184
+ if issubclass(self.dataset_type, TextDatasetMixin):
185
+ key, value = next(iter(embedding_dict.items()))
186
+ embedding_size = value[next(iter(value))][0].shape[0]
187
+ pooling_strategy = self.embedding_pooling_strategy_type() if self.embedding_pooling_strategy_type else None
188
+
189
+ dataset = self.dataset_type(
190
+ embedding_dict=embedding_dict,
191
+ embedding_size=embedding_size,
192
+ embeddings_pooling_strategy=pooling_strategy,
193
+ **kwargs)
194
+ else:
195
+ dataset = self.dataset_type(**kwargs)
148
196
 
149
- X_train, X_test, y_train, y_test, X_train.index, X_test.index, train_idx_arr, val_idx_arr = self.dataset.load()
197
+ X_train, X_test, y_train, y_test, X_train.index, X_test.index, train_idx_arr, val_idx_arr = dataset.load()
198
+
199
+ dataframe = dataset.dataframe
200
+
201
+ if dataframe is None: # if the dataframe is None, it means that the dataset is not loaded
202
+ raise ValueError("The dataset is not loaded")
150
203
 
151
- self.dataframe = self.dataset.dataframe
152
204
  # dataframe.dropna()
153
- self.X_train = self.dataset.X_train
154
- self.X_test = self.dataset.X_test
155
- self.y_train = self.dataset.y_train
156
- self.y_test = self.dataset.y_test
157
- self.train_idx_arr = self.dataset.train_idx_arr
158
- self.val_idx_arr = self.dataset.val_idx_arr
205
+ X_train = dataset.X_train
206
+ X_test = dataset.X_test
207
+ y_train = dataset.y_train
208
+ y_test = dataset.y_test
209
+ self._train_idx_arr = dataset.train_idx_arr
210
+ self._val_idx_arr = dataset.val_idx_arr
159
211
  # Logic to set up the experiment
160
212
  # column name, train data, train label, test data, test label
161
- self.items = self.dataset.produce_inputs()
213
+ self._items = dataset.produce_inputs()
162
214
 
163
- unique_classes = pd.unique(self.dataframe['event_category'])
164
- event_num = len(unique_classes)
215
+ # unique_classes = pd.unique(dataframe[dataset.class_column])
216
+ # event_num = len(unique_classes)
165
217
  # droprate = 0.3
166
- vector_size = self.dataset.drugs_df.shape[0]
218
+ # vector_size = self.dataset.drugs_df.shape[0]
167
219
 
168
220
  print("Building the experiment with the following settings:")
169
221
  print(
170
- f"Name: {self.experiment_name}, Dataset: {self.dataset}, Model: {self.model}")
222
+ f"Name: {self.experiment_name}, Dataset: {dataset}, Model: {self.model}")
171
223
  # Implement additional build logic as needed
172
224
  return self
173
225
 
174
226
  def run(self):
175
- mlflow.set_tracking_uri(self.tracking_uri)
176
-
177
- if mlflow.get_experiment_by_name(self.experiment_name) == None:
178
- mlflow.create_experiment(
179
- self.experiment_name, self.artifact_location)
180
- mlflow.set_experiment_tags(self.experiment_tags)
181
- mlflow.set_experiment(self.experiment_name)
227
+ if self.use_mlflow:
228
+ if self.tracking_uri is None:
229
+ raise ValueError("Tracking uri should be specified")
230
+ mlflow.set_tracking_uri(self.tracking_uri)
231
+
232
+ if mlflow.get_experiment_by_name(self.experiment_name) == None:
233
+ mlflow.create_experiment(
234
+ self.experiment_name, self.artifact_location)
235
+ if self.experiment_tags is not None:
236
+ mlflow.set_experiment_tags(self.experiment_tags)
237
+ mlflow.set_experiment(self.experiment_name)
182
238
 
183
239
  y_test_label = self.items[0][4]
184
- multi_modal_runner = MultiModalRunner(library=self.library, multi_modal = self.multi_modal)
240
+ multi_modal_runner = MultiModalRunner(
241
+ library=self.library, multi_modal=self.multi_modal)
185
242
  # multi_modal_runner = MultiModalRunner(
186
243
  # library=self.library, model_func=model_func, batch_size=batch_size, epochs=epochs)
187
244
  # multi_modal = TFMultiModal(
188
245
  # model_func=model_func, batch_size=batch_size, epochs=epochs) # 100
189
246
  multi_modal_runner.set_data(
190
247
  self.items, self.train_idx_arr, self.val_idx_arr, y_test_label)
191
- result = multi_modal_runner.predict(self.combinations)
248
+ combinations = self.combinations if self.combinations is not None else []
249
+ result = multi_modal_runner.predict(combinations)
192
250
  return result
@@ -1,4 +1,3 @@
1
- from compress_json import compress, decompress
2
1
  import json
3
2
  import sys
4
3
 
@@ -9,17 +8,4 @@ def minify(folder, file_name):
9
8
  json_string = json.dumps(json_data, separators=(',', ":")) # Compact JSON structure
10
9
  file_name = str(file_name).replace(".json", "") # remove .json from end of file_name string
11
10
  new_file_name = folder+"/{0}_minify.json".format(file_name)
12
- open(new_file_name, "w+", 1).write(json_string) # open and write json_string to file
13
-
14
- json_file = f'C:\\Users\\kivanc\\Downloads\\metrics.json'
15
- data_file = f'C:\\Users\\kivanc\\Downloads\\data.json'
16
-
17
- minify('C:\\Users\\kivanc\\Downloads','metrics.json')
18
-
19
- # with open(json_file, 'r', encoding="utf8") as f:
20
- # data = json.load(f)
21
-
22
- # compressed = compress(data) # the result is a list (array)
23
-
24
- # with open(data_file, "w") as fd:
25
- # fd.write(json.dumps(compressed)) # convert into string if needed
11
+ open(new_file_name, "w+", 1).write(json_string) # open and write json_string to file
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: ddi_fw
3
- Version: 0.0.149
3
+ Version: 0.0.150
4
4
  Summary: Do not use :)
5
5
  Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
6
6
  Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
@@ -22,6 +22,7 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
22
22
  Classifier: Topic :: Scientific/Engineering :: Medical Science Apps.
23
23
  Requires-Python: >=3.10
24
24
  Description-Content-Type: text/markdown
25
+ Requires-Dist: pydantic==2.10.6
25
26
  Requires-Dist: importlib-resources==6.4.5
26
27
  Requires-Dist: python-stopwatch==1.1.11
27
28
  Requires-Dist: lxml==5.3.0
@@ -35,7 +36,7 @@ Requires-Dist: rdkit==2023.3.3
35
36
  Requires-Dist: scikit-learn==1.5.2
36
37
  Requires-Dist: scipy==1.13.1
37
38
  Requires-Dist: accelerate>=0.33.0
38
- Requires-Dist: sentence-transformers>=3.0.1
39
+ Requires-Dist: sentence-transformers<=3.3.1,>=3.0.1
39
40
  Requires-Dist: transformers>=4.42.4
40
41
  Requires-Dist: stanza==1.9.2
41
42
  Requires-Dist: tokenizers>=0.19.1
@@ -49,3 +50,5 @@ Requires-Dist: chromadb>=0.5.15
49
50
  Requires-Dist: langchain_community==0.3.3
50
51
  Requires-Dist: datasets==3.0.2
51
52
  Requires-Dist: unstructured==0.16.3
53
+ Requires-Dist: tensorflow<2.18.0,>=2.17.0
54
+ Requires-Dist: tf-keras==2.17.0
@@ -1,11 +1,13 @@
1
- ddi_fw/datasets/__init__.py,sha256=M74mFusbhvnoJw_F9Vljhtum5JYRu_rWe50zBeorVYQ,399
2
- ddi_fw/datasets/core.py,sha256=z-H-UWIwLjnJVC7QwZQIK4tC8v00XJtjSeA1cYssxAI,17110
1
+ ddi_fw/datasets/__init__.py,sha256=yDsRQD_9Ijpm_Rl2wSDwdutG5Q_wca_UBPEvm7nBx04,444
2
+ ddi_fw/datasets/core.py,sha256=JA6WJz3VCUfxI85rYE7ZBqC4pnn7L8NSS9-EgjLw710,7968
3
+ ddi_fw/datasets/dataset_splitter.py,sha256=lLIelXv-8rCK0tbwLNgHBHYUO_65HT-_kErAlZhRQVE,1662
3
4
  ddi_fw/datasets/db_utils.py,sha256=OTsa3d-Iic7z3HmzSQK9UigedRbHDxYChJk0s4GfLnw,6191
4
- ddi_fw/datasets/embedding_generator.py,sha256=Jqrlv88RCu0Lg812KsA12X0cSaZuxbckJ4LNRKNy_qw,2173
5
+ ddi_fw/datasets/embedding_generator.py,sha256=jiDKwLaPMaQkloxQkuCrhl-A-2OdvocmkSzjWtUnk4g,2255
5
6
  ddi_fw/datasets/feature_vector_generation.py,sha256=gvjpEzkgVV8dp4V8NMMv59u0v-1tNAJ7v83R-keWGoA,4748
6
7
  ddi_fw/datasets/idf_helper.py,sha256=_Gd1dtDSLaw8o-o0JugzSKMt9FpeXewTh4wGEaUd4VQ,2571
7
8
  ddi_fw/datasets/setup_._py,sha256=khYVJuW5PlOY_i_A16F3UbSZ6s6o_ljw33Byw3C-A8E,1047
8
- ddi_fw/datasets/ddi_mdl/base.py,sha256=ynM99u8rx82F0dzlkEXcPGsHHOXEvIbPxiZ9GCi-8wo,6165
9
+ ddi_fw/datasets/ddi_mdl/base.py,sha256=usw3AhBCjdYwZx9MMnyNaUYTEyYXoRSO4fNJJHxnPuk,9312
10
+ ddi_fw/datasets/ddi_mdl/debug.log,sha256=eWz05j8RFqZuHFDTCF7Rck5w4rvtTanFN21iZsgxO7Y,115
9
11
  ddi_fw/datasets/ddi_mdl/readme.md,sha256=WC6lpmsEKvIISnZqENY7TWtzCQr98HPpE3oRsBl8pIw,625
10
12
  ddi_fw/datasets/ddi_mdl/data/event.db,sha256=cmlSsf9MYjRzqR-mw3cUDnTnfT6FkpOG2yCl2mMwwew,30580736
11
13
  ddi_fw/datasets/ddi_mdl/indexes/test_indexes.txt,sha256=XVlDqYATckrQwNSXqMSKVBqyoN_Hg8SK6CL-XMdLADY,102176
@@ -70,43 +72,32 @@ ddi_fw/drugbank/drugbank_processor.py,sha256=vmkt68n9nFLevufgGyXhOSDtTo4G1XzwT9P
70
72
  ddi_fw/drugbank/drugbank_processor_org.py,sha256=eO5Yset50P91qkic79RUXPoEuxRxQKFkKW0l4G29Mas,13322
71
73
  ddi_fw/drugbank/event_extractor.py,sha256=6odoZohhK7OdLF-LF0l-5BFq0_NMG_5jrFJbHrBXsI8,4600
72
74
  ddi_fw/langchain/__init__.py,sha256=zS0CQrakWEP19biSRewFJGcBT8WBZq4899HrEKiMqUY,269
73
- ddi_fw/langchain/embeddings.py,sha256=lU64a5AZ62jP8U3hTSwK0kXt7gThbwPACLfJMZ1baPA,7538
75
+ ddi_fw/langchain/embeddings.py,sha256=XzIYgmqnAO93pnavKRDhYDoz0RhDn-RoC7CDc0yAvbM,7572
74
76
  ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSzu5L2PXzg,280
75
77
  ddi_fw/langchain/storage.py,sha256=OizKyWm74Js7T6Q9kez-ulUoBGzIMFo4R46h4kjUyIM,11200
76
78
  ddi_fw/ml/__init__.py,sha256=tIxiW0g6q1VsmDYVXR_ovvHQR3SCir8g2bKxx_CrS7s,221
77
- ddi_fw/ml/evaluation_helper.py,sha256=o4-w5Xa3t4olLW4ymx_8L-Buhe5wfQEmT2bh4Zz544c,13066
78
- ddi_fw/ml/ml_helper.py,sha256=xSEa_UNpaFyrPswlQcDfZSI2x5nZLStOiKoP54SYkCM,6454
79
- ddi_fw/ml/model_wrapper.py,sha256=kc01_TVJuriUvNI6ABnLngnJWvmG_Y7-XJ6XMusLJ8U,1088
80
- ddi_fw/ml/pytorch_wrapper.py,sha256=AkG-2sKDXr0IBhgmkbjG0i20OuwQv3mhdvqp6UvJDCA,3716
81
- ddi_fw/ml/tensorflow_wrapper.py,sha256=DkW3aVWsPrzA87eGz5XTkiPBRb-Sb-z4tvOUcAZc2r0,6396
79
+ ddi_fw/ml/evaluation_helper.py,sha256=JFATMquaQVa2gckxmEivCztZmivWBAAP7EpJ8PVeI3c,7626
80
+ ddi_fw/ml/ml_helper.py,sha256=E6ef7f1UnQl6JBUdGDbbbI4FIS-904VGypT7tI0a598,8545
81
+ ddi_fw/ml/model_wrapper.py,sha256=kabPXuo7S8tGkp9a00V04n4rXDmv7dD8wYGMjotISRc,1050
82
+ ddi_fw/ml/pytorch_wrapper.py,sha256=pe6UsjP2XeTgLxDnIUiodoyhJTGCxV27wD4Cjxysu2Q,8553
83
+ ddi_fw/ml/tensorflow_wrapper.py,sha256=jt6h9Q-wF0mkbnvV6yCCl1SpUd2paHK70Bu6EFrkmd0,10112
82
84
  ddi_fw/ner/__init__.py,sha256=JwhGXrepomxPSsGsg2b_xPRC72AjvxOIn2CW5Mvscn0,26
83
85
  ddi_fw/ner/mmlrestclient.py,sha256=NZta7m2Qm6I_qtVguMZhqtAUjVBmmXn0-TMnsNp0jpg,6859
84
- ddi_fw/ner/ner.py,sha256=BEs9AFljAxOQrC2BEP1raSzRoypcfELS5UTdl4bjTqw,15863
86
+ ddi_fw/ner/ner.py,sha256=FHyyX53Xwpdw8Hec261dyN88yD7Z9LmJua2mIrQLguI,17967
85
87
  ddi_fw/pipeline/__init__.py,sha256=tKDM_rW4vPjlYTeOkNgi9PujDzb4e9O3LK1w5wqnebw,212
86
- ddi_fw/pipeline/multi_modal_combination_strategy.py,sha256=qIst7vxHaOAhRv4lgozszwa3b1QE4aIrN74t41Xnvr4,1637
87
- ddi_fw/pipeline/multi_pipeline.py,sha256=G8ONZdfwjGZRI2PrzMOaET6w5AUcmgYzMtaV6j5Hbz0,5981
88
- ddi_fw/pipeline/ner_pipeline.py,sha256=wB7hz4YCOv7UAz6bGE6sSpPXXIdoOflOVK5UCc1fO-o,5586
89
- ddi_fw/pipeline/pipeline.py,sha256=-1zGbSJapmUSx9xltJLQajmUCeZdT-9Ow0cC6JZ92y0,8984
90
- ddi_fw/test/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
91
- ddi_fw/test/basic_test.py,sha256=fEOGcZm1ObnsDvMiXNmdmz6YCeUrGc8V0DwlSwGhsq8,376
92
- ddi_fw/test/combination_test.py,sha256=TWNE8sf-DSh1Q9-yRaRBc774Sn1kSMGXLwQhd2_Qynk,324
93
- ddi_fw/test/compress_json_test.py,sha256=BGny56YqiG-pzhMoDzLKQBQI1E7o3jU0S7VYWtclAx4,1045
94
- ddi_fw/test/date_test.py,sha256=QmJ97ennS9LxLl8mGBkM2ob8_KWEFmiLakZTI9zQxxo,532
95
- ddi_fw/test/idf_score.py,sha256=YsAur-F1T3eFxn9KrcK3VXCvrsV_LXrpHxPjMKZeQZ8,1523
96
- ddi_fw/test/jaccard_similarity.py,sha256=pf6SNI52RCUZ0otx_1cz7A0p7kyfoCZv13Tbc_rxfuw,2382
97
- ddi_fw/test/mlfow_test.py,sha256=L2hJAeIU5PDSxsyWTtV6PY0bfaWerWUJ1buni9BTjXo,4853
98
- ddi_fw/test/sklearn-tfidf.py,sha256=cjtg27vLskEMXgrsqUR_EapRGVd4xgwOQ9zYsu72zjs,657
99
- ddi_fw/test/test.py,sha256=zJh9ZBcZl-vZIFDvuftcRrRV8WAwtiFVhPPd6Et4OU4,2997
100
- ddi_fw/test/torch_cuda_test.py,sha256=R-4VGVErl_Ufk54DoZbgL_YXWoCYFyanIVWd6P39IEk,312
101
- ddi_fw/test/type_guarding_test.py,sha256=KxjyBxohDu7lwpejalCj-REjtJ-k1S1wQbOB6TGY0O8,766
88
+ ddi_fw/pipeline/multi_modal_combination_strategy.py,sha256=JSyuP71b1I1yuk0s2ecCJZTtCED85jBtkpwTUxibJvI,1706
89
+ ddi_fw/pipeline/multi_pipeline.py,sha256=D_BZ3ciHbVGuuB7m7cEmVQHESruh1gqhA-vxCMfNKj0,5407
90
+ ddi_fw/pipeline/ner_pipeline.py,sha256=q1aKjb54Ra1HzZ7dARvBw6lB37je9R-POEf2h6QT_nU,6018
91
+ ddi_fw/pipeline/pipeline.py,sha256=Xp5_cPj0SZ6b1lRWepwKCHoCbhEnzSZexm56CtvO_4Y,11073
102
92
  ddi_fw/utils/__init__.py,sha256=77563ikqAtdzjjgRlLp5OAsJBbpLA1Cao8iecGaVUXQ,354
103
93
  ddi_fw/utils/enums.py,sha256=19eJ3fX5eRK_xPvkYcukmug144jXPH4X9zQqtsFBj5A,671
94
+ ddi_fw/utils/json_helper.py,sha256=BVU6wmJgdXPxyqLPu3Ck_9Es5RrP1PDanKvE-OSj1D4,571
104
95
  ddi_fw/utils/kaggle.py,sha256=wKRJ18KpQ6P-CubpZklEgsDtyFpR9RUL1_HyyF6ttEE,2425
105
96
  ddi_fw/utils/package_helper.py,sha256=erl8_onmhK-41zQoaED2qyDUV9GQxmT9sdoyRp9_q5I,1056
106
97
  ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,4747
107
98
  ddi_fw/utils/utils.py,sha256=szwnxMTDRrZoeNRyDuf3aCbtzriwtaRk4mHSH3asLdA,4301
108
99
  ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
109
- ddi_fw-0.0.149.dist-info/METADATA,sha256=ujgYzxc29yv5J7ltzxgEN1MSrTV5SkN3NNBseekoLEA,1965
110
- ddi_fw-0.0.149.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
111
- ddi_fw-0.0.149.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
112
- ddi_fw-0.0.149.dist-info/RECORD,,
100
+ ddi_fw-0.0.150.dist-info/METADATA,sha256=QGoZpcrDypCUbyMgSXEe2vdWBeYmLG5gSw6qnyWKQLc,2082
101
+ ddi_fw-0.0.150.dist-info/WHEEL,sha256=DK49LOLCYiurdXXOXwGJm6U4DkHkg4lcxjhqwRa0CP4,91
102
+ ddi_fw-0.0.150.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
103
+ ddi_fw-0.0.150.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.6.0)
2
+ Generator: setuptools (78.0.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
ddi_fw/test/__init__.py DELETED
File without changes
ddi_fw/test/basic_test.py DELETED
@@ -1,15 +0,0 @@
1
- import json
2
-
3
-
4
- class Metrics():
5
- def __init__(self, precision, recall, roc_aupr, roc_auc):
6
- self.precision = precision
7
- self.recall = recall
8
- self.roc_aupr = roc_aupr
9
- self.roc_auc = roc_auc
10
-
11
-
12
- m = Metrics( 0.96, 0.96, {"micro": 0.99, "macro": 0.88}, {"micro": 0.99, "macro": 0.88})
13
-
14
- as_json = json.dumps(m.__dict__)
15
- print(as_json)
@@ -1,12 +0,0 @@
1
- import itertools
2
-
3
- l = ['e1','e2','e3','e4','e5']
4
- all_combinations = []
5
- for i in range(2, len(l) + 1):
6
- all_combinations.extend(list(itertools.combinations(l, i)))
7
-
8
- print(all_combinations)
9
-
10
- for combination in all_combinations:
11
- combination_descriptor = '-'.join(combination)
12
- print(combination_descriptor)
ddi_fw/test/date_test.py DELETED
@@ -1,15 +0,0 @@
1
- from datetime import datetime, timezone
2
-
3
- local_datetime = datetime.now()
4
- utc_datetime = datetime.now(timezone.utc)
5
-
6
- local_iso_str = datetime.strftime(local_datetime, "%Y-%m-%dT%H:%M:%S.%f")[:-3]
7
- utc_iso_str = datetime.strftime(utc_datetime, "%Y-%m-%dT%H:%M:%S.%f")[:-3]
8
-
9
- print(f"local dt: {local_iso_str}, tzname: {local_datetime.tzname()}")
10
- print(f" utc dt: {utc_iso_str}, tzname: {utc_datetime.tzname()}")
11
-
12
- print("\n")
13
-
14
- print(f"local dt: {local_datetime.isoformat()}")
15
- print(f" utc dt: {utc_datetime.isoformat()}")
ddi_fw/test/idf_score.py DELETED
@@ -1,54 +0,0 @@
1
- import pandas as pd
2
- import numpy as np
3
-
4
- # from ddi_fw.datasets.feature_vector_generation import find_distinct_elements
5
- def find_distinct_elements(frame):
6
- y = set()
7
- for x in frame:
8
- if x is not None:
9
- for k in x:
10
- y.add(k)
11
- return y
12
-
13
- def calculate_idf(series):
14
- idf_scores = {}
15
- distinct_items = find_distinct_elements(series)
16
- sorted_distinct_items = sorted(distinct_items)
17
- total_document_number = len(all_data)
18
- for item in sorted_distinct_items:
19
- document_freq = series.map(set([item]).issubset).sum()
20
- idf = np.log(total_document_number/document_freq)
21
- idf_scores[item] = idf
22
- return idf_scores
23
-
24
-
25
- item1 = 'T001|T002|T001|T001'
26
- item2 = 'T002|T003'
27
- item3 = 'T004|T005'
28
-
29
-
30
- all_data = [item1, item2, item3]
31
-
32
- df = pd.DataFrame(all_data, columns=['tui_description'])
33
-
34
- df['tui_description'] = df['tui_description'].apply(
35
- lambda x: x.split('|') if x is not None else [])
36
-
37
- print(df.head())
38
-
39
- idf_scores = calculate_idf(df['tui_description'])
40
- idf_scores_sorted_desc = sorted(idf_scores.items(), key=lambda x:x[1], reverse=True)
41
- threshold = 1
42
- keys_over_threshold = [k for k,v in idf_scores.items() if v > threshold]
43
-
44
- print(idf_scores_sorted_desc)
45
- print(keys_over_threshold)
46
-
47
-
48
- def remove_items_by_idf_score(items):
49
- return [item for item in items if item in keys_over_threshold]
50
-
51
- df['tui_description'] = df['tui_description'].apply(
52
- remove_items_by_idf_score)
53
-
54
- print(df)
@@ -1,85 +0,0 @@
1
- import pandas as pd
2
-
3
- # data = {'A': [1, 1, 1, 0, 0],
4
- # 'B': [0, 1, 1, 1, 0],
5
- # 'C': [0, 0, 1, 1, 1]}
6
-
7
- # df = pd.DataFrame(data)
8
-
9
-
10
- # from scipy.spatial.distance import pdist, squareform
11
-
12
- # jaccard_dist = pdist(df.values, metric='jaccard')
13
- # jaccard_dist_matrix = squareform(jaccard_dist)
14
-
15
- # print(jaccard_dist_matrix)
16
-
17
-
18
- # import pandas as pd
19
- # from scipy.spatial.distance import euclidean, pdist, squareform
20
-
21
-
22
- # def similarity_func(u, v):
23
- # return 1/(1+euclidean(u,v))
24
-
25
- # DF_var = pd.DataFrame.from_dict({"s1":[1.2,3.4,10.2],"s2":[1.4,3.1,10.7],"s3":[2.1,3.7,11.3],"s4":[1.5,3.2,10.9]})
26
- # DF_var.index = ["g1","g2","g3"]
27
-
28
- # dists = pdist(DF_var, similarity_func)
29
- # DF_euclid = pd.DataFrame(squareform(dists), columns=DF_var.index, index=DF_var.index)
30
-
31
- # print(DF_euclid)
32
-
33
-
34
- from sklearn.metrics import jaccard_score
35
- import seaborn as sns
36
- import matplotlib.pyplot as plt
37
-
38
- data = [[0, 1, 0], [0, 1, 1], [0, 1, 0], [1, 1, 1], [1, 0, 1]]
39
-
40
- similarity_matrix = []
41
- for i in range(len(data)):
42
- row = []
43
- for j in range(len(data)):
44
- row.append(jaccard_score(data[i], data[j]))
45
- similarity_matrix.append(row)
46
-
47
- sns.heatmap(pd.DataFrame(similarity_matrix), annot=True, cmap="YlGnBu")
48
- plt.show()
49
-
50
-
51
- # https://stackoverflow.com/questions/35639571/python-pandas-distance-matrix-using-jaccard-similarity
52
- import pandas as pd
53
- entries = [
54
- {'id':'1', 'category1':'100', 'category2': '0', 'category3':'100'},
55
- {'id':'2', 'category1':'100', 'category2': '0', 'category3':'100'},
56
- {'id':'3', 'category1':'0', 'category2': '100', 'category3':'100'},
57
- {'id':'4', 'category1':'100', 'category2': '100', 'category3':'100'},
58
- {'id':'5', 'category1':'100', 'category2': '0', 'category3':'100'}
59
- ]
60
- df = pd.DataFrame(entries)
61
-
62
- from scipy.spatial.distance import squareform
63
- from scipy.spatial.distance import pdist, jaccard
64
-
65
- res = 1 - pdist(df[['category1','category2','category3']], 'jaccard')
66
- # squareform(res)
67
- distance = pd.DataFrame(squareform(res), index=df.index, columns= df.index)
68
- print(distance)
69
-
70
- entries2 = [
71
- {'id':'1', 'cat':['p1','p2','p3']},
72
- {'id':'2', 'cat':['p3','p4','p5']},
73
- {'id':'3', 'cat':['p5','p6','p7']},
74
- ]
75
- df2 = pd.DataFrame(entries2)
76
-
77
- c = df2['cat']
78
-
79
- y = set()
80
-
81
- for x in c:
82
- for k in x:
83
- y.add(k)
84
-
85
- print(y)