ddi-fw 0.0.149__py3-none-any.whl → 0.0.151__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ddi_fw/datasets/__init__.py +1 -1
- ddi_fw/datasets/core.py +147 -341
- ddi_fw/datasets/dataset_splitter.py +39 -0
- ddi_fw/datasets/ddi_mdl/base.py +194 -130
- ddi_fw/datasets/ddi_mdl/debug.log +1 -0
- ddi_fw/datasets/embedding_generator.py +2 -1
- ddi_fw/langchain/embeddings.py +1 -0
- ddi_fw/ml/evaluation_helper.py +47 -178
- ddi_fw/ml/ml_helper.py +125 -81
- ddi_fw/ml/model_wrapper.py +2 -2
- ddi_fw/ml/pytorch_wrapper.py +175 -72
- ddi_fw/ml/tensorflow_wrapper.py +131 -39
- ddi_fw/ner/ner.py +93 -39
- ddi_fw/pipeline/multi_modal_combination_strategy.py +4 -2
- ddi_fw/pipeline/multi_pipeline.py +2 -15
- ddi_fw/pipeline/ner_pipeline.py +15 -6
- ddi_fw/pipeline/pipeline.py +157 -93
- ddi_fw/{test/compress_json_test.py → utils/json_helper.py} +1 -15
- {ddi_fw-0.0.149.dist-info → ddi_fw-0.0.151.dist-info}/METADATA +6 -3
- {ddi_fw-0.0.149.dist-info → ddi_fw-0.0.151.dist-info}/RECORD +22 -31
- {ddi_fw-0.0.149.dist-info → ddi_fw-0.0.151.dist-info}/WHEEL +1 -1
- ddi_fw/test/__init__.py +0 -0
- ddi_fw/test/basic_test.py +0 -15
- ddi_fw/test/combination_test.py +0 -12
- ddi_fw/test/date_test.py +0 -15
- ddi_fw/test/idf_score.py +0 -54
- ddi_fw/test/jaccard_similarity.py +0 -85
- ddi_fw/test/mlfow_test.py +0 -165
- ddi_fw/test/sklearn-tfidf.py +0 -16
- ddi_fw/test/test.py +0 -93
- ddi_fw/test/torch_cuda_test.py +0 -9
- ddi_fw/test/type_guarding_test.py +0 -18
- {ddi_fw-0.0.149.dist-info → ddi_fw-0.0.151.dist-info}/top_level.txt +0 -0
ddi_fw/pipeline/pipeline.py
CHANGED
@@ -1,7 +1,12 @@
|
|
1
|
+
from typing import Any, Dict, List, Optional, Type, Union
|
1
2
|
import numpy as np
|
2
3
|
import pandas as pd
|
3
4
|
import chromadb
|
4
5
|
from collections import defaultdict
|
6
|
+
from chromadb.api.types import IncludeEnum
|
7
|
+
|
8
|
+
from pydantic import BaseModel
|
9
|
+
from ddi_fw.datasets.core import TextDatasetMixin
|
5
10
|
from ddi_fw.ner.ner import CTakesNER
|
6
11
|
from ddi_fw.langchain.embeddings import PoolingStrategy
|
7
12
|
from ddi_fw.datasets import BaseDataset, DDIMDLDataset
|
@@ -10,44 +15,83 @@ import mlflow
|
|
10
15
|
from ddi_fw.ml import MultiModalRunner
|
11
16
|
|
12
17
|
|
13
|
-
class Pipeline:
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
self.
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
self.
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
self.
|
48
|
-
|
49
|
-
|
50
|
-
|
18
|
+
class Pipeline(BaseModel):
|
19
|
+
library: str = 'tensorflow'
|
20
|
+
experiment_name: str
|
21
|
+
experiment_description: str
|
22
|
+
experiment_tags: Optional[Dict[str, Any]] = None
|
23
|
+
artifact_location: Optional[str] = None
|
24
|
+
tracking_uri: Optional[str] = None
|
25
|
+
dataset_type: Type[BaseDataset]
|
26
|
+
columns: Optional[List[str]] = None
|
27
|
+
embedding_dict: Optional[Dict[str, Any]] = None
|
28
|
+
column_embedding_configs: Optional[Dict] = None
|
29
|
+
vector_db_persist_directory: Optional[str] = None
|
30
|
+
vector_db_collection_name: Optional[str] = None
|
31
|
+
embedding_pooling_strategy_type: Type[PoolingStrategy] | None = None
|
32
|
+
ner_data_file: Optional[str] = None
|
33
|
+
ner_threshold: Optional[dict] = None
|
34
|
+
combinations: Optional[List[str]] = None
|
35
|
+
model: Optional[Any] = None
|
36
|
+
multi_modal: Optional[Any] = None
|
37
|
+
use_mlflow: bool = True
|
38
|
+
_items: List = []
|
39
|
+
_train_idx_arr: List | None = []
|
40
|
+
_val_idx_arr: List | None = []
|
41
|
+
|
42
|
+
@property
|
43
|
+
def items(self) -> List:
|
44
|
+
return self._items
|
45
|
+
|
46
|
+
@property
|
47
|
+
def train_idx_arr(self) -> List | None:
|
48
|
+
return self._train_idx_arr
|
49
|
+
|
50
|
+
@property
|
51
|
+
def val_idx_arr(self) -> List | None:
|
52
|
+
return self._val_idx_arr
|
53
|
+
|
54
|
+
class Config:
|
55
|
+
arbitrary_types_allowed = True
|
56
|
+
|
57
|
+
# class Pipeline:
|
58
|
+
# def __init__(self,
|
59
|
+
# library='tensorflow',
|
60
|
+
# experiment_name=None,
|
61
|
+
# experiment_description=None,
|
62
|
+
# experiment_tags=None,
|
63
|
+
# artifact_location=None,
|
64
|
+
# tracking_uri=None,
|
65
|
+
# dataset_type: BaseDataset = None,
|
66
|
+
# columns=None,
|
67
|
+
# embedding_dict=None,
|
68
|
+
# column_embedding_configs=None,
|
69
|
+
# vector_db_persist_directory=None,
|
70
|
+
# vector_db_collection_name=None,
|
71
|
+
# embedding_pooling_strategy_type: PoolingStrategy = None,
|
72
|
+
# ner_data_file=None,
|
73
|
+
# ner_threshold=None,
|
74
|
+
# combinations=None,
|
75
|
+
# model=None,
|
76
|
+
# multi_modal = None ):
|
77
|
+
# self.library = library
|
78
|
+
# self.experiment_name = experiment_name
|
79
|
+
# self.experiment_description = experiment_description
|
80
|
+
# self.experiment_tags = experiment_tags
|
81
|
+
# self.artifact_location = artifact_location
|
82
|
+
# self.tracking_uri = tracking_uri
|
83
|
+
# self.dataset_type = dataset_type
|
84
|
+
# self.columns = columns
|
85
|
+
# self.embedding_dict = embedding_dict
|
86
|
+
# self.column_embedding_configs = column_embedding_configs
|
87
|
+
# self.vector_db_persist_directory = vector_db_persist_directory
|
88
|
+
# self.vector_db_collection_name = vector_db_collection_name
|
89
|
+
# self.embedding_pooling_strategy_type = embedding_pooling_strategy_type
|
90
|
+
# self.ner_data_file = ner_data_file
|
91
|
+
# self.ner_threshold = ner_threshold
|
92
|
+
# self.combinations = combinations
|
93
|
+
# self.model = model
|
94
|
+
# self.multi_modal = multi_modal
|
51
95
|
|
52
96
|
def __create_or_update_embeddings__(self, embedding_dict, vector_db_persist_directory, vector_db_collection_name, column=None):
|
53
97
|
"""
|
@@ -64,29 +108,50 @@ class Pipeline:
|
|
64
108
|
vector_db = chromadb.PersistentClient(
|
65
109
|
path=vector_db_persist_directory)
|
66
110
|
collection = vector_db.get_collection(vector_db_collection_name)
|
67
|
-
|
111
|
+
include = [IncludeEnum.embeddings, IncludeEnum.metadatas]
|
112
|
+
dictionary: chromadb.GetResult
|
68
113
|
# Fetch the embeddings and metadata
|
69
114
|
if column == None:
|
70
115
|
dictionary = collection.get(
|
71
|
-
include=
|
116
|
+
include=include
|
117
|
+
# include=['embeddings', 'metadatas']
|
118
|
+
)
|
72
119
|
print(
|
73
120
|
f"Embeddings are calculated from {vector_db_collection_name}")
|
74
121
|
else:
|
75
|
-
dictionary = collection.get(
|
76
|
-
|
122
|
+
dictionary = collection.get(
|
123
|
+
include=include,
|
124
|
+
# include=['embeddings', 'metadatas'],
|
125
|
+
where={
|
126
|
+
"type": {"$eq": f"{column}"}})
|
77
127
|
print(
|
78
128
|
f"Embeddings of {column} are calculated from {vector_db_collection_name}")
|
129
|
+
|
130
|
+
# if metadatas == None or embeddings == None:
|
131
|
+
if 'embeddings' not in dictionary or 'metadatas' not in dictionary or not dictionary['embeddings'] or not dictionary['metadatas']:
|
132
|
+
raise ValueError(
|
133
|
+
"The collection does not contain embeddings or metadatas.")
|
134
|
+
|
79
135
|
# Populate the embedding dictionary with embeddings from the vector database
|
80
|
-
|
136
|
+
metadatas = dictionary["metadatas"]
|
137
|
+
embeddings = dictionary["embeddings"]
|
138
|
+
|
139
|
+
for metadata, embedding in zip(metadatas, embeddings):
|
81
140
|
embedding_dict[metadata["type"]
|
82
141
|
][metadata["id"]].append(embedding)
|
83
142
|
|
84
|
-
# return dictionary['embeddings'].shape[1]
|
85
143
|
else:
|
86
144
|
raise ValueError(
|
87
145
|
"Persistent directory for the vector DB is not specified.")
|
88
146
|
|
89
147
|
def build(self):
|
148
|
+
if self.embedding_pooling_strategy_type is not None and not isinstance(self.embedding_pooling_strategy_type, type):
|
149
|
+
raise TypeError(
|
150
|
+
"self.embedding_pooling_strategy_type must be a class, not an instance")
|
151
|
+
if not isinstance(self.dataset_type, type):
|
152
|
+
raise TypeError(
|
153
|
+
"self.dataset_type must be a class, not an instance")
|
154
|
+
|
90
155
|
# 'enzyme','target','pathway','smile','all_text','indication', 'description','mechanism_of_action','pharmacodynamics', 'tui', 'cui', 'entities'
|
91
156
|
kwargs = {"columns": self.columns}
|
92
157
|
if self.ner_threshold:
|
@@ -103,90 +168,89 @@ class Pipeline:
|
|
103
168
|
col_db_collection = item["vector_db_collection_name"]
|
104
169
|
self.__create_or_update_embeddings__(
|
105
170
|
embedding_dict, col_db_dir, col_db_collection, col)
|
106
|
-
|
171
|
+
|
107
172
|
elif self.vector_db_persist_directory:
|
108
173
|
self.__create_or_update_embeddings__(
|
109
174
|
embedding_dict, self.vector_db_persist_directory, self.vector_db_collection_name)
|
110
|
-
|
175
|
+
|
111
176
|
else:
|
112
177
|
print(
|
113
178
|
f"There is no configuration of Embeddings")
|
114
179
|
|
115
|
-
# if self.embedding_dict == None:
|
116
|
-
# if self.vector_db_persist_directory:
|
117
|
-
# self.vector_db = chromadb.PersistentClient(
|
118
|
-
# path=self.vector_db_persist_directory)
|
119
|
-
# self.collection = self.vector_db.get_collection(
|
120
|
-
# self.vector_db_collection_name)
|
121
|
-
# dictionary = self.collection.get(
|
122
|
-
# include=['embeddings', 'metadatas'])
|
123
|
-
|
124
|
-
# embedding_dict = defaultdict(lambda: defaultdict(list))
|
125
|
-
|
126
|
-
# for metadata, embedding in zip(dictionary['metadatas'], dictionary['embeddings']):
|
127
|
-
# embedding_dict[metadata["type"]
|
128
|
-
# ][metadata["id"]].append(embedding)
|
129
|
-
|
130
|
-
# embedding_size = dictionary['embeddings'].shape[1]
|
131
180
|
else:
|
132
181
|
embedding_dict = self.embedding_dict
|
133
182
|
# TODO make generic
|
134
183
|
# embedding_size = list(embedding_dict['all_text'].values())[
|
135
184
|
# 0][0].shape
|
136
|
-
key, value = next(iter(embedding_dict.items()))
|
137
|
-
embedding_size = value[next(iter(value))][0].shape[0]
|
138
|
-
pooling_strategy = self.embedding_pooling_strategy_type()
|
139
185
|
|
140
|
-
self.ner_df = CTakesNER().load(
|
141
|
-
|
186
|
+
# self.ner_df = CTakesNER(df=None).load(
|
187
|
+
# filename=self.ner_data_file) if self.ner_data_file else None
|
188
|
+
|
189
|
+
if issubclass(self.dataset_type, TextDatasetMixin):
|
190
|
+
key, value = next(iter(embedding_dict.items()))
|
191
|
+
embedding_size = value[next(iter(value))][0].shape[0]
|
192
|
+
pooling_strategy = self.embedding_pooling_strategy_type(
|
193
|
+
) if self.embedding_pooling_strategy_type else None
|
194
|
+
|
195
|
+
dataset = self.dataset_type(
|
196
|
+
embedding_dict=embedding_dict,
|
197
|
+
embedding_size=embedding_size,
|
198
|
+
embeddings_pooling_strategy=pooling_strategy,
|
199
|
+
**kwargs)
|
200
|
+
else:
|
201
|
+
dataset = self.dataset_type(**kwargs)
|
202
|
+
|
203
|
+
X_train, X_test, y_train, y_test, X_train.index, X_test.index, train_idx_arr, val_idx_arr = dataset.load()
|
142
204
|
|
143
|
-
|
144
|
-
embedding_dict=embedding_dict,
|
145
|
-
embedding_size=embedding_size,
|
146
|
-
embeddings_pooling_strategy=pooling_strategy,
|
147
|
-
ner_df=self.ner_df, **kwargs)
|
205
|
+
dataframe = dataset.dataframe
|
148
206
|
|
149
|
-
|
207
|
+
if dataframe is None: # if the dataframe is None, it means that the dataset is not loaded
|
208
|
+
raise ValueError("The dataset is not loaded")
|
150
209
|
|
151
|
-
self.dataframe = self.dataset.dataframe
|
152
210
|
# dataframe.dropna()
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
self.
|
158
|
-
self.
|
211
|
+
X_train = dataset.X_train
|
212
|
+
X_test = dataset.X_test
|
213
|
+
y_train = dataset.y_train
|
214
|
+
y_test = dataset.y_test
|
215
|
+
self._train_idx_arr = dataset.train_idx_arr
|
216
|
+
self._val_idx_arr = dataset.val_idx_arr
|
159
217
|
# Logic to set up the experiment
|
160
218
|
# column name, train data, train label, test data, test label
|
161
|
-
self.
|
219
|
+
self._items = dataset.produce_inputs()
|
162
220
|
|
163
|
-
unique_classes = pd.unique(
|
164
|
-
event_num = len(unique_classes)
|
221
|
+
# unique_classes = pd.unique(dataframe[dataset.class_column])
|
222
|
+
# event_num = len(unique_classes)
|
165
223
|
# droprate = 0.3
|
166
|
-
vector_size = self.dataset.drugs_df.shape[0]
|
224
|
+
# vector_size = self.dataset.drugs_df.shape[0]
|
167
225
|
|
168
226
|
print("Building the experiment with the following settings:")
|
169
227
|
print(
|
170
|
-
f"Name: {self.experiment_name}, Dataset: {
|
228
|
+
f"Name: {self.experiment_name}, Dataset: {dataset}, Model: {self.model}")
|
171
229
|
# Implement additional build logic as needed
|
172
230
|
return self
|
173
231
|
|
174
232
|
def run(self):
|
175
|
-
|
233
|
+
if self.use_mlflow:
|
234
|
+
if self.tracking_uri is None:
|
235
|
+
raise ValueError("Tracking uri should be specified")
|
236
|
+
mlflow.set_tracking_uri(self.tracking_uri)
|
176
237
|
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
238
|
+
if mlflow.get_experiment_by_name(self.experiment_name) == None:
|
239
|
+
mlflow.create_experiment(
|
240
|
+
self.experiment_name, self.artifact_location)
|
241
|
+
if self.experiment_tags is not None:
|
242
|
+
mlflow.set_experiment_tags(self.experiment_tags)
|
243
|
+
mlflow.set_experiment(self.experiment_name)
|
182
244
|
|
183
245
|
y_test_label = self.items[0][4]
|
184
|
-
multi_modal_runner = MultiModalRunner(
|
246
|
+
multi_modal_runner = MultiModalRunner(
|
247
|
+
library=self.library, multi_modal=self.multi_modal)
|
185
248
|
# multi_modal_runner = MultiModalRunner(
|
186
249
|
# library=self.library, model_func=model_func, batch_size=batch_size, epochs=epochs)
|
187
250
|
# multi_modal = TFMultiModal(
|
188
251
|
# model_func=model_func, batch_size=batch_size, epochs=epochs) # 100
|
189
252
|
multi_modal_runner.set_data(
|
190
253
|
self.items, self.train_idx_arr, self.val_idx_arr, y_test_label)
|
191
|
-
|
254
|
+
combinations = self.combinations if self.combinations is not None else []
|
255
|
+
result = multi_modal_runner.predict(combinations)
|
192
256
|
return result
|
@@ -1,4 +1,3 @@
|
|
1
|
-
from compress_json import compress, decompress
|
2
1
|
import json
|
3
2
|
import sys
|
4
3
|
|
@@ -9,17 +8,4 @@ def minify(folder, file_name):
|
|
9
8
|
json_string = json.dumps(json_data, separators=(',', ":")) # Compact JSON structure
|
10
9
|
file_name = str(file_name).replace(".json", "") # remove .json from end of file_name string
|
11
10
|
new_file_name = folder+"/{0}_minify.json".format(file_name)
|
12
|
-
open(new_file_name, "w+", 1).write(json_string) # open and write json_string to file
|
13
|
-
|
14
|
-
json_file = f'C:\\Users\\kivanc\\Downloads\\metrics.json'
|
15
|
-
data_file = f'C:\\Users\\kivanc\\Downloads\\data.json'
|
16
|
-
|
17
|
-
minify('C:\\Users\\kivanc\\Downloads','metrics.json')
|
18
|
-
|
19
|
-
# with open(json_file, 'r', encoding="utf8") as f:
|
20
|
-
# data = json.load(f)
|
21
|
-
|
22
|
-
# compressed = compress(data) # the result is a list (array)
|
23
|
-
|
24
|
-
# with open(data_file, "w") as fd:
|
25
|
-
# fd.write(json.dumps(compressed)) # convert into string if needed
|
11
|
+
open(new_file_name, "w+", 1).write(json_string) # open and write json_string to file
|
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: ddi_fw
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.151
|
4
4
|
Summary: Do not use :)
|
5
5
|
Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
|
6
6
|
Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
|
@@ -22,6 +22,7 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
22
22
|
Classifier: Topic :: Scientific/Engineering :: Medical Science Apps.
|
23
23
|
Requires-Python: >=3.10
|
24
24
|
Description-Content-Type: text/markdown
|
25
|
+
Requires-Dist: pydantic==2.10.6
|
25
26
|
Requires-Dist: importlib-resources==6.4.5
|
26
27
|
Requires-Dist: python-stopwatch==1.1.11
|
27
28
|
Requires-Dist: lxml==5.3.0
|
@@ -35,7 +36,7 @@ Requires-Dist: rdkit==2023.3.3
|
|
35
36
|
Requires-Dist: scikit-learn==1.5.2
|
36
37
|
Requires-Dist: scipy==1.13.1
|
37
38
|
Requires-Dist: accelerate>=0.33.0
|
38
|
-
Requires-Dist: sentence-transformers
|
39
|
+
Requires-Dist: sentence-transformers<=3.3.1,>=3.0.1
|
39
40
|
Requires-Dist: transformers>=4.42.4
|
40
41
|
Requires-Dist: stanza==1.9.2
|
41
42
|
Requires-Dist: tokenizers>=0.19.1
|
@@ -49,3 +50,5 @@ Requires-Dist: chromadb>=0.5.15
|
|
49
50
|
Requires-Dist: langchain_community==0.3.3
|
50
51
|
Requires-Dist: datasets==3.0.2
|
51
52
|
Requires-Dist: unstructured==0.16.3
|
53
|
+
Requires-Dist: tensorflow<2.18.0,>=2.17.0
|
54
|
+
Requires-Dist: tf-keras==2.17.0
|
@@ -1,11 +1,13 @@
|
|
1
|
-
ddi_fw/datasets/__init__.py,sha256=
|
2
|
-
ddi_fw/datasets/core.py,sha256=
|
1
|
+
ddi_fw/datasets/__init__.py,sha256=yDsRQD_9Ijpm_Rl2wSDwdutG5Q_wca_UBPEvm7nBx04,444
|
2
|
+
ddi_fw/datasets/core.py,sha256=JA6WJz3VCUfxI85rYE7ZBqC4pnn7L8NSS9-EgjLw710,7968
|
3
|
+
ddi_fw/datasets/dataset_splitter.py,sha256=lLIelXv-8rCK0tbwLNgHBHYUO_65HT-_kErAlZhRQVE,1662
|
3
4
|
ddi_fw/datasets/db_utils.py,sha256=OTsa3d-Iic7z3HmzSQK9UigedRbHDxYChJk0s4GfLnw,6191
|
4
|
-
ddi_fw/datasets/embedding_generator.py,sha256=
|
5
|
+
ddi_fw/datasets/embedding_generator.py,sha256=jiDKwLaPMaQkloxQkuCrhl-A-2OdvocmkSzjWtUnk4g,2255
|
5
6
|
ddi_fw/datasets/feature_vector_generation.py,sha256=gvjpEzkgVV8dp4V8NMMv59u0v-1tNAJ7v83R-keWGoA,4748
|
6
7
|
ddi_fw/datasets/idf_helper.py,sha256=_Gd1dtDSLaw8o-o0JugzSKMt9FpeXewTh4wGEaUd4VQ,2571
|
7
8
|
ddi_fw/datasets/setup_._py,sha256=khYVJuW5PlOY_i_A16F3UbSZ6s6o_ljw33Byw3C-A8E,1047
|
8
|
-
ddi_fw/datasets/ddi_mdl/base.py,sha256=
|
9
|
+
ddi_fw/datasets/ddi_mdl/base.py,sha256=usw3AhBCjdYwZx9MMnyNaUYTEyYXoRSO4fNJJHxnPuk,9312
|
10
|
+
ddi_fw/datasets/ddi_mdl/debug.log,sha256=eWz05j8RFqZuHFDTCF7Rck5w4rvtTanFN21iZsgxO7Y,115
|
9
11
|
ddi_fw/datasets/ddi_mdl/readme.md,sha256=WC6lpmsEKvIISnZqENY7TWtzCQr98HPpE3oRsBl8pIw,625
|
10
12
|
ddi_fw/datasets/ddi_mdl/data/event.db,sha256=cmlSsf9MYjRzqR-mw3cUDnTnfT6FkpOG2yCl2mMwwew,30580736
|
11
13
|
ddi_fw/datasets/ddi_mdl/indexes/test_indexes.txt,sha256=XVlDqYATckrQwNSXqMSKVBqyoN_Hg8SK6CL-XMdLADY,102176
|
@@ -70,43 +72,32 @@ ddi_fw/drugbank/drugbank_processor.py,sha256=vmkt68n9nFLevufgGyXhOSDtTo4G1XzwT9P
|
|
70
72
|
ddi_fw/drugbank/drugbank_processor_org.py,sha256=eO5Yset50P91qkic79RUXPoEuxRxQKFkKW0l4G29Mas,13322
|
71
73
|
ddi_fw/drugbank/event_extractor.py,sha256=6odoZohhK7OdLF-LF0l-5BFq0_NMG_5jrFJbHrBXsI8,4600
|
72
74
|
ddi_fw/langchain/__init__.py,sha256=zS0CQrakWEP19biSRewFJGcBT8WBZq4899HrEKiMqUY,269
|
73
|
-
ddi_fw/langchain/embeddings.py,sha256=
|
75
|
+
ddi_fw/langchain/embeddings.py,sha256=XzIYgmqnAO93pnavKRDhYDoz0RhDn-RoC7CDc0yAvbM,7572
|
74
76
|
ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSzu5L2PXzg,280
|
75
77
|
ddi_fw/langchain/storage.py,sha256=OizKyWm74Js7T6Q9kez-ulUoBGzIMFo4R46h4kjUyIM,11200
|
76
78
|
ddi_fw/ml/__init__.py,sha256=tIxiW0g6q1VsmDYVXR_ovvHQR3SCir8g2bKxx_CrS7s,221
|
77
|
-
ddi_fw/ml/evaluation_helper.py,sha256=
|
78
|
-
ddi_fw/ml/ml_helper.py,sha256=
|
79
|
-
ddi_fw/ml/model_wrapper.py,sha256=
|
80
|
-
ddi_fw/ml/pytorch_wrapper.py,sha256=
|
81
|
-
ddi_fw/ml/tensorflow_wrapper.py,sha256=
|
79
|
+
ddi_fw/ml/evaluation_helper.py,sha256=JFATMquaQVa2gckxmEivCztZmivWBAAP7EpJ8PVeI3c,7626
|
80
|
+
ddi_fw/ml/ml_helper.py,sha256=E6ef7f1UnQl6JBUdGDbbbI4FIS-904VGypT7tI0a598,8545
|
81
|
+
ddi_fw/ml/model_wrapper.py,sha256=kabPXuo7S8tGkp9a00V04n4rXDmv7dD8wYGMjotISRc,1050
|
82
|
+
ddi_fw/ml/pytorch_wrapper.py,sha256=pe6UsjP2XeTgLxDnIUiodoyhJTGCxV27wD4Cjxysu2Q,8553
|
83
|
+
ddi_fw/ml/tensorflow_wrapper.py,sha256=jt6h9Q-wF0mkbnvV6yCCl1SpUd2paHK70Bu6EFrkmd0,10112
|
82
84
|
ddi_fw/ner/__init__.py,sha256=JwhGXrepomxPSsGsg2b_xPRC72AjvxOIn2CW5Mvscn0,26
|
83
85
|
ddi_fw/ner/mmlrestclient.py,sha256=NZta7m2Qm6I_qtVguMZhqtAUjVBmmXn0-TMnsNp0jpg,6859
|
84
|
-
ddi_fw/ner/ner.py,sha256=
|
86
|
+
ddi_fw/ner/ner.py,sha256=FHyyX53Xwpdw8Hec261dyN88yD7Z9LmJua2mIrQLguI,17967
|
85
87
|
ddi_fw/pipeline/__init__.py,sha256=tKDM_rW4vPjlYTeOkNgi9PujDzb4e9O3LK1w5wqnebw,212
|
86
|
-
ddi_fw/pipeline/multi_modal_combination_strategy.py,sha256=
|
87
|
-
ddi_fw/pipeline/multi_pipeline.py,sha256=
|
88
|
-
ddi_fw/pipeline/ner_pipeline.py,sha256=
|
89
|
-
ddi_fw/pipeline/pipeline.py,sha256
|
90
|
-
ddi_fw/test/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
91
|
-
ddi_fw/test/basic_test.py,sha256=fEOGcZm1ObnsDvMiXNmdmz6YCeUrGc8V0DwlSwGhsq8,376
|
92
|
-
ddi_fw/test/combination_test.py,sha256=TWNE8sf-DSh1Q9-yRaRBc774Sn1kSMGXLwQhd2_Qynk,324
|
93
|
-
ddi_fw/test/compress_json_test.py,sha256=BGny56YqiG-pzhMoDzLKQBQI1E7o3jU0S7VYWtclAx4,1045
|
94
|
-
ddi_fw/test/date_test.py,sha256=QmJ97ennS9LxLl8mGBkM2ob8_KWEFmiLakZTI9zQxxo,532
|
95
|
-
ddi_fw/test/idf_score.py,sha256=YsAur-F1T3eFxn9KrcK3VXCvrsV_LXrpHxPjMKZeQZ8,1523
|
96
|
-
ddi_fw/test/jaccard_similarity.py,sha256=pf6SNI52RCUZ0otx_1cz7A0p7kyfoCZv13Tbc_rxfuw,2382
|
97
|
-
ddi_fw/test/mlfow_test.py,sha256=L2hJAeIU5PDSxsyWTtV6PY0bfaWerWUJ1buni9BTjXo,4853
|
98
|
-
ddi_fw/test/sklearn-tfidf.py,sha256=cjtg27vLskEMXgrsqUR_EapRGVd4xgwOQ9zYsu72zjs,657
|
99
|
-
ddi_fw/test/test.py,sha256=zJh9ZBcZl-vZIFDvuftcRrRV8WAwtiFVhPPd6Et4OU4,2997
|
100
|
-
ddi_fw/test/torch_cuda_test.py,sha256=R-4VGVErl_Ufk54DoZbgL_YXWoCYFyanIVWd6P39IEk,312
|
101
|
-
ddi_fw/test/type_guarding_test.py,sha256=KxjyBxohDu7lwpejalCj-REjtJ-k1S1wQbOB6TGY0O8,766
|
88
|
+
ddi_fw/pipeline/multi_modal_combination_strategy.py,sha256=JSyuP71b1I1yuk0s2ecCJZTtCED85jBtkpwTUxibJvI,1706
|
89
|
+
ddi_fw/pipeline/multi_pipeline.py,sha256=D_BZ3ciHbVGuuB7m7cEmVQHESruh1gqhA-vxCMfNKj0,5407
|
90
|
+
ddi_fw/pipeline/ner_pipeline.py,sha256=q1aKjb54Ra1HzZ7dARvBw6lB37je9R-POEf2h6QT_nU,6018
|
91
|
+
ddi_fw/pipeline/pipeline.py,sha256=NPew1lESAiuXUKR4Ob9R4LwRh2Xe1qfnqZDfmuMuC7k,11253
|
102
92
|
ddi_fw/utils/__init__.py,sha256=77563ikqAtdzjjgRlLp5OAsJBbpLA1Cao8iecGaVUXQ,354
|
103
93
|
ddi_fw/utils/enums.py,sha256=19eJ3fX5eRK_xPvkYcukmug144jXPH4X9zQqtsFBj5A,671
|
94
|
+
ddi_fw/utils/json_helper.py,sha256=BVU6wmJgdXPxyqLPu3Ck_9Es5RrP1PDanKvE-OSj1D4,571
|
104
95
|
ddi_fw/utils/kaggle.py,sha256=wKRJ18KpQ6P-CubpZklEgsDtyFpR9RUL1_HyyF6ttEE,2425
|
105
96
|
ddi_fw/utils/package_helper.py,sha256=erl8_onmhK-41zQoaED2qyDUV9GQxmT9sdoyRp9_q5I,1056
|
106
97
|
ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,4747
|
107
98
|
ddi_fw/utils/utils.py,sha256=szwnxMTDRrZoeNRyDuf3aCbtzriwtaRk4mHSH3asLdA,4301
|
108
99
|
ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
|
109
|
-
ddi_fw-0.0.
|
110
|
-
ddi_fw-0.0.
|
111
|
-
ddi_fw-0.0.
|
112
|
-
ddi_fw-0.0.
|
100
|
+
ddi_fw-0.0.151.dist-info/METADATA,sha256=cTz-LpUrPhCU0uKQ2A9oE3lm5uaI3ra3nFHufSoi8hA,2082
|
101
|
+
ddi_fw-0.0.151.dist-info/WHEEL,sha256=DK49LOLCYiurdXXOXwGJm6U4DkHkg4lcxjhqwRa0CP4,91
|
102
|
+
ddi_fw-0.0.151.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
|
103
|
+
ddi_fw-0.0.151.dist-info/RECORD,,
|
ddi_fw/test/__init__.py
DELETED
File without changes
|
ddi_fw/test/basic_test.py
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
import json
|
2
|
-
|
3
|
-
|
4
|
-
class Metrics():
|
5
|
-
def __init__(self, precision, recall, roc_aupr, roc_auc):
|
6
|
-
self.precision = precision
|
7
|
-
self.recall = recall
|
8
|
-
self.roc_aupr = roc_aupr
|
9
|
-
self.roc_auc = roc_auc
|
10
|
-
|
11
|
-
|
12
|
-
m = Metrics( 0.96, 0.96, {"micro": 0.99, "macro": 0.88}, {"micro": 0.99, "macro": 0.88})
|
13
|
-
|
14
|
-
as_json = json.dumps(m.__dict__)
|
15
|
-
print(as_json)
|
ddi_fw/test/combination_test.py
DELETED
@@ -1,12 +0,0 @@
|
|
1
|
-
import itertools
|
2
|
-
|
3
|
-
l = ['e1','e2','e3','e4','e5']
|
4
|
-
all_combinations = []
|
5
|
-
for i in range(2, len(l) + 1):
|
6
|
-
all_combinations.extend(list(itertools.combinations(l, i)))
|
7
|
-
|
8
|
-
print(all_combinations)
|
9
|
-
|
10
|
-
for combination in all_combinations:
|
11
|
-
combination_descriptor = '-'.join(combination)
|
12
|
-
print(combination_descriptor)
|
ddi_fw/test/date_test.py
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
from datetime import datetime, timezone
|
2
|
-
|
3
|
-
local_datetime = datetime.now()
|
4
|
-
utc_datetime = datetime.now(timezone.utc)
|
5
|
-
|
6
|
-
local_iso_str = datetime.strftime(local_datetime, "%Y-%m-%dT%H:%M:%S.%f")[:-3]
|
7
|
-
utc_iso_str = datetime.strftime(utc_datetime, "%Y-%m-%dT%H:%M:%S.%f")[:-3]
|
8
|
-
|
9
|
-
print(f"local dt: {local_iso_str}, tzname: {local_datetime.tzname()}")
|
10
|
-
print(f" utc dt: {utc_iso_str}, tzname: {utc_datetime.tzname()}")
|
11
|
-
|
12
|
-
print("\n")
|
13
|
-
|
14
|
-
print(f"local dt: {local_datetime.isoformat()}")
|
15
|
-
print(f" utc dt: {utc_datetime.isoformat()}")
|
ddi_fw/test/idf_score.py
DELETED
@@ -1,54 +0,0 @@
|
|
1
|
-
import pandas as pd
|
2
|
-
import numpy as np
|
3
|
-
|
4
|
-
# from ddi_fw.datasets.feature_vector_generation import find_distinct_elements
|
5
|
-
def find_distinct_elements(frame):
|
6
|
-
y = set()
|
7
|
-
for x in frame:
|
8
|
-
if x is not None:
|
9
|
-
for k in x:
|
10
|
-
y.add(k)
|
11
|
-
return y
|
12
|
-
|
13
|
-
def calculate_idf(series):
|
14
|
-
idf_scores = {}
|
15
|
-
distinct_items = find_distinct_elements(series)
|
16
|
-
sorted_distinct_items = sorted(distinct_items)
|
17
|
-
total_document_number = len(all_data)
|
18
|
-
for item in sorted_distinct_items:
|
19
|
-
document_freq = series.map(set([item]).issubset).sum()
|
20
|
-
idf = np.log(total_document_number/document_freq)
|
21
|
-
idf_scores[item] = idf
|
22
|
-
return idf_scores
|
23
|
-
|
24
|
-
|
25
|
-
item1 = 'T001|T002|T001|T001'
|
26
|
-
item2 = 'T002|T003'
|
27
|
-
item3 = 'T004|T005'
|
28
|
-
|
29
|
-
|
30
|
-
all_data = [item1, item2, item3]
|
31
|
-
|
32
|
-
df = pd.DataFrame(all_data, columns=['tui_description'])
|
33
|
-
|
34
|
-
df['tui_description'] = df['tui_description'].apply(
|
35
|
-
lambda x: x.split('|') if x is not None else [])
|
36
|
-
|
37
|
-
print(df.head())
|
38
|
-
|
39
|
-
idf_scores = calculate_idf(df['tui_description'])
|
40
|
-
idf_scores_sorted_desc = sorted(idf_scores.items(), key=lambda x:x[1], reverse=True)
|
41
|
-
threshold = 1
|
42
|
-
keys_over_threshold = [k for k,v in idf_scores.items() if v > threshold]
|
43
|
-
|
44
|
-
print(idf_scores_sorted_desc)
|
45
|
-
print(keys_over_threshold)
|
46
|
-
|
47
|
-
|
48
|
-
def remove_items_by_idf_score(items):
|
49
|
-
return [item for item in items if item in keys_over_threshold]
|
50
|
-
|
51
|
-
df['tui_description'] = df['tui_description'].apply(
|
52
|
-
remove_items_by_idf_score)
|
53
|
-
|
54
|
-
print(df)
|