ddi-fw 0.0.149__py3-none-any.whl → 0.0.150__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ddi_fw/datasets/__init__.py +1 -1
- ddi_fw/datasets/core.py +147 -341
- ddi_fw/datasets/dataset_splitter.py +39 -0
- ddi_fw/datasets/ddi_mdl/base.py +194 -130
- ddi_fw/datasets/ddi_mdl/debug.log +1 -0
- ddi_fw/datasets/embedding_generator.py +2 -1
- ddi_fw/langchain/embeddings.py +1 -0
- ddi_fw/ml/evaluation_helper.py +47 -178
- ddi_fw/ml/ml_helper.py +125 -81
- ddi_fw/ml/model_wrapper.py +2 -2
- ddi_fw/ml/pytorch_wrapper.py +175 -72
- ddi_fw/ml/tensorflow_wrapper.py +131 -39
- ddi_fw/ner/ner.py +93 -39
- ddi_fw/pipeline/multi_modal_combination_strategy.py +4 -2
- ddi_fw/pipeline/multi_pipeline.py +2 -15
- ddi_fw/pipeline/ner_pipeline.py +15 -6
- ddi_fw/pipeline/pipeline.py +152 -94
- ddi_fw/{test/compress_json_test.py → utils/json_helper.py} +1 -15
- {ddi_fw-0.0.149.dist-info → ddi_fw-0.0.150.dist-info}/METADATA +6 -3
- {ddi_fw-0.0.149.dist-info → ddi_fw-0.0.150.dist-info}/RECORD +22 -31
- {ddi_fw-0.0.149.dist-info → ddi_fw-0.0.150.dist-info}/WHEEL +1 -1
- ddi_fw/test/__init__.py +0 -0
- ddi_fw/test/basic_test.py +0 -15
- ddi_fw/test/combination_test.py +0 -12
- ddi_fw/test/date_test.py +0 -15
- ddi_fw/test/idf_score.py +0 -54
- ddi_fw/test/jaccard_similarity.py +0 -85
- ddi_fw/test/mlfow_test.py +0 -165
- ddi_fw/test/sklearn-tfidf.py +0 -16
- ddi_fw/test/test.py +0 -93
- ddi_fw/test/torch_cuda_test.py +0 -9
- ddi_fw/test/type_guarding_test.py +0 -18
- {ddi_fw-0.0.149.dist-info → ddi_fw-0.0.150.dist-info}/top_level.txt +0 -0
ddi_fw/pipeline/pipeline.py
CHANGED
@@ -1,7 +1,12 @@
|
|
1
|
+
from typing import Any, Dict, List, Optional, Type, Union
|
1
2
|
import numpy as np
|
2
3
|
import pandas as pd
|
3
4
|
import chromadb
|
4
5
|
from collections import defaultdict
|
6
|
+
from chromadb.api.types import IncludeEnum
|
7
|
+
|
8
|
+
from pydantic import BaseModel
|
9
|
+
from ddi_fw.datasets.core import TextDatasetMixin
|
5
10
|
from ddi_fw.ner.ner import CTakesNER
|
6
11
|
from ddi_fw.langchain.embeddings import PoolingStrategy
|
7
12
|
from ddi_fw.datasets import BaseDataset, DDIMDLDataset
|
@@ -10,44 +15,81 @@ import mlflow
|
|
10
15
|
from ddi_fw.ml import MultiModalRunner
|
11
16
|
|
12
17
|
|
13
|
-
class Pipeline:
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
self.
|
40
|
-
|
41
|
-
|
42
|
-
self.
|
43
|
-
|
44
|
-
|
45
|
-
self.
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
18
|
+
class Pipeline(BaseModel):
|
19
|
+
library: str = 'tensorflow'
|
20
|
+
experiment_name: str
|
21
|
+
experiment_description: str
|
22
|
+
experiment_tags: Optional[Dict[str, Any]] = None
|
23
|
+
artifact_location: Optional[str] = None
|
24
|
+
tracking_uri: Optional[str] = None
|
25
|
+
dataset_type: Type[BaseDataset]
|
26
|
+
columns: Optional[List[str]] = None
|
27
|
+
embedding_dict: Optional[Dict[str, Any]] = None
|
28
|
+
column_embedding_configs: Optional[Dict] = None
|
29
|
+
vector_db_persist_directory: Optional[str] = None
|
30
|
+
vector_db_collection_name: Optional[str] = None
|
31
|
+
embedding_pooling_strategy_type: Type[PoolingStrategy] | None = None
|
32
|
+
ner_data_file: Optional[str] = None
|
33
|
+
ner_threshold: Optional[dict] = None
|
34
|
+
combinations: Optional[List[str]] = None
|
35
|
+
model: Optional[Any] = None
|
36
|
+
multi_modal: Optional[Any] = None
|
37
|
+
use_mlflow: bool = True
|
38
|
+
_items:List=[]
|
39
|
+
_train_idx_arr:List|None=[]
|
40
|
+
_val_idx_arr:List|None=[]
|
41
|
+
|
42
|
+
@property
|
43
|
+
def items(self) -> List:
|
44
|
+
return self._items
|
45
|
+
@property
|
46
|
+
def train_idx_arr(self) -> List|None:
|
47
|
+
return self._train_idx_arr
|
48
|
+
@property
|
49
|
+
def val_idx_arr(self) -> List|None:
|
50
|
+
return self._val_idx_arr
|
51
|
+
|
52
|
+
class Config:
|
53
|
+
arbitrary_types_allowed = True
|
54
|
+
|
55
|
+
# class Pipeline:
|
56
|
+
# def __init__(self,
|
57
|
+
# library='tensorflow',
|
58
|
+
# experiment_name=None,
|
59
|
+
# experiment_description=None,
|
60
|
+
# experiment_tags=None,
|
61
|
+
# artifact_location=None,
|
62
|
+
# tracking_uri=None,
|
63
|
+
# dataset_type: BaseDataset = None,
|
64
|
+
# columns=None,
|
65
|
+
# embedding_dict=None,
|
66
|
+
# column_embedding_configs=None,
|
67
|
+
# vector_db_persist_directory=None,
|
68
|
+
# vector_db_collection_name=None,
|
69
|
+
# embedding_pooling_strategy_type: PoolingStrategy = None,
|
70
|
+
# ner_data_file=None,
|
71
|
+
# ner_threshold=None,
|
72
|
+
# combinations=None,
|
73
|
+
# model=None,
|
74
|
+
# multi_modal = None ):
|
75
|
+
# self.library = library
|
76
|
+
# self.experiment_name = experiment_name
|
77
|
+
# self.experiment_description = experiment_description
|
78
|
+
# self.experiment_tags = experiment_tags
|
79
|
+
# self.artifact_location = artifact_location
|
80
|
+
# self.tracking_uri = tracking_uri
|
81
|
+
# self.dataset_type = dataset_type
|
82
|
+
# self.columns = columns
|
83
|
+
# self.embedding_dict = embedding_dict
|
84
|
+
# self.column_embedding_configs = column_embedding_configs
|
85
|
+
# self.vector_db_persist_directory = vector_db_persist_directory
|
86
|
+
# self.vector_db_collection_name = vector_db_collection_name
|
87
|
+
# self.embedding_pooling_strategy_type = embedding_pooling_strategy_type
|
88
|
+
# self.ner_data_file = ner_data_file
|
89
|
+
# self.ner_threshold = ner_threshold
|
90
|
+
# self.combinations = combinations
|
91
|
+
# self.model = model
|
92
|
+
# self.multi_modal = multi_modal
|
51
93
|
|
52
94
|
def __create_or_update_embeddings__(self, embedding_dict, vector_db_persist_directory, vector_db_collection_name, column=None):
|
53
95
|
"""
|
@@ -64,29 +106,47 @@ class Pipeline:
|
|
64
106
|
vector_db = chromadb.PersistentClient(
|
65
107
|
path=vector_db_persist_directory)
|
66
108
|
collection = vector_db.get_collection(vector_db_collection_name)
|
67
|
-
|
109
|
+
include = [IncludeEnum.embeddings, IncludeEnum.metadatas]
|
110
|
+
dictionary: chromadb.GetResult
|
68
111
|
# Fetch the embeddings and metadata
|
69
112
|
if column == None:
|
70
113
|
dictionary = collection.get(
|
71
|
-
include=
|
114
|
+
include=include
|
115
|
+
# include=['embeddings', 'metadatas']
|
116
|
+
)
|
72
117
|
print(
|
73
118
|
f"Embeddings are calculated from {vector_db_collection_name}")
|
74
119
|
else:
|
75
|
-
dictionary = collection.get(
|
76
|
-
|
120
|
+
dictionary = collection.get(
|
121
|
+
include=include,
|
122
|
+
# include=['embeddings', 'metadatas'],
|
123
|
+
where={
|
124
|
+
"type": {"$eq": f"{column}"}})
|
77
125
|
print(
|
78
126
|
f"Embeddings of {column} are calculated from {vector_db_collection_name}")
|
127
|
+
|
79
128
|
# Populate the embedding dictionary with embeddings from the vector database
|
80
|
-
|
129
|
+
metadatas = dictionary["metadatas"]
|
130
|
+
embeddings = dictionary["embeddings"]
|
131
|
+
if metadatas == None or embeddings == None:
|
132
|
+
raise ValueError(
|
133
|
+
"The collection does not contain embeddings or metadatas.")
|
134
|
+
for metadata, embedding in zip(metadatas, embeddings):
|
81
135
|
embedding_dict[metadata["type"]
|
82
136
|
][metadata["id"]].append(embedding)
|
83
137
|
|
84
|
-
# return dictionary['embeddings'].shape[1]
|
85
138
|
else:
|
86
139
|
raise ValueError(
|
87
140
|
"Persistent directory for the vector DB is not specified.")
|
88
141
|
|
89
142
|
def build(self):
|
143
|
+
if self.embedding_pooling_strategy_type is not None and not isinstance(self.embedding_pooling_strategy_type, type):
|
144
|
+
raise TypeError(
|
145
|
+
"self.embedding_pooling_strategy_type must be a class, not an instance")
|
146
|
+
if not isinstance(self.dataset_type, type):
|
147
|
+
raise TypeError(
|
148
|
+
"self.dataset_type must be a class, not an instance")
|
149
|
+
|
90
150
|
# 'enzyme','target','pathway','smile','all_text','indication', 'description','mechanism_of_action','pharmacodynamics', 'tui', 'cui', 'entities'
|
91
151
|
kwargs = {"columns": self.columns}
|
92
152
|
if self.ner_threshold:
|
@@ -103,90 +163,88 @@ class Pipeline:
|
|
103
163
|
col_db_collection = item["vector_db_collection_name"]
|
104
164
|
self.__create_or_update_embeddings__(
|
105
165
|
embedding_dict, col_db_dir, col_db_collection, col)
|
106
|
-
|
166
|
+
|
107
167
|
elif self.vector_db_persist_directory:
|
108
168
|
self.__create_or_update_embeddings__(
|
109
169
|
embedding_dict, self.vector_db_persist_directory, self.vector_db_collection_name)
|
110
|
-
|
170
|
+
|
111
171
|
else:
|
112
172
|
print(
|
113
173
|
f"There is no configuration of Embeddings")
|
114
174
|
|
115
|
-
# if self.embedding_dict == None:
|
116
|
-
# if self.vector_db_persist_directory:
|
117
|
-
# self.vector_db = chromadb.PersistentClient(
|
118
|
-
# path=self.vector_db_persist_directory)
|
119
|
-
# self.collection = self.vector_db.get_collection(
|
120
|
-
# self.vector_db_collection_name)
|
121
|
-
# dictionary = self.collection.get(
|
122
|
-
# include=['embeddings', 'metadatas'])
|
123
|
-
|
124
|
-
# embedding_dict = defaultdict(lambda: defaultdict(list))
|
125
|
-
|
126
|
-
# for metadata, embedding in zip(dictionary['metadatas'], dictionary['embeddings']):
|
127
|
-
# embedding_dict[metadata["type"]
|
128
|
-
# ][metadata["id"]].append(embedding)
|
129
|
-
|
130
|
-
# embedding_size = dictionary['embeddings'].shape[1]
|
131
175
|
else:
|
132
176
|
embedding_dict = self.embedding_dict
|
133
177
|
# TODO make generic
|
134
178
|
# embedding_size = list(embedding_dict['all_text'].values())[
|
135
179
|
# 0][0].shape
|
136
|
-
key, value = next(iter(embedding_dict.items()))
|
137
|
-
embedding_size = value[next(iter(value))][0].shape[0]
|
138
|
-
pooling_strategy = self.embedding_pooling_strategy_type()
|
139
180
|
|
140
|
-
self.ner_df = CTakesNER().load(
|
141
|
-
|
181
|
+
# self.ner_df = CTakesNER(df=None).load(
|
182
|
+
# filename=self.ner_data_file) if self.ner_data_file else None
|
142
183
|
|
143
|
-
|
144
|
-
|
145
|
-
embedding_size=
|
146
|
-
|
147
|
-
|
184
|
+
if issubclass(self.dataset_type, TextDatasetMixin):
|
185
|
+
key, value = next(iter(embedding_dict.items()))
|
186
|
+
embedding_size = value[next(iter(value))][0].shape[0]
|
187
|
+
pooling_strategy = self.embedding_pooling_strategy_type() if self.embedding_pooling_strategy_type else None
|
188
|
+
|
189
|
+
dataset = self.dataset_type(
|
190
|
+
embedding_dict=embedding_dict,
|
191
|
+
embedding_size=embedding_size,
|
192
|
+
embeddings_pooling_strategy=pooling_strategy,
|
193
|
+
**kwargs)
|
194
|
+
else:
|
195
|
+
dataset = self.dataset_type(**kwargs)
|
148
196
|
|
149
|
-
X_train, X_test, y_train, y_test, X_train.index, X_test.index, train_idx_arr, val_idx_arr =
|
197
|
+
X_train, X_test, y_train, y_test, X_train.index, X_test.index, train_idx_arr, val_idx_arr = dataset.load()
|
198
|
+
|
199
|
+
dataframe = dataset.dataframe
|
200
|
+
|
201
|
+
if dataframe is None: # if the dataframe is None, it means that the dataset is not loaded
|
202
|
+
raise ValueError("The dataset is not loaded")
|
150
203
|
|
151
|
-
self.dataframe = self.dataset.dataframe
|
152
204
|
# dataframe.dropna()
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
self.
|
158
|
-
self.
|
205
|
+
X_train = dataset.X_train
|
206
|
+
X_test = dataset.X_test
|
207
|
+
y_train = dataset.y_train
|
208
|
+
y_test = dataset.y_test
|
209
|
+
self._train_idx_arr = dataset.train_idx_arr
|
210
|
+
self._val_idx_arr = dataset.val_idx_arr
|
159
211
|
# Logic to set up the experiment
|
160
212
|
# column name, train data, train label, test data, test label
|
161
|
-
self.
|
213
|
+
self._items = dataset.produce_inputs()
|
162
214
|
|
163
|
-
unique_classes = pd.unique(
|
164
|
-
event_num = len(unique_classes)
|
215
|
+
# unique_classes = pd.unique(dataframe[dataset.class_column])
|
216
|
+
# event_num = len(unique_classes)
|
165
217
|
# droprate = 0.3
|
166
|
-
vector_size = self.dataset.drugs_df.shape[0]
|
218
|
+
# vector_size = self.dataset.drugs_df.shape[0]
|
167
219
|
|
168
220
|
print("Building the experiment with the following settings:")
|
169
221
|
print(
|
170
|
-
f"Name: {self.experiment_name}, Dataset: {
|
222
|
+
f"Name: {self.experiment_name}, Dataset: {dataset}, Model: {self.model}")
|
171
223
|
# Implement additional build logic as needed
|
172
224
|
return self
|
173
225
|
|
174
226
|
def run(self):
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
mlflow.
|
179
|
-
|
180
|
-
mlflow.
|
181
|
-
|
227
|
+
if self.use_mlflow:
|
228
|
+
if self.tracking_uri is None:
|
229
|
+
raise ValueError("Tracking uri should be specified")
|
230
|
+
mlflow.set_tracking_uri(self.tracking_uri)
|
231
|
+
|
232
|
+
if mlflow.get_experiment_by_name(self.experiment_name) == None:
|
233
|
+
mlflow.create_experiment(
|
234
|
+
self.experiment_name, self.artifact_location)
|
235
|
+
if self.experiment_tags is not None:
|
236
|
+
mlflow.set_experiment_tags(self.experiment_tags)
|
237
|
+
mlflow.set_experiment(self.experiment_name)
|
182
238
|
|
183
239
|
y_test_label = self.items[0][4]
|
184
|
-
multi_modal_runner = MultiModalRunner(
|
240
|
+
multi_modal_runner = MultiModalRunner(
|
241
|
+
library=self.library, multi_modal=self.multi_modal)
|
185
242
|
# multi_modal_runner = MultiModalRunner(
|
186
243
|
# library=self.library, model_func=model_func, batch_size=batch_size, epochs=epochs)
|
187
244
|
# multi_modal = TFMultiModal(
|
188
245
|
# model_func=model_func, batch_size=batch_size, epochs=epochs) # 100
|
189
246
|
multi_modal_runner.set_data(
|
190
247
|
self.items, self.train_idx_arr, self.val_idx_arr, y_test_label)
|
191
|
-
|
248
|
+
combinations = self.combinations if self.combinations is not None else []
|
249
|
+
result = multi_modal_runner.predict(combinations)
|
192
250
|
return result
|
@@ -1,4 +1,3 @@
|
|
1
|
-
from compress_json import compress, decompress
|
2
1
|
import json
|
3
2
|
import sys
|
4
3
|
|
@@ -9,17 +8,4 @@ def minify(folder, file_name):
|
|
9
8
|
json_string = json.dumps(json_data, separators=(',', ":")) # Compact JSON structure
|
10
9
|
file_name = str(file_name).replace(".json", "") # remove .json from end of file_name string
|
11
10
|
new_file_name = folder+"/{0}_minify.json".format(file_name)
|
12
|
-
open(new_file_name, "w+", 1).write(json_string) # open and write json_string to file
|
13
|
-
|
14
|
-
json_file = f'C:\\Users\\kivanc\\Downloads\\metrics.json'
|
15
|
-
data_file = f'C:\\Users\\kivanc\\Downloads\\data.json'
|
16
|
-
|
17
|
-
minify('C:\\Users\\kivanc\\Downloads','metrics.json')
|
18
|
-
|
19
|
-
# with open(json_file, 'r', encoding="utf8") as f:
|
20
|
-
# data = json.load(f)
|
21
|
-
|
22
|
-
# compressed = compress(data) # the result is a list (array)
|
23
|
-
|
24
|
-
# with open(data_file, "w") as fd:
|
25
|
-
# fd.write(json.dumps(compressed)) # convert into string if needed
|
11
|
+
open(new_file_name, "w+", 1).write(json_string) # open and write json_string to file
|
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: ddi_fw
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.150
|
4
4
|
Summary: Do not use :)
|
5
5
|
Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
|
6
6
|
Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
|
@@ -22,6 +22,7 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
22
22
|
Classifier: Topic :: Scientific/Engineering :: Medical Science Apps.
|
23
23
|
Requires-Python: >=3.10
|
24
24
|
Description-Content-Type: text/markdown
|
25
|
+
Requires-Dist: pydantic==2.10.6
|
25
26
|
Requires-Dist: importlib-resources==6.4.5
|
26
27
|
Requires-Dist: python-stopwatch==1.1.11
|
27
28
|
Requires-Dist: lxml==5.3.0
|
@@ -35,7 +36,7 @@ Requires-Dist: rdkit==2023.3.3
|
|
35
36
|
Requires-Dist: scikit-learn==1.5.2
|
36
37
|
Requires-Dist: scipy==1.13.1
|
37
38
|
Requires-Dist: accelerate>=0.33.0
|
38
|
-
Requires-Dist: sentence-transformers
|
39
|
+
Requires-Dist: sentence-transformers<=3.3.1,>=3.0.1
|
39
40
|
Requires-Dist: transformers>=4.42.4
|
40
41
|
Requires-Dist: stanza==1.9.2
|
41
42
|
Requires-Dist: tokenizers>=0.19.1
|
@@ -49,3 +50,5 @@ Requires-Dist: chromadb>=0.5.15
|
|
49
50
|
Requires-Dist: langchain_community==0.3.3
|
50
51
|
Requires-Dist: datasets==3.0.2
|
51
52
|
Requires-Dist: unstructured==0.16.3
|
53
|
+
Requires-Dist: tensorflow<2.18.0,>=2.17.0
|
54
|
+
Requires-Dist: tf-keras==2.17.0
|
@@ -1,11 +1,13 @@
|
|
1
|
-
ddi_fw/datasets/__init__.py,sha256=
|
2
|
-
ddi_fw/datasets/core.py,sha256=
|
1
|
+
ddi_fw/datasets/__init__.py,sha256=yDsRQD_9Ijpm_Rl2wSDwdutG5Q_wca_UBPEvm7nBx04,444
|
2
|
+
ddi_fw/datasets/core.py,sha256=JA6WJz3VCUfxI85rYE7ZBqC4pnn7L8NSS9-EgjLw710,7968
|
3
|
+
ddi_fw/datasets/dataset_splitter.py,sha256=lLIelXv-8rCK0tbwLNgHBHYUO_65HT-_kErAlZhRQVE,1662
|
3
4
|
ddi_fw/datasets/db_utils.py,sha256=OTsa3d-Iic7z3HmzSQK9UigedRbHDxYChJk0s4GfLnw,6191
|
4
|
-
ddi_fw/datasets/embedding_generator.py,sha256=
|
5
|
+
ddi_fw/datasets/embedding_generator.py,sha256=jiDKwLaPMaQkloxQkuCrhl-A-2OdvocmkSzjWtUnk4g,2255
|
5
6
|
ddi_fw/datasets/feature_vector_generation.py,sha256=gvjpEzkgVV8dp4V8NMMv59u0v-1tNAJ7v83R-keWGoA,4748
|
6
7
|
ddi_fw/datasets/idf_helper.py,sha256=_Gd1dtDSLaw8o-o0JugzSKMt9FpeXewTh4wGEaUd4VQ,2571
|
7
8
|
ddi_fw/datasets/setup_._py,sha256=khYVJuW5PlOY_i_A16F3UbSZ6s6o_ljw33Byw3C-A8E,1047
|
8
|
-
ddi_fw/datasets/ddi_mdl/base.py,sha256=
|
9
|
+
ddi_fw/datasets/ddi_mdl/base.py,sha256=usw3AhBCjdYwZx9MMnyNaUYTEyYXoRSO4fNJJHxnPuk,9312
|
10
|
+
ddi_fw/datasets/ddi_mdl/debug.log,sha256=eWz05j8RFqZuHFDTCF7Rck5w4rvtTanFN21iZsgxO7Y,115
|
9
11
|
ddi_fw/datasets/ddi_mdl/readme.md,sha256=WC6lpmsEKvIISnZqENY7TWtzCQr98HPpE3oRsBl8pIw,625
|
10
12
|
ddi_fw/datasets/ddi_mdl/data/event.db,sha256=cmlSsf9MYjRzqR-mw3cUDnTnfT6FkpOG2yCl2mMwwew,30580736
|
11
13
|
ddi_fw/datasets/ddi_mdl/indexes/test_indexes.txt,sha256=XVlDqYATckrQwNSXqMSKVBqyoN_Hg8SK6CL-XMdLADY,102176
|
@@ -70,43 +72,32 @@ ddi_fw/drugbank/drugbank_processor.py,sha256=vmkt68n9nFLevufgGyXhOSDtTo4G1XzwT9P
|
|
70
72
|
ddi_fw/drugbank/drugbank_processor_org.py,sha256=eO5Yset50P91qkic79RUXPoEuxRxQKFkKW0l4G29Mas,13322
|
71
73
|
ddi_fw/drugbank/event_extractor.py,sha256=6odoZohhK7OdLF-LF0l-5BFq0_NMG_5jrFJbHrBXsI8,4600
|
72
74
|
ddi_fw/langchain/__init__.py,sha256=zS0CQrakWEP19biSRewFJGcBT8WBZq4899HrEKiMqUY,269
|
73
|
-
ddi_fw/langchain/embeddings.py,sha256=
|
75
|
+
ddi_fw/langchain/embeddings.py,sha256=XzIYgmqnAO93pnavKRDhYDoz0RhDn-RoC7CDc0yAvbM,7572
|
74
76
|
ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSzu5L2PXzg,280
|
75
77
|
ddi_fw/langchain/storage.py,sha256=OizKyWm74Js7T6Q9kez-ulUoBGzIMFo4R46h4kjUyIM,11200
|
76
78
|
ddi_fw/ml/__init__.py,sha256=tIxiW0g6q1VsmDYVXR_ovvHQR3SCir8g2bKxx_CrS7s,221
|
77
|
-
ddi_fw/ml/evaluation_helper.py,sha256=
|
78
|
-
ddi_fw/ml/ml_helper.py,sha256=
|
79
|
-
ddi_fw/ml/model_wrapper.py,sha256=
|
80
|
-
ddi_fw/ml/pytorch_wrapper.py,sha256=
|
81
|
-
ddi_fw/ml/tensorflow_wrapper.py,sha256=
|
79
|
+
ddi_fw/ml/evaluation_helper.py,sha256=JFATMquaQVa2gckxmEivCztZmivWBAAP7EpJ8PVeI3c,7626
|
80
|
+
ddi_fw/ml/ml_helper.py,sha256=E6ef7f1UnQl6JBUdGDbbbI4FIS-904VGypT7tI0a598,8545
|
81
|
+
ddi_fw/ml/model_wrapper.py,sha256=kabPXuo7S8tGkp9a00V04n4rXDmv7dD8wYGMjotISRc,1050
|
82
|
+
ddi_fw/ml/pytorch_wrapper.py,sha256=pe6UsjP2XeTgLxDnIUiodoyhJTGCxV27wD4Cjxysu2Q,8553
|
83
|
+
ddi_fw/ml/tensorflow_wrapper.py,sha256=jt6h9Q-wF0mkbnvV6yCCl1SpUd2paHK70Bu6EFrkmd0,10112
|
82
84
|
ddi_fw/ner/__init__.py,sha256=JwhGXrepomxPSsGsg2b_xPRC72AjvxOIn2CW5Mvscn0,26
|
83
85
|
ddi_fw/ner/mmlrestclient.py,sha256=NZta7m2Qm6I_qtVguMZhqtAUjVBmmXn0-TMnsNp0jpg,6859
|
84
|
-
ddi_fw/ner/ner.py,sha256=
|
86
|
+
ddi_fw/ner/ner.py,sha256=FHyyX53Xwpdw8Hec261dyN88yD7Z9LmJua2mIrQLguI,17967
|
85
87
|
ddi_fw/pipeline/__init__.py,sha256=tKDM_rW4vPjlYTeOkNgi9PujDzb4e9O3LK1w5wqnebw,212
|
86
|
-
ddi_fw/pipeline/multi_modal_combination_strategy.py,sha256=
|
87
|
-
ddi_fw/pipeline/multi_pipeline.py,sha256=
|
88
|
-
ddi_fw/pipeline/ner_pipeline.py,sha256=
|
89
|
-
ddi_fw/pipeline/pipeline.py,sha256
|
90
|
-
ddi_fw/test/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
91
|
-
ddi_fw/test/basic_test.py,sha256=fEOGcZm1ObnsDvMiXNmdmz6YCeUrGc8V0DwlSwGhsq8,376
|
92
|
-
ddi_fw/test/combination_test.py,sha256=TWNE8sf-DSh1Q9-yRaRBc774Sn1kSMGXLwQhd2_Qynk,324
|
93
|
-
ddi_fw/test/compress_json_test.py,sha256=BGny56YqiG-pzhMoDzLKQBQI1E7o3jU0S7VYWtclAx4,1045
|
94
|
-
ddi_fw/test/date_test.py,sha256=QmJ97ennS9LxLl8mGBkM2ob8_KWEFmiLakZTI9zQxxo,532
|
95
|
-
ddi_fw/test/idf_score.py,sha256=YsAur-F1T3eFxn9KrcK3VXCvrsV_LXrpHxPjMKZeQZ8,1523
|
96
|
-
ddi_fw/test/jaccard_similarity.py,sha256=pf6SNI52RCUZ0otx_1cz7A0p7kyfoCZv13Tbc_rxfuw,2382
|
97
|
-
ddi_fw/test/mlfow_test.py,sha256=L2hJAeIU5PDSxsyWTtV6PY0bfaWerWUJ1buni9BTjXo,4853
|
98
|
-
ddi_fw/test/sklearn-tfidf.py,sha256=cjtg27vLskEMXgrsqUR_EapRGVd4xgwOQ9zYsu72zjs,657
|
99
|
-
ddi_fw/test/test.py,sha256=zJh9ZBcZl-vZIFDvuftcRrRV8WAwtiFVhPPd6Et4OU4,2997
|
100
|
-
ddi_fw/test/torch_cuda_test.py,sha256=R-4VGVErl_Ufk54DoZbgL_YXWoCYFyanIVWd6P39IEk,312
|
101
|
-
ddi_fw/test/type_guarding_test.py,sha256=KxjyBxohDu7lwpejalCj-REjtJ-k1S1wQbOB6TGY0O8,766
|
88
|
+
ddi_fw/pipeline/multi_modal_combination_strategy.py,sha256=JSyuP71b1I1yuk0s2ecCJZTtCED85jBtkpwTUxibJvI,1706
|
89
|
+
ddi_fw/pipeline/multi_pipeline.py,sha256=D_BZ3ciHbVGuuB7m7cEmVQHESruh1gqhA-vxCMfNKj0,5407
|
90
|
+
ddi_fw/pipeline/ner_pipeline.py,sha256=q1aKjb54Ra1HzZ7dARvBw6lB37je9R-POEf2h6QT_nU,6018
|
91
|
+
ddi_fw/pipeline/pipeline.py,sha256=Xp5_cPj0SZ6b1lRWepwKCHoCbhEnzSZexm56CtvO_4Y,11073
|
102
92
|
ddi_fw/utils/__init__.py,sha256=77563ikqAtdzjjgRlLp5OAsJBbpLA1Cao8iecGaVUXQ,354
|
103
93
|
ddi_fw/utils/enums.py,sha256=19eJ3fX5eRK_xPvkYcukmug144jXPH4X9zQqtsFBj5A,671
|
94
|
+
ddi_fw/utils/json_helper.py,sha256=BVU6wmJgdXPxyqLPu3Ck_9Es5RrP1PDanKvE-OSj1D4,571
|
104
95
|
ddi_fw/utils/kaggle.py,sha256=wKRJ18KpQ6P-CubpZklEgsDtyFpR9RUL1_HyyF6ttEE,2425
|
105
96
|
ddi_fw/utils/package_helper.py,sha256=erl8_onmhK-41zQoaED2qyDUV9GQxmT9sdoyRp9_q5I,1056
|
106
97
|
ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,4747
|
107
98
|
ddi_fw/utils/utils.py,sha256=szwnxMTDRrZoeNRyDuf3aCbtzriwtaRk4mHSH3asLdA,4301
|
108
99
|
ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
|
109
|
-
ddi_fw-0.0.
|
110
|
-
ddi_fw-0.0.
|
111
|
-
ddi_fw-0.0.
|
112
|
-
ddi_fw-0.0.
|
100
|
+
ddi_fw-0.0.150.dist-info/METADATA,sha256=QGoZpcrDypCUbyMgSXEe2vdWBeYmLG5gSw6qnyWKQLc,2082
|
101
|
+
ddi_fw-0.0.150.dist-info/WHEEL,sha256=DK49LOLCYiurdXXOXwGJm6U4DkHkg4lcxjhqwRa0CP4,91
|
102
|
+
ddi_fw-0.0.150.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
|
103
|
+
ddi_fw-0.0.150.dist-info/RECORD,,
|
ddi_fw/test/__init__.py
DELETED
File without changes
|
ddi_fw/test/basic_test.py
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
import json
|
2
|
-
|
3
|
-
|
4
|
-
class Metrics():
|
5
|
-
def __init__(self, precision, recall, roc_aupr, roc_auc):
|
6
|
-
self.precision = precision
|
7
|
-
self.recall = recall
|
8
|
-
self.roc_aupr = roc_aupr
|
9
|
-
self.roc_auc = roc_auc
|
10
|
-
|
11
|
-
|
12
|
-
m = Metrics( 0.96, 0.96, {"micro": 0.99, "macro": 0.88}, {"micro": 0.99, "macro": 0.88})
|
13
|
-
|
14
|
-
as_json = json.dumps(m.__dict__)
|
15
|
-
print(as_json)
|
ddi_fw/test/combination_test.py
DELETED
@@ -1,12 +0,0 @@
|
|
1
|
-
import itertools
|
2
|
-
|
3
|
-
l = ['e1','e2','e3','e4','e5']
|
4
|
-
all_combinations = []
|
5
|
-
for i in range(2, len(l) + 1):
|
6
|
-
all_combinations.extend(list(itertools.combinations(l, i)))
|
7
|
-
|
8
|
-
print(all_combinations)
|
9
|
-
|
10
|
-
for combination in all_combinations:
|
11
|
-
combination_descriptor = '-'.join(combination)
|
12
|
-
print(combination_descriptor)
|
ddi_fw/test/date_test.py
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
from datetime import datetime, timezone
|
2
|
-
|
3
|
-
local_datetime = datetime.now()
|
4
|
-
utc_datetime = datetime.now(timezone.utc)
|
5
|
-
|
6
|
-
local_iso_str = datetime.strftime(local_datetime, "%Y-%m-%dT%H:%M:%S.%f")[:-3]
|
7
|
-
utc_iso_str = datetime.strftime(utc_datetime, "%Y-%m-%dT%H:%M:%S.%f")[:-3]
|
8
|
-
|
9
|
-
print(f"local dt: {local_iso_str}, tzname: {local_datetime.tzname()}")
|
10
|
-
print(f" utc dt: {utc_iso_str}, tzname: {utc_datetime.tzname()}")
|
11
|
-
|
12
|
-
print("\n")
|
13
|
-
|
14
|
-
print(f"local dt: {local_datetime.isoformat()}")
|
15
|
-
print(f" utc dt: {utc_datetime.isoformat()}")
|
ddi_fw/test/idf_score.py
DELETED
@@ -1,54 +0,0 @@
|
|
1
|
-
import pandas as pd
|
2
|
-
import numpy as np
|
3
|
-
|
4
|
-
# from ddi_fw.datasets.feature_vector_generation import find_distinct_elements
|
5
|
-
def find_distinct_elements(frame):
|
6
|
-
y = set()
|
7
|
-
for x in frame:
|
8
|
-
if x is not None:
|
9
|
-
for k in x:
|
10
|
-
y.add(k)
|
11
|
-
return y
|
12
|
-
|
13
|
-
def calculate_idf(series):
|
14
|
-
idf_scores = {}
|
15
|
-
distinct_items = find_distinct_elements(series)
|
16
|
-
sorted_distinct_items = sorted(distinct_items)
|
17
|
-
total_document_number = len(all_data)
|
18
|
-
for item in sorted_distinct_items:
|
19
|
-
document_freq = series.map(set([item]).issubset).sum()
|
20
|
-
idf = np.log(total_document_number/document_freq)
|
21
|
-
idf_scores[item] = idf
|
22
|
-
return idf_scores
|
23
|
-
|
24
|
-
|
25
|
-
item1 = 'T001|T002|T001|T001'
|
26
|
-
item2 = 'T002|T003'
|
27
|
-
item3 = 'T004|T005'
|
28
|
-
|
29
|
-
|
30
|
-
all_data = [item1, item2, item3]
|
31
|
-
|
32
|
-
df = pd.DataFrame(all_data, columns=['tui_description'])
|
33
|
-
|
34
|
-
df['tui_description'] = df['tui_description'].apply(
|
35
|
-
lambda x: x.split('|') if x is not None else [])
|
36
|
-
|
37
|
-
print(df.head())
|
38
|
-
|
39
|
-
idf_scores = calculate_idf(df['tui_description'])
|
40
|
-
idf_scores_sorted_desc = sorted(idf_scores.items(), key=lambda x:x[1], reverse=True)
|
41
|
-
threshold = 1
|
42
|
-
keys_over_threshold = [k for k,v in idf_scores.items() if v > threshold]
|
43
|
-
|
44
|
-
print(idf_scores_sorted_desc)
|
45
|
-
print(keys_over_threshold)
|
46
|
-
|
47
|
-
|
48
|
-
def remove_items_by_idf_score(items):
|
49
|
-
return [item for item in items if item in keys_over_threshold]
|
50
|
-
|
51
|
-
df['tui_description'] = df['tui_description'].apply(
|
52
|
-
remove_items_by_idf_score)
|
53
|
-
|
54
|
-
print(df)
|
@@ -1,85 +0,0 @@
|
|
1
|
-
import pandas as pd
|
2
|
-
|
3
|
-
# data = {'A': [1, 1, 1, 0, 0],
|
4
|
-
# 'B': [0, 1, 1, 1, 0],
|
5
|
-
# 'C': [0, 0, 1, 1, 1]}
|
6
|
-
|
7
|
-
# df = pd.DataFrame(data)
|
8
|
-
|
9
|
-
|
10
|
-
# from scipy.spatial.distance import pdist, squareform
|
11
|
-
|
12
|
-
# jaccard_dist = pdist(df.values, metric='jaccard')
|
13
|
-
# jaccard_dist_matrix = squareform(jaccard_dist)
|
14
|
-
|
15
|
-
# print(jaccard_dist_matrix)
|
16
|
-
|
17
|
-
|
18
|
-
# import pandas as pd
|
19
|
-
# from scipy.spatial.distance import euclidean, pdist, squareform
|
20
|
-
|
21
|
-
|
22
|
-
# def similarity_func(u, v):
|
23
|
-
# return 1/(1+euclidean(u,v))
|
24
|
-
|
25
|
-
# DF_var = pd.DataFrame.from_dict({"s1":[1.2,3.4,10.2],"s2":[1.4,3.1,10.7],"s3":[2.1,3.7,11.3],"s4":[1.5,3.2,10.9]})
|
26
|
-
# DF_var.index = ["g1","g2","g3"]
|
27
|
-
|
28
|
-
# dists = pdist(DF_var, similarity_func)
|
29
|
-
# DF_euclid = pd.DataFrame(squareform(dists), columns=DF_var.index, index=DF_var.index)
|
30
|
-
|
31
|
-
# print(DF_euclid)
|
32
|
-
|
33
|
-
|
34
|
-
from sklearn.metrics import jaccard_score
|
35
|
-
import seaborn as sns
|
36
|
-
import matplotlib.pyplot as plt
|
37
|
-
|
38
|
-
data = [[0, 1, 0], [0, 1, 1], [0, 1, 0], [1, 1, 1], [1, 0, 1]]
|
39
|
-
|
40
|
-
similarity_matrix = []
|
41
|
-
for i in range(len(data)):
|
42
|
-
row = []
|
43
|
-
for j in range(len(data)):
|
44
|
-
row.append(jaccard_score(data[i], data[j]))
|
45
|
-
similarity_matrix.append(row)
|
46
|
-
|
47
|
-
sns.heatmap(pd.DataFrame(similarity_matrix), annot=True, cmap="YlGnBu")
|
48
|
-
plt.show()
|
49
|
-
|
50
|
-
|
51
|
-
# https://stackoverflow.com/questions/35639571/python-pandas-distance-matrix-using-jaccard-similarity
|
52
|
-
import pandas as pd
|
53
|
-
entries = [
|
54
|
-
{'id':'1', 'category1':'100', 'category2': '0', 'category3':'100'},
|
55
|
-
{'id':'2', 'category1':'100', 'category2': '0', 'category3':'100'},
|
56
|
-
{'id':'3', 'category1':'0', 'category2': '100', 'category3':'100'},
|
57
|
-
{'id':'4', 'category1':'100', 'category2': '100', 'category3':'100'},
|
58
|
-
{'id':'5', 'category1':'100', 'category2': '0', 'category3':'100'}
|
59
|
-
]
|
60
|
-
df = pd.DataFrame(entries)
|
61
|
-
|
62
|
-
from scipy.spatial.distance import squareform
|
63
|
-
from scipy.spatial.distance import pdist, jaccard
|
64
|
-
|
65
|
-
res = 1 - pdist(df[['category1','category2','category3']], 'jaccard')
|
66
|
-
# squareform(res)
|
67
|
-
distance = pd.DataFrame(squareform(res), index=df.index, columns= df.index)
|
68
|
-
print(distance)
|
69
|
-
|
70
|
-
entries2 = [
|
71
|
-
{'id':'1', 'cat':['p1','p2','p3']},
|
72
|
-
{'id':'2', 'cat':['p3','p4','p5']},
|
73
|
-
{'id':'3', 'cat':['p5','p6','p7']},
|
74
|
-
]
|
75
|
-
df2 = pd.DataFrame(entries2)
|
76
|
-
|
77
|
-
c = df2['cat']
|
78
|
-
|
79
|
-
y = set()
|
80
|
-
|
81
|
-
for x in c:
|
82
|
-
for k in x:
|
83
|
-
y.add(k)
|
84
|
-
|
85
|
-
print(y)
|