ddi-fw 0.0.54__py3-none-any.whl → 0.0.55__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ddi_fw/experiments/__init__.py +2 -1
- ddi_fw/experiments/pipeline.py +11 -11
- ddi_fw/experiments/pipeline_ner.py +109 -0
- {ddi_fw-0.0.54.dist-info → ddi_fw-0.0.55.dist-info}/METADATA +1 -1
- {ddi_fw-0.0.54.dist-info → ddi_fw-0.0.55.dist-info}/RECORD +7 -6
- {ddi_fw-0.0.54.dist-info → ddi_fw-0.0.55.dist-info}/WHEEL +0 -0
- {ddi_fw-0.0.54.dist-info → ddi_fw-0.0.55.dist-info}/top_level.txt +0 -0
ddi_fw/experiments/__init__.py
CHANGED
ddi_fw/experiments/pipeline.py
CHANGED
@@ -61,21 +61,21 @@ class Experiment:
|
|
61
61
|
kwargs = {"columns": self.columns}
|
62
62
|
for k, v in self.ner_threshold.items():
|
63
63
|
kwargs[k] = v
|
64
|
+
if self.vector_db_persist_directory:
|
65
|
+
self.vector_db = chromadb.PersistentClient(
|
66
|
+
path=self.vector_db_persist_directory)
|
67
|
+
self.collection = self.vector_db.get_collection(
|
68
|
+
self.vector_db_collection_name)
|
69
|
+
dictionary = self.collection.get(include=['embeddings', 'metadatas'])
|
64
70
|
|
65
|
-
|
66
|
-
path=self.vector_db_persist_directory)
|
67
|
-
self.collection = self.vector_db.get_collection(
|
68
|
-
self.vector_db_collection_name)
|
69
|
-
dictionary = self.collection.get(include=['embeddings', 'metadatas'])
|
71
|
+
embedding_dict = defaultdict(lambda: defaultdict(list))
|
70
72
|
|
71
|
-
|
73
|
+
for metadata, embedding in zip(dictionary['metadatas'], dictionary['embeddings']):
|
74
|
+
embedding_dict[metadata["type"]][metadata["id"]].append(embedding)
|
72
75
|
|
73
|
-
|
74
|
-
embedding_dict[metadata["type"]][metadata["id"]].append(embedding)
|
76
|
+
embedding_size = dictionary['embeddings'].shape[1]
|
75
77
|
|
76
|
-
|
77
|
-
|
78
|
-
pooling_strategy = self.embedding_pooling_strategy_type()
|
78
|
+
pooling_strategy = self.embedding_pooling_strategy_type()
|
79
79
|
|
80
80
|
self.ner_df = CTakesNER().load(filename=self.ner_data_file) if self.ner_data_file else None
|
81
81
|
|
@@ -0,0 +1,109 @@
|
|
1
|
+
from collections import defaultdict
|
2
|
+
from enum import Enum
|
3
|
+
import numpy as np
|
4
|
+
import pandas as pd
|
5
|
+
from ddi_fw.datasets.core import BaseDataset
|
6
|
+
from ddi_fw.experiments.tensorflow_helper import TFMultiModal
|
7
|
+
from ddi_fw.experiments.pipeline import Experiment
|
8
|
+
from typing import Dict, List
|
9
|
+
from itertools import product
|
10
|
+
|
11
|
+
from ddi_fw.utils.enums import DrugBankTextDataTypes, UMLSCodeTypes
|
12
|
+
import mlflow
|
13
|
+
from ddi_fw.ner.ner import CTakesNER
|
14
|
+
|
15
|
+
def stack(df_column):
|
16
|
+
return np.stack(df_column.values)
|
17
|
+
|
18
|
+
|
19
|
+
class NerParameterSearch:
|
20
|
+
def __init__(self,
|
21
|
+
experiment_name,
|
22
|
+
experiment_description,
|
23
|
+
experiment_tags,
|
24
|
+
tracking_uri,
|
25
|
+
dataset_type: BaseDataset,
|
26
|
+
umls_code_types: List[UMLSCodeTypes],
|
27
|
+
text_types=List[DrugBankTextDataTypes],
|
28
|
+
min_threshold_dict: Dict[str, float] = defaultdict(float),
|
29
|
+
max_threshold_dict: Dict[str, float] = defaultdict(float),
|
30
|
+
increase_step=0.5):
|
31
|
+
self.experiment_name = experiment_name
|
32
|
+
self.experiment_description = experiment_description
|
33
|
+
self.experiment_tags = experiment_tags
|
34
|
+
self.tracking_uri = tracking_uri
|
35
|
+
|
36
|
+
self.dataset_type = dataset_type
|
37
|
+
self.umls_code_types = umls_code_types
|
38
|
+
self.text_types = text_types
|
39
|
+
self.min_threshold_dict = min_threshold_dict
|
40
|
+
self.max_threshold_dict = max_threshold_dict
|
41
|
+
self.increase_step = increase_step
|
42
|
+
|
43
|
+
def build(self):
|
44
|
+
self.datasets = {}
|
45
|
+
self.items = []
|
46
|
+
columns = ['tui', 'cui', 'entities']
|
47
|
+
if self.umls_code_types is not None and self.text_types is not None:
|
48
|
+
# add checking statements
|
49
|
+
_umls_codes = [t.value[0] for t in self.umls_code_types]
|
50
|
+
_text_types = [t.value[0] for t in self.text_types]
|
51
|
+
_columns = [f'{item[0]}_{item[1]}' for item in product(
|
52
|
+
_umls_codes, _text_types)]
|
53
|
+
columns.extend(_columns)
|
54
|
+
print(f'Columns: {columns}')
|
55
|
+
self.ner_df = CTakesNER().load(filename=self.ner_data_file) if self.ner_data_file else None
|
56
|
+
for column in columns:
|
57
|
+
min_threshold = self.min_threshold_dict[column]
|
58
|
+
max_threshold = self.max_threshold_dict[column]
|
59
|
+
kwargs = {}
|
60
|
+
kwargs['threshold_method'] = 'idf'
|
61
|
+
kwargs['tui_threshold'] = 0
|
62
|
+
kwargs['cui_threshold'] = 0
|
63
|
+
kwargs['entities_threshold'] = 0
|
64
|
+
|
65
|
+
for threshold in np.arange(min_threshold, max_threshold, self.increase_step):
|
66
|
+
print(threshold)
|
67
|
+
if column.startswith('tui'):
|
68
|
+
kwargs['tui_threshold'] = threshold
|
69
|
+
if column.startswith('cui'):
|
70
|
+
kwargs['cui_threshold'] = threshold
|
71
|
+
if column.startswith('entities'):
|
72
|
+
kwargs['entities_threshold'] = threshold
|
73
|
+
dataset = self.dataset_type(
|
74
|
+
# chemical_property_columns=[],
|
75
|
+
# embedding_columns=[],
|
76
|
+
# ner_columns=[column],
|
77
|
+
columns=[column],
|
78
|
+
ner_df= self.ner_df,
|
79
|
+
**kwargs)
|
80
|
+
|
81
|
+
# train_idx_arr, val_idx_arr bir kez hesaplanması yeterli aslında
|
82
|
+
X_train, X_test, y_train, y_test, X_train.index, X_test.index, train_idx_arr, val_idx_arr = dataset.load()
|
83
|
+
group_items = dataset.produce_inputs()
|
84
|
+
for item in group_items:
|
85
|
+
# item[0] = f'threshold_{threshold}_{item[0]}'
|
86
|
+
item[0] = f'threshold_{item[0]}_{threshold}'
|
87
|
+
self.datasets[item[0]] = dataset.ddis_df
|
88
|
+
|
89
|
+
self.items.extend(group_items)
|
90
|
+
self.y_test_label = self.items[0][4]
|
91
|
+
self.train_idx_arr = train_idx_arr
|
92
|
+
self.val_idx_arr = val_idx_arr
|
93
|
+
|
94
|
+
|
95
|
+
def run(self, model_func, batch_size=128, epochs=100):
|
96
|
+
mlflow.set_tracking_uri(self.tracking_uri)
|
97
|
+
|
98
|
+
if mlflow.get_experiment_by_name(self.experiment_name) == None:
|
99
|
+
mlflow.create_experiment(self.experiment_name)
|
100
|
+
mlflow.set_experiment_tags(self.experiment_tags)
|
101
|
+
mlflow.set_experiment(self.experiment_name)
|
102
|
+
|
103
|
+
y_test_label = self.items[0][4]
|
104
|
+
multi_modal = TFMultiModal(
|
105
|
+
model_func=model_func, batch_size=batch_size, epochs=epochs) # 100
|
106
|
+
multi_modal.set_data(
|
107
|
+
self.items, self.train_idx_arr, self.val_idx_arr, y_test_label)
|
108
|
+
result = multi_modal.predict(self.combinations)
|
109
|
+
return result
|
@@ -56,11 +56,12 @@ ddi_fw/drugbank/drugbank_parser.py,sha256=lxUuhB0s8ef_aPNDs0V8ClKF7-KIWugNIV9gVs
|
|
56
56
|
ddi_fw/drugbank/drugbank_processor.py,sha256=vmkt68n9nFLevufgGyXhOSDtTo4G1XzwT9PVncGTXtk,18127
|
57
57
|
ddi_fw/drugbank/drugbank_processor_org.py,sha256=eO5Yset50P91qkic79RUXPoEuxRxQKFkKW0l4G29Mas,13322
|
58
58
|
ddi_fw/drugbank/event_extractor.py,sha256=6odoZohhK7OdLF-LF0l-5BFq0_NMG_5jrFJbHrBXsI8,4600
|
59
|
-
ddi_fw/experiments/__init__.py,sha256=
|
59
|
+
ddi_fw/experiments/__init__.py,sha256=5L2xSolpFycNnflqOMdvJSiqRB16ExA5bbVGORKFX04,195
|
60
60
|
ddi_fw/experiments/custom_torch_model.py,sha256=iQ_R_EApzD2JCcASN8cie6D21oh7VCxaOQ45_dkiGwc,2576
|
61
61
|
ddi_fw/experiments/evaluation_helper.py,sha256=pY69cezV3WzrXw1bduIwRJfah1w3wXJ2YyTNim1J7ko,9349
|
62
|
-
ddi_fw/experiments/pipeline.py,sha256=
|
62
|
+
ddi_fw/experiments/pipeline.py,sha256=wttkvdzGP9d3jC9nx2iZul4hbogXkRho6eDns0yfLiE,5380
|
63
63
|
ddi_fw/experiments/pipeline_builder_pattern.py,sha256=q1PNEQFoO5U3UidEoGB8rgLA7KXr4FsJTXEug5c5UJg,5466
|
64
|
+
ddi_fw/experiments/pipeline_ner.py,sha256=JERKAaPdgKt2wjfVauOd3HXOGbLLoYLNxNCAv9CO_vg,4757
|
64
65
|
ddi_fw/experiments/tensorflow_helper.py,sha256=Y-gD9qyqFFPl6HAvM_tIa5Y6em2YmafPCL1KMrK6eb8,11768
|
65
66
|
ddi_fw/experiments/test.py,sha256=z1TfBpK75zGKpp2ZU8f6APjZlgBFthaCBN61YB9ma4o,2049
|
66
67
|
ddi_fw/ner/__init__.py,sha256=JwhGXrepomxPSsGsg2b_xPRC72AjvxOIn2CW5Mvscn0,26
|
@@ -82,7 +83,7 @@ ddi_fw/utils/enums.py,sha256=19eJ3fX5eRK_xPvkYcukmug144jXPH4X9zQqtsFBj5A,671
|
|
82
83
|
ddi_fw/utils/py7zr_helper.py,sha256=dgfHqXDBWys1hmd1JlHhYyZGxrzYWi6siYiUq3bnLuI,4698
|
83
84
|
ddi_fw/utils/utils.py,sha256=szwnxMTDRrZoeNRyDuf3aCbtzriwtaRk4mHSH3asLdA,4301
|
84
85
|
ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
|
85
|
-
ddi_fw-0.0.
|
86
|
-
ddi_fw-0.0.
|
87
|
-
ddi_fw-0.0.
|
88
|
-
ddi_fw-0.0.
|
86
|
+
ddi_fw-0.0.55.dist-info/METADATA,sha256=736seAJsPdjZQPhFly5pkPPGi7SMWr6XqNgUKKRhC2I,1565
|
87
|
+
ddi_fw-0.0.55.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
88
|
+
ddi_fw-0.0.55.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
|
89
|
+
ddi_fw-0.0.55.dist-info/RECORD,,
|
File without changes
|
File without changes
|