ddi-fw 0.0.53__py3-none-any.whl → 0.0.55__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ddi_fw/datasets/core.py CHANGED
@@ -298,11 +298,13 @@ class BaseDataset(ABC):
298
298
  combined_df = filtered_df.copy()
299
299
  # TODO: eğer kullanılan veri setinde tui, cui veya entity bilgileri yoksa o veri setine bu sütunları eklemek için aşağısı gerekli
300
300
 
301
- idf_calc = IDF(filtered_ner_df, [f for f in filtered_ner_df.keys()])
301
+ # idf_calc = IDF(filtered_ner_df, [f for f in filtered_ner_df.keys()])
302
+ idf_calc = IDF(filtered_ner_df, self.ner_columns)
302
303
  idf_calc.calculate()
303
304
  idf_scores_df = idf_calc.to_dataframe()
304
305
 
305
- for key in filtered_ner_df.keys():
306
+ # for key in filtered_ner_df.keys():
307
+ for key in self.ner_columns:
306
308
  threshold = 0
307
309
  if key.startswith('tui'):
308
310
  threshold = self.tui_threshold
@@ -1,3 +1,4 @@
1
1
  from .tensorflow_helper import TFMultiModal, TFSingleModal,Result
2
2
  from .evaluation_helper import evaluate, Metrics
3
- from .pipeline import Experiment
3
+ from .pipeline import Experiment
4
+ from .pipeline_ner import NerParameterSearch
@@ -61,21 +61,21 @@ class Experiment:
61
61
  kwargs = {"columns": self.columns}
62
62
  for k, v in self.ner_threshold.items():
63
63
  kwargs[k] = v
64
+ if self.vector_db_persist_directory:
65
+ self.vector_db = chromadb.PersistentClient(
66
+ path=self.vector_db_persist_directory)
67
+ self.collection = self.vector_db.get_collection(
68
+ self.vector_db_collection_name)
69
+ dictionary = self.collection.get(include=['embeddings', 'metadatas'])
64
70
 
65
- self.vector_db = chromadb.PersistentClient(
66
- path=self.vector_db_persist_directory)
67
- self.collection = self.vector_db.get_collection(
68
- self.vector_db_collection_name)
69
- dictionary = self.collection.get(include=['embeddings', 'metadatas'])
71
+ embedding_dict = defaultdict(lambda: defaultdict(list))
70
72
 
71
- embedding_dict = defaultdict(lambda: defaultdict(list))
73
+ for metadata, embedding in zip(dictionary['metadatas'], dictionary['embeddings']):
74
+ embedding_dict[metadata["type"]][metadata["id"]].append(embedding)
72
75
 
73
- for metadata, embedding in zip(dictionary['metadatas'], dictionary['embeddings']):
74
- embedding_dict[metadata["type"]][metadata["id"]].append(embedding)
76
+ embedding_size = dictionary['embeddings'].shape[1]
75
77
 
76
- embedding_size = dictionary['embeddings'].shape[1]
77
-
78
- pooling_strategy = self.embedding_pooling_strategy_type()
78
+ pooling_strategy = self.embedding_pooling_strategy_type()
79
79
 
80
80
  self.ner_df = CTakesNER().load(filename=self.ner_data_file) if self.ner_data_file else None
81
81
 
@@ -0,0 +1,109 @@
1
+ from collections import defaultdict
2
+ from enum import Enum
3
+ import numpy as np
4
+ import pandas as pd
5
+ from ddi_fw.datasets.core import BaseDataset
6
+ from ddi_fw.experiments.tensorflow_helper import TFMultiModal
7
+ from ddi_fw.experiments.pipeline import Experiment
8
+ from typing import Dict, List
9
+ from itertools import product
10
+
11
+ from ddi_fw.utils.enums import DrugBankTextDataTypes, UMLSCodeTypes
12
+ import mlflow
13
+ from ddi_fw.ner.ner import CTakesNER
14
+
15
+ def stack(df_column):
16
+ return np.stack(df_column.values)
17
+
18
+
19
+ class NerParameterSearch:
20
+ def __init__(self,
21
+ experiment_name,
22
+ experiment_description,
23
+ experiment_tags,
24
+ tracking_uri,
25
+ dataset_type: BaseDataset,
26
+ umls_code_types: List[UMLSCodeTypes],
27
+ text_types=List[DrugBankTextDataTypes],
28
+ min_threshold_dict: Dict[str, float] = defaultdict(float),
29
+ max_threshold_dict: Dict[str, float] = defaultdict(float),
30
+ increase_step=0.5):
31
+ self.experiment_name = experiment_name
32
+ self.experiment_description = experiment_description
33
+ self.experiment_tags = experiment_tags
34
+ self.tracking_uri = tracking_uri
35
+
36
+ self.dataset_type = dataset_type
37
+ self.umls_code_types = umls_code_types
38
+ self.text_types = text_types
39
+ self.min_threshold_dict = min_threshold_dict
40
+ self.max_threshold_dict = max_threshold_dict
41
+ self.increase_step = increase_step
42
+
43
+ def build(self):
44
+ self.datasets = {}
45
+ self.items = []
46
+ columns = ['tui', 'cui', 'entities']
47
+ if self.umls_code_types is not None and self.text_types is not None:
48
+ # add checking statements
49
+ _umls_codes = [t.value[0] for t in self.umls_code_types]
50
+ _text_types = [t.value[0] for t in self.text_types]
51
+ _columns = [f'{item[0]}_{item[1]}' for item in product(
52
+ _umls_codes, _text_types)]
53
+ columns.extend(_columns)
54
+ print(f'Columns: {columns}')
55
+ self.ner_df = CTakesNER().load(filename=self.ner_data_file) if self.ner_data_file else None
56
+ for column in columns:
57
+ min_threshold = self.min_threshold_dict[column]
58
+ max_threshold = self.max_threshold_dict[column]
59
+ kwargs = {}
60
+ kwargs['threshold_method'] = 'idf'
61
+ kwargs['tui_threshold'] = 0
62
+ kwargs['cui_threshold'] = 0
63
+ kwargs['entities_threshold'] = 0
64
+
65
+ for threshold in np.arange(min_threshold, max_threshold, self.increase_step):
66
+ print(threshold)
67
+ if column.startswith('tui'):
68
+ kwargs['tui_threshold'] = threshold
69
+ if column.startswith('cui'):
70
+ kwargs['cui_threshold'] = threshold
71
+ if column.startswith('entities'):
72
+ kwargs['entities_threshold'] = threshold
73
+ dataset = self.dataset_type(
74
+ # chemical_property_columns=[],
75
+ # embedding_columns=[],
76
+ # ner_columns=[column],
77
+ columns=[column],
78
+ ner_df= self.ner_df,
79
+ **kwargs)
80
+
81
+ # train_idx_arr, val_idx_arr bir kez hesaplanması yeterli aslında
82
+ X_train, X_test, y_train, y_test, X_train.index, X_test.index, train_idx_arr, val_idx_arr = dataset.load()
83
+ group_items = dataset.produce_inputs()
84
+ for item in group_items:
85
+ # item[0] = f'threshold_{threshold}_{item[0]}'
86
+ item[0] = f'threshold_{item[0]}_{threshold}'
87
+ self.datasets[item[0]] = dataset.ddis_df
88
+
89
+ self.items.extend(group_items)
90
+ self.y_test_label = self.items[0][4]
91
+ self.train_idx_arr = train_idx_arr
92
+ self.val_idx_arr = val_idx_arr
93
+
94
+
95
+ def run(self, model_func, batch_size=128, epochs=100):
96
+ mlflow.set_tracking_uri(self.tracking_uri)
97
+
98
+ if mlflow.get_experiment_by_name(self.experiment_name) == None:
99
+ mlflow.create_experiment(self.experiment_name)
100
+ mlflow.set_experiment_tags(self.experiment_tags)
101
+ mlflow.set_experiment(self.experiment_name)
102
+
103
+ y_test_label = self.items[0][4]
104
+ multi_modal = TFMultiModal(
105
+ model_func=model_func, batch_size=batch_size, epochs=epochs) # 100
106
+ multi_modal.set_data(
107
+ self.items, self.train_idx_arr, self.val_idx_arr, y_test_label)
108
+ result = multi_modal.predict(self.combinations)
109
+ return result
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ddi_fw
3
- Version: 0.0.53
3
+ Version: 0.0.55
4
4
  Summary: Do not use :)
5
5
  Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
6
6
  Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
@@ -1,5 +1,5 @@
1
1
  ddi_fw/datasets/__init__.py,sha256=CqDrx7Ov83pXRh-n0ylembBmzhlW_yFWiheBcISrKdg,510
2
- ddi_fw/datasets/core.py,sha256=CFmnK0_cUxQAlTHjC2LEuKX0DNVAhGjSyQaV4jURYdI,18932
2
+ ddi_fw/datasets/core.py,sha256=ffza6yX3zvZV8Lp7but5f49J0837gZQKCSQ3iMBT6BE,19033
3
3
  ddi_fw/datasets/db_utils.py,sha256=OTsa3d-Iic7z3HmzSQK9UigedRbHDxYChJk0s4GfLnw,6191
4
4
  ddi_fw/datasets/embedding_generator.py,sha256=Jqrlv88RCu0Lg812KsA12X0cSaZuxbckJ4LNRKNy_qw,2173
5
5
  ddi_fw/datasets/embedding_generator_new.py,sha256=GOE-Io6-DBwiUJSkgmxw9ZM1exCYYVu9KyP2dH3gf1o,7506
@@ -56,11 +56,12 @@ ddi_fw/drugbank/drugbank_parser.py,sha256=lxUuhB0s8ef_aPNDs0V8ClKF7-KIWugNIV9gVs
56
56
  ddi_fw/drugbank/drugbank_processor.py,sha256=vmkt68n9nFLevufgGyXhOSDtTo4G1XzwT9PVncGTXtk,18127
57
57
  ddi_fw/drugbank/drugbank_processor_org.py,sha256=eO5Yset50P91qkic79RUXPoEuxRxQKFkKW0l4G29Mas,13322
58
58
  ddi_fw/drugbank/event_extractor.py,sha256=6odoZohhK7OdLF-LF0l-5BFq0_NMG_5jrFJbHrBXsI8,4600
59
- ddi_fw/experiments/__init__.py,sha256=5tOuHtrypRdmJgE5E78YcySbjCdNSGkbY1H5DY_I7gw,149
59
+ ddi_fw/experiments/__init__.py,sha256=5L2xSolpFycNnflqOMdvJSiqRB16ExA5bbVGORKFX04,195
60
60
  ddi_fw/experiments/custom_torch_model.py,sha256=iQ_R_EApzD2JCcASN8cie6D21oh7VCxaOQ45_dkiGwc,2576
61
61
  ddi_fw/experiments/evaluation_helper.py,sha256=pY69cezV3WzrXw1bduIwRJfah1w3wXJ2YyTNim1J7ko,9349
62
- ddi_fw/experiments/pipeline.py,sha256=wHovtPbky1mEJCkC0xJnRDWKmPx9THrgKKN4ZnYQU_U,5296
62
+ ddi_fw/experiments/pipeline.py,sha256=wttkvdzGP9d3jC9nx2iZul4hbogXkRho6eDns0yfLiE,5380
63
63
  ddi_fw/experiments/pipeline_builder_pattern.py,sha256=q1PNEQFoO5U3UidEoGB8rgLA7KXr4FsJTXEug5c5UJg,5466
64
+ ddi_fw/experiments/pipeline_ner.py,sha256=JERKAaPdgKt2wjfVauOd3HXOGbLLoYLNxNCAv9CO_vg,4757
64
65
  ddi_fw/experiments/tensorflow_helper.py,sha256=Y-gD9qyqFFPl6HAvM_tIa5Y6em2YmafPCL1KMrK6eb8,11768
65
66
  ddi_fw/experiments/test.py,sha256=z1TfBpK75zGKpp2ZU8f6APjZlgBFthaCBN61YB9ma4o,2049
66
67
  ddi_fw/ner/__init__.py,sha256=JwhGXrepomxPSsGsg2b_xPRC72AjvxOIn2CW5Mvscn0,26
@@ -82,7 +83,7 @@ ddi_fw/utils/enums.py,sha256=19eJ3fX5eRK_xPvkYcukmug144jXPH4X9zQqtsFBj5A,671
82
83
  ddi_fw/utils/py7zr_helper.py,sha256=dgfHqXDBWys1hmd1JlHhYyZGxrzYWi6siYiUq3bnLuI,4698
83
84
  ddi_fw/utils/utils.py,sha256=szwnxMTDRrZoeNRyDuf3aCbtzriwtaRk4mHSH3asLdA,4301
84
85
  ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
85
- ddi_fw-0.0.53.dist-info/METADATA,sha256=ZS4o-QhleM0ShJGKifYQ9oa9o1WfQ98MUhDUJhwlL5k,1565
86
- ddi_fw-0.0.53.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
87
- ddi_fw-0.0.53.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
88
- ddi_fw-0.0.53.dist-info/RECORD,,
86
+ ddi_fw-0.0.55.dist-info/METADATA,sha256=736seAJsPdjZQPhFly5pkPPGi7SMWr6XqNgUKKRhC2I,1565
87
+ ddi_fw-0.0.55.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
88
+ ddi_fw-0.0.55.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
89
+ ddi_fw-0.0.55.dist-info/RECORD,,