ddi-fw 0.0.84__py3-none-any.whl → 0.0.87__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ddi_fw/datasets/core.py CHANGED
@@ -24,11 +24,11 @@ def stack(df_column):
24
24
  class BaseDataset(ABC):
25
25
  def __init__(self,
26
26
  embedding_size,
27
- embedding_dict,
28
- embeddings_pooling_strategy: PoolingStrategy,
29
- ner_df,
30
- chemical_property_columns,
31
- embedding_columns,
27
+ embedding_dict,
28
+ embeddings_pooling_strategy: PoolingStrategy,
29
+ ner_df,
30
+ chemical_property_columns,
31
+ embedding_columns,
32
32
  ner_columns,
33
33
  **kwargs):
34
34
  self.embedding_size = embedding_size
@@ -409,6 +409,7 @@ class BaseDataset(ABC):
409
409
  x_fnc, args=(embeddings_after_pooling,), axis=1)
410
410
 
411
411
  self.dataframe = self.ddis_df.copy()
412
+ self.dataframe['class_as_txt'] = labels
412
413
  self.dataframe['class'] = list(classes)
413
414
  print(self.dataframe.shape)
414
415
 
@@ -436,3 +437,11 @@ class BaseDataset(ABC):
436
437
  self.val_idx_arr = val_idx_arr
437
438
 
438
439
  return self.X_train, self.X_test, self.y_train, self.y_test, self.X_train.index, self.X_test.index, train_idx_arr, val_idx_arr
440
+
441
+ def export_as_csv(self, output_file_path, not_change: list):
442
+ copy = self.dataframe.copy()
443
+ for col in copy.columns:
444
+ if col not in not_change:
445
+ copy[col] = [
446
+ '[' + ','.join(f"{value:.3f}" for value in row) + ']' for row in copy[col]]
447
+ copy.to_csv(output_file_path, index=False)
@@ -35,7 +35,7 @@ class DDIMDLDataset(BaseDataset):
35
35
  ner_columns=[],
36
36
  **kwargs):
37
37
  columns = kwargs['columns']
38
- if columns is not None:
38
+ if columns:
39
39
  chemical_property_columns = []
40
40
  embedding_columns=[]
41
41
  ner_columns=[]
ddi_fw/ml/ml_helper.py CHANGED
@@ -74,7 +74,8 @@ class MultiModalRunner:
74
74
  self.train_idx_arr, self.val_idx_arr, item[1], item[2], item[3], item[4])
75
75
  logs, metrics, prediction = single_modal.predict()
76
76
  # self.result.add_log(item[0], logs)
77
- # self.result.add_metric(item[0], metrics)
77
+ #Check
78
+ self.result.add_metric(item[0], metrics)
78
79
  single_results[item[0]] = prediction
79
80
  # sum = sum + prediction
80
81
 
@@ -1,3 +1,4 @@
1
1
  from .pipeline import Pipeline
2
2
  from .multi_pipeline import MultiPipeline
3
- from .multi_modal_combination_strategy import CombinationStrategy,CustomCombinationStrategy
3
+ from .multi_modal_combination_strategy import CombinationStrategy,CustomCombinationStrategy
4
+ from .ner_pipeline import NerParameterSearch
@@ -9,8 +9,8 @@ class CombinationStrategy():
9
9
  class CustomCombinationStrategy(CombinationStrategy):
10
10
  def __init__(self, **kwargs_combination_params):
11
11
  # kwargs fonksiyona da alınabilir
12
- self.group1 = kwargs_combination_params.get("group_1", None)
13
- self.group2 = kwargs_combination_params.get("group_2", None)
12
+ self.group_1 = kwargs_combination_params.get("group_1", None)
13
+ self.group_2 = kwargs_combination_params.get("group_2", None)
14
14
 
15
15
  def generate(self):
16
16
  # Handle edge cases
@@ -1,5 +1,6 @@
1
1
  import json
2
2
  from ddi_fw.pipeline import Pipeline
3
+ from ddi_fw.pipeline import NerParameterSearch
3
4
  import importlib
4
5
 
5
6
 
@@ -46,41 +47,41 @@ class MultiPipeline():
46
47
  self.pipeline_resuts = dict()
47
48
 
48
49
  def __create_pipeline(self, config):
49
- library = config["library"]
50
- batch_size = config["batch_size"]
51
- epochs = config["epochs"]
52
-
53
- # dataset_module = config["dataset_module"]
54
- # dataset_name = config["dataset_name"]
55
-
56
- experiment_name = config["experiment_name"]
57
- experiment_description = config["experiment_description"]
58
- experiment_tags = config["experiment_tags"]
59
- tracking_uri = config["tracking_uri"]
60
- artifact_location = config["artifact_location"]
61
- columns = config["columns"]
62
- ner_data_file = config["ner_data_file"]
63
- ner_threshold = config["ner_threshold"]
64
- vector_db_persist_directory = config["vector_db_persist_directory"]
65
- vector_db_collection_name = config["vector_db_collection_name"]
50
+ type = config.get("type")
51
+ library = config.get("library")
52
+ batch_size = config.get("batch_size")
53
+ epochs = config.get("epochs")
54
+
55
+ # dataset_module = config.get("dataset_module")
56
+ # dataset_name = config.get("dataset_name")
57
+
58
+ experiment_name = config.get("experiment_name")
59
+ experiment_description = config.get("experiment_description")
60
+ experiment_tags = config.get("experiment_tags")
61
+ tracking_uri = config.get("tracking_uri")
62
+ artifact_location = config.get("artifact_location")
63
+ columns = config.get("columns")
64
+ ner_data_file = config.get("ner_data_file")
65
+ ner_threshold = config.get("ner_threshold")
66
+ vector_db_persist_directory = config.get("vector_db_persist_directory")
67
+ vector_db_collection_name = config.get("vector_db_collection_name")
66
68
  embedding_pooling_strategy = get_import(
67
- config["embedding_pooling_strategy_type"])
69
+ config.get("embedding_pooling_strategy_type"))
68
70
  # Dynamically import the model and dataset classes
69
- model_type = get_import(config["model_type"])
70
- dataset_type = get_import(config["dataset_type"])
71
- combination_type = get_import(config["combination_strategy"]["type"])
72
- kwargs_combination_params = config["combination_strategy"]["params"]
73
-
71
+ model_type = get_import(config.get("model_type"))
72
+ dataset_type = get_import(config.get("dataset_type"))
73
+ combination_type = get_import(config.get("combination_strategy").get("type"))
74
+ kwargs_combination_params = config.get("combination_strategy").get("params")
75
+ combinations = []
76
+ if combination_type is not None:
77
+ combinations = combination_type(**kwargs_combination_params).generate()
74
78
  # # Instantiate the classes
75
79
  # model_instance = model_class()
76
80
  # dataset_instance = dataset_class()
77
- return {
78
- "name": experiment_name,
79
- "library": library,
80
- "batch_size": batch_size,
81
- "epochs": epochs,
82
- "model_type": model_type,
83
- "pipeline": Pipeline(
81
+
82
+ pipeline = None
83
+ if type == "general":
84
+ pipeline = Pipeline(
84
85
  library=library,
85
86
  experiment_name=experiment_name,
86
87
  experiment_description=experiment_description,
@@ -94,7 +95,28 @@ class MultiPipeline():
94
95
  embedding_pooling_strategy_type=embedding_pooling_strategy,
95
96
  ner_data_file=ner_data_file,
96
97
  ner_threshold=ner_threshold,
97
- combinations=combination_type(**kwargs_combination_params).generate())}
98
+ combinations=combinations)
99
+ elif type== "ner_search":
100
+ pipeline = NerParameterSearch(
101
+ experiment_name=experiment_name,
102
+ experiment_description=experiment_description,
103
+ experiment_tags=experiment_tags,
104
+ tracking_uri=tracking_uri,
105
+ dataset_type=dataset_type,
106
+ umls_code_types = None,
107
+ text_types = None,
108
+ columns=['tui', 'cui', 'entities'],
109
+ ner_data_file=ner_data_file,
110
+ )
111
+
112
+
113
+ return {
114
+ "name": experiment_name,
115
+ "library": library,
116
+ "batch_size": batch_size,
117
+ "epochs": epochs,
118
+ "model_type": model_type,
119
+ "pipeline": pipeline}
98
120
 
99
121
  def build(self):
100
122
  for config in self.experiments_config['experiments']:
@@ -0,0 +1,128 @@
1
+ from collections import defaultdict
2
+ import numpy as np
3
+ from ddi_fw.datasets.core import BaseDataset
4
+ from ddi_fw.datasets.idf_helper import IDF
5
+ from typing import Dict, List
6
+ from itertools import product
7
+
8
+ from ddi_fw.ml.ml_helper import MultiModalRunner
9
+ from ddi_fw.utils.enums import DrugBankTextDataTypes, UMLSCodeTypes
10
+ import mlflow
11
+ from ddi_fw.ner.ner import CTakesNER
12
+
13
+
14
+ def stack(df_column):
15
+ return np.stack(df_column.values)
16
+
17
+
18
+ class NerParameterSearch:
19
+ def __init__(self,
20
+ experiment_name,
21
+ experiment_description,
22
+ experiment_tags,
23
+ tracking_uri,
24
+ dataset_type: BaseDataset,
25
+ ner_data_file,
26
+ columns: list,
27
+ umls_code_types: List[UMLSCodeTypes],
28
+ text_types=List[DrugBankTextDataTypes],
29
+ min_threshold_dict: Dict[str, float] = defaultdict(float),
30
+ max_threshold_dict: Dict[str, float] = defaultdict(float),
31
+ increase_step=0.5):
32
+ self.experiment_name = experiment_name
33
+ self.experiment_description = experiment_description
34
+ self.experiment_tags = experiment_tags
35
+ self.tracking_uri = tracking_uri
36
+
37
+ self.dataset_type = dataset_type
38
+ self.ner_data_file = ner_data_file
39
+ self.columns = columns
40
+ self.umls_code_types = umls_code_types
41
+ self.text_types = text_types
42
+
43
+ self.min_threshold_dict = min_threshold_dict
44
+ self.max_threshold_dict = max_threshold_dict
45
+ self.increase_step = increase_step
46
+
47
+ def build(self):
48
+ self.datasets = {}
49
+ self.items = []
50
+ # columns = ['tui', 'cui', 'entities']
51
+ if self.umls_code_types is not None and self.text_types is not None:
52
+ # add checking statements
53
+ _umls_codes = [t.value[0] for t in self.umls_code_types]
54
+ _text_types = [t.value[0] for t in self.text_types]
55
+ _columns = [f'{item[0]}_{item[1]}' for item in product(
56
+ _umls_codes, _text_types)]
57
+ self.columns.extend(_columns)
58
+ print(f'Columns: {self.columns}')
59
+ self.ner_df = CTakesNER().load(
60
+ filename=self.ner_data_file) if self.ner_data_file else None
61
+
62
+ if not self.min_threshold_dict or not self.max_threshold_dict:
63
+ idf2 = IDF(self.ner_df, self.columns)
64
+ idf2.calculate()
65
+ # df = pd.DataFrame.from_dict(idf2.idf_scores)
66
+ df = idf2.to_dataframe()
67
+ import math
68
+ self.min_threshold_dict = {key: math.floor(
69
+ df.describe()[key]['min']) for key in df.describe().keys()}
70
+ self.max_threshold_dict = {key: math.ceil(
71
+ df.describe()[key]['max']) for key in df.describe().keys()}
72
+
73
+ for column in self.columns:
74
+ min_threshold = self.min_threshold_dict[column]
75
+ max_threshold = self.max_threshold_dict[column]
76
+ kwargs = {}
77
+ kwargs['threshold_method'] = 'idf'
78
+ kwargs['tui_threshold'] = 0
79
+ kwargs['cui_threshold'] = 0
80
+ kwargs['entities_threshold'] = 0
81
+
82
+ for threshold in np.arange(min_threshold, max_threshold, self.increase_step):
83
+ print(threshold)
84
+ if column.startswith('tui'):
85
+ kwargs['tui_threshold'] = threshold
86
+ if column.startswith('cui'):
87
+ kwargs['cui_threshold'] = threshold
88
+ if column.startswith('entities'):
89
+ kwargs['entities_threshold'] = threshold
90
+ dataset = self.dataset_type(
91
+ # chemical_property_columns=[],
92
+ # embedding_columns=[],
93
+ # ner_columns=[column],
94
+ columns=[column],
95
+ ner_df=self.ner_df,
96
+ embedding_size=None,
97
+ embedding_dict=None,
98
+ embeddings_pooling_strategy=None,
99
+ **kwargs)
100
+
101
+ # train_idx_arr, val_idx_arr bir kez hesaplanması yeterli aslında
102
+ X_train, X_test, y_train, y_test, X_train.index, X_test.index, train_idx_arr, val_idx_arr = dataset.load()
103
+ group_items = dataset.produce_inputs()
104
+ for item in group_items:
105
+ # item[0] = f'threshold_{threshold}_{item[0]}'
106
+ item[0] = f'threshold_{item[0]}_{threshold}'
107
+ self.datasets[item[0]] = dataset.ddis_df
108
+
109
+ self.items.extend(group_items)
110
+ self.y_test_label = self.items[0][4]
111
+ self.train_idx_arr = train_idx_arr
112
+ self.val_idx_arr = val_idx_arr
113
+
114
+ def run(self, model_func, batch_size=128, epochs=100):
115
+ mlflow.set_tracking_uri(self.tracking_uri)
116
+
117
+ if mlflow.get_experiment_by_name(self.experiment_name) == None:
118
+ mlflow.create_experiment(self.experiment_name)
119
+ mlflow.set_experiment_tags(self.experiment_tags)
120
+ mlflow.set_experiment(self.experiment_name)
121
+
122
+ y_test_label = self.items[0][4]
123
+ multi_modal_runner = MultiModalRunner(
124
+ library=self.library, model_func=model_func, batch_size=batch_size, epochs=epochs)
125
+ multi_modal_runner.set_data(
126
+ self.items, self.train_idx_arr, self.val_idx_arr, y_test_label)
127
+ result = multi_modal_runner.predict()
128
+ return result
@@ -12,7 +12,7 @@ from ddi_fw.ml import MultiModalRunner
12
12
 
13
13
  class Pipeline:
14
14
  def __init__(self,
15
- library='TF',
15
+ library='tensorflow',
16
16
  experiment_name=None,
17
17
  experiment_description=None,
18
18
  experiment_tags=None,
@@ -68,6 +68,7 @@ class Pipeline:
68
68
  embedding_size = dictionary['embeddings'].shape[1]
69
69
  else:
70
70
  embedding_dict = self.embedding_dict
71
+ #TODO make generic
71
72
  embedding_size = list(embedding_dict['all_text'].values())[
72
73
  0][0].shape
73
74
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ddi_fw
3
- Version: 0.0.84
3
+ Version: 0.0.87
4
4
  Summary: Do not use :)
5
5
  Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
6
6
  Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
@@ -1,11 +1,11 @@
1
1
  ddi_fw/datasets/__init__.py,sha256=HSwQrqnzrEjIG4gif41pwJ_cST3t2XHGDxqFyuEBRwo,351
2
- ddi_fw/datasets/core.py,sha256=lGVP2P8CIeSEG5fH230XV8bLoycblJxBQKYbdMSBITM,19021
2
+ ddi_fw/datasets/core.py,sha256=cL_H7-osGTNG5W8X8LLpIcSJ-GUXoI3LjNwvffmEGzA,19452
3
3
  ddi_fw/datasets/db_utils.py,sha256=OTsa3d-Iic7z3HmzSQK9UigedRbHDxYChJk0s4GfLnw,6191
4
4
  ddi_fw/datasets/embedding_generator.py,sha256=Jqrlv88RCu0Lg812KsA12X0cSaZuxbckJ4LNRKNy_qw,2173
5
5
  ddi_fw/datasets/feature_vector_generation.py,sha256=EImavcALxkIB0YG_smOzagMNzuWMbK9SaWSKwARx_qU,3254
6
6
  ddi_fw/datasets/idf_helper.py,sha256=_Gd1dtDSLaw8o-o0JugzSKMt9FpeXewTh4wGEaUd4VQ,2571
7
7
  ddi_fw/datasets/setup_._py,sha256=khYVJuW5PlOY_i_A16F3UbSZ6s6o_ljw33Byw3C-A8E,1047
8
- ddi_fw/datasets/ddi_mdl/base.py,sha256=ZW8uJIvEizK2x_VkoyhNYcKh3ki3kQRsKxl8d2_hVYQ,4249
8
+ ddi_fw/datasets/ddi_mdl/base.py,sha256=45cUmDRyyD8CC07oj5Dka2DWfgWU4Qi7-Am0vCvRKbo,4237
9
9
  ddi_fw/datasets/ddi_mdl/readme.md,sha256=WC6lpmsEKvIISnZqENY7TWtzCQr98HPpE3oRsBl8pIw,625
10
10
  ddi_fw/datasets/ddi_mdl/data/event.db,sha256=cmlSsf9MYjRzqR-mw3cUDnTnfT6FkpOG2yCl2mMwwew,30580736
11
11
  ddi_fw/datasets/ddi_mdl/indexes/test_indexes.txt,sha256=XVlDqYATckrQwNSXqMSKVBqyoN_Hg8SK6CL-XMdLADY,102176
@@ -67,17 +67,18 @@ ddi_fw/langchain/embeddings.py,sha256=8J_SfO9pyET2W-Ltzq0_r9EchFzBsYdUabiOMma42U
67
67
  ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSzu5L2PXzg,280
68
68
  ddi_fw/langchain/storage.py,sha256=uy5clVB07So2eFbRGdAKzHIPdfEk4se33cPktis7Aa4,2716
69
69
  ddi_fw/ml/__init__.py,sha256=0YubqmEpJKp3OfqlLKkD5N9L6WDWew3QEtnbdY3mqKg,180
70
- ddi_fw/ml/ml_helper.py,sha256=ibHVWMfCfC4jePRDsI8kC6e1e1x_ybOmigy6hkHOrAs,4470
70
+ ddi_fw/ml/ml_helper.py,sha256=8ll5cMfcHUfwPhm8Gbmy7UQA91SRuf3MKoTaXTSzunY,4492
71
71
  ddi_fw/ml/model_wrapper.py,sha256=ZExnsLMjHKL3BaI4aKkbyWTp8vbswLeF2_T3cZ73YpQ,1144
72
72
  ddi_fw/ml/pytorch_wrapper.py,sha256=YdwzR5qAHFNajYB_elFqDhVKRLeajaRpopNzyQ6gIIA,3725
73
73
  ddi_fw/ml/tensorflow_wrapper.py,sha256=pSeiJDuaLf9MhZVlLuLJBA-LH-H-Dl2TyYbB39iGsto,5748
74
74
  ddi_fw/ner/__init__.py,sha256=JwhGXrepomxPSsGsg2b_xPRC72AjvxOIn2CW5Mvscn0,26
75
75
  ddi_fw/ner/mmlrestclient.py,sha256=NZta7m2Qm6I_qtVguMZhqtAUjVBmmXn0-TMnsNp0jpg,6859
76
76
  ddi_fw/ner/ner.py,sha256=BEs9AFljAxOQrC2BEP1raSzRoypcfELS5UTdl4bjTqw,15863
77
- ddi_fw/pipeline/__init__.py,sha256=1oLf4sGCwyLc_zPsZDRkEcpjtJJP09Y1EUv61PqmJrw,166
78
- ddi_fw/pipeline/multi_modal_combination_strategy.py,sha256=YkPixHVo9-4SPkY8VaWvBe1aaI5IiV4oZT4kBrm2WHQ,1635
79
- ddi_fw/pipeline/multi_pipeline.py,sha256=7PqeafCD--UUI7Xg2att9DdaB7b-ZRGPQY-E5F8qlgU,4529
80
- ddi_fw/pipeline/pipeline.py,sha256=k2LevGe7e5kRWraKvy-8i312zd6w5VfFHzFs3EQ9F1E,5519
77
+ ddi_fw/pipeline/__init__.py,sha256=tKDM_rW4vPjlYTeOkNgi9PujDzb4e9O3LK1w5wqnebw,212
78
+ ddi_fw/pipeline/multi_modal_combination_strategy.py,sha256=qIst7vxHaOAhRv4lgozszwa3b1QE4aIrN74t41Xnvr4,1637
79
+ ddi_fw/pipeline/multi_pipeline.py,sha256=ZSVBR5UJIgCh1sLaTDAE_F4u7cyWyOjYTqvdN8uTPnI,5425
80
+ ddi_fw/pipeline/ner_pipeline.py,sha256=bf9amT8I-Ed9cdudWzPOpI6-X0oLgg8O37GJMSyX_bo,5527
81
+ ddi_fw/pipeline/pipeline.py,sha256=NvRTHcccIZU-N17dgZRPkO0TCjpzj932cMIsmpaj7Qs,5559
81
82
  ddi_fw/test/basic_test.py,sha256=fEOGcZm1ObnsDvMiXNmdmz6YCeUrGc8V0DwlSwGhsq8,376
82
83
  ddi_fw/test/combination_test.py,sha256=TWNE8sf-DSh1Q9-yRaRBc774Sn1kSMGXLwQhd2_Qynk,324
83
84
  ddi_fw/test/compress_json_test.py,sha256=BGny56YqiG-pzhMoDzLKQBQI1E7o3jU0S7VYWtclAx4,1045
@@ -94,7 +95,7 @@ ddi_fw/utils/enums.py,sha256=19eJ3fX5eRK_xPvkYcukmug144jXPH4X9zQqtsFBj5A,671
94
95
  ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,4747
95
96
  ddi_fw/utils/utils.py,sha256=szwnxMTDRrZoeNRyDuf3aCbtzriwtaRk4mHSH3asLdA,4301
96
97
  ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
97
- ddi_fw-0.0.84.dist-info/METADATA,sha256=8NTw5XSYOQa6b9yzlC7O2AanU1rIxtOi9j0WQ1J_Xvo,1966
98
- ddi_fw-0.0.84.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
99
- ddi_fw-0.0.84.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
100
- ddi_fw-0.0.84.dist-info/RECORD,,
98
+ ddi_fw-0.0.87.dist-info/METADATA,sha256=qAB0NEo8r0O-l27PecHa6fY1KOrSFSjbEO2bNF3-xZg,1966
99
+ ddi_fw-0.0.87.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
100
+ ddi_fw-0.0.87.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
101
+ ddi_fw-0.0.87.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.3.0)
2
+ Generator: setuptools (75.5.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5