ddi-fw 0.0.149__py3-none-any.whl → 0.0.151__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ddi_fw/ner/ner.py CHANGED
@@ -1,11 +1,14 @@
1
+ from abc import ABC, abstractmethod
1
2
  from collections import defaultdict
2
3
  import glob
3
4
  import json
4
5
  from pathlib import Path
5
6
  import pathlib
6
7
  from time import sleep
8
+ from typing import List, Optional
7
9
  import pandas as pd
8
10
 
11
+ from pydantic import BaseModel, Field, HttpUrl
9
12
  from tqdm import tqdm
10
13
  import os
11
14
  import requests
@@ -33,19 +36,67 @@ from ddi_fw.utils import create_folder_if_not_exists
33
36
  HERE = pathlib.Path(__file__).resolve().parent
34
37
 
35
38
 
36
- class CTakesNER:
37
- def __init__(self, drugs_df = None,api_url= 'http://localhost:8080/ctakes-web-rest/service/analyze?pipeline=Default'
38
- , output_path='ner-output/ctakes', ids=[],
39
- columns=[]):
40
- self.drugs_df = drugs_df
41
- self.api_url = api_url
42
- self.columns = columns
43
- self.ids = ids
44
- self.output_path = output_path
45
-
46
- def run(self,
39
+ class NERInterface(ABC):
40
+ """
41
+ An abstract base class to define the interface for Named Entity Recognition (NER).
42
+ """
43
+
44
+ @abstractmethod
45
+ def run(self, run_for=[]):
46
+ """
47
+ Run the NER process.
48
+ :param run_for: A list of columns to process.
49
+ """
50
+ pass
51
+
52
+ class CTakesNER(BaseModel,NERInterface):
53
+ """
54
+ A class to perform Named Entity Recognition (NER) using the cTAKES API.
55
+ Attributes:
56
+ df (pd.DataFrame): The input dataframe containing data to be processed.
57
+ key (str): The key column in the dataframe, default is 'drugbank_id'.
58
+ api_url (str): The URL of the cTAKES API, default is 'http://localhost:8080/ctakes-web-rest/service/analyze?pipeline=Default'.
59
+ output_path (str): The path to save the NER output, default is 'ner-output/ctakes'.
60
+ ids (list): A list of IDs to exclude from processing, default is an empty list.
61
+ columns (list): A list of columns in the dataframe to process, default is an empty list.
62
+ Methods:
63
+ run(run_for=[]):
64
+ Runs the NER process for the specified columns.
65
+ load(filename=None, group=True):
66
+ Loads the NER results from a pickle file.
67
+ create_dataframe(override=False):
68
+ Creates a dataframe from the NER results and saves it as a pickle file.
69
+ """
70
+ # def __init__(self, df: pd.DataFrame,
71
+ # key: str = 'drugbank_id',
72
+ # api_url: str = 'http://localhost:8080/ctakes-web-rest/service/analyze?pipeline=Default',
73
+ # output_path: str = 'ner-output/ctakes', ids: list = [],
74
+ # columns: list = []):
75
+ # self.df = df
76
+ # self.key = key
77
+ # self.api_url = api_url
78
+ # self.columns = columns
79
+ # self.ids = ids
80
+ # self.output_path = output_path
81
+
82
+ df: Optional[pd.DataFrame]
83
+ key: str = 'drugbank_id'
84
+ api_url: str = 'http://localhost:8080/ctakes-web-rest/service/analyze?pipeline=Default'
85
+ output_path: str = 'ner-output/ctakes'
86
+ ids: List[str] = Field(default_factory=list)
87
+ columns: List[str] = Field(default_factory=list)
88
+
89
+ class Config:
90
+ arbitrary_types_allowed = True
91
+
92
+ def run(self,
47
93
  run_for=[]):
48
-
94
+ """
95
+ Run the NER process.
96
+ :param run_for: A list of columns to process.
97
+ """
98
+ if self.df is None:
99
+ raise ValueError('Dataframe is not provided')
49
100
  for column in self.columns:
50
101
  if not os.path.exists(self.output_path+"/"+column):
51
102
  os.makedirs(self.output_path+"/"+column)
@@ -55,12 +106,14 @@ class CTakesNER:
55
106
  continue
56
107
  # not include
57
108
  if self.ids:
58
- self.drugs_df = self.drugs_df[~self.drugs_df['drugbank_id'].isin(
109
+ self.df = self.df[~self.df[self.key].isin(
59
110
  self.ids)]
60
- for index, row in self.drugs_df.iterrows():
61
- drugbank_id = row['drugbank_id']
111
+ for index, row in self.df.iterrows():
112
+ drugbank_id = row[self.key]
62
113
  data = row[column]
63
- if data is None or pd.isna(data) or (type(data) == str and len(data.strip()) == 0): # or len(data) == 0:
114
+ # or len(data) == 0:
115
+ if data is None or (isinstance(data, pd.Series) and data.isna().any()) or (isinstance(data, str) and len(data.strip()) == 0):
116
+ # if data is None or pd.isna(data) or (type(data) == str and len(data.strip()) == 0):
64
117
  with open(f'{column_output_path}/{drugbank_id}.json', 'w', encoding='utf-8') as f:
65
118
  json.dump([], f, ensure_ascii=False, indent=4)
66
119
  continue
@@ -79,8 +132,9 @@ class CTakesNER:
79
132
  # if index % 10 == 0:
80
133
  # sleep(10)
81
134
 
82
- def load(self, filename = None, group = True):
83
- file_path= filename if filename else HERE.joinpath('output/ctakes/ctakes_ner.pkl')
135
+ def load(self, filename=None, group=True):
136
+ file_path = filename if filename else HERE.joinpath(
137
+ 'output/ctakes/ctakes_ner.pkl')
84
138
  df = pd.read_pickle(file_path)
85
139
 
86
140
  if group:
@@ -92,24 +146,28 @@ class CTakesNER:
92
146
 
93
147
  tui_columns = [key for key in keys if key.startswith('tui')]
94
148
  cui_columns = [key for key in keys if key.startswith('cui')]
95
- entities_columns = [key for key in keys if key.startswith('entities')]
96
- #bunu tek bir eşitlikle çöz
97
- df['tui'] = df[tui_columns].values.tolist()
98
- df['tui'] = df['tui'].apply(lambda items:{i for item in items for i in item})
149
+ entities_columns = [
150
+ key for key in keys if key.startswith('entities')]
151
+ # bunu tek bir eşitlikle çöz
152
+ df['tui'] = df[tui_columns].values.tolist()
153
+ df['tui'] = df['tui'].apply(
154
+ lambda items: {i for item in items for i in item})
99
155
 
100
- df['cui'] = df[cui_columns].values.tolist()
101
- df['cui'] = df['cui'].apply(lambda items:{i for item in items for i in item})
156
+ df['cui'] = df[cui_columns].values.tolist()
157
+ df['cui'] = df['cui'].apply(
158
+ lambda items: {i for item in items for i in item})
102
159
 
103
- df['entities'] = df[entities_columns].values.tolist()
104
- df['entities'] = df['entities'].apply(lambda items:{i for item in items for i in item})
160
+ df['entities'] = df[entities_columns].values.tolist()
161
+ df['entities'] = df['entities'].apply(
162
+ lambda items: {i for item in items for i in item})
105
163
 
106
164
  return df
107
165
 
108
- def create_dataframe(self, override = False): # dataframe_columns=[]
109
- filename='ctakes_ner.pkl'
166
+ def create_dataframe(self, override=False): # dataframe_columns=[]
167
+ filename = 'ctakes_ner.pkl'
110
168
  if not override and os.path.exists(self.output_path+"/" + filename):
111
169
  return self.load(self.output_path+"/" + filename)
112
-
170
+
113
171
  create_folder_if_not_exists(self.output_path+"/" + filename)
114
172
  dict_of_dict = defaultdict(dict)
115
173
  for column in self.columns:
@@ -123,9 +181,9 @@ class CTakesNER:
123
181
  cuis = []
124
182
  tuis = []
125
183
  if data is None or len(data) == 0:
126
- t['drugbank_id'] = file_name
184
+ t[self.key] = file_name
127
185
  t[f'cui_{column}'] = []
128
- t[f'tui_{column}']= []
186
+ t[f'tui_{column}'] = []
129
187
  t[f'entities_{column}'] = []
130
188
  dict_of_dict[file_name] = t
131
189
  continue
@@ -136,9 +194,9 @@ class CTakesNER:
136
194
  tuis = [attr['tui']
137
195
  for v in value for attr in v['conceptAttributes']]
138
196
  # codingScheme
139
-
140
- if 'drugbank_id' not in t:
141
- t['drugbank_id'] = file_name
197
+
198
+ if self.key not in t:
199
+ t[self.key] = file_name
142
200
  t[f'cui_{column}'] = cuis
143
201
  t[f'tui_{column}'] = tuis
144
202
  t[f'entities_{column}'] = entities
@@ -149,11 +207,7 @@ class CTakesNER:
149
207
  # columns=columns
150
208
  )
151
209
  df.to_pickle(self.output_path+"/" + filename)
152
- # dataframe_columns.insert(0, 'drugbank_id')
153
-
154
- # new_columns = {columns[i]: dataframe_columns[i]
155
- # for i in range(len(columns))}
156
- # df.rename(columns=new_columns, inplace=True)
210
+
157
211
  return df
158
212
 
159
213
 
@@ -1,8 +1,10 @@
1
+ from abc import ABC, abstractmethod
1
2
  import itertools
2
3
 
3
4
 
4
- class CombinationStrategy():
5
- def generate(self):
5
+ class CombinationStrategy(ABC):
6
+ @abstractmethod
7
+ def generate(self) -> list:
6
8
  pass
7
9
 
8
10
 
@@ -49,11 +49,6 @@ class MultiPipeline():
49
49
  def __create_pipeline(self, config):
50
50
  type = config.get("type")
51
51
  library = config.get("library")
52
- # batch_size = config.get("batch_size")
53
- # epochs = config.get("epochs")
54
-
55
- # dataset_module = config.get("dataset_module")
56
- # dataset_name = config.get("dataset_name")
57
52
 
58
53
  experiment_name = config.get("experiment_name")
59
54
  experiment_description = config.get("experiment_description")
@@ -82,9 +77,7 @@ class MultiPipeline():
82
77
  combinations = []
83
78
  if combination_type is not None:
84
79
  combinations = combination_type(**kwargs_combination_params).generate()
85
- # # Instantiate the classes
86
- # model_instance = model_class()
87
- # dataset_instance = dataset_class()
80
+
88
81
 
89
82
  pipeline = None
90
83
  if type == "general":
@@ -117,15 +110,13 @@ class MultiPipeline():
117
110
  text_types = None,
118
111
  columns=['tui', 'cui', 'entities'],
119
112
  ner_data_file=ner_data_file,
113
+ multi_modal= multi_modal
120
114
  )
121
115
 
122
116
 
123
117
  return {
124
118
  "name": experiment_name,
125
119
  "library": library,
126
- # "batch_size": batch_size,
127
- # "epochs": epochs,
128
- # "model_type": model_type,
129
120
  "pipeline": pipeline}
130
121
 
131
122
  def build(self):
@@ -138,10 +129,6 @@ class MultiPipeline():
138
129
  for item in self.items:
139
130
  print(f"{item['name']} is running")
140
131
  pipeline = item['pipeline']
141
- # model_type = item['model_type']
142
- # batch_size = item['batch_size']
143
- # epochs = item['epochs']
144
- # It can be moved to build function
145
132
  pipeline.build()
146
133
  result = pipeline.run()
147
134
  self.pipeline_resuts[item['name']] = result
@@ -2,7 +2,7 @@ from collections import defaultdict
2
2
  import numpy as np
3
3
  from ddi_fw.datasets.core import BaseDataset
4
4
  from ddi_fw.datasets.idf_helper import IDF
5
- from typing import Dict, List
5
+ from typing import Any, Dict, List, Optional
6
6
  from itertools import product
7
7
 
8
8
  from ddi_fw.ml.ml_helper import MultiModalRunner
@@ -18,6 +18,7 @@ def stack(df_column):
18
18
  class NerParameterSearch:
19
19
  def __init__(self,
20
20
  library,
21
+ multi_modal,
21
22
  experiment_name,
22
23
  experiment_description,
23
24
  experiment_tags,
@@ -25,12 +26,13 @@ class NerParameterSearch:
25
26
  dataset_type: BaseDataset,
26
27
  ner_data_file,
27
28
  columns: list,
28
- umls_code_types: List[UMLSCodeTypes],
29
- text_types=List[DrugBankTextDataTypes],
29
+ umls_code_types: List[UMLSCodeTypes]|None,
30
+ text_types:List[DrugBankTextDataTypes]|None,
30
31
  min_threshold_dict: Dict[str, float] = defaultdict(float),
31
32
  max_threshold_dict: Dict[str, float] = defaultdict(float),
32
33
  increase_step=0.5):
33
34
  self.library = library
35
+ self.multi_modal = multi_modal
34
36
  self.experiment_name = experiment_name
35
37
  self.experiment_description = experiment_description
36
38
  self.experiment_tags = experiment_tags
@@ -47,6 +49,8 @@ class NerParameterSearch:
47
49
  self.increase_step = increase_step
48
50
 
49
51
  def build(self):
52
+ if not isinstance(self.dataset_type, type):
53
+ raise TypeError("self.dataset_type must be a class, not an instance")
50
54
  self.datasets = {}
51
55
  self.items = []
52
56
  # columns = ['tui', 'cui', 'entities']
@@ -58,7 +62,7 @@ class NerParameterSearch:
58
62
  _umls_codes, _text_types)]
59
63
  self.columns.extend(_columns)
60
64
  print(f'Columns: {self.columns}')
61
- self.ner_df = CTakesNER().load(
65
+ self.ner_df = CTakesNER(df = None).load(
62
66
  filename=self.ner_data_file) if self.ner_data_file else None
63
67
 
64
68
  if not self.min_threshold_dict or not self.max_threshold_dict:
@@ -72,6 +76,7 @@ class NerParameterSearch:
72
76
  self.max_threshold_dict = {key: math.ceil(
73
77
  df.describe()[key]['max']) for key in df.describe().keys()}
74
78
 
79
+ train_idx_arr, val_idx_arr = None, None
75
80
  for column in self.columns:
76
81
  min_threshold = self.min_threshold_dict[column]
77
82
  max_threshold = self.max_threshold_dict[column]
@@ -106,7 +111,7 @@ class NerParameterSearch:
106
111
  for item in group_items:
107
112
  # item[0] = f'threshold_{threshold}_{item[0]}'
108
113
  item[0] = f'threshold_{item[0]}_{threshold}'
109
- self.datasets[item[0]] = dataset.ddis_df
114
+ self.datasets[item[0]] = dataset.ddis_df
110
115
 
111
116
  self.items.extend(group_items)
112
117
  self.y_test_label = self.items[0][4]
@@ -123,8 +128,12 @@ class NerParameterSearch:
123
128
 
124
129
  y_test_label = self.items[0][4]
125
130
  multi_modal_runner = MultiModalRunner(
126
- library=self.library, model_func=model_func, batch_size=batch_size, epochs=epochs)
131
+ library=self.library, multi_modal=self.multi_modal)
132
+ # multi_modal_runner = MultiModalRunner(
133
+ # library=self.library, model_func=model_func, batch_size=batch_size, epochs=epochs)
127
134
  multi_modal_runner.set_data(
128
135
  self.items, self.train_idx_arr, self.val_idx_arr, y_test_label)
129
136
  result = multi_modal_runner.predict()
137
+
138
+
130
139
  return result