ddi-fw 0.0.149__py3-none-any.whl → 0.0.151__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ddi_fw/datasets/__init__.py +1 -1
- ddi_fw/datasets/core.py +147 -341
- ddi_fw/datasets/dataset_splitter.py +39 -0
- ddi_fw/datasets/ddi_mdl/base.py +194 -130
- ddi_fw/datasets/ddi_mdl/debug.log +1 -0
- ddi_fw/datasets/embedding_generator.py +2 -1
- ddi_fw/langchain/embeddings.py +1 -0
- ddi_fw/ml/evaluation_helper.py +47 -178
- ddi_fw/ml/ml_helper.py +125 -81
- ddi_fw/ml/model_wrapper.py +2 -2
- ddi_fw/ml/pytorch_wrapper.py +175 -72
- ddi_fw/ml/tensorflow_wrapper.py +131 -39
- ddi_fw/ner/ner.py +93 -39
- ddi_fw/pipeline/multi_modal_combination_strategy.py +4 -2
- ddi_fw/pipeline/multi_pipeline.py +2 -15
- ddi_fw/pipeline/ner_pipeline.py +15 -6
- ddi_fw/pipeline/pipeline.py +157 -93
- ddi_fw/{test/compress_json_test.py → utils/json_helper.py} +1 -15
- {ddi_fw-0.0.149.dist-info → ddi_fw-0.0.151.dist-info}/METADATA +6 -3
- {ddi_fw-0.0.149.dist-info → ddi_fw-0.0.151.dist-info}/RECORD +22 -31
- {ddi_fw-0.0.149.dist-info → ddi_fw-0.0.151.dist-info}/WHEEL +1 -1
- ddi_fw/test/__init__.py +0 -0
- ddi_fw/test/basic_test.py +0 -15
- ddi_fw/test/combination_test.py +0 -12
- ddi_fw/test/date_test.py +0 -15
- ddi_fw/test/idf_score.py +0 -54
- ddi_fw/test/jaccard_similarity.py +0 -85
- ddi_fw/test/mlfow_test.py +0 -165
- ddi_fw/test/sklearn-tfidf.py +0 -16
- ddi_fw/test/test.py +0 -93
- ddi_fw/test/torch_cuda_test.py +0 -9
- ddi_fw/test/type_guarding_test.py +0 -18
- {ddi_fw-0.0.149.dist-info → ddi_fw-0.0.151.dist-info}/top_level.txt +0 -0
ddi_fw/ner/ner.py
CHANGED
@@ -1,11 +1,14 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
1
2
|
from collections import defaultdict
|
2
3
|
import glob
|
3
4
|
import json
|
4
5
|
from pathlib import Path
|
5
6
|
import pathlib
|
6
7
|
from time import sleep
|
8
|
+
from typing import List, Optional
|
7
9
|
import pandas as pd
|
8
10
|
|
11
|
+
from pydantic import BaseModel, Field, HttpUrl
|
9
12
|
from tqdm import tqdm
|
10
13
|
import os
|
11
14
|
import requests
|
@@ -33,19 +36,67 @@ from ddi_fw.utils import create_folder_if_not_exists
|
|
33
36
|
HERE = pathlib.Path(__file__).resolve().parent
|
34
37
|
|
35
38
|
|
36
|
-
class
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
39
|
+
class NERInterface(ABC):
|
40
|
+
"""
|
41
|
+
An abstract base class to define the interface for Named Entity Recognition (NER).
|
42
|
+
"""
|
43
|
+
|
44
|
+
@abstractmethod
|
45
|
+
def run(self, run_for=[]):
|
46
|
+
"""
|
47
|
+
Run the NER process.
|
48
|
+
:param run_for: A list of columns to process.
|
49
|
+
"""
|
50
|
+
pass
|
51
|
+
|
52
|
+
class CTakesNER(BaseModel,NERInterface):
|
53
|
+
"""
|
54
|
+
A class to perform Named Entity Recognition (NER) using the cTAKES API.
|
55
|
+
Attributes:
|
56
|
+
df (pd.DataFrame): The input dataframe containing data to be processed.
|
57
|
+
key (str): The key column in the dataframe, default is 'drugbank_id'.
|
58
|
+
api_url (str): The URL of the cTAKES API, default is 'http://localhost:8080/ctakes-web-rest/service/analyze?pipeline=Default'.
|
59
|
+
output_path (str): The path to save the NER output, default is 'ner-output/ctakes'.
|
60
|
+
ids (list): A list of IDs to exclude from processing, default is an empty list.
|
61
|
+
columns (list): A list of columns in the dataframe to process, default is an empty list.
|
62
|
+
Methods:
|
63
|
+
run(run_for=[]):
|
64
|
+
Runs the NER process for the specified columns.
|
65
|
+
load(filename=None, group=True):
|
66
|
+
Loads the NER results from a pickle file.
|
67
|
+
create_dataframe(override=False):
|
68
|
+
Creates a dataframe from the NER results and saves it as a pickle file.
|
69
|
+
"""
|
70
|
+
# def __init__(self, df: pd.DataFrame,
|
71
|
+
# key: str = 'drugbank_id',
|
72
|
+
# api_url: str = 'http://localhost:8080/ctakes-web-rest/service/analyze?pipeline=Default',
|
73
|
+
# output_path: str = 'ner-output/ctakes', ids: list = [],
|
74
|
+
# columns: list = []):
|
75
|
+
# self.df = df
|
76
|
+
# self.key = key
|
77
|
+
# self.api_url = api_url
|
78
|
+
# self.columns = columns
|
79
|
+
# self.ids = ids
|
80
|
+
# self.output_path = output_path
|
81
|
+
|
82
|
+
df: Optional[pd.DataFrame]
|
83
|
+
key: str = 'drugbank_id'
|
84
|
+
api_url: str = 'http://localhost:8080/ctakes-web-rest/service/analyze?pipeline=Default'
|
85
|
+
output_path: str = 'ner-output/ctakes'
|
86
|
+
ids: List[str] = Field(default_factory=list)
|
87
|
+
columns: List[str] = Field(default_factory=list)
|
88
|
+
|
89
|
+
class Config:
|
90
|
+
arbitrary_types_allowed = True
|
91
|
+
|
92
|
+
def run(self,
|
47
93
|
run_for=[]):
|
48
|
-
|
94
|
+
"""
|
95
|
+
Run the NER process.
|
96
|
+
:param run_for: A list of columns to process.
|
97
|
+
"""
|
98
|
+
if self.df is None:
|
99
|
+
raise ValueError('Dataframe is not provided')
|
49
100
|
for column in self.columns:
|
50
101
|
if not os.path.exists(self.output_path+"/"+column):
|
51
102
|
os.makedirs(self.output_path+"/"+column)
|
@@ -55,12 +106,14 @@ class CTakesNER:
|
|
55
106
|
continue
|
56
107
|
# not include
|
57
108
|
if self.ids:
|
58
|
-
self.
|
109
|
+
self.df = self.df[~self.df[self.key].isin(
|
59
110
|
self.ids)]
|
60
|
-
for index, row in self.
|
61
|
-
drugbank_id = row[
|
111
|
+
for index, row in self.df.iterrows():
|
112
|
+
drugbank_id = row[self.key]
|
62
113
|
data = row[column]
|
63
|
-
|
114
|
+
# or len(data) == 0:
|
115
|
+
if data is None or (isinstance(data, pd.Series) and data.isna().any()) or (isinstance(data, str) and len(data.strip()) == 0):
|
116
|
+
# if data is None or pd.isna(data) or (type(data) == str and len(data.strip()) == 0):
|
64
117
|
with open(f'{column_output_path}/{drugbank_id}.json', 'w', encoding='utf-8') as f:
|
65
118
|
json.dump([], f, ensure_ascii=False, indent=4)
|
66
119
|
continue
|
@@ -79,8 +132,9 @@ class CTakesNER:
|
|
79
132
|
# if index % 10 == 0:
|
80
133
|
# sleep(10)
|
81
134
|
|
82
|
-
def load(self, filename
|
83
|
-
file_path=
|
135
|
+
def load(self, filename=None, group=True):
|
136
|
+
file_path = filename if filename else HERE.joinpath(
|
137
|
+
'output/ctakes/ctakes_ner.pkl')
|
84
138
|
df = pd.read_pickle(file_path)
|
85
139
|
|
86
140
|
if group:
|
@@ -92,24 +146,28 @@ class CTakesNER:
|
|
92
146
|
|
93
147
|
tui_columns = [key for key in keys if key.startswith('tui')]
|
94
148
|
cui_columns = [key for key in keys if key.startswith('cui')]
|
95
|
-
entities_columns = [
|
96
|
-
|
97
|
-
|
98
|
-
df['tui'] = df[
|
149
|
+
entities_columns = [
|
150
|
+
key for key in keys if key.startswith('entities')]
|
151
|
+
# bunu tek bir eşitlikle çöz
|
152
|
+
df['tui'] = df[tui_columns].values.tolist()
|
153
|
+
df['tui'] = df['tui'].apply(
|
154
|
+
lambda items: {i for item in items for i in item})
|
99
155
|
|
100
|
-
df['cui'] =
|
101
|
-
df['cui'] = df['cui'].apply(
|
156
|
+
df['cui'] = df[cui_columns].values.tolist()
|
157
|
+
df['cui'] = df['cui'].apply(
|
158
|
+
lambda items: {i for item in items for i in item})
|
102
159
|
|
103
|
-
df['entities'] =
|
104
|
-
df['entities'] = df['entities'].apply(
|
160
|
+
df['entities'] = df[entities_columns].values.tolist()
|
161
|
+
df['entities'] = df['entities'].apply(
|
162
|
+
lambda items: {i for item in items for i in item})
|
105
163
|
|
106
164
|
return df
|
107
165
|
|
108
|
-
def create_dataframe(self, override
|
109
|
-
filename='ctakes_ner.pkl'
|
166
|
+
def create_dataframe(self, override=False): # dataframe_columns=[]
|
167
|
+
filename = 'ctakes_ner.pkl'
|
110
168
|
if not override and os.path.exists(self.output_path+"/" + filename):
|
111
169
|
return self.load(self.output_path+"/" + filename)
|
112
|
-
|
170
|
+
|
113
171
|
create_folder_if_not_exists(self.output_path+"/" + filename)
|
114
172
|
dict_of_dict = defaultdict(dict)
|
115
173
|
for column in self.columns:
|
@@ -123,9 +181,9 @@ class CTakesNER:
|
|
123
181
|
cuis = []
|
124
182
|
tuis = []
|
125
183
|
if data is None or len(data) == 0:
|
126
|
-
t[
|
184
|
+
t[self.key] = file_name
|
127
185
|
t[f'cui_{column}'] = []
|
128
|
-
t[f'tui_{column}']= []
|
186
|
+
t[f'tui_{column}'] = []
|
129
187
|
t[f'entities_{column}'] = []
|
130
188
|
dict_of_dict[file_name] = t
|
131
189
|
continue
|
@@ -136,9 +194,9 @@ class CTakesNER:
|
|
136
194
|
tuis = [attr['tui']
|
137
195
|
for v in value for attr in v['conceptAttributes']]
|
138
196
|
# codingScheme
|
139
|
-
|
140
|
-
if
|
141
|
-
t[
|
197
|
+
|
198
|
+
if self.key not in t:
|
199
|
+
t[self.key] = file_name
|
142
200
|
t[f'cui_{column}'] = cuis
|
143
201
|
t[f'tui_{column}'] = tuis
|
144
202
|
t[f'entities_{column}'] = entities
|
@@ -149,11 +207,7 @@ class CTakesNER:
|
|
149
207
|
# columns=columns
|
150
208
|
)
|
151
209
|
df.to_pickle(self.output_path+"/" + filename)
|
152
|
-
|
153
|
-
|
154
|
-
# new_columns = {columns[i]: dataframe_columns[i]
|
155
|
-
# for i in range(len(columns))}
|
156
|
-
# df.rename(columns=new_columns, inplace=True)
|
210
|
+
|
157
211
|
return df
|
158
212
|
|
159
213
|
|
@@ -49,11 +49,6 @@ class MultiPipeline():
|
|
49
49
|
def __create_pipeline(self, config):
|
50
50
|
type = config.get("type")
|
51
51
|
library = config.get("library")
|
52
|
-
# batch_size = config.get("batch_size")
|
53
|
-
# epochs = config.get("epochs")
|
54
|
-
|
55
|
-
# dataset_module = config.get("dataset_module")
|
56
|
-
# dataset_name = config.get("dataset_name")
|
57
52
|
|
58
53
|
experiment_name = config.get("experiment_name")
|
59
54
|
experiment_description = config.get("experiment_description")
|
@@ -82,9 +77,7 @@ class MultiPipeline():
|
|
82
77
|
combinations = []
|
83
78
|
if combination_type is not None:
|
84
79
|
combinations = combination_type(**kwargs_combination_params).generate()
|
85
|
-
|
86
|
-
# model_instance = model_class()
|
87
|
-
# dataset_instance = dataset_class()
|
80
|
+
|
88
81
|
|
89
82
|
pipeline = None
|
90
83
|
if type == "general":
|
@@ -117,15 +110,13 @@ class MultiPipeline():
|
|
117
110
|
text_types = None,
|
118
111
|
columns=['tui', 'cui', 'entities'],
|
119
112
|
ner_data_file=ner_data_file,
|
113
|
+
multi_modal= multi_modal
|
120
114
|
)
|
121
115
|
|
122
116
|
|
123
117
|
return {
|
124
118
|
"name": experiment_name,
|
125
119
|
"library": library,
|
126
|
-
# "batch_size": batch_size,
|
127
|
-
# "epochs": epochs,
|
128
|
-
# "model_type": model_type,
|
129
120
|
"pipeline": pipeline}
|
130
121
|
|
131
122
|
def build(self):
|
@@ -138,10 +129,6 @@ class MultiPipeline():
|
|
138
129
|
for item in self.items:
|
139
130
|
print(f"{item['name']} is running")
|
140
131
|
pipeline = item['pipeline']
|
141
|
-
# model_type = item['model_type']
|
142
|
-
# batch_size = item['batch_size']
|
143
|
-
# epochs = item['epochs']
|
144
|
-
# It can be moved to build function
|
145
132
|
pipeline.build()
|
146
133
|
result = pipeline.run()
|
147
134
|
self.pipeline_resuts[item['name']] = result
|
ddi_fw/pipeline/ner_pipeline.py
CHANGED
@@ -2,7 +2,7 @@ from collections import defaultdict
|
|
2
2
|
import numpy as np
|
3
3
|
from ddi_fw.datasets.core import BaseDataset
|
4
4
|
from ddi_fw.datasets.idf_helper import IDF
|
5
|
-
from typing import Dict, List
|
5
|
+
from typing import Any, Dict, List, Optional
|
6
6
|
from itertools import product
|
7
7
|
|
8
8
|
from ddi_fw.ml.ml_helper import MultiModalRunner
|
@@ -18,6 +18,7 @@ def stack(df_column):
|
|
18
18
|
class NerParameterSearch:
|
19
19
|
def __init__(self,
|
20
20
|
library,
|
21
|
+
multi_modal,
|
21
22
|
experiment_name,
|
22
23
|
experiment_description,
|
23
24
|
experiment_tags,
|
@@ -25,12 +26,13 @@ class NerParameterSearch:
|
|
25
26
|
dataset_type: BaseDataset,
|
26
27
|
ner_data_file,
|
27
28
|
columns: list,
|
28
|
-
umls_code_types: List[UMLSCodeTypes],
|
29
|
-
text_types
|
29
|
+
umls_code_types: List[UMLSCodeTypes]|None,
|
30
|
+
text_types:List[DrugBankTextDataTypes]|None,
|
30
31
|
min_threshold_dict: Dict[str, float] = defaultdict(float),
|
31
32
|
max_threshold_dict: Dict[str, float] = defaultdict(float),
|
32
33
|
increase_step=0.5):
|
33
34
|
self.library = library
|
35
|
+
self.multi_modal = multi_modal
|
34
36
|
self.experiment_name = experiment_name
|
35
37
|
self.experiment_description = experiment_description
|
36
38
|
self.experiment_tags = experiment_tags
|
@@ -47,6 +49,8 @@ class NerParameterSearch:
|
|
47
49
|
self.increase_step = increase_step
|
48
50
|
|
49
51
|
def build(self):
|
52
|
+
if not isinstance(self.dataset_type, type):
|
53
|
+
raise TypeError("self.dataset_type must be a class, not an instance")
|
50
54
|
self.datasets = {}
|
51
55
|
self.items = []
|
52
56
|
# columns = ['tui', 'cui', 'entities']
|
@@ -58,7 +62,7 @@ class NerParameterSearch:
|
|
58
62
|
_umls_codes, _text_types)]
|
59
63
|
self.columns.extend(_columns)
|
60
64
|
print(f'Columns: {self.columns}')
|
61
|
-
self.ner_df = CTakesNER().load(
|
65
|
+
self.ner_df = CTakesNER(df = None).load(
|
62
66
|
filename=self.ner_data_file) if self.ner_data_file else None
|
63
67
|
|
64
68
|
if not self.min_threshold_dict or not self.max_threshold_dict:
|
@@ -72,6 +76,7 @@ class NerParameterSearch:
|
|
72
76
|
self.max_threshold_dict = {key: math.ceil(
|
73
77
|
df.describe()[key]['max']) for key in df.describe().keys()}
|
74
78
|
|
79
|
+
train_idx_arr, val_idx_arr = None, None
|
75
80
|
for column in self.columns:
|
76
81
|
min_threshold = self.min_threshold_dict[column]
|
77
82
|
max_threshold = self.max_threshold_dict[column]
|
@@ -106,7 +111,7 @@ class NerParameterSearch:
|
|
106
111
|
for item in group_items:
|
107
112
|
# item[0] = f'threshold_{threshold}_{item[0]}'
|
108
113
|
item[0] = f'threshold_{item[0]}_{threshold}'
|
109
|
-
|
114
|
+
self.datasets[item[0]] = dataset.ddis_df
|
110
115
|
|
111
116
|
self.items.extend(group_items)
|
112
117
|
self.y_test_label = self.items[0][4]
|
@@ -123,8 +128,12 @@ class NerParameterSearch:
|
|
123
128
|
|
124
129
|
y_test_label = self.items[0][4]
|
125
130
|
multi_modal_runner = MultiModalRunner(
|
126
|
-
library=self.library,
|
131
|
+
library=self.library, multi_modal=self.multi_modal)
|
132
|
+
# multi_modal_runner = MultiModalRunner(
|
133
|
+
# library=self.library, model_func=model_func, batch_size=batch_size, epochs=epochs)
|
127
134
|
multi_modal_runner.set_data(
|
128
135
|
self.items, self.train_idx_arr, self.val_idx_arr, y_test_label)
|
129
136
|
result = multi_modal_runner.predict()
|
137
|
+
|
138
|
+
|
130
139
|
return result
|