ddi-fw 0.0.94__py3-none-any.whl → 0.0.96__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ddi_fw/ml/__init__.py CHANGED
@@ -1,4 +1,5 @@
1
1
  from .ml_helper import MultiModalRunner
2
2
  from .model_wrapper import ModelWrapper,Result
3
3
  from .tensorflow_wrapper import TFModelWrapper
4
- from .pytorch_wrapper import PTModelWrapper
4
+ from .pytorch_wrapper import PTModelWrapper
5
+ from .evaluation_helper import evaluate
ddi_fw/ml/ml_helper.py CHANGED
@@ -16,7 +16,7 @@ from mlflow.utils.autologging_utils import batch_metrics_logger
16
16
  import time
17
17
 
18
18
  from mlflow.models import infer_signature
19
- from ddi_fw.experiments.evaluation_helper import Metrics, evaluate
19
+ from ddi_fw.ml.evaluation_helper import Metrics, evaluate
20
20
 
21
21
  # import tf2onnx
22
22
  # import onnx
@@ -1,6 +1,6 @@
1
1
  from typing import Dict, List, Tuple
2
2
 
3
- from ddi_fw.experiments.evaluation_helper import Metrics
3
+ from ddi_fw.ml.evaluation_helper import Metrics
4
4
 
5
5
  class Result:
6
6
  def __init__(self) -> None:
@@ -1,6 +1,6 @@
1
1
  import mlflow
2
2
  import torch
3
- from ddi_fw.experiments.evaluation_helper import evaluate
3
+ from ddi_fw.ml.evaluation_helper import evaluate
4
4
  from ddi_fw.ml.model_wrapper import ModelWrapper
5
5
 
6
6
 
@@ -1,24 +1,19 @@
1
- from matplotlib import pyplot as plt
2
1
  from ddi_fw.ml.model_wrapper import ModelWrapper
3
2
  import tensorflow as tf
4
3
  from tensorflow import keras
5
- from keras.models import Model, Sequential
6
- from keras.layers import Dense, Dropout, Input, Activation, BatchNormalization
7
- from keras.callbacks import EarlyStopping
4
+ from keras.callbacks import EarlyStopping,ModelCheckpoint
8
5
  from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
9
6
  import numpy as np
10
7
 
11
8
  import mlflow
12
9
  from mlflow.utils.autologging_utils import batch_metrics_logger
13
- import time
14
10
 
15
11
  from mlflow.models import infer_signature
16
- from ddi_fw.experiments.evaluation_helper import Metrics, evaluate
12
+ from ddi_fw.ml.evaluation_helper import Metrics, evaluate
17
13
 
18
14
  # import tf2onnx
19
15
  # import onnx
20
16
 
21
- import itertools
22
17
  import ddi_fw.utils as utils
23
18
 
24
19
 
@@ -44,6 +39,15 @@ class TFModelWrapper(ModelWrapper):
44
39
  X_valid_cv = self.train_data[val_idx]
45
40
  y_valid_cv = self.train_label[val_idx]
46
41
 
42
+ checkpoint = ModelCheckpoint(
43
+ filepath=f'{self.descriptor}_validation_{i}.weights.h5',
44
+ monitor='val_loss',
45
+ save_best_only=True,
46
+ save_weights_only=True,
47
+ verbose=1,
48
+ mode='min'
49
+ )
50
+
47
51
  early_stopping = EarlyStopping(
48
52
  monitor='val_loss', patience=10, verbose=0, mode='auto')
49
53
  custom_callback = CustomCallback()
@@ -52,15 +56,21 @@ class TFModelWrapper(ModelWrapper):
52
56
  epochs=self.epochs,
53
57
  validation_data=(
54
58
  X_valid_cv, y_valid_cv),
55
- callbacks=[early_stopping, custom_callback])
59
+ callbacks=[early_stopping, checkpoint, custom_callback])
56
60
  # histories[f'validation_{i}'] = history
57
- models_val_acc[f'validation_{i}'] = history.history['val_accuracy'][-1]
61
+ # models_val_acc[f'validation_{i}'] = history.history['val_accuracy'][-1]
62
+ models_val_acc[f'{self.descriptor}_validation_{i}'] = checkpoint.best
63
+ models[f'{self.descriptor}_validation_{i}'] = checkpoint.model
64
+ import os
65
+ if os.path.exists(f'{self.descriptor}_validation_{i}.weights.h5'):
66
+ os.remove(f'{self.descriptor}_validation_{i}.weights.h5')
58
67
  # Saving each CV model
59
68
 
60
69
  best_model_key = max(models_val_acc, key=models_val_acc.get)
61
70
  best_model = models[best_model_key]
62
- best_model.evaluate(self.test_data, self.test_label,
63
- callbacks=[custom_callback])
71
+ # mlflow.tensorflow.log_model(best_model, "model")
72
+ # best_model.evaluate(self.test_data, self.test_label,
73
+ # callbacks=[custom_callback])
64
74
  pred = best_model.predict(self.test_data)
65
75
 
66
76
  logs, metrics = evaluate(
@@ -68,17 +78,17 @@ class TFModelWrapper(ModelWrapper):
68
78
  metrics.format_float()
69
79
  mlflow.log_metrics(logs)
70
80
  mlflow.log_param('best_cv', best_model_key)
71
- signature = infer_signature(
72
- self.train_data,
73
- # generate_signature_output(model,X_valid_cv)
74
- # params=params,
75
- )
76
-
77
- mlflow.keras.save_model(
78
- best_model,
79
- path=run.info.artifact_uri + '/model',
80
- signature=signature,
81
- )
81
+ # signature = infer_signature(
82
+ # self.train_data,
83
+ # # generate_signature_output(model,X_valid_cv)
84
+ # # params=params,
85
+ # )
86
+
87
+ # mlflow.keras.save_model(
88
+ # best_model,
89
+ # path=run.info.artifact_uri + '/model',
90
+ # signature=signature,
91
+ # )
82
92
  print(run.info.artifact_uri)
83
93
  # todo tf2onnx not compatible with keras > 2.15
84
94
  # onnx_model, _ = tf2onnx.convert.from_keras(
@@ -63,6 +63,7 @@ class MultiPipeline():
63
63
  columns = config.get("columns")
64
64
  ner_data_file = config.get("ner_data_file")
65
65
  ner_threshold = config.get("ner_threshold")
66
+ column_embedding_configs = config.get("column_embedding_configs")
66
67
  vector_db_persist_directory = config.get("vector_db_persist_directory")
67
68
  vector_db_collection_name = config.get("vector_db_collection_name")
68
69
  embedding_pooling_strategy = get_import(
@@ -93,6 +94,7 @@ class MultiPipeline():
93
94
  tracking_uri=tracking_uri,
94
95
  dataset_type=dataset_type,
95
96
  columns=columns,
97
+ column_embedding_configs=column_embedding_configs,
96
98
  vector_db_persist_directory=vector_db_persist_directory,
97
99
  vector_db_collection_name=vector_db_collection_name,
98
100
  embedding_pooling_strategy_type=embedding_pooling_strategy,
@@ -126,6 +128,7 @@ class MultiPipeline():
126
128
  for config in self.experiments_config['experiments']:
127
129
  item = self.__create_pipeline(config)
128
130
  self.items.append(item)
131
+ return self
129
132
 
130
133
  def run(self):
131
134
  for item in self.items:
@@ -134,9 +137,11 @@ class MultiPipeline():
134
137
  model_type = item['model_type']
135
138
  batch_size = item['batch_size']
136
139
  epochs = item['epochs']
140
+ # It can be moved to build function
137
141
  pipeline.build()
138
142
  result = pipeline.run(model_type, epochs=epochs, batch_size=batch_size)
139
143
  self.pipeline_resuts[item['name']] = result
144
+ return self
140
145
 
141
146
  def results(self):
142
147
  return self.pipeline_resuts
@@ -21,6 +21,7 @@ class Pipeline:
21
21
  dataset_type: BaseDataset = None,
22
22
  columns=None,
23
23
  embedding_dict=None,
24
+ column_embedding_configs=None,
24
25
  vector_db_persist_directory=None,
25
26
  vector_db_collection_name=None,
26
27
  embedding_pooling_strategy_type: PoolingStrategy = None,
@@ -37,6 +38,7 @@ class Pipeline:
37
38
  self.dataset_type = dataset_type
38
39
  self.columns = columns
39
40
  self.embedding_dict = embedding_dict
41
+ self.column_embedding_configs = column_embedding_configs
40
42
  self.vector_db_persist_directory = vector_db_persist_directory
41
43
  self.vector_db_collection_name = vector_db_collection_name
42
44
  self.embedding_pooling_strategy_type = embedding_pooling_strategy_type
@@ -45,6 +47,37 @@ class Pipeline:
45
47
  self.combinations = combinations
46
48
  self.model = model
47
49
 
50
+ def __create_or_update_embeddings__(self, embedding_dict, vector_db_persist_directory, vector_db_collection_name, column):
51
+ """
52
+ Fetch embeddings and metadata from a persistent Chroma vector database and update the provided embedding_dict.
53
+
54
+ Args:
55
+ - vector_db_persist_directory (str): The path to the directory where the Chroma vector database is stored.
56
+ - vector_db_collection_name (str): The name of the collection to query.
57
+ - embedding_dict (dict): The existing dictionary to update with embeddings.
58
+
59
+ """
60
+ if vector_db_persist_directory:
61
+ # Initialize the Chroma client and get the collection
62
+ vector_db = chromadb.PersistentClient(
63
+ path=vector_db_persist_directory)
64
+ collection = vector_db.get_collection(vector_db_collection_name)
65
+
66
+ # Fetch the embeddings and metadata
67
+ if column == None:
68
+ dictionary = collection.get(include=['embeddings', 'metadatas'])
69
+ else:
70
+ dictionary = collection.get(include=['embeddings', 'metadatas'], where= {"type": {"$eq": f"{column}"}})
71
+ # Populate the embedding dictionary with embeddings from the vector database
72
+ for metadata, embedding in zip(dictionary['metadatas'], dictionary['embeddings']):
73
+ embedding_dict[metadata["type"]
74
+ ][metadata["id"]].append(embedding)
75
+
76
+ # return dictionary['embeddings'].shape[1]
77
+ else:
78
+ raise ValueError(
79
+ "Persistent directory for the vector DB is not specified.")
80
+
48
81
  def build(self):
49
82
  # 'enzyme','target','pathway','smile','all_text','indication', 'description','mechanism_of_action','pharmacodynamics', 'tui', 'cui', 'entities'
50
83
  kwargs = {"columns": self.columns}
@@ -52,27 +85,42 @@ class Pipeline:
52
85
  for k, v in self.ner_threshold.items():
53
86
  kwargs[k] = v
54
87
  if self.embedding_dict == None:
88
+ embedding_dict = defaultdict(lambda: defaultdict(list))
55
89
  if self.vector_db_persist_directory:
56
- self.vector_db = chromadb.PersistentClient(
57
- path=self.vector_db_persist_directory)
58
- self.collection = self.vector_db.get_collection(
59
- self.vector_db_collection_name)
60
- dictionary = self.collection.get(
61
- include=['embeddings', 'metadatas'])
62
-
63
- embedding_dict = defaultdict(lambda: defaultdict(list))
64
-
65
- for metadata, embedding in zip(dictionary['metadatas'], dictionary['embeddings']):
66
- embedding_dict[metadata["type"]
67
- ][metadata["id"]].append(embedding)
68
-
69
- embedding_size = dictionary['embeddings'].shape[1]
90
+ self.__create_or_update_embeddings__(
91
+ embedding_dict, self.vector_db_persist_directory, self.vector_db_collection_name)
92
+
93
+ if self.column_embedding_configs:
94
+ for item in self.column_embedding_configs:
95
+ col = item["column"]
96
+ col_db_dir = item["vector_db_persist_directory"]
97
+ col_db_collection = item["vector_db_collection_name"]
98
+ self.__create_or_update_embeddings__(embedding_dict, col_db_dir, col_db_collection, col)
99
+ print(f"Embedings of {col} is calculated from {col_db_collection}")
100
+
101
+ # if self.embedding_dict == None:
102
+ # if self.vector_db_persist_directory:
103
+ # self.vector_db = chromadb.PersistentClient(
104
+ # path=self.vector_db_persist_directory)
105
+ # self.collection = self.vector_db.get_collection(
106
+ # self.vector_db_collection_name)
107
+ # dictionary = self.collection.get(
108
+ # include=['embeddings', 'metadatas'])
109
+
110
+ # embedding_dict = defaultdict(lambda: defaultdict(list))
111
+
112
+ # for metadata, embedding in zip(dictionary['metadatas'], dictionary['embeddings']):
113
+ # embedding_dict[metadata["type"]
114
+ # ][metadata["id"]].append(embedding)
115
+
116
+ # embedding_size = dictionary['embeddings'].shape[1]
70
117
  else:
71
118
  embedding_dict = self.embedding_dict
72
- #TODO make generic
73
- embedding_size = list(embedding_dict['all_text'].values())[
74
- 0][0].shape
75
-
119
+ # TODO make generic
120
+ # embedding_size = list(embedding_dict['all_text'].values())[
121
+ # 0][0].shape
122
+ key, value = next(iter(embedding_dict.items()))
123
+ embedding_size = value[next(iter(value))][0].shape[0]
76
124
  pooling_strategy = self.embedding_pooling_strategy_type()
77
125
 
78
126
  self.ner_df = CTakesNER().load(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ddi_fw
3
- Version: 0.0.94
3
+ Version: 0.0.96
4
4
  Summary: Do not use :)
5
5
  Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
6
6
  Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
@@ -55,30 +55,24 @@ ddi_fw/drugbank/drugbank_parser.py,sha256=lxUuhB0s8ef_aPNDs0V8ClKF7-KIWugNIV9gVs
55
55
  ddi_fw/drugbank/drugbank_processor.py,sha256=vmkt68n9nFLevufgGyXhOSDtTo4G1XzwT9PVncGTXtk,18127
56
56
  ddi_fw/drugbank/drugbank_processor_org.py,sha256=eO5Yset50P91qkic79RUXPoEuxRxQKFkKW0l4G29Mas,13322
57
57
  ddi_fw/drugbank/event_extractor.py,sha256=6odoZohhK7OdLF-LF0l-5BFq0_NMG_5jrFJbHrBXsI8,4600
58
- ddi_fw/experiments/__init__.py,sha256=5L2xSolpFycNnflqOMdvJSiqRB16ExA5bbVGORKFX04,195
59
- ddi_fw/experiments/custom_torch_model.py,sha256=iQ_R_EApzD2JCcASN8cie6D21oh7VCxaOQ45_dkiGwc,2576
60
- ddi_fw/experiments/evaluation_helper.py,sha256=o4-w5Xa3t4olLW4ymx_8L-Buhe5wfQEmT2bh4Zz544c,13066
61
- ddi_fw/experiments/pipeline.py,sha256=4ltPCcfLZ1fFpiOd8ahPognI6NLmRLzJvUqyFpn3z18,5693
62
- ddi_fw/experiments/pipeline_ner.py,sha256=unxEJCYrG6wEZjLmqvGdLRTMOBwELbGKkdygSpAR3b8,5043
63
- ddi_fw/experiments/tensorflow_helper.py,sha256=m3Mppl-tbccTMAKLpZg2YC0xpcukkyQihPw_uwAlRRY,11857
64
- ddi_fw/experiments/test.py,sha256=z1TfBpK75zGKpp2ZU8f6APjZlgBFthaCBN61YB9ma4o,2049
65
58
  ddi_fw/langchain/__init__.py,sha256=8dBPZivc01WWaCH8sZ_UV8-XPyo74e9Qy6-fYgAiNLE,248
66
59
  ddi_fw/langchain/embeddings.py,sha256=8J_SfO9pyET2W-Ltzq0_r9EchFzBsYdUabiOMma42Us,7515
67
60
  ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSzu5L2PXzg,280
68
61
  ddi_fw/langchain/storage.py,sha256=uy5clVB07So2eFbRGdAKzHIPdfEk4se33cPktis7Aa4,2716
69
- ddi_fw/ml/__init__.py,sha256=0YubqmEpJKp3OfqlLKkD5N9L6WDWew3QEtnbdY3mqKg,180
70
- ddi_fw/ml/ml_helper.py,sha256=yeNfTg9aC6woQLYbR3ofSUE1b79k1SQPrLAr2yYxHVA,4498
71
- ddi_fw/ml/model_wrapper.py,sha256=ZExnsLMjHKL3BaI4aKkbyWTp8vbswLeF2_T3cZ73YpQ,1144
72
- ddi_fw/ml/pytorch_wrapper.py,sha256=YdwzR5qAHFNajYB_elFqDhVKRLeajaRpopNzyQ6gIIA,3725
73
- ddi_fw/ml/tensorflow_wrapper.py,sha256=pSeiJDuaLf9MhZVlLuLJBA-LH-H-Dl2TyYbB39iGsto,5748
62
+ ddi_fw/ml/__init__.py,sha256=tIxiW0g6q1VsmDYVXR_ovvHQR3SCir8g2bKxx_CrS7s,221
63
+ ddi_fw/ml/evaluation_helper.py,sha256=o4-w5Xa3t4olLW4ymx_8L-Buhe5wfQEmT2bh4Zz544c,13066
64
+ ddi_fw/ml/ml_helper.py,sha256=fySjIAFzkeEOvaLJhDwtCOgRhgYQ7H106eqaP16GhDY,4489
65
+ ddi_fw/ml/model_wrapper.py,sha256=W-bed6NOJxuXs7d3nG6iU8bkUX8LBPFQ0gMjZ7Qc0Sw,1135
66
+ ddi_fw/ml/pytorch_wrapper.py,sha256=AkG-2sKDXr0IBhgmkbjG0i20OuwQv3mhdvqp6UvJDCA,3716
67
+ ddi_fw/ml/tensorflow_wrapper.py,sha256=E46lC9qMkM5NvFTL-eOuMcMhEUso5UYfP66Du4BOhfQ,6423
74
68
  ddi_fw/ner/__init__.py,sha256=JwhGXrepomxPSsGsg2b_xPRC72AjvxOIn2CW5Mvscn0,26
75
69
  ddi_fw/ner/mmlrestclient.py,sha256=NZta7m2Qm6I_qtVguMZhqtAUjVBmmXn0-TMnsNp0jpg,6859
76
70
  ddi_fw/ner/ner.py,sha256=BEs9AFljAxOQrC2BEP1raSzRoypcfELS5UTdl4bjTqw,15863
77
71
  ddi_fw/pipeline/__init__.py,sha256=tKDM_rW4vPjlYTeOkNgi9PujDzb4e9O3LK1w5wqnebw,212
78
72
  ddi_fw/pipeline/multi_modal_combination_strategy.py,sha256=qIst7vxHaOAhRv4lgozszwa3b1QE4aIrN74t41Xnvr4,1637
79
- ddi_fw/pipeline/multi_pipeline.py,sha256=5UDpTYgvrbbaMCoLu4BtlqfGy5MA2lTSNakm6hBVdGw,5669
73
+ ddi_fw/pipeline/multi_pipeline.py,sha256=t_Z7d7xRfDnhpQTlqCf7c0isZ5hZlyXavKhC7ePsnJY,5903
80
74
  ddi_fw/pipeline/ner_pipeline.py,sha256=wB7hz4YCOv7UAz6bGE6sSpPXXIdoOflOVK5UCc1fO-o,5586
81
- ddi_fw/pipeline/pipeline.py,sha256=uiRS0MmOOto3RUp5C4pRJZcZQHpevuZGGLrsZP6FmJ8,5599
75
+ ddi_fw/pipeline/pipeline.py,sha256=q7jfTt7ryYa3xBscPtxvanB-j5RzWVZUKir0KmAdTKc,8357
82
76
  ddi_fw/test/basic_test.py,sha256=fEOGcZm1ObnsDvMiXNmdmz6YCeUrGc8V0DwlSwGhsq8,376
83
77
  ddi_fw/test/combination_test.py,sha256=TWNE8sf-DSh1Q9-yRaRBc774Sn1kSMGXLwQhd2_Qynk,324
84
78
  ddi_fw/test/compress_json_test.py,sha256=BGny56YqiG-pzhMoDzLKQBQI1E7o3jU0S7VYWtclAx4,1045
@@ -95,7 +89,7 @@ ddi_fw/utils/enums.py,sha256=19eJ3fX5eRK_xPvkYcukmug144jXPH4X9zQqtsFBj5A,671
95
89
  ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,4747
96
90
  ddi_fw/utils/utils.py,sha256=szwnxMTDRrZoeNRyDuf3aCbtzriwtaRk4mHSH3asLdA,4301
97
91
  ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
98
- ddi_fw-0.0.94.dist-info/METADATA,sha256=R-lLr-tlzZPcKnW35DlxhJ5CtlbVasb4qE5NLS5nV9A,1966
99
- ddi_fw-0.0.94.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
100
- ddi_fw-0.0.94.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
101
- ddi_fw-0.0.94.dist-info/RECORD,,
92
+ ddi_fw-0.0.96.dist-info/METADATA,sha256=b2D7e7ub3byUbTwEPdw6FvUqTEK-H_KflNXwqbk4r7s,1966
93
+ ddi_fw-0.0.96.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
94
+ ddi_fw-0.0.96.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
95
+ ddi_fw-0.0.96.dist-info/RECORD,,
@@ -1,4 +0,0 @@
1
- from .tensorflow_helper import TFMultiModal, TFSingleModal,Result
2
- from .evaluation_helper import evaluate, Metrics
3
- from .pipeline import Experiment
4
- from .pipeline_ner import NerParameterSearch
@@ -1,66 +0,0 @@
1
- import torch
2
-
3
- class ExtendedTorchModule(torch.nn.Module):
4
- def __init__(self,model):
5
- super().__init__()
6
- self.model = model
7
-
8
- def train(self,dataloader_train, criterion, optimizer, epoch_count = 10):
9
- for epoch in range(epoch_count): # loop over the dataset multiple times
10
-
11
- running_loss = 0.0
12
- for i, data in enumerate(dataloader_train, 0):
13
- # get the inputs; data is a list of [inputs, labels]
14
- inputs, labels = data
15
-
16
- # zero the parameter gradients
17
- optimizer.zero_grad()
18
-
19
- # forward + backward + optimize
20
- outputs = self(inputs)
21
- loss = criterion(outputs, labels)
22
- loss.backward()
23
- optimizer.step()
24
-
25
- # print statistics
26
- running_loss += loss.item()
27
- if i % 5000 == 4999: # print every 2000 mini-batches
28
- print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 5000:.3f}')
29
- running_loss = 0.0
30
- print('Finished Training')
31
-
32
- def forward(self, x):
33
- x = x.to(torch.float32)
34
- # for f in self.module_list:
35
- # x = f(x)
36
- # return x
37
- return self.model(x)
38
-
39
- def compute_outputs(self, dataloader_test):
40
- output_arr = []
41
- with torch.no_grad():
42
- for data in dataloader_test:
43
- inputs, labels = data
44
- # calculate outputs by running inputs through the network
45
- outputs = self(inputs)
46
- output_arr.append(outputs.numpy())
47
-
48
- # <ipython-input-44-114ac3037693>:54: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at ../torch/csrc/utils/tensor_new.cpp:245.)
49
- t = torch.tensor(output_arr)
50
- return torch.squeeze(t)
51
-
52
- # def compute_accuracy(self, dataloader_test):
53
- # correct = 0
54
- # total = 0
55
- # # since we're not training, we don't need to calculate the gradients for our outputs
56
- # with torch.no_grad():
57
- # for data in dataloader_test:
58
- # inputs, labels = data
59
- # # calculate outputs by running inputs through the network
60
- # outputs = self(inputs)
61
- # # the class with the highest energy is what we choose as prediction
62
- # _, predicted = torch.max(outputs.data, 1)
63
- # total += labels.size(0)
64
- # correct += (predicted == labels).sum().item()
65
-
66
- # print(f'Accuracy of the network: {100 * correct // total} %')
@@ -1,132 +0,0 @@
1
- import sqlite3
2
- from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
3
- from keras.models import Model, Sequential
4
- from keras.callbacks import EarlyStopping
5
- from keras.layers import Dense, Dropout, Input, Activation, BatchNormalization
6
- from tensorflow import keras
7
- from ddi_fw.experiments import TFSingleModal, TFMultiModal
8
- from ddi_fw.experiments import evaluate
9
- from sklearn.preprocessing import LabelBinarizer
10
- import numpy as np
11
- import pandas as pd
12
- from ddi_fw.utils import ZipHelper, Py7ZipHelper
13
- import os
14
- import chromadb
15
- from collections import defaultdict
16
- from langchain_community.vectorstores import Chroma
17
- from ddi_fw.ner.ner import CTakesNER
18
- from ddi_fw.langchain.embeddings import PoolingStrategy
19
-
20
- from ddi_fw.datasets import BaseDataset, DDIMDLDataset
21
-
22
- from ddi_fw.langchain.embeddings import SumPoolingStrategy
23
- from keras import metrics
24
- from ddi_fw.experiments.evaluation_helper import evaluate
25
-
26
- import mlflow
27
-
28
-
29
- class Experiment:
30
- def __init__(self,
31
- experiment_name=None,
32
- experiment_description=None,
33
- experiment_tags=None,
34
- tracking_uri=None,
35
- dataset_type:BaseDataset=None,
36
- columns=None,
37
- embedding_dict = None,
38
- vector_db_persist_directory=None,
39
- vector_db_collection_name=None,
40
- embedding_pooling_strategy_type:PoolingStrategy=None,
41
- ner_data_file=None,
42
- ner_threshold=None,
43
- combinations=None,
44
- model=None):
45
-
46
- self.experiment_name = experiment_name
47
- self.experiment_description = experiment_description
48
- self.experiment_tags = experiment_tags
49
- self.tracking_uri = tracking_uri
50
- self.dataset_type = dataset_type
51
- self.columns = columns
52
- self.embedding_dict = embedding_dict
53
- self.vector_db_persist_directory = vector_db_persist_directory
54
- self.vector_db_collection_name = vector_db_collection_name
55
- self.embedding_pooling_strategy_type = embedding_pooling_strategy_type
56
- self.ner_data_file = ner_data_file
57
- self.ner_threshold = ner_threshold
58
- self.combinations = combinations
59
- self.model = model
60
-
61
- def build(self):
62
- # 'enzyme','target','pathway','smile','all_text','indication', 'description','mechanism_of_action','pharmacodynamics', 'tui', 'cui', 'entities'
63
- kwargs = {"columns": self.columns}
64
- for k, v in self.ner_threshold.items():
65
- kwargs[k] = v
66
- if self.embedding_dict == None:
67
- if self.vector_db_persist_directory:
68
- self.vector_db = chromadb.PersistentClient(
69
- path=self.vector_db_persist_directory)
70
- self.collection = self.vector_db.get_collection(
71
- self.vector_db_collection_name)
72
- dictionary = self.collection.get(include=['embeddings', 'metadatas'])
73
-
74
- embedding_dict = defaultdict(lambda: defaultdict(list))
75
-
76
- for metadata, embedding in zip(dictionary['metadatas'], dictionary['embeddings']):
77
- embedding_dict[metadata["type"]][metadata["id"]].append(embedding)
78
-
79
- embedding_size = dictionary['embeddings'].shape[1]
80
- else:
81
- embedding_dict = self.embedding_dict
82
- embedding_size = list(embedding_dict['all_text'].values())[0][0].shape
83
-
84
- pooling_strategy = self.embedding_pooling_strategy_type()
85
-
86
- self.ner_df = CTakesNER().load(filename=self.ner_data_file) if self.ner_data_file else None
87
-
88
- self.dataset = self.dataset_type(
89
- embedding_dict=embedding_dict,
90
- embedding_size=embedding_size,
91
- embeddings_pooling_strategy=pooling_strategy,
92
- ner_df=self.ner_df, **kwargs)
93
-
94
- X_train, X_test, y_train, y_test, X_train.index, X_test.index, train_idx_arr, val_idx_arr = self.dataset.load()
95
-
96
- self.dataframe = self.dataset.dataframe
97
- # dataframe.dropna()
98
- self.X_train = self.dataset.X_train
99
- self.X_test = self.dataset.X_test
100
- self.y_train = self.dataset.y_train
101
- self.y_test = self.dataset.y_test
102
- self.train_idx_arr = self.dataset.train_idx_arr
103
- self.val_idx_arr = self.dataset.val_idx_arr
104
- # Logic to set up the experiment
105
- self.items = self.dataset.produce_inputs()
106
-
107
- unique_classes = pd.unique(self.dataframe['event_category'])
108
- event_num = len(unique_classes)
109
- # droprate = 0.3
110
- vector_size = self.dataset.drugs_df.shape[0]
111
-
112
- print("Building the experiment with the following settings:")
113
- print(
114
- f"Name: {self.experiment_name}, Dataset: {self.dataset}, Model: {self.model}")
115
- # Implement additional build logic as needed
116
- return self
117
-
118
- def run(self, model_func, batch_size=128, epochs=100):
119
- mlflow.set_tracking_uri(self.tracking_uri)
120
-
121
- if mlflow.get_experiment_by_name(self.experiment_name) == None:
122
- mlflow.create_experiment(self.experiment_name)
123
- mlflow.set_experiment_tags(self.experiment_tags)
124
- mlflow.set_experiment(self.experiment_name)
125
-
126
- y_test_label = self.items[0][4]
127
- multi_modal = TFMultiModal(
128
- model_func=model_func, batch_size=batch_size, epochs=epochs) # 100
129
- multi_modal.set_data(
130
- self.items, self.train_idx_arr, self.val_idx_arr, y_test_label)
131
- result = multi_modal.predict(self.combinations)
132
- return result
@@ -1,116 +0,0 @@
1
- from collections import defaultdict
2
- from enum import Enum
3
- import numpy as np
4
- import pandas as pd
5
- from ddi_fw.datasets.core import BaseDataset
6
- from ddi_fw.experiments.tensorflow_helper import TFMultiModal
7
- from ddi_fw.experiments.pipeline import Experiment
8
- from typing import Dict, List
9
- from itertools import product
10
-
11
- from ddi_fw.utils.enums import DrugBankTextDataTypes, UMLSCodeTypes
12
- import mlflow
13
- from ddi_fw.ner.ner import CTakesNER
14
-
15
- def stack(df_column):
16
- return np.stack(df_column.values)
17
-
18
-
19
- class NerParameterSearch:
20
- def __init__(self,
21
- experiment_name,
22
- experiment_description,
23
- experiment_tags,
24
- tracking_uri,
25
- dataset_type: BaseDataset,
26
- ner_data_file,
27
- columns:list,
28
- umls_code_types: List[UMLSCodeTypes],
29
- text_types=List[DrugBankTextDataTypes],
30
- min_threshold_dict: Dict[str, float] = defaultdict(float),
31
- max_threshold_dict: Dict[str, float] = defaultdict(float),
32
- increase_step=0.5):
33
- self.experiment_name = experiment_name
34
- self.experiment_description = experiment_description
35
- self.experiment_tags = experiment_tags
36
- self.tracking_uri = tracking_uri
37
-
38
- self.dataset_type = dataset_type
39
- self.ner_data_file = ner_data_file
40
- self.columns = columns
41
- self.umls_code_types = umls_code_types
42
- self.text_types = text_types
43
- self.min_threshold_dict = min_threshold_dict
44
- self.max_threshold_dict = max_threshold_dict
45
- self.increase_step = increase_step
46
-
47
- def build(self):
48
- self.datasets = {}
49
- self.items = []
50
- # columns = ['tui', 'cui', 'entities']
51
- if self.umls_code_types is not None and self.text_types is not None:
52
- # add checking statements
53
- _umls_codes = [t.value[0] for t in self.umls_code_types]
54
- _text_types = [t.value[0] for t in self.text_types]
55
- _columns = [f'{item[0]}_{item[1]}' for item in product(
56
- _umls_codes, _text_types)]
57
- self.columns.extend(_columns)
58
- print(f'Columns: {self.columns}')
59
- self.ner_df = CTakesNER().load(filename=self.ner_data_file) if self.ner_data_file else None
60
- for column in self.columns:
61
- min_threshold = self.min_threshold_dict[column]
62
- max_threshold = self.max_threshold_dict[column]
63
- kwargs = {}
64
- kwargs['threshold_method'] = 'idf'
65
- kwargs['tui_threshold'] = 0
66
- kwargs['cui_threshold'] = 0
67
- kwargs['entities_threshold'] = 0
68
-
69
- for threshold in np.arange(min_threshold, max_threshold, self.increase_step):
70
- print(threshold)
71
- if column.startswith('tui'):
72
- kwargs['tui_threshold'] = threshold
73
- if column.startswith('cui'):
74
- kwargs['cui_threshold'] = threshold
75
- if column.startswith('entities'):
76
- kwargs['entities_threshold'] = threshold
77
- dataset = self.dataset_type(
78
- # chemical_property_columns=[],
79
- # embedding_columns=[],
80
- # ner_columns=[column],
81
- columns=[column],
82
- ner_df= self.ner_df,
83
- embedding_size = None,
84
- embedding_dict = None,
85
- embeddings_pooling_strategy = None,
86
- **kwargs)
87
-
88
- # train_idx_arr, val_idx_arr bir kez hesaplanması yeterli aslında
89
- X_train, X_test, y_train, y_test, X_train.index, X_test.index, train_idx_arr, val_idx_arr = dataset.load()
90
- group_items = dataset.produce_inputs()
91
- for item in group_items:
92
- # item[0] = f'threshold_{threshold}_{item[0]}'
93
- item[0] = f'threshold_{item[0]}_{threshold}'
94
- self.datasets[item[0]] = dataset.ddis_df
95
-
96
- self.items.extend(group_items)
97
- self.y_test_label = self.items[0][4]
98
- self.train_idx_arr = train_idx_arr
99
- self.val_idx_arr = val_idx_arr
100
-
101
-
102
- def run(self, model_func, batch_size=128, epochs=100):
103
- mlflow.set_tracking_uri(self.tracking_uri)
104
-
105
- if mlflow.get_experiment_by_name(self.experiment_name) == None:
106
- mlflow.create_experiment(self.experiment_name)
107
- mlflow.set_experiment_tags(self.experiment_tags)
108
- mlflow.set_experiment(self.experiment_name)
109
-
110
- y_test_label = self.items[0][4]
111
- multi_modal = TFMultiModal(
112
- model_func=model_func, batch_size=batch_size, epochs=epochs) # 100
113
- multi_modal.set_data(
114
- self.items, self.train_idx_arr, self.val_idx_arr, y_test_label)
115
- result = multi_modal.predict()
116
- return result
@@ -1,284 +0,0 @@
1
- from matplotlib import pyplot as plt
2
- import tensorflow as tf
3
- from tensorflow import keras
4
- from keras.models import Model, Sequential
5
- from keras.layers import Dense, Dropout, Input, Activation, BatchNormalization
6
- from keras.callbacks import EarlyStopping
7
- from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
8
- import numpy as np
9
-
10
- import mlflow
11
- from mlflow.utils.autologging_utils import batch_metrics_logger
12
- import time
13
-
14
- from mlflow.models import infer_signature
15
- from ddi_fw.experiments.evaluation_helper import evaluate
16
-
17
- # import tf2onnx
18
- # import onnx
19
-
20
- import itertools
21
- import ddi_fw.utils as utils
22
-
23
- # https://github.com/YifanDengWHU/DDIMDL/blob/master/newTask.py
24
- # from numpy.random import seed
25
- # seed(1)
26
- # from tensorflow import set_random_seed
27
- # set_random_seed(2)
28
- tf.random.set_seed(1)
29
- np.random.seed(2)
30
- np.set_printoptions(precision=4)
31
-
32
-
33
- class Result:
34
- def __init__(self) -> None:
35
- self.log_dict = {}
36
- self.metric_dict = {}
37
-
38
- def add_log(self, key, logs):
39
- self.log_dict[key] = logs
40
-
41
- def add_metric(self, key, metrics):
42
- self.metric_dict[key] = metrics
43
-
44
-
45
- class TFMultiModal:
46
- # todo model related parameters to config
47
- def __init__(self, model_func, batch_size=128, epochs=100):
48
- self.model_func = model_func
49
- self.batch_size = batch_size
50
- self.epochs = epochs
51
- self.result = Result()
52
-
53
- def set_data(self, items, train_idx_arr, val_idx_arr, y_test_label):
54
- self.items = items
55
- self.train_idx_arr = train_idx_arr
56
- self.val_idx_arr = val_idx_arr
57
- self.y_test_label = y_test_label
58
-
59
- def predict(self, combinations: list = [], generate_combinations=False):
60
- self.prefix = utils.utc_time_as_string()
61
- self.date = utils.utc_time_as_string_simple_format()
62
- sum = np.zeros(
63
- (self.y_test_label.shape[0], self.y_test_label.shape[1]))
64
- single_results = dict()
65
-
66
- if generate_combinations:
67
- l = [item[0] for item in self.items]
68
- combinations = []
69
- for i in range(2, len(l) + 1):
70
- combinations.extend(list(itertools.combinations(l, i))) # all
71
-
72
- with mlflow.start_run(run_name=self.prefix, description="***") as run:
73
- self.level_0_run_id = run.info.run_id
74
- for item in self.items:
75
- print(item[0])
76
- single_modal = TFSingleModal(
77
- self.date, item[0], self.model_func, self.batch_size, self.epochs)
78
- single_modal.set_data(
79
- self.train_idx_arr, self.val_idx_arr, item[1], item[2], item[3], item[4])
80
- logs, metrics, prediction = single_modal.predict()
81
- self.result.add_log(item[0], logs)
82
- # self.result.add_metric(item[0], metrics)
83
- # single_results[item[0]] = prediction
84
- single_results[item[0]] = tf.nn.softmax(prediction).numpy()
85
- # sum = sum + prediction
86
-
87
- if combinations:
88
- self.evaluate_combinations(single_results, combinations)
89
- # TODO: sum'a gerek yok
90
- return self.result
91
-
92
- def evaluate_combinations(self, single_results, combinations):
93
- for combination in combinations:
94
- combination_descriptor = '-'.join(combination)
95
- with mlflow.start_run(run_name=combination_descriptor, description="***", nested=True) as combination_run:
96
- prediction = np.zeros(
97
- (self.y_test_label.shape[0], self.y_test_label.shape[1]))
98
- for item in combination:
99
- prediction = prediction + single_results[item]
100
- logs, metrics = evaluate(
101
- actual=self.y_test_label, pred=prediction, info=combination_descriptor)
102
- mlflow.log_metrics(logs)
103
- metrics.format_float()
104
- # TODO path bulunamadı hatası aldık
105
- print(
106
- f'combination_artifact_uri:{combination_run.info.artifact_uri}')
107
- utils.compress_and_save_data(
108
- metrics.__dict__, combination_run.info.artifact_uri, f'{self.date}_metrics.gzip')
109
- # self.result.add_log(combination_descriptor,logs)
110
- # self.result.add_metric(combination_descriptor,metrics)
111
-
112
-
113
- class TFSingleModal:
114
- def __init__(self, date, descriptor, model_func, batch_size=128, epochs=100):
115
- self.date = date
116
- self.descriptor = descriptor
117
- self.model_func = model_func
118
- self.batch_size = batch_size
119
- self.epochs = epochs
120
-
121
- def set_data(self, train_idx_arr, val_idx_arr, train_data, train_label, test_data, test_label):
122
- self.train_idx_arr = train_idx_arr
123
- self.val_idx_arr = val_idx_arr
124
- self.train_data = train_data
125
- self.train_label = train_label
126
- self.test_data = test_data
127
- self.test_label = test_label
128
-
129
- # https://github.com/mlflow/mlflow/blob/master/examples/tensorflow/train.py
130
- def predict(self):
131
- print(self.train_data.shape)
132
-
133
- # Failed to convert a NumPy array to a Tensor
134
- with mlflow.start_run(run_name=self.descriptor, description="***", nested=True) as run:
135
- models = dict()
136
- histories = dict()
137
- models_val_acc = dict()
138
- # with batch_metrics_logger(run_id) as metrics_logger:
139
- for i, (train_idx, val_idx) in enumerate(zip(self.train_idx_arr, self.val_idx_arr)):
140
- print(f"Validation {i}")
141
-
142
- with mlflow.start_run(run_name=f'Validation {i}', description='CV models', nested=True) as cv_fit:
143
- model = self.model_func(self.train_data.shape[1])
144
- models[f'validation_{i}'] = model
145
- X_train_cv = self.train_data[train_idx]
146
- y_train_cv = self.train_label[train_idx]
147
- X_valid_cv = self.train_data[val_idx]
148
- y_valid_cv = self.train_label[val_idx]
149
-
150
- early_stopping = EarlyStopping(
151
- monitor='val_loss', patience=10, verbose=0, mode='auto')
152
- custom_callback = CustomCallback()
153
- history = model.fit(X_train_cv, y_train_cv,
154
- batch_size=self.batch_size,
155
- epochs=self.epochs,
156
- validation_data=(
157
- X_valid_cv, y_valid_cv),
158
- callbacks=[early_stopping, custom_callback])
159
- # histories[f'validation_{i}'] = history
160
- models_val_acc[f'validation_{i}'] = history.history['val_accuracy'][-1]
161
- # Saving each CV model
162
-
163
- best_model_key = max(models_val_acc, key=models_val_acc.get)
164
- best_model = models[best_model_key]
165
- best_model.evaluate(self.test_data, self.test_label,
166
- callbacks=[custom_callback])
167
- pred = best_model.predict(self.test_data)
168
-
169
- logs, metrics = evaluate(
170
- actual=self.test_label, pred=pred, info=self.descriptor)
171
- metrics.format_float()
172
- mlflow.log_metrics(logs)
173
- mlflow.log_param('best_cv', best_model_key)
174
- signature = infer_signature(
175
- self.train_data,
176
- # generate_signature_output(model,X_valid_cv)
177
- # params=params,
178
- )
179
-
180
- mlflow.keras.save_model(
181
- best_model,
182
- path=run.info.artifact_uri + '/model',
183
- signature=signature,
184
- )
185
- print(run.info.artifact_uri)
186
- # todo tf2onnx not compatible with keras > 2.15
187
- # onnx_model, _ = tf2onnx.convert.from_keras(
188
- # best_model, input_signature=None, opset=13)
189
- # onnx.save(onnx_model, run.info.artifact_uri +
190
- # '/model/model.onnx')
191
- utils.compress_and_save_data(
192
- metrics.__dict__, run.info.artifact_uri, f'{self.date}_metrics.gzip')
193
-
194
- return logs, metrics, pred
195
-
196
-
197
- class CustomCallback(keras.callbacks.Callback):
198
- def on_train_begin(self, logs=None):
199
- keys = list(logs.keys())
200
- mlflow.log_param("train_begin_keys", keys)
201
- config = self.model.optimizer.get_config()
202
- for attribute in config:
203
- mlflow.log_param("opt_" + attribute, config[attribute])
204
-
205
- sum_list = []
206
- self.model.summary(print_fn=sum_list.append)
207
- summary = "\n".join(sum_list)
208
- mlflow.log_text(summary, artifact_file="model_summary.txt")
209
-
210
- def on_train_end(self, logs=None):
211
- print(logs)
212
- mlflow.log_metrics(logs)
213
-
214
- def on_epoch_begin(self, epoch, logs=None):
215
- keys = list(logs.keys())
216
-
217
- def on_epoch_end(self, epoch, logs=None):
218
- keys = list(logs.keys())
219
-
220
- def on_test_begin(self, logs=None):
221
- keys = list(logs.keys())
222
-
223
- def on_test_end(self, logs=None):
224
- mlflow.log_metrics(logs)
225
- print(logs)
226
-
227
- def on_predict_begin(self, logs=None):
228
- keys = list(logs.keys())
229
-
230
- def on_predict_end(self, logs=None):
231
- keys = list(logs.keys())
232
- mlflow.log_metrics(logs)
233
-
234
- def on_train_batch_begin(self, batch, logs=None):
235
- keys = list(logs.keys())
236
-
237
- def on_train_batch_end(self, batch, logs=None):
238
- keys = list(logs.keys())
239
-
240
- def on_test_batch_begin(self, batch, logs=None):
241
- keys = list(logs.keys())
242
-
243
- def on_test_batch_end(self, batch, logs=None):
244
- keys = list(logs.keys())
245
-
246
- def on_predict_batch_begin(self, batch, logs=None):
247
- keys = list(logs.keys())
248
-
249
- def on_predict_batch_end(self, batch, logs=None):
250
- keys = list(logs.keys())
251
- # def on_train_begin(self, logs=None): # pylint: disable=unused-argument
252
- # config = self.model.optimizer.get_config()
253
- # for attribute in config:
254
- # mlflow.log_param("opt_" + attribute, config[attribute])
255
-
256
- # sum_list = []
257
- # self.model.summary(print_fn=sum_list.append)
258
- # summary = "\n".join(sum_list)
259
- # mlflow.log_text(summary, artifact_file="model_summary.txt")
260
-
261
- # def on_epoch_end(self, epoch, logs=None):
262
- # # NB: tf.Keras uses zero-indexing for epochs, while other TensorFlow Estimator
263
- # # APIs (e.g., tf.Estimator) use one-indexing. Accordingly, the modular arithmetic
264
- # # used here is slightly different from the arithmetic used in `_log_event`, which
265
- # # provides metric logging hooks for TensorFlow Estimator & other TensorFlow APIs
266
- # if epoch % self.log_every_n_steps == 0:
267
- # self.metrics_logger.record_metrics(logs, epoch)
268
-
269
- # def predict(self):
270
- # model = self.model_func()
271
- # # Failed to convert a NumPy array to a Tensor
272
- # for i, (train_idx, val_idx) in enumerate(zip(self.train_idx_arr, self.val_idx_arr)):
273
- # print(f"Validation {i}")
274
- # X_train_cv = self.train_data[train_idx]
275
- # y_train_cv = self.train_label[train_idx]
276
- # X_valid_cv = self.train_data[val_idx]
277
- # y_valid_cv = self.train_label[val_idx]
278
-
279
- # early_stopping = EarlyStopping(
280
- # monitor='val_loss', patience=10, verbose=0, mode='auto')
281
- # model.fit(X_train_cv, y_train_cv, batch_size=128, epochs=20, validation_data=(X_valid_cv, y_valid_cv),
282
- # callbacks=[early_stopping])
283
- # pred = model.predict(self.test_data)
284
- # return pred
@@ -1,61 +0,0 @@
1
- # # https://github.com/kashif/tf-keras-tutorial/blob/tf2/3-imdb.ipynb
2
- # # TensorFlow and tf.keras
3
- # import tensorflow as tf
4
-
5
- # # Helper libraries
6
- # import numpy as np
7
- # import matplotlib.pyplot as plt
8
- # from tensorflow_helper import CustomCallback
9
-
10
- # print(tf.__version__)
11
-
12
-
13
- # imdb = tf.keras.datasets.imdb
14
-
15
- # (train_data, train_labels), (test_data, test_labels) = tf.keras.datasets.imdb.load_data(num_words=10000)
16
-
17
-
18
- # class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
19
- # 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
20
-
21
-
22
- # # Create a model
23
- # model = tf.keras.Sequential()
24
- # custom_callback = CustomCallback()
25
-
26
- # # input shape here is the length of our movie review vector
27
- # model.add(tf.keras.layers.Dense(16, activation=tf.nn.relu, input_shape=(10000,)))
28
- # model.add(tf.keras.layers.Dense(16, activation=tf.nn.relu))
29
- # model.add(tf.keras.layers.Dense(1, activation=tf.nn.sigmoid))
30
-
31
- # optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001)
32
-
33
- # model.compile(loss='binary_crossentropy',
34
- # optimizer=optimizer,
35
- # metrics=['binary_accuracy'])
36
-
37
- # model.summary()
38
-
39
- # VAL_SIZE = 10000
40
- # x = np.array(train_data[:VAL_SIZE].tolist())
41
-
42
- # val_data = np.asarray(train_data[:VAL_SIZE])
43
- # partial_train_data = np.asarray(train_data[VAL_SIZE:])
44
-
45
-
46
- # val_labels = train_labels[:VAL_SIZE]
47
- # partial_train_labels = train_labels[VAL_SIZE:]
48
-
49
- # BATCH_SIZE = 512
50
- # SHUFFLE_SIZE = 1000
51
-
52
- # # training_set = tf.data.Dataset.from_tensor_slices((partial_train_data, partial_train_labels))
53
- # # training_set = training_set.shuffle(SHUFFLE_SIZE).batch(BATCH_SIZE)
54
-
55
- # model.fit(partial_train_data , partial_train_labels , batch_size=128, epochs=20, validation_data=(val_data , val_labels ),
56
- # callbacks=[custom_callback])
57
-
58
- # loss, accuracy = model.evaluate(test_data, test_labels,callbacks=[custom_callback])
59
- # print('Test accuracy: %.2f' % (accuracy))
60
-
61
- from langchain.embeddings import SentenceTransformerEmbeddings
File without changes