PyPI - ddi-fw - Versions diffs - 0.0.149__py3-none-any.whl → 0.0.150__py3-none-any.whl - Mend

ddi-fw 0.0.149py3-none-any.whl → 0.0.150py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

ddi_fw/datasets/__init__.py +1 -1
ddi_fw/datasets/core.py +147 -341
ddi_fw/datasets/dataset_splitter.py +39 -0
ddi_fw/datasets/ddi_mdl/base.py +194 -130
ddi_fw/datasets/ddi_mdl/debug.log +1 -0
ddi_fw/datasets/embedding_generator.py +2 -1
ddi_fw/langchain/embeddings.py +1 -0
ddi_fw/ml/evaluation_helper.py +47 -178
ddi_fw/ml/ml_helper.py +125 -81
ddi_fw/ml/model_wrapper.py +2 -2
ddi_fw/ml/pytorch_wrapper.py +175 -72
ddi_fw/ml/tensorflow_wrapper.py +131 -39
ddi_fw/ner/ner.py +93 -39
ddi_fw/pipeline/multi_modal_combination_strategy.py +4 -2
ddi_fw/pipeline/multi_pipeline.py +2 -15
ddi_fw/pipeline/ner_pipeline.py +15 -6
ddi_fw/pipeline/pipeline.py +152 -94
ddi_fw/{test/compress_json_test.py → utils/json_helper.py} +1 -15
{ddi_fw-0.0.149.dist-info → ddi_fw-0.0.150.dist-info}/METADATA +6 -3
{ddi_fw-0.0.149.dist-info → ddi_fw-0.0.150.dist-info}/RECORD +22 -31
{ddi_fw-0.0.149.dist-info → ddi_fw-0.0.150.dist-info}/WHEEL +1 -1
ddi_fw/test/__init__.py +0 -0
ddi_fw/test/basic_test.py +0 -15
ddi_fw/test/combination_test.py +0 -12
ddi_fw/test/date_test.py +0 -15
ddi_fw/test/idf_score.py +0 -54
ddi_fw/test/jaccard_similarity.py +0 -85
ddi_fw/test/mlfow_test.py +0 -165
ddi_fw/test/sklearn-tfidf.py +0 -16
ddi_fw/test/test.py +0 -93
ddi_fw/test/torch_cuda_test.py +0 -9
ddi_fw/test/type_guarding_test.py +0 -18
{ddi_fw-0.0.149.dist-info → ddi_fw-0.0.150.dist-info}/top_level.txt +0 -0

ddi_fw/pipeline/pipeline.py CHANGED Viewed

@@ -1,7 +1,12 @@
+from typing import Any, Dict, List, Optional, Type, Union
 import numpy as np
 import pandas as pd
 import chromadb
 from collections import defaultdict
+from chromadb.api.types import IncludeEnum
+from pydantic import BaseModel
+from ddi_fw.datasets.core import TextDatasetMixin
 from ddi_fw.ner.ner import CTakesNER
 from ddi_fw.langchain.embeddings import PoolingStrategy
 from ddi_fw.datasets import BaseDataset, DDIMDLDataset
@@ -10,44 +15,81 @@ import mlflow
 from ddi_fw.ml import MultiModalRunner
-class Pipeline:
-    def __init__(self,
-                 library='tensorflow',
-                 experiment_name=None,
-                 experiment_description=None,
-                 experiment_tags=None,
-                 artifact_location=None,
-                 tracking_uri=None,
-                 dataset_type: BaseDataset = None,
-                 columns=None,
-                 embedding_dict=None,
-                 column_embedding_configs=None,
-                 vector_db_persist_directory=None,
-                 vector_db_collection_name=None,
-                 embedding_pooling_strategy_type: PoolingStrategy = None,
-                 ner_data_file=None,
-                 ner_threshold=None,
-                 combinations=None,
-                 model=None,
-                 multi_modal = None ):
-        self.library = library
-        self.experiment_name = experiment_name
-        self.experiment_description = experiment_description
-        self.experiment_tags = experiment_tags
-        self.artifact_location = artifact_location
-        self.tracking_uri = tracking_uri
-        self.dataset_type = dataset_type
-        self.columns = columns
-        self.embedding_dict = embedding_dict
-        self.column_embedding_configs = column_embedding_configs
-        self.vector_db_persist_directory = vector_db_persist_directory
-        self.vector_db_collection_name = vector_db_collection_name
-        self.embedding_pooling_strategy_type = embedding_pooling_strategy_type
-        self.ner_data_file = ner_data_file
-        self.ner_threshold = ner_threshold
-        self.combinations = combinations
-        self.model = model
-        self.multi_modal = multi_modal
+class Pipeline(BaseModel):
+    library: str = 'tensorflow'
+    experiment_name: str
+    experiment_description: str
+    experiment_tags: Optional[Dict[str, Any]] = None
+    artifact_location: Optional[str] = None
+    tracking_uri: Optional[str] = None
+    dataset_type: Type[BaseDataset]
+    columns: Optional[List[str]] = None
+    embedding_dict: Optional[Dict[str, Any]] = None
+    column_embedding_configs: Optional[Dict] = None
+    vector_db_persist_directory: Optional[str] = None
+    vector_db_collection_name: Optional[str] = None
+    embedding_pooling_strategy_type: Type[PoolingStrategy] | None = None
+    ner_data_file: Optional[str] = None
+    ner_threshold: Optional[dict] = None
+    combinations: Optional[List[str]] = None
+    model: Optional[Any] = None
+    multi_modal:  Optional[Any] = None
+    use_mlflow: bool = True
+    _items:List=[]
+    _train_idx_arr:List|None=[]
+    _val_idx_arr:List|None=[]
+    @property
+    def items(self) -> List:
+        return self._items
+    @property
+    def train_idx_arr(self) -> List|None:
+        return self._train_idx_arr
+    @property
+    def val_idx_arr(self) -> List|None:
+        return self._val_idx_arr
+    class Config:
+        arbitrary_types_allowed = True
+# class Pipeline:
+#     def __init__(self,
+#                  library='tensorflow',
+#                  experiment_name=None,
+#                  experiment_description=None,
+#                  experiment_tags=None,
+#                  artifact_location=None,
+#                  tracking_uri=None,
+#                  dataset_type: BaseDataset = None,
+#                  columns=None,
+#                  embedding_dict=None,
+#                  column_embedding_configs=None,
+#                  vector_db_persist_directory=None,
+#                  vector_db_collection_name=None,
+#                  embedding_pooling_strategy_type: PoolingStrategy = None,
+#                  ner_data_file=None,
+#                  ner_threshold=None,
+#                  combinations=None,
+#                  model=None,
+#                  multi_modal = None ):
+#         self.library = library
+#         self.experiment_name = experiment_name
+#         self.experiment_description = experiment_description
+#         self.experiment_tags = experiment_tags
+#         self.artifact_location = artifact_location
+#         self.tracking_uri = tracking_uri
+#         self.dataset_type = dataset_type
+#         self.columns = columns
+#         self.embedding_dict = embedding_dict
+#         self.column_embedding_configs = column_embedding_configs
+#         self.vector_db_persist_directory = vector_db_persist_directory
+#         self.vector_db_collection_name = vector_db_collection_name
+#         self.embedding_pooling_strategy_type = embedding_pooling_strategy_type
+#         self.ner_data_file = ner_data_file
+#         self.ner_threshold = ner_threshold
+#         self.combinations = combinations
+#         self.model = model
+#         self.multi_modal = multi_modal
     def __create_or_update_embeddings__(self, embedding_dict, vector_db_persist_directory, vector_db_collection_name, column=None):
         """
@@ -64,29 +106,47 @@ class Pipeline:
             vector_db = chromadb.PersistentClient(
                 path=vector_db_persist_directory)
             collection = vector_db.get_collection(vector_db_collection_name)
+            include = [IncludeEnum.embeddings, IncludeEnum.metadatas]
+            dictionary: chromadb.GetResult
             # Fetch the embeddings and metadata
             if column == None:
                 dictionary = collection.get(
-                    include=['embeddings', 'metadatas'])
+                    include=include
+                    # include=['embeddings', 'metadatas']
+                )
                 print(
                     f"Embeddings are calculated from {vector_db_collection_name}")
             else:
-                dictionary = collection.get(include=['embeddings', 'metadatas'], where={
-                                            "type": {"$eq": f"{column}"}})
+                dictionary = collection.get(
+                    include=include,
+                    # include=['embeddings', 'metadatas'],
+                    where={
+                        "type": {"$eq": f"{column}"}})
                 print(
                     f"Embeddings of {column} are calculated from {vector_db_collection_name}")
             # Populate the embedding dictionary with embeddings from the vector database
-            for metadata, embedding in zip(dictionary['metadatas'], dictionary['embeddings']):
+            metadatas = dictionary["metadatas"]
+            embeddings = dictionary["embeddings"]
+            if metadatas == None or embeddings == None:
+                raise ValueError(
+                    "The collection does not contain embeddings or metadatas.")
+            for metadata, embedding in zip(metadatas, embeddings):
                 embedding_dict[metadata["type"]
                                ][metadata["id"]].append(embedding)
-            # return dictionary['embeddings'].shape[1]
         else:
             raise ValueError(
                 "Persistent directory for the vector DB is not specified.")
     def build(self):
+        if self.embedding_pooling_strategy_type is not None and not isinstance(self.embedding_pooling_strategy_type, type):
+            raise TypeError(
+                "self.embedding_pooling_strategy_type must be a class, not an instance")
+        if not isinstance(self.dataset_type, type):
+            raise TypeError(
+                "self.dataset_type must be a class, not an instance")
         # 'enzyme','target','pathway','smile','all_text','indication', 'description','mechanism_of_action','pharmacodynamics', 'tui', 'cui', 'entities'
         kwargs = {"columns": self.columns}
         if self.ner_threshold:
@@ -103,90 +163,88 @@ class Pipeline:
                     col_db_collection = item["vector_db_collection_name"]
                     self.__create_or_update_embeddings__(
                         embedding_dict, col_db_dir, col_db_collection, col)
             elif self.vector_db_persist_directory:
                 self.__create_or_update_embeddings__(
                     embedding_dict, self.vector_db_persist_directory, self.vector_db_collection_name)
             else:
                 print(
                     f"There is no configuration of Embeddings")
-        # if self.embedding_dict == None:
-        #     if self.vector_db_persist_directory:
-        #         self.vector_db = chromadb.PersistentClient(
-        #             path=self.vector_db_persist_directory)
-        #         self.collection = self.vector_db.get_collection(
-        #             self.vector_db_collection_name)
-        #         dictionary = self.collection.get(
-        #             include=['embeddings', 'metadatas'])
-        #         embedding_dict = defaultdict(lambda: defaultdict(list))
-        #         for metadata, embedding in zip(dictionary['metadatas'], dictionary['embeddings']):
-        #             embedding_dict[metadata["type"]
-        #                            ][metadata["id"]].append(embedding)
-        #         embedding_size = dictionary['embeddings'].shape[1]
         else:
             embedding_dict = self.embedding_dict
             # TODO make generic
             # embedding_size = list(embedding_dict['all_text'].values())[
             #     0][0].shape
-        key, value = next(iter(embedding_dict.items()))
-        embedding_size = value[next(iter(value))][0].shape[0]
-        pooling_strategy = self.embedding_pooling_strategy_type()
-        self.ner_df = CTakesNER().load(
-            filename=self.ner_data_file) if self.ner_data_file else None
+        # self.ner_df = CTakesNER(df=None).load(
+        #     filename=self.ner_data_file) if self.ner_data_file else None
-        self.dataset = self.dataset_type(
-            embedding_dict=embedding_dict,
-            embedding_size=embedding_size,
-            embeddings_pooling_strategy=pooling_strategy,
-            ner_df=self.ner_df, **kwargs)
+        if issubclass(self.dataset_type, TextDatasetMixin):
+            key, value = next(iter(embedding_dict.items()))
+            embedding_size = value[next(iter(value))][0].shape[0]
+            pooling_strategy = self.embedding_pooling_strategy_type() if self.embedding_pooling_strategy_type  else None
+            dataset = self.dataset_type(
+                embedding_dict=embedding_dict,
+                embedding_size=embedding_size,
+                embeddings_pooling_strategy=pooling_strategy,
+                **kwargs)
+        else:
+            dataset = self.dataset_type(**kwargs)
-        X_train, X_test, y_train, y_test, X_train.index, X_test.index, train_idx_arr, val_idx_arr = self.dataset.load()
+        X_train, X_test, y_train, y_test, X_train.index, X_test.index, train_idx_arr, val_idx_arr = dataset.load()
+        dataframe = dataset.dataframe
+        if dataframe is None:  # if the dataframe is None, it means that the dataset is not loaded
+            raise ValueError("The dataset is not loaded")
-        self.dataframe = self.dataset.dataframe
         # dataframe.dropna()
-        self.X_train = self.dataset.X_train
-        self.X_test = self.dataset.X_test
-        self.y_train = self.dataset.y_train
-        self.y_test = self.dataset.y_test
-        self.train_idx_arr = self.dataset.train_idx_arr
-        self.val_idx_arr = self.dataset.val_idx_arr
+        X_train = dataset.X_train
+        X_test = dataset.X_test
+        y_train = dataset.y_train
+        y_test = dataset.y_test
+        self._train_idx_arr = dataset.train_idx_arr
+        self._val_idx_arr = dataset.val_idx_arr
         # Logic to set up the experiment
         # column name, train data, train label, test data, test label
-        self.items = self.dataset.produce_inputs()
+        self._items = dataset.produce_inputs()
-        unique_classes = pd.unique(self.dataframe['event_category'])
-        event_num = len(unique_classes)
+        # unique_classes = pd.unique(dataframe[dataset.class_column])
+        # event_num = len(unique_classes)
         # droprate = 0.3
-        vector_size = self.dataset.drugs_df.shape[0]
+        # vector_size = self.dataset.drugs_df.shape[0]
         print("Building the experiment with the following settings:")
         print(
-            f"Name: {self.experiment_name}, Dataset: {self.dataset}, Model: {self.model}")
+            f"Name: {self.experiment_name}, Dataset: {dataset}, Model: {self.model}")
         # Implement additional build logic as needed
         return self
     def run(self):
-        mlflow.set_tracking_uri(self.tracking_uri)
-        if mlflow.get_experiment_by_name(self.experiment_name) == None:
-            mlflow.create_experiment(
-                self.experiment_name, self.artifact_location)
-            mlflow.set_experiment_tags(self.experiment_tags)
-        mlflow.set_experiment(self.experiment_name)
+        if self.use_mlflow:
+            if self.tracking_uri is None:
+                raise ValueError("Tracking uri should be specified")
+            mlflow.set_tracking_uri(self.tracking_uri)
+            if mlflow.get_experiment_by_name(self.experiment_name) == None:
+                mlflow.create_experiment(
+                    self.experiment_name, self.artifact_location)
+                if self.experiment_tags is not None:
+                    mlflow.set_experiment_tags(self.experiment_tags)
+            mlflow.set_experiment(self.experiment_name)
         y_test_label = self.items[0][4]
-        multi_modal_runner = MultiModalRunner(library=self.library, multi_modal = self.multi_modal)
+        multi_modal_runner = MultiModalRunner(
+            library=self.library, multi_modal=self.multi_modal)
         # multi_modal_runner = MultiModalRunner(
         #     library=self.library, model_func=model_func, batch_size=batch_size,  epochs=epochs)
         # multi_modal = TFMultiModal(
         #     model_func=model_func, batch_size=batch_size,  epochs=epochs)  # 100
         multi_modal_runner.set_data(
             self.items, self.train_idx_arr, self.val_idx_arr, y_test_label)
-        result = multi_modal_runner.predict(self.combinations)
+        combinations = self.combinations if self.combinations is not None else []
+        result = multi_modal_runner.predict(combinations)
         return result

ddi_fw/{test/compress_json_test.py → utils/json_helper.py} RENAMED Viewed

@@ -1,4 +1,3 @@
-from compress_json import compress, decompress
 import json
 import sys
@@ -9,17 +8,4 @@ def minify(folder, file_name):
     json_string = json.dumps(json_data, separators=(',', ":")) # Compact JSON structure
     file_name = str(file_name).replace(".json", "") # remove .json from end of file_name string
     new_file_name = folder+"/{0}_minify.json".format(file_name)
-    open(new_file_name, "w+", 1).write(json_string) # open and write json_string to file
-json_file = f'C:\\Users\\kivanc\\Downloads\\metrics.json'
-data_file = f'C:\\Users\\kivanc\\Downloads\\data.json'
-minify('C:\\Users\\kivanc\\Downloads','metrics.json')
-# with open(json_file, 'r', encoding="utf8") as f:
-#     data = json.load(f)
-# compressed = compress(data) # the result is a list (array)
-# with open(data_file, "w") as fd:
-# 	fd.write(json.dumps(compressed)) # convert into string if needed
+    open(new_file_name, "w+", 1).write(json_string) # open and write json_string to file

{ddi_fw-0.0.149.dist-info → ddi_fw-0.0.150.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.4
 Name: ddi_fw
-Version: 0.0.149
+Version: 0.0.150
 Summary: Do not use :)
 Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
 Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
@@ -22,6 +22,7 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
 Classifier: Topic :: Scientific/Engineering :: Medical Science Apps.
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
+Requires-Dist: pydantic==2.10.6
 Requires-Dist: importlib-resources==6.4.5
 Requires-Dist: python-stopwatch==1.1.11
 Requires-Dist: lxml==5.3.0
@@ -35,7 +36,7 @@ Requires-Dist: rdkit==2023.3.3
 Requires-Dist: scikit-learn==1.5.2
 Requires-Dist: scipy==1.13.1
 Requires-Dist: accelerate>=0.33.0
-Requires-Dist: sentence-transformers>=3.0.1
+Requires-Dist: sentence-transformers<=3.3.1,>=3.0.1
 Requires-Dist: transformers>=4.42.4
 Requires-Dist: stanza==1.9.2
 Requires-Dist: tokenizers>=0.19.1
@@ -49,3 +50,5 @@ Requires-Dist: chromadb>=0.5.15
 Requires-Dist: langchain_community==0.3.3
 Requires-Dist: datasets==3.0.2
 Requires-Dist: unstructured==0.16.3
+Requires-Dist: tensorflow<2.18.0,>=2.17.0
+Requires-Dist: tf-keras==2.17.0

{ddi_fw-0.0.149.dist-info → ddi_fw-0.0.150.dist-info}/RECORD RENAMED Viewed

@@ -1,11 +1,13 @@
-ddi_fw/datasets/__init__.py,sha256=M74mFusbhvnoJw_F9Vljhtum5JYRu_rWe50zBeorVYQ,399
-ddi_fw/datasets/core.py,sha256=z-H-UWIwLjnJVC7QwZQIK4tC8v00XJtjSeA1cYssxAI,17110
+ddi_fw/datasets/__init__.py,sha256=yDsRQD_9Ijpm_Rl2wSDwdutG5Q_wca_UBPEvm7nBx04,444
+ddi_fw/datasets/core.py,sha256=JA6WJz3VCUfxI85rYE7ZBqC4pnn7L8NSS9-EgjLw710,7968
+ddi_fw/datasets/dataset_splitter.py,sha256=lLIelXv-8rCK0tbwLNgHBHYUO_65HT-_kErAlZhRQVE,1662
 ddi_fw/datasets/db_utils.py,sha256=OTsa3d-Iic7z3HmzSQK9UigedRbHDxYChJk0s4GfLnw,6191
-ddi_fw/datasets/embedding_generator.py,sha256=Jqrlv88RCu0Lg812KsA12X0cSaZuxbckJ4LNRKNy_qw,2173
+ddi_fw/datasets/embedding_generator.py,sha256=jiDKwLaPMaQkloxQkuCrhl-A-2OdvocmkSzjWtUnk4g,2255
 ddi_fw/datasets/feature_vector_generation.py,sha256=gvjpEzkgVV8dp4V8NMMv59u0v-1tNAJ7v83R-keWGoA,4748
 ddi_fw/datasets/idf_helper.py,sha256=_Gd1dtDSLaw8o-o0JugzSKMt9FpeXewTh4wGEaUd4VQ,2571
 ddi_fw/datasets/setup_._py,sha256=khYVJuW5PlOY_i_A16F3UbSZ6s6o_ljw33Byw3C-A8E,1047
-ddi_fw/datasets/ddi_mdl/base.py,sha256=ynM99u8rx82F0dzlkEXcPGsHHOXEvIbPxiZ9GCi-8wo,6165
+ddi_fw/datasets/ddi_mdl/base.py,sha256=usw3AhBCjdYwZx9MMnyNaUYTEyYXoRSO4fNJJHxnPuk,9312
+ddi_fw/datasets/ddi_mdl/debug.log,sha256=eWz05j8RFqZuHFDTCF7Rck5w4rvtTanFN21iZsgxO7Y,115
 ddi_fw/datasets/ddi_mdl/readme.md,sha256=WC6lpmsEKvIISnZqENY7TWtzCQr98HPpE3oRsBl8pIw,625
 ddi_fw/datasets/ddi_mdl/data/event.db,sha256=cmlSsf9MYjRzqR-mw3cUDnTnfT6FkpOG2yCl2mMwwew,30580736
 ddi_fw/datasets/ddi_mdl/indexes/test_indexes.txt,sha256=XVlDqYATckrQwNSXqMSKVBqyoN_Hg8SK6CL-XMdLADY,102176
@@ -70,43 +72,32 @@ ddi_fw/drugbank/drugbank_processor.py,sha256=vmkt68n9nFLevufgGyXhOSDtTo4G1XzwT9P
 ddi_fw/drugbank/drugbank_processor_org.py,sha256=eO5Yset50P91qkic79RUXPoEuxRxQKFkKW0l4G29Mas,13322
 ddi_fw/drugbank/event_extractor.py,sha256=6odoZohhK7OdLF-LF0l-5BFq0_NMG_5jrFJbHrBXsI8,4600
 ddi_fw/langchain/__init__.py,sha256=zS0CQrakWEP19biSRewFJGcBT8WBZq4899HrEKiMqUY,269
-ddi_fw/langchain/embeddings.py,sha256=lU64a5AZ62jP8U3hTSwK0kXt7gThbwPACLfJMZ1baPA,7538
+ddi_fw/langchain/embeddings.py,sha256=XzIYgmqnAO93pnavKRDhYDoz0RhDn-RoC7CDc0yAvbM,7572
 ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSzu5L2PXzg,280
 ddi_fw/langchain/storage.py,sha256=OizKyWm74Js7T6Q9kez-ulUoBGzIMFo4R46h4kjUyIM,11200
 ddi_fw/ml/__init__.py,sha256=tIxiW0g6q1VsmDYVXR_ovvHQR3SCir8g2bKxx_CrS7s,221
-ddi_fw/ml/evaluation_helper.py,sha256=o4-w5Xa3t4olLW4ymx_8L-Buhe5wfQEmT2bh4Zz544c,13066
-ddi_fw/ml/ml_helper.py,sha256=xSEa_UNpaFyrPswlQcDfZSI2x5nZLStOiKoP54SYkCM,6454
-ddi_fw/ml/model_wrapper.py,sha256=kc01_TVJuriUvNI6ABnLngnJWvmG_Y7-XJ6XMusLJ8U,1088
-ddi_fw/ml/pytorch_wrapper.py,sha256=AkG-2sKDXr0IBhgmkbjG0i20OuwQv3mhdvqp6UvJDCA,3716
-ddi_fw/ml/tensorflow_wrapper.py,sha256=DkW3aVWsPrzA87eGz5XTkiPBRb-Sb-z4tvOUcAZc2r0,6396
+ddi_fw/ml/evaluation_helper.py,sha256=JFATMquaQVa2gckxmEivCztZmivWBAAP7EpJ8PVeI3c,7626
+ddi_fw/ml/ml_helper.py,sha256=E6ef7f1UnQl6JBUdGDbbbI4FIS-904VGypT7tI0a598,8545
+ddi_fw/ml/model_wrapper.py,sha256=kabPXuo7S8tGkp9a00V04n4rXDmv7dD8wYGMjotISRc,1050
+ddi_fw/ml/pytorch_wrapper.py,sha256=pe6UsjP2XeTgLxDnIUiodoyhJTGCxV27wD4Cjxysu2Q,8553
+ddi_fw/ml/tensorflow_wrapper.py,sha256=jt6h9Q-wF0mkbnvV6yCCl1SpUd2paHK70Bu6EFrkmd0,10112
 ddi_fw/ner/__init__.py,sha256=JwhGXrepomxPSsGsg2b_xPRC72AjvxOIn2CW5Mvscn0,26
 ddi_fw/ner/mmlrestclient.py,sha256=NZta7m2Qm6I_qtVguMZhqtAUjVBmmXn0-TMnsNp0jpg,6859
-ddi_fw/ner/ner.py,sha256=BEs9AFljAxOQrC2BEP1raSzRoypcfELS5UTdl4bjTqw,15863
+ddi_fw/ner/ner.py,sha256=FHyyX53Xwpdw8Hec261dyN88yD7Z9LmJua2mIrQLguI,17967
 ddi_fw/pipeline/__init__.py,sha256=tKDM_rW4vPjlYTeOkNgi9PujDzb4e9O3LK1w5wqnebw,212
-ddi_fw/pipeline/multi_modal_combination_strategy.py,sha256=qIst7vxHaOAhRv4lgozszwa3b1QE4aIrN74t41Xnvr4,1637
-ddi_fw/pipeline/multi_pipeline.py,sha256=G8ONZdfwjGZRI2PrzMOaET6w5AUcmgYzMtaV6j5Hbz0,5981
-ddi_fw/pipeline/ner_pipeline.py,sha256=wB7hz4YCOv7UAz6bGE6sSpPXXIdoOflOVK5UCc1fO-o,5586
-ddi_fw/pipeline/pipeline.py,sha256=-1zGbSJapmUSx9xltJLQajmUCeZdT-9Ow0cC6JZ92y0,8984
-ddi_fw/test/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-ddi_fw/test/basic_test.py,sha256=fEOGcZm1ObnsDvMiXNmdmz6YCeUrGc8V0DwlSwGhsq8,376
-ddi_fw/test/combination_test.py,sha256=TWNE8sf-DSh1Q9-yRaRBc774Sn1kSMGXLwQhd2_Qynk,324
-ddi_fw/test/compress_json_test.py,sha256=BGny56YqiG-pzhMoDzLKQBQI1E7o3jU0S7VYWtclAx4,1045
-ddi_fw/test/date_test.py,sha256=QmJ97ennS9LxLl8mGBkM2ob8_KWEFmiLakZTI9zQxxo,532
-ddi_fw/test/idf_score.py,sha256=YsAur-F1T3eFxn9KrcK3VXCvrsV_LXrpHxPjMKZeQZ8,1523
-ddi_fw/test/jaccard_similarity.py,sha256=pf6SNI52RCUZ0otx_1cz7A0p7kyfoCZv13Tbc_rxfuw,2382
-ddi_fw/test/mlfow_test.py,sha256=L2hJAeIU5PDSxsyWTtV6PY0bfaWerWUJ1buni9BTjXo,4853
-ddi_fw/test/sklearn-tfidf.py,sha256=cjtg27vLskEMXgrsqUR_EapRGVd4xgwOQ9zYsu72zjs,657
-ddi_fw/test/test.py,sha256=zJh9ZBcZl-vZIFDvuftcRrRV8WAwtiFVhPPd6Et4OU4,2997
-ddi_fw/test/torch_cuda_test.py,sha256=R-4VGVErl_Ufk54DoZbgL_YXWoCYFyanIVWd6P39IEk,312
-ddi_fw/test/type_guarding_test.py,sha256=KxjyBxohDu7lwpejalCj-REjtJ-k1S1wQbOB6TGY0O8,766
+ddi_fw/pipeline/multi_modal_combination_strategy.py,sha256=JSyuP71b1I1yuk0s2ecCJZTtCED85jBtkpwTUxibJvI,1706
+ddi_fw/pipeline/multi_pipeline.py,sha256=D_BZ3ciHbVGuuB7m7cEmVQHESruh1gqhA-vxCMfNKj0,5407
+ddi_fw/pipeline/ner_pipeline.py,sha256=q1aKjb54Ra1HzZ7dARvBw6lB37je9R-POEf2h6QT_nU,6018
+ddi_fw/pipeline/pipeline.py,sha256=Xp5_cPj0SZ6b1lRWepwKCHoCbhEnzSZexm56CtvO_4Y,11073
 ddi_fw/utils/__init__.py,sha256=77563ikqAtdzjjgRlLp5OAsJBbpLA1Cao8iecGaVUXQ,354
 ddi_fw/utils/enums.py,sha256=19eJ3fX5eRK_xPvkYcukmug144jXPH4X9zQqtsFBj5A,671
+ddi_fw/utils/json_helper.py,sha256=BVU6wmJgdXPxyqLPu3Ck_9Es5RrP1PDanKvE-OSj1D4,571
 ddi_fw/utils/kaggle.py,sha256=wKRJ18KpQ6P-CubpZklEgsDtyFpR9RUL1_HyyF6ttEE,2425
 ddi_fw/utils/package_helper.py,sha256=erl8_onmhK-41zQoaED2qyDUV9GQxmT9sdoyRp9_q5I,1056
 ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,4747
 ddi_fw/utils/utils.py,sha256=szwnxMTDRrZoeNRyDuf3aCbtzriwtaRk4mHSH3asLdA,4301
 ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
-ddi_fw-0.0.149.dist-info/METADATA,sha256=ujgYzxc29yv5J7ltzxgEN1MSrTV5SkN3NNBseekoLEA,1965
-ddi_fw-0.0.149.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
-ddi_fw-0.0.149.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
-ddi_fw-0.0.149.dist-info/RECORD,,
+ddi_fw-0.0.150.dist-info/METADATA,sha256=QGoZpcrDypCUbyMgSXEe2vdWBeYmLG5gSw6qnyWKQLc,2082
+ddi_fw-0.0.150.dist-info/WHEEL,sha256=DK49LOLCYiurdXXOXwGJm6U4DkHkg4lcxjhqwRa0CP4,91
+ddi_fw-0.0.150.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
+ddi_fw-0.0.150.dist-info/RECORD,,

{ddi_fw-0.0.149.dist-info → ddi_fw-0.0.150.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.6.0)
+Generator: setuptools (78.0.2)
 Root-Is-Purelib: true
 Tag: py3-none-any

ddi_fw/test/__init__.py DELETED Viewed

File without changes

ddi_fw/test/basic_test.py DELETED Viewed

@@ -1,15 +0,0 @@
-import json
-class Metrics():
-    def __init__(self, precision, recall, roc_aupr, roc_auc):
-        self.precision = precision
-        self.recall = recall
-        self.roc_aupr = roc_aupr
-        self.roc_auc = roc_auc
-m = Metrics( 0.96, 0.96, {"micro": 0.99, "macro": 0.88}, {"micro": 0.99, "macro": 0.88})
-as_json = json.dumps(m.__dict__)
-print(as_json)

ddi_fw/test/combination_test.py DELETED Viewed

@@ -1,12 +0,0 @@
-import itertools
-l = ['e1','e2','e3','e4','e5']
-all_combinations = []
-for i in range(2, len(l) + 1):
-    all_combinations.extend(list(itertools.combinations(l, i)))
-print(all_combinations)
-for combination in all_combinations:
-    combination_descriptor = '-'.join(combination)
-    print(combination_descriptor)

ddi_fw/test/date_test.py DELETED Viewed

@@ -1,15 +0,0 @@
-from datetime import datetime, timezone
-local_datetime = datetime.now()
-utc_datetime = datetime.now(timezone.utc)
-local_iso_str = datetime.strftime(local_datetime, "%Y-%m-%dT%H:%M:%S.%f")[:-3]
-utc_iso_str = datetime.strftime(utc_datetime, "%Y-%m-%dT%H:%M:%S.%f")[:-3]
-print(f"local dt: {local_iso_str}, tzname: {local_datetime.tzname()}")
-print(f"  utc dt: {utc_iso_str}, tzname: {utc_datetime.tzname()}")
-print("\n")
-print(f"local dt: {local_datetime.isoformat()}")
-print(f"  utc dt: {utc_datetime.isoformat()}")

ddi_fw/test/idf_score.py DELETED Viewed

@@ -1,54 +0,0 @@
-import pandas as pd
-import numpy as np
-# from ddi_fw.datasets.feature_vector_generation import find_distinct_elements
-def find_distinct_elements(frame):
-    y = set()
-    for x in frame:
-        if x is not None:
-            for k in x:
-                  y.add(k)
-    return y
-def calculate_idf(series):
-    idf_scores = {}
-    distinct_items = find_distinct_elements(series)
-    sorted_distinct_items = sorted(distinct_items)
-    total_document_number = len(all_data)
-    for item in sorted_distinct_items:
-       document_freq = series.map(set([item]).issubset).sum()
-       idf = np.log(total_document_number/document_freq)
-       idf_scores[item] = idf
-    return idf_scores
-item1 = 'T001|T002|T001|T001'
-item2 = 'T002|T003'
-item3 = 'T004|T005'
-all_data = [item1, item2, item3]
-df = pd.DataFrame(all_data, columns=['tui_description'])
-df['tui_description'] = df['tui_description'].apply(
-            lambda x: x.split('|') if x is not None else [])
-print(df.head())
-idf_scores = calculate_idf(df['tui_description'])
-idf_scores_sorted_desc = sorted(idf_scores.items(), key=lambda x:x[1], reverse=True)
-threshold = 1
-keys_over_threshold = [k for k,v in idf_scores.items() if v > threshold]
-print(idf_scores_sorted_desc)
-print(keys_over_threshold)
-def remove_items_by_idf_score(items):
-    return [item for item in items if item in keys_over_threshold]
-df['tui_description'] = df['tui_description'].apply(
-            remove_items_by_idf_score)
-print(df)

ddi_fw/test/jaccard_similarity.py DELETED Viewed

@@ -1,85 +0,0 @@
-import pandas as pd
-# data = {'A': [1, 1, 1, 0, 0],
-#         'B': [0, 1, 1, 1, 0],
-#         'C': [0, 0, 1, 1, 1]}
-# df = pd.DataFrame(data)
-# from scipy.spatial.distance import pdist, squareform
-# jaccard_dist = pdist(df.values, metric='jaccard')
-# jaccard_dist_matrix = squareform(jaccard_dist)
-# print(jaccard_dist_matrix)
-# import pandas as pd
-# from scipy.spatial.distance import euclidean, pdist, squareform
-# def similarity_func(u, v):
-#     return 1/(1+euclidean(u,v))
-# DF_var = pd.DataFrame.from_dict({"s1":[1.2,3.4,10.2],"s2":[1.4,3.1,10.7],"s3":[2.1,3.7,11.3],"s4":[1.5,3.2,10.9]})
-# DF_var.index = ["g1","g2","g3"]
-# dists = pdist(DF_var, similarity_func)
-# DF_euclid = pd.DataFrame(squareform(dists), columns=DF_var.index, index=DF_var.index)
-# print(DF_euclid)
-from sklearn.metrics import jaccard_score
-import seaborn as sns
-import matplotlib.pyplot as plt
-data = [[0, 1, 0], [0, 1, 1], [0, 1, 0], [1, 1, 1], [1, 0, 1]]
-similarity_matrix = []
-for i in range(len(data)):
-        row = []
-        for j in range(len(data)):
-          row.append(jaccard_score(data[i], data[j]))
-        similarity_matrix.append(row)
-sns.heatmap(pd.DataFrame(similarity_matrix), annot=True, cmap="YlGnBu")
-plt.show()
-# https://stackoverflow.com/questions/35639571/python-pandas-distance-matrix-using-jaccard-similarity
-import pandas as pd
-entries = [
-    {'id':'1', 'category1':'100', 'category2': '0', 'category3':'100'},
-    {'id':'2', 'category1':'100', 'category2': '0', 'category3':'100'},
-    {'id':'3', 'category1':'0', 'category2': '100', 'category3':'100'},
-    {'id':'4', 'category1':'100', 'category2': '100', 'category3':'100'},
-    {'id':'5', 'category1':'100', 'category2': '0', 'category3':'100'}
-           ]
-df = pd.DataFrame(entries)
-from scipy.spatial.distance import squareform
-from scipy.spatial.distance import pdist, jaccard
-res = 1 - pdist(df[['category1','category2','category3']], 'jaccard')
-# squareform(res)
-distance = pd.DataFrame(squareform(res), index=df.index, columns= df.index)
-print(distance)
-entries2 = [
-    {'id':'1', 'cat':['p1','p2','p3']},
-    {'id':'2', 'cat':['p3','p4','p5']},
-    {'id':'3', 'cat':['p5','p6','p7']},
-           ]
-df2 = pd.DataFrame(entries2)
-c = df2['cat']
-y = set()
-for x in c:
-  for k in x:
-    y.add(k)
-print(y)

ddi-fw 0.0.149__py3-none-any.whl → 0.0.150__py3-none-any.whl

ddi-fw 0.0.149py3-none-any.whl → 0.0.150py3-none-any.whl