ddi-fw 0.0.98__py3-none-any.whl → 0.0.100__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,3 @@
1
1
  from ..langchain.embeddings import PoolingStrategy,SumPoolingStrategy,MeanPoolingStrategy,SentenceTransformerDecorator,PretrainedEmbeddings,SBertEmbeddings
2
2
  from .sentence_splitter import SentenceSplitter
3
- from .storage import DataFrameToVectorDB
3
+ from .storage import DataFrameToVectorDB, generate_embeddings
@@ -69,7 +69,7 @@ class SentenceTransformerDecorator(Embeddings):
69
69
 
70
70
  class PretrainedEmbeddings(Embeddings):
71
71
  def __init__(self, model_name):
72
- self.mmodel_name = model_name
72
+ self.model_name = model_name
73
73
  self.model = AutoModel.from_pretrained(model_name)
74
74
  self.tokenizer = AutoTokenizer.from_pretrained(model_name)
75
75
  self.shape = self.model.get_input_embeddings().weight.shape
@@ -1,3 +1,4 @@
1
+ import json
1
2
  from langchain.vectorstores import Chroma
2
3
  # from langchain_community.vectorstores import Chroma
3
4
  from langchain_community.vectorstores.utils import filter_complex_metadata
@@ -11,6 +12,16 @@ from langchain.document_loaders import DataFrameLoader
11
12
  from langchain.text_splitter import TextSplitter
12
13
 
13
14
  # from langchain_community.document_loaders.hugging_face_dataset import HuggingFaceDatasetLoader
15
+ from ddi_fw.utils import get_import
16
+
17
+
18
+ def load_configuration(config_file):
19
+ """
20
+ Load the configuration from a JSON file.
21
+ """
22
+ with open(config_file, 'r') as f:
23
+ config = json.load(f)
24
+ return config
14
25
 
15
26
 
16
27
  class DataFrameToVectorDB:
@@ -18,15 +29,13 @@ class DataFrameToVectorDB:
18
29
  collection_name,
19
30
  persist_directory,
20
31
  embeddings: Embeddings,
21
- text_splitter: TextSplitter,
22
- chunk_size=1000,
23
- chunk_overlap=20):
32
+ text_splitter: TextSplitter,
33
+ batch_size=1000):
24
34
  self.collection_name = collection_name
25
35
  self.persist_directory = persist_directory
26
36
  self.embeddings = embeddings
27
37
  self.text_splitter = text_splitter
28
- self.chunk_size = chunk_size
29
- self.chunk_overlap = chunk_overlap
38
+ self.batch_size = batch_size # to store chunks partially
30
39
  self.vectordb = Chroma(collection_name=collection_name,
31
40
  persist_directory=persist_directory,
32
41
  embedding_function=embeddings)
@@ -35,11 +44,14 @@ class DataFrameToVectorDB:
35
44
  docs = self.text_splitter.split_documents(documents)
36
45
  return docs
37
46
 
38
- def __split_list(self, input_list, chunk_size):
39
- for i in range(0, len(input_list), chunk_size):
40
- yield input_list[i:i + chunk_size]
47
+ def __split_list(self, input_list, batch_size):
48
+ for i in range(0, len(input_list), batch_size):
49
+ yield input_list[i:i + batch_size]
41
50
 
42
- def store(self, df, columns, page_content_columns, max_batch_size=1000):
51
+ def store_documents(self, df, columns, page_content_columns):
52
+ """
53
+ Core function that processes the documents and adds them to the vector database.
54
+ """
43
55
  for page_content_column in page_content_columns:
44
56
  copy_columns = columns.copy()
45
57
  copy_columns.append(page_content_column)
@@ -48,11 +60,12 @@ class DataFrameToVectorDB:
48
60
  col_df['type'] = page_content_column # Set the type column
49
61
  documents = []
50
62
 
51
- loader = DataFrameLoader(data_frame=col_df, page_content_column=page_content_column)
63
+ loader = DataFrameLoader(
64
+ data_frame=col_df, page_content_column=page_content_column)
52
65
  loaded_docs = loader.load()
53
66
  documents.extend(self.__split_docs(loaded_docs))
54
67
 
55
- split_docs_chunked = self.__split_list(documents, max_batch_size)
68
+ split_docs_chunked = self.__split_list(documents, self.batch_size)
56
69
 
57
70
  for split_docs_chunk in split_docs_chunked:
58
71
  # vectordb = Chroma.from_documents(
@@ -64,5 +77,93 @@ class DataFrameToVectorDB:
64
77
  self.vectordb.add_documents(split_docs_chunk)
65
78
  self.vectordb.persist()
66
79
 
80
+ def store(self, df, columns, page_content_columns, partial_df_size=None):
81
+ """
82
+ Store function to handle both full and partial dataframe processing.
83
+ """
84
+ if partial_df_size:
85
+ # Process the dataframe in chunks if partial_df_size is provided
86
+ for i in range(0, len(df), partial_df_size):
87
+ batch = df[i: i + partial_df_size]
88
+ self.store_documents(df=batch, columns=columns,
89
+ page_content_columns=page_content_columns, batch_size=self.batch_size)
90
+ else:
91
+ # Process the entire dataframe if no partial_df_size is specified
92
+ self.store_documents(df=df, columns=columns,
93
+ page_content_columns=page_content_columns, batch_size=self.batch_size)
94
+
95
+
96
+ def generate_embeddings(df, config_file, new_model_names, collections=None, persist_directory="embeddings"):
97
+ """
98
+ Generate embeddings for collections based on a configuration file.
99
+
100
+ collections: List of collections that contain metadata for embedding generation.
101
+ config_file: Path to the configuration file containing model settings.
102
+ new_model_names: List of model names to generate embeddings for.
103
+ """
104
+ # Load the configuration from the provided file
105
+ if not collections:
106
+ collections = load_configuration(config_file)
107
+
108
+ # Process each collection
109
+ for collection_config in collections:
110
+ id = collection_config['id']
111
+ name = collection_config['name']
112
+
113
+ # Skip if the collection's name is not in the list of new model names
114
+ if name not in new_model_names:
115
+ continue
116
+
117
+ # # Find the matching configuration for the collection
118
+ # collection_config = next(
119
+ # (item for item in collections if item['id'] == id), None)
120
+
121
+ # if not collection_config:
122
+ # print(f"Configuration for collection {id} not found.")
123
+ # continue
124
+
125
+ embedding_model_type = collection_config['embedding_model_tpe']
126
+ text_splitters_types = collection_config['text_splitters_types']
127
+ batch_size = collection_config['batch_size']
128
+ columns = collection_config['columns']
129
+ page_content_columns = collection_config['page_content_columns']
130
+ persist_directory = f'{persist_directory}/{id}'
131
+
132
+ # Load the embedding model and text splitter dynamically
133
+ print(f"Generating embeddings for {id} with model {name}...")
134
+
135
+ # Assuming the classes for the embeddings and splitters are available
136
+ try:
137
+ model = get_import(embedding_model_type)(
138
+ model_name=name, model_kwargs=c['model_kwargs'])
139
+ except:
140
+ # print(f"Unknown embedding model: {embedding_model_type}")
141
+ raise Exception(f"Unknown embedding model: {embedding_model_type}")
142
+
143
+ text_splitters = []
144
+ text_splitters_suffixes = []
145
+ for text_splitter_type in text_splitters_types:
146
+ try:
147
+ type_of_text_splitter = get_import(text_splitter_type)
148
+ text_splitter_params = text_splitter.get("params")
149
+ suffix = text_splitter.get("suffix")
150
+ if text_splitter_params:
151
+ text_splitter = type_of_text_splitter(
152
+ **text_splitter_params)
153
+ else:
154
+ text_splitter = type_of_text_splitter()
155
+ text_splitters.append(text_splitter)
156
+ text_splitters_suffixes.append(suffix)
157
+ except:
158
+ print(f"Unknown text splitter: {text_splitter_type}")
159
+ raise Exception(f"Unknown text splitter: {text_splitter_type}")
67
160
 
68
-
161
+ for text_splitter, suffix in zip(text_splitters, text_splitters_suffixes):
162
+ print(f"{id}_{suffix}")
163
+ to_vector_db = DataFrameToVectorDB(collection_name=f"{id}_{suffix}",
164
+ persist_directory=persist_directory,
165
+ embeddings=model,
166
+ text_splitter=text_splitter,
167
+ batch_size=1024)
168
+ to_vector_db.store(
169
+ df, columns, page_content_columns, partial_df_size=batch_size)
ddi_fw/utils/__init__.py CHANGED
@@ -2,3 +2,5 @@ from .utils import clear_directory,create_folder_if_not_exists, utc_time_as_stri
2
2
  from .zip_helper import ZipHelper
3
3
  from .py7zr_helper import Py7ZipHelper
4
4
  from .enums import UMLSCodeTypes, DrugBankTextDataTypes
5
+ from .package_helper import get_import
6
+ from .kaggle import create_kaggle_dataset
ddi_fw/utils/kaggle.py ADDED
@@ -0,0 +1,56 @@
1
+ import os
2
+ import json
3
+
4
+ def create_kaggle_dataset(base_path: str, collections: list):
5
+ """
6
+ This function creates metadata JSON files and uploads datasets to Kaggle from folders.
7
+
8
+ Args:
9
+ base_path (str): The base path containing dataset folders.
10
+ collections (list): A list of dictionaries containing metadata about collections (e.g., model names).
11
+ path (str): The path to your root directory (default is "/content" for Google Colab).
12
+
13
+ Returns:
14
+ None
15
+ """
16
+
17
+ # Step 1: Loop through each folder in base_path
18
+ for folder_name in os.listdir(base_path):
19
+ folder_path = os.path.join(base_path, folder_name)
20
+
21
+ # Step 2: Get metadata for the current folder
22
+ model_info = next((c for c in collections if c['id'] == folder_name), None)
23
+ if model_info is None:
24
+ continue # Skip if model info is not found
25
+
26
+ title = model_info['kaggle_title']
27
+ id = model_info['kaggle_id'].lower().replace(' ', '-')
28
+ licenses = model_info['kaggle_licenses']
29
+ description = model_info['kaggle_description']
30
+
31
+ # Ensure title is between 6 and 50 characters
32
+ if not (6 <= len(title) <= 50):
33
+ continue # Skip if title length is out of the expected range
34
+
35
+ # Step 3: Define the metadata content
36
+ metadata = {
37
+ "title": title,
38
+ "id": id,
39
+ "licenses": licenses,
40
+ "description": description,
41
+ }
42
+
43
+ # Step 4: Write the metadata to a JSON file in the folder
44
+ metadata_file_path = os.path.join(folder_path, 'dataset-metadata.json')
45
+ with open(metadata_file_path, 'w') as f:
46
+ json.dump(metadata, f, indent=4)
47
+
48
+ print(f"Created metadata for {folder_name}: {metadata_file_path}")
49
+
50
+ # Step 5: Create datasets on Kaggle using the Kaggle API
51
+ for folder_name in os.listdir(base_path):
52
+ folder_path = os.path.join(base_path, folder_name)
53
+ if os.path.isdir(folder_path):
54
+ # Run the Kaggle dataset creation command
55
+ os.system(f"kaggle datasets create -p {folder_path} --dir-mode zip")
56
+ print(f"Dataset created for {folder_name}.")
@@ -0,0 +1,31 @@
1
+ import importlib
2
+
3
+
4
+ def get_import(full_path_of_import):
5
+ """Dynamically imports an object from a module given its full path.
6
+
7
+ Args:
8
+ full_path_of_import (str): The full path of the import (e.g., 'module.submodule.ClassName').
9
+
10
+ Returns:
11
+ object: The imported object.
12
+
13
+ Raises:
14
+ ImportError: If the module cannot be imported.
15
+ AttributeError: If the attribute does not exist in the module.
16
+ """
17
+ if not full_path_of_import:
18
+ raise ValueError("The import path cannot be empty.")
19
+
20
+ parts = full_path_of_import.split('.')
21
+ import_name = parts[-1]
22
+ module_name = ".".join(parts[:-1]) if len(parts) > 1 else ""
23
+
24
+ try:
25
+ module = importlib.import_module(module_name)
26
+ return getattr(module, import_name)
27
+ except ModuleNotFoundError as e:
28
+ raise ImportError(f"Module '{module_name}' could not be found.") from e
29
+ except AttributeError as e:
30
+ raise AttributeError(
31
+ f"'{module_name}' has no attribute '{import_name}'") from e
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ddi_fw
3
- Version: 0.0.98
3
+ Version: 0.0.100
4
4
  Summary: Do not use :)
5
5
  Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
6
6
  Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
@@ -55,10 +55,10 @@ ddi_fw/drugbank/drugbank_parser.py,sha256=lxUuhB0s8ef_aPNDs0V8ClKF7-KIWugNIV9gVs
55
55
  ddi_fw/drugbank/drugbank_processor.py,sha256=vmkt68n9nFLevufgGyXhOSDtTo4G1XzwT9PVncGTXtk,18127
56
56
  ddi_fw/drugbank/drugbank_processor_org.py,sha256=eO5Yset50P91qkic79RUXPoEuxRxQKFkKW0l4G29Mas,13322
57
57
  ddi_fw/drugbank/event_extractor.py,sha256=6odoZohhK7OdLF-LF0l-5BFq0_NMG_5jrFJbHrBXsI8,4600
58
- ddi_fw/langchain/__init__.py,sha256=8dBPZivc01WWaCH8sZ_UV8-XPyo74e9Qy6-fYgAiNLE,248
59
- ddi_fw/langchain/embeddings.py,sha256=8J_SfO9pyET2W-Ltzq0_r9EchFzBsYdUabiOMma42Us,7515
58
+ ddi_fw/langchain/__init__.py,sha256=zS0CQrakWEP19biSRewFJGcBT8WBZq4899HrEKiMqUY,269
59
+ ddi_fw/langchain/embeddings.py,sha256=b9BUG73Ayx3Wy8MQrfsVeZ-qBB41vjVECSp2YhH-CIY,7514
60
60
  ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSzu5L2PXzg,280
61
- ddi_fw/langchain/storage.py,sha256=uy5clVB07So2eFbRGdAKzHIPdfEk4se33cPktis7Aa4,2716
61
+ ddi_fw/langchain/storage.py,sha256=-QRlzHsfQ7yj0OEFShRDb2A0H1iMReHiD4absxoYwGU,7415
62
62
  ddi_fw/ml/__init__.py,sha256=tIxiW0g6q1VsmDYVXR_ovvHQR3SCir8g2bKxx_CrS7s,221
63
63
  ddi_fw/ml/evaluation_helper.py,sha256=o4-w5Xa3t4olLW4ymx_8L-Buhe5wfQEmT2bh4Zz544c,13066
64
64
  ddi_fw/ml/ml_helper.py,sha256=fySjIAFzkeEOvaLJhDwtCOgRhgYQ7H106eqaP16GhDY,4489
@@ -84,12 +84,14 @@ ddi_fw/test/sklearn-tfidf.py,sha256=cjtg27vLskEMXgrsqUR_EapRGVd4xgwOQ9zYsu72zjs,
84
84
  ddi_fw/test/test.py,sha256=zJh9ZBcZl-vZIFDvuftcRrRV8WAwtiFVhPPd6Et4OU4,2997
85
85
  ddi_fw/test/torch_cuda_test.py,sha256=R-4VGVErl_Ufk54DoZbgL_YXWoCYFyanIVWd6P39IEk,312
86
86
  ddi_fw/test/type_guarding_test.py,sha256=KxjyBxohDu7lwpejalCj-REjtJ-k1S1wQbOB6TGY0O8,766
87
- ddi_fw/utils/__init__.py,sha256=x1ypYJRKJlbF9x4psHYGXj-YbDD8T_c28gXZkr03cdE,273
87
+ ddi_fw/utils/__init__.py,sha256=77563ikqAtdzjjgRlLp5OAsJBbpLA1Cao8iecGaVUXQ,354
88
88
  ddi_fw/utils/enums.py,sha256=19eJ3fX5eRK_xPvkYcukmug144jXPH4X9zQqtsFBj5A,671
89
+ ddi_fw/utils/kaggle.py,sha256=FjWR1ncOEif6XCCzDYpErLDz_9fxAQub0L7X4aVPw24,2266
90
+ ddi_fw/utils/package_helper.py,sha256=erl8_onmhK-41zQoaED2qyDUV9GQxmT9sdoyRp9_q5I,1056
89
91
  ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,4747
90
92
  ddi_fw/utils/utils.py,sha256=szwnxMTDRrZoeNRyDuf3aCbtzriwtaRk4mHSH3asLdA,4301
91
93
  ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
92
- ddi_fw-0.0.98.dist-info/METADATA,sha256=W4ZdrQs8YgQp6aHxr4Py5_lO4zrzKnk1XjDfFhrlsq8,1966
93
- ddi_fw-0.0.98.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
94
- ddi_fw-0.0.98.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
95
- ddi_fw-0.0.98.dist-info/RECORD,,
94
+ ddi_fw-0.0.100.dist-info/METADATA,sha256=TyW7tsHIuFEoXe8d2tpZ02iCE3HEGqBAKnwKIrpPmgs,1967
95
+ ddi_fw-0.0.100.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
96
+ ddi_fw-0.0.100.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
97
+ ddi_fw-0.0.100.dist-info/RECORD,,