ddi-fw 0.0.98__py3-none-any.whl → 0.0.99__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ddi_fw/langchain/__init__.py +1 -1
- ddi_fw/langchain/embeddings.py +1 -1
- ddi_fw/langchain/storage.py +113 -12
- ddi_fw/utils/__init__.py +2 -0
- ddi_fw/utils/kaggle.py +56 -0
- ddi_fw/utils/package_helper.py +31 -0
- {ddi_fw-0.0.98.dist-info → ddi_fw-0.0.99.dist-info}/METADATA +1 -1
- {ddi_fw-0.0.98.dist-info → ddi_fw-0.0.99.dist-info}/RECORD +10 -8
- {ddi_fw-0.0.98.dist-info → ddi_fw-0.0.99.dist-info}/WHEEL +0 -0
- {ddi_fw-0.0.98.dist-info → ddi_fw-0.0.99.dist-info}/top_level.txt +0 -0
ddi_fw/langchain/__init__.py
CHANGED
@@ -1,3 +1,3 @@
|
|
1
1
|
from ..langchain.embeddings import PoolingStrategy,SumPoolingStrategy,MeanPoolingStrategy,SentenceTransformerDecorator,PretrainedEmbeddings,SBertEmbeddings
|
2
2
|
from .sentence_splitter import SentenceSplitter
|
3
|
-
from .storage import DataFrameToVectorDB
|
3
|
+
from .storage import DataFrameToVectorDB, generate_embeddings
|
ddi_fw/langchain/embeddings.py
CHANGED
@@ -69,7 +69,7 @@ class SentenceTransformerDecorator(Embeddings):
|
|
69
69
|
|
70
70
|
class PretrainedEmbeddings(Embeddings):
|
71
71
|
def __init__(self, model_name):
|
72
|
-
self.
|
72
|
+
self.model_name = model_name
|
73
73
|
self.model = AutoModel.from_pretrained(model_name)
|
74
74
|
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
75
75
|
self.shape = self.model.get_input_embeddings().weight.shape
|
ddi_fw/langchain/storage.py
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
import json
|
1
2
|
from langchain.vectorstores import Chroma
|
2
3
|
# from langchain_community.vectorstores import Chroma
|
3
4
|
from langchain_community.vectorstores.utils import filter_complex_metadata
|
@@ -11,6 +12,16 @@ from langchain.document_loaders import DataFrameLoader
|
|
11
12
|
from langchain.text_splitter import TextSplitter
|
12
13
|
|
13
14
|
# from langchain_community.document_loaders.hugging_face_dataset import HuggingFaceDatasetLoader
|
15
|
+
from utils import get_import
|
16
|
+
|
17
|
+
|
18
|
+
def load_configuration(config_file):
|
19
|
+
"""
|
20
|
+
Load the configuration from a JSON file.
|
21
|
+
"""
|
22
|
+
with open(config_file, 'r') as f:
|
23
|
+
config = json.load(f)
|
24
|
+
return config
|
14
25
|
|
15
26
|
|
16
27
|
class DataFrameToVectorDB:
|
@@ -18,15 +29,13 @@ class DataFrameToVectorDB:
|
|
18
29
|
collection_name,
|
19
30
|
persist_directory,
|
20
31
|
embeddings: Embeddings,
|
21
|
-
text_splitter: TextSplitter,
|
22
|
-
|
23
|
-
chunk_overlap=20):
|
32
|
+
text_splitter: TextSplitter,
|
33
|
+
batch_size=1000):
|
24
34
|
self.collection_name = collection_name
|
25
35
|
self.persist_directory = persist_directory
|
26
36
|
self.embeddings = embeddings
|
27
37
|
self.text_splitter = text_splitter
|
28
|
-
self.
|
29
|
-
self.chunk_overlap = chunk_overlap
|
38
|
+
self.batch_size = batch_size # to store chunks partially
|
30
39
|
self.vectordb = Chroma(collection_name=collection_name,
|
31
40
|
persist_directory=persist_directory,
|
32
41
|
embedding_function=embeddings)
|
@@ -35,11 +44,14 @@ class DataFrameToVectorDB:
|
|
35
44
|
docs = self.text_splitter.split_documents(documents)
|
36
45
|
return docs
|
37
46
|
|
38
|
-
def __split_list(self, input_list,
|
39
|
-
for i in range(0, len(input_list),
|
40
|
-
yield input_list[i:i +
|
47
|
+
def __split_list(self, input_list, batch_size):
|
48
|
+
for i in range(0, len(input_list), batch_size):
|
49
|
+
yield input_list[i:i + batch_size]
|
41
50
|
|
42
|
-
def
|
51
|
+
def store_documents(self, df, columns, page_content_columns):
|
52
|
+
"""
|
53
|
+
Core function that processes the documents and adds them to the vector database.
|
54
|
+
"""
|
43
55
|
for page_content_column in page_content_columns:
|
44
56
|
copy_columns = columns.copy()
|
45
57
|
copy_columns.append(page_content_column)
|
@@ -48,11 +60,12 @@ class DataFrameToVectorDB:
|
|
48
60
|
col_df['type'] = page_content_column # Set the type column
|
49
61
|
documents = []
|
50
62
|
|
51
|
-
loader = DataFrameLoader(
|
63
|
+
loader = DataFrameLoader(
|
64
|
+
data_frame=col_df, page_content_column=page_content_column)
|
52
65
|
loaded_docs = loader.load()
|
53
66
|
documents.extend(self.__split_docs(loaded_docs))
|
54
67
|
|
55
|
-
split_docs_chunked = self.__split_list(documents,
|
68
|
+
split_docs_chunked = self.__split_list(documents, self.batch_size)
|
56
69
|
|
57
70
|
for split_docs_chunk in split_docs_chunked:
|
58
71
|
# vectordb = Chroma.from_documents(
|
@@ -64,5 +77,93 @@ class DataFrameToVectorDB:
|
|
64
77
|
self.vectordb.add_documents(split_docs_chunk)
|
65
78
|
self.vectordb.persist()
|
66
79
|
|
80
|
+
def store(self, df, columns, page_content_columns, partial_df_size=None):
|
81
|
+
"""
|
82
|
+
Store function to handle both full and partial dataframe processing.
|
83
|
+
"""
|
84
|
+
if partial_df_size:
|
85
|
+
# Process the dataframe in chunks if partial_df_size is provided
|
86
|
+
for i in range(0, len(df), partial_df_size):
|
87
|
+
batch = df[i: i + partial_df_size]
|
88
|
+
self.store_documents(df=batch, columns=columns,
|
89
|
+
page_content_columns=page_content_columns, batch_size=self.batch_size)
|
90
|
+
else:
|
91
|
+
# Process the entire dataframe if no partial_df_size is specified
|
92
|
+
self.store_documents(df=df, columns=columns,
|
93
|
+
page_content_columns=page_content_columns, batch_size=self.batch_size)
|
94
|
+
|
95
|
+
|
96
|
+
def generate_embeddings(df, config_file, new_model_names, collections=None, persist_directory="embeddings"):
|
97
|
+
"""
|
98
|
+
Generate embeddings for collections based on a configuration file.
|
99
|
+
|
100
|
+
collections: List of collections that contain metadata for embedding generation.
|
101
|
+
config_file: Path to the configuration file containing model settings.
|
102
|
+
new_model_names: List of model names to generate embeddings for.
|
103
|
+
"""
|
104
|
+
# Load the configuration from the provided file
|
105
|
+
if not collections:
|
106
|
+
collections = load_configuration(config_file)
|
107
|
+
|
108
|
+
# Process each collection
|
109
|
+
for collection_config in collections:
|
110
|
+
id = collection_config['id']
|
111
|
+
name = collection_config['name']
|
112
|
+
|
113
|
+
# Skip if the collection's name is not in the list of new model names
|
114
|
+
if name not in new_model_names:
|
115
|
+
continue
|
116
|
+
|
117
|
+
# # Find the matching configuration for the collection
|
118
|
+
# collection_config = next(
|
119
|
+
# (item for item in collections if item['id'] == id), None)
|
120
|
+
|
121
|
+
# if not collection_config:
|
122
|
+
# print(f"Configuration for collection {id} not found.")
|
123
|
+
# continue
|
124
|
+
|
125
|
+
embedding_model_type = collection_config['embedding_model_tpe']
|
126
|
+
text_splitters_types = collection_config['text_splitters_types']
|
127
|
+
batch_size = collection_config['batch_size']
|
128
|
+
columns = collection_config['columns']
|
129
|
+
page_content_columns = collection_config['page_content_columns']
|
130
|
+
persist_directory = f'{persist_directory}/{id}'
|
131
|
+
|
132
|
+
# Load the embedding model and text splitter dynamically
|
133
|
+
print(f"Generating embeddings for {id} with model {name}...")
|
134
|
+
|
135
|
+
# Assuming the classes for the embeddings and splitters are available
|
136
|
+
try:
|
137
|
+
model = get_import(embedding_model_type)(
|
138
|
+
model_name=name, model_kwargs=c['model_kwargs'])
|
139
|
+
except:
|
140
|
+
# print(f"Unknown embedding model: {embedding_model_type}")
|
141
|
+
raise Exception(f"Unknown embedding model: {embedding_model_type}")
|
142
|
+
|
143
|
+
text_splitters = []
|
144
|
+
text_splitters_suffixes = []
|
145
|
+
for text_splitter_type in text_splitters_types:
|
146
|
+
try:
|
147
|
+
type_of_text_splitter = get_import(text_splitter_type)
|
148
|
+
text_splitter_params = text_splitter.get("params")
|
149
|
+
suffix = text_splitter.get("suffix")
|
150
|
+
if text_splitter_params:
|
151
|
+
text_splitter = type_of_text_splitter(
|
152
|
+
**text_splitter_params)
|
153
|
+
else:
|
154
|
+
text_splitter = type_of_text_splitter()
|
155
|
+
text_splitters.append(text_splitter)
|
156
|
+
text_splitters_suffixes.append(suffix)
|
157
|
+
except:
|
158
|
+
print(f"Unknown text splitter: {text_splitter_type}")
|
159
|
+
raise Exception(f"Unknown text splitter: {text_splitter_type}")
|
67
160
|
|
68
|
-
|
161
|
+
for text_splitter, suffix in zip(text_splitters, text_splitters_suffixes):
|
162
|
+
print(f"{id}_{suffix}")
|
163
|
+
to_vector_db = DataFrameToVectorDB(collection_name=f"{id}_{suffix}",
|
164
|
+
persist_directory=persist_directory,
|
165
|
+
embeddings=model,
|
166
|
+
text_splitter=text_splitter,
|
167
|
+
batch_size=1024)
|
168
|
+
to_vector_db.store(
|
169
|
+
df, columns, page_content_columns, partial_df_size=batch_size)
|
ddi_fw/utils/__init__.py
CHANGED
@@ -2,3 +2,5 @@ from .utils import clear_directory,create_folder_if_not_exists, utc_time_as_stri
|
|
2
2
|
from .zip_helper import ZipHelper
|
3
3
|
from .py7zr_helper import Py7ZipHelper
|
4
4
|
from .enums import UMLSCodeTypes, DrugBankTextDataTypes
|
5
|
+
from .package_helper import get_import
|
6
|
+
from .kaggle import create_kaggle_dataset
|
ddi_fw/utils/kaggle.py
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
import os
|
2
|
+
import json
|
3
|
+
|
4
|
+
def create_kaggle_dataset(base_path: str, collections: list):
|
5
|
+
"""
|
6
|
+
This function creates metadata JSON files and uploads datasets to Kaggle from folders.
|
7
|
+
|
8
|
+
Args:
|
9
|
+
base_path (str): The base path containing dataset folders.
|
10
|
+
collections (list): A list of dictionaries containing metadata about collections (e.g., model names).
|
11
|
+
path (str): The path to your root directory (default is "/content" for Google Colab).
|
12
|
+
|
13
|
+
Returns:
|
14
|
+
None
|
15
|
+
"""
|
16
|
+
|
17
|
+
# Step 1: Loop through each folder in base_path
|
18
|
+
for folder_name in os.listdir(base_path):
|
19
|
+
folder_path = os.path.join(base_path, folder_name)
|
20
|
+
|
21
|
+
# Step 2: Get metadata for the current folder
|
22
|
+
model_info = next((c for c in collections if c['id'] == folder_name), None)
|
23
|
+
if model_info is None:
|
24
|
+
continue # Skip if model info is not found
|
25
|
+
|
26
|
+
title = model_info['kaggle_title']
|
27
|
+
id = model_info['kaggle_id'].lower().replace(' ', '-')
|
28
|
+
licenses = model_info['kaggle_licenses']
|
29
|
+
description = model_info['kaggle_description']
|
30
|
+
|
31
|
+
# Ensure title is between 6 and 50 characters
|
32
|
+
if not (6 <= len(title) <= 50):
|
33
|
+
continue # Skip if title length is out of the expected range
|
34
|
+
|
35
|
+
# Step 3: Define the metadata content
|
36
|
+
metadata = {
|
37
|
+
"title": title,
|
38
|
+
"id": id,
|
39
|
+
"licenses": licenses,
|
40
|
+
"description": description,
|
41
|
+
}
|
42
|
+
|
43
|
+
# Step 4: Write the metadata to a JSON file in the folder
|
44
|
+
metadata_file_path = os.path.join(folder_path, 'dataset-metadata.json')
|
45
|
+
with open(metadata_file_path, 'w') as f:
|
46
|
+
json.dump(metadata, f, indent=4)
|
47
|
+
|
48
|
+
print(f"Created metadata for {folder_name}: {metadata_file_path}")
|
49
|
+
|
50
|
+
# Step 5: Create datasets on Kaggle using the Kaggle API
|
51
|
+
for folder_name in os.listdir(base_path):
|
52
|
+
folder_path = os.path.join(base_path, folder_name)
|
53
|
+
if os.path.isdir(folder_path):
|
54
|
+
# Run the Kaggle dataset creation command
|
55
|
+
os.system(f"kaggle datasets create -p {folder_path} --dir-mode zip")
|
56
|
+
print(f"Dataset created for {folder_name}.")
|
@@ -0,0 +1,31 @@
|
|
1
|
+
import importlib
|
2
|
+
|
3
|
+
|
4
|
+
def get_import(full_path_of_import):
|
5
|
+
"""Dynamically imports an object from a module given its full path.
|
6
|
+
|
7
|
+
Args:
|
8
|
+
full_path_of_import (str): The full path of the import (e.g., 'module.submodule.ClassName').
|
9
|
+
|
10
|
+
Returns:
|
11
|
+
object: The imported object.
|
12
|
+
|
13
|
+
Raises:
|
14
|
+
ImportError: If the module cannot be imported.
|
15
|
+
AttributeError: If the attribute does not exist in the module.
|
16
|
+
"""
|
17
|
+
if not full_path_of_import:
|
18
|
+
raise ValueError("The import path cannot be empty.")
|
19
|
+
|
20
|
+
parts = full_path_of_import.split('.')
|
21
|
+
import_name = parts[-1]
|
22
|
+
module_name = ".".join(parts[:-1]) if len(parts) > 1 else ""
|
23
|
+
|
24
|
+
try:
|
25
|
+
module = importlib.import_module(module_name)
|
26
|
+
return getattr(module, import_name)
|
27
|
+
except ModuleNotFoundError as e:
|
28
|
+
raise ImportError(f"Module '{module_name}' could not be found.") from e
|
29
|
+
except AttributeError as e:
|
30
|
+
raise AttributeError(
|
31
|
+
f"'{module_name}' has no attribute '{import_name}'") from e
|
@@ -55,10 +55,10 @@ ddi_fw/drugbank/drugbank_parser.py,sha256=lxUuhB0s8ef_aPNDs0V8ClKF7-KIWugNIV9gVs
|
|
55
55
|
ddi_fw/drugbank/drugbank_processor.py,sha256=vmkt68n9nFLevufgGyXhOSDtTo4G1XzwT9PVncGTXtk,18127
|
56
56
|
ddi_fw/drugbank/drugbank_processor_org.py,sha256=eO5Yset50P91qkic79RUXPoEuxRxQKFkKW0l4G29Mas,13322
|
57
57
|
ddi_fw/drugbank/event_extractor.py,sha256=6odoZohhK7OdLF-LF0l-5BFq0_NMG_5jrFJbHrBXsI8,4600
|
58
|
-
ddi_fw/langchain/__init__.py,sha256=
|
59
|
-
ddi_fw/langchain/embeddings.py,sha256=
|
58
|
+
ddi_fw/langchain/__init__.py,sha256=zS0CQrakWEP19biSRewFJGcBT8WBZq4899HrEKiMqUY,269
|
59
|
+
ddi_fw/langchain/embeddings.py,sha256=b9BUG73Ayx3Wy8MQrfsVeZ-qBB41vjVECSp2YhH-CIY,7514
|
60
60
|
ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSzu5L2PXzg,280
|
61
|
-
ddi_fw/langchain/storage.py,sha256=
|
61
|
+
ddi_fw/langchain/storage.py,sha256=ljK_ybzjtrikb4XdJ1qkNFTqqyw5r62OBAnBJ5B-X_k,7408
|
62
62
|
ddi_fw/ml/__init__.py,sha256=tIxiW0g6q1VsmDYVXR_ovvHQR3SCir8g2bKxx_CrS7s,221
|
63
63
|
ddi_fw/ml/evaluation_helper.py,sha256=o4-w5Xa3t4olLW4ymx_8L-Buhe5wfQEmT2bh4Zz544c,13066
|
64
64
|
ddi_fw/ml/ml_helper.py,sha256=fySjIAFzkeEOvaLJhDwtCOgRhgYQ7H106eqaP16GhDY,4489
|
@@ -84,12 +84,14 @@ ddi_fw/test/sklearn-tfidf.py,sha256=cjtg27vLskEMXgrsqUR_EapRGVd4xgwOQ9zYsu72zjs,
|
|
84
84
|
ddi_fw/test/test.py,sha256=zJh9ZBcZl-vZIFDvuftcRrRV8WAwtiFVhPPd6Et4OU4,2997
|
85
85
|
ddi_fw/test/torch_cuda_test.py,sha256=R-4VGVErl_Ufk54DoZbgL_YXWoCYFyanIVWd6P39IEk,312
|
86
86
|
ddi_fw/test/type_guarding_test.py,sha256=KxjyBxohDu7lwpejalCj-REjtJ-k1S1wQbOB6TGY0O8,766
|
87
|
-
ddi_fw/utils/__init__.py,sha256=
|
87
|
+
ddi_fw/utils/__init__.py,sha256=77563ikqAtdzjjgRlLp5OAsJBbpLA1Cao8iecGaVUXQ,354
|
88
88
|
ddi_fw/utils/enums.py,sha256=19eJ3fX5eRK_xPvkYcukmug144jXPH4X9zQqtsFBj5A,671
|
89
|
+
ddi_fw/utils/kaggle.py,sha256=FjWR1ncOEif6XCCzDYpErLDz_9fxAQub0L7X4aVPw24,2266
|
90
|
+
ddi_fw/utils/package_helper.py,sha256=erl8_onmhK-41zQoaED2qyDUV9GQxmT9sdoyRp9_q5I,1056
|
89
91
|
ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,4747
|
90
92
|
ddi_fw/utils/utils.py,sha256=szwnxMTDRrZoeNRyDuf3aCbtzriwtaRk4mHSH3asLdA,4301
|
91
93
|
ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
|
92
|
-
ddi_fw-0.0.
|
93
|
-
ddi_fw-0.0.
|
94
|
-
ddi_fw-0.0.
|
95
|
-
ddi_fw-0.0.
|
94
|
+
ddi_fw-0.0.99.dist-info/METADATA,sha256=NrGCSF9-dHrO0FteW7HO0OFPF7dfgeNs2gDVvLMtiNg,1966
|
95
|
+
ddi_fw-0.0.99.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
|
96
|
+
ddi_fw-0.0.99.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
|
97
|
+
ddi_fw-0.0.99.dist-info/RECORD,,
|
File without changes
|
File without changes
|