ddi-fw 0.0.246__py3-none-any.whl → 0.0.248__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,7 +3,7 @@ import pandas as pd
3
3
  from uuid import uuid4
4
4
  from langchain_community.vectorstores.faiss import FAISS
5
5
  from langchain_community.docstore.in_memory import InMemoryDocstore
6
- from typing import Callable, Optional, Dict, Any
6
+ from typing import Callable, Optional, Dict, Any, Type
7
7
  from langchain_core.documents import Document
8
8
  import numpy as np # optional, if you're using NumPy vectors
9
9
  from langchain_core.embeddings import Embeddings
@@ -11,6 +11,8 @@ from langchain_core.embeddings import Embeddings
11
11
  from pydantic import BaseModel, Field
12
12
  from langchain_core.embeddings import Embeddings
13
13
  from langchain_core.vectorstores import VectorStore
14
+ from ddi_fw.utils import get_import
15
+ from langchain.document_loaders import DataFrameLoader
14
16
 
15
17
  class BaseVectorStoreManager(BaseModel):
16
18
  embeddings: Optional[Embeddings] = None
@@ -240,3 +242,106 @@ def custom_formatter(document: Document, vector: np.ndarray) -> Dict[str, Any]:
240
242
  "type": document.metadata.get("type", None),
241
243
  "embedding": vector
242
244
  }
245
+
246
+ def load_configuration(config_file):
247
+ """
248
+ Load the configuration from a JSON file.
249
+ """
250
+ import json
251
+ with open(config_file, 'r') as f:
252
+ config = json.load(f)
253
+ return config
254
+
255
+
256
+ def generate_embeddings(
257
+ df,
258
+ vector_store_manager_type:Type[BaseVectorStoreManager],
259
+ config_file,
260
+ new_model_names,
261
+ collections,
262
+ persist_directory="embeddings",
263
+ ):
264
+ """
265
+ Generate embeddings for collections based on a configuration file.
266
+
267
+ collections: List of collections that contain metadata for embedding generation.
268
+ config_file: Path to the configuration file containing model settings.
269
+ new_model_names: List of model names to generate embeddings for.
270
+ vector_store_manager_type: Class type of the vector store manager (e.g., FaissVectorStoreManager or ChromaVectorStoreManager)
271
+ """
272
+ if not collections and not config_file:
273
+ raise ValueError("Either 'collections' or 'config_file' must be provided.")
274
+ if collections and config_file:
275
+ raise ValueError("Only one of 'collections' or 'config_file' should be provided.")
276
+
277
+ if not collections:
278
+ collections = load_configuration(config_file)
279
+
280
+ for collection_config in collections:
281
+ id = collection_config['id']
282
+ name = collection_config['name']
283
+
284
+ if name not in new_model_names:
285
+ continue
286
+
287
+ embedding_model_type = collection_config.get('embedding_model_type')
288
+ text_splitters_types = collection_config.get('text_splitters_types')
289
+ batch_size = collection_config.get('batch_size')
290
+ partial_df_size = collection_config.get('partial_dataframe_size')
291
+ columns = collection_config.get('columns')
292
+ page_content_columns = collection_config.get('page_content_columns')
293
+ persist_dir = f'{persist_directory}/{id}'
294
+
295
+ # Load embedding model
296
+ try:
297
+ model_kwargs = collection_config.get('model_kwargs')
298
+ model = get_import(embedding_model_type)(
299
+ model_name=name, **model_kwargs)
300
+ except Exception as e:
301
+ raise Exception(f"Unknown embedding model: {embedding_model_type}") from e
302
+
303
+ # Load text splitters
304
+ text_splitters = []
305
+ text_splitters_suffixes = []
306
+ for text_splitter_type in text_splitters_types:
307
+ try:
308
+ type_of_text_splitter = get_import(
309
+ text_splitter_type.get("type"))
310
+ kwargs = text_splitter_type.get("params")
311
+ suffix = text_splitter_type.get("suffix")
312
+ if kwargs:
313
+ text_splitter = type_of_text_splitter(**kwargs)
314
+ else:
315
+ text_splitter = type_of_text_splitter()
316
+ text_splitters.append(text_splitter)
317
+ text_splitters_suffixes.append(suffix)
318
+ except Exception as e:
319
+ raise Exception(f"Unknown text splitter: {text_splitter_type}") from e
320
+
321
+ for text_splitter, suffix in zip(text_splitters, text_splitters_suffixes):
322
+ print(f"{id}_{suffix}")
323
+
324
+ # Prepare manager parameters
325
+ manager_params = {
326
+ "collection_name": f"{id}_{suffix}",
327
+ "persist_directory": persist_dir,
328
+ "embeddings": model,
329
+ "text_splitter": text_splitter,
330
+ "batch_size": batch_size
331
+ }
332
+
333
+ # Instantiate the manager class
334
+ vector_store_manager = vector_store_manager_type(**manager_params)
335
+
336
+ # Prepare documents
337
+ # You may need to use a DataFrameLoader or similar to convert df to LangChain Documents
338
+ loader = DataFrameLoader(
339
+ data_frame=df, page_content_column=page_content_columns[0]
340
+ )
341
+ docs = loader.load()
342
+
343
+ # Generate vector store
344
+ vector_store_manager.generate_vector_store(docs)
345
+
346
+ # Optionally persist/save
347
+ vector_store_manager.save(persist_dir)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ddi_fw
3
- Version: 0.0.246
3
+ Version: 0.0.248
4
4
  Summary: Do not use :)
5
5
  Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
6
6
  Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
@@ -39,7 +39,7 @@ Provides-Extra: ml
39
39
  Requires-Dist: scikit-learn<=1.6.1,>=1.5.2; extra == "ml"
40
40
  Requires-Dist: tensorflow<=2.19.0,>=2.17.0; extra == "ml"
41
41
  Requires-Dist: tf-keras<=2.19.0,>=2.17.0; extra == "ml"
42
- Requires-Dist: mlflow<=2.20.0,>=2.16.1; extra == "ml"
42
+ Requires-Dist: mlflow<=3.2.0,>=2.16.1; extra == "ml"
43
43
  Requires-Dist: scipy<=1.16.1,>=1.13.1; extra == "ml"
44
44
  Requires-Dist: plotly==5.24.1; extra == "ml"
45
45
  Requires-Dist: matplotlib==3.10.0; extra == "ml"
@@ -6,7 +6,7 @@ ddi_fw/datasets/setup_._py,sha256=khYVJuW5PlOY_i_A16F3UbSZ6s6o_ljw33Byw3C-A8E,10
6
6
  ddi_fw/langchain/__init__.py,sha256=xGNaTEZCUxyc_aT1zvzVWGRfsj-9VXqMvPKtV_G7ChA,399
7
7
  ddi_fw/langchain/chroma_storage.py,sha256=7LSUhdiPdQHZvKC_NapOeVbHtS71iE5ABZVTrI0YQ-A,15520
8
8
  ddi_fw/langchain/embeddings.py,sha256=eEWy4okcjdhUJHi4N48Wd8XauPXyeaQVLUdNWEvtEcY,6754
9
- ddi_fw/langchain/faiss_storage.py,sha256=b-PStwJHeRl9ZYGC7ql5p5ak1Xk2-A6TTEL1UqmhxVI,9220
9
+ ddi_fw/langchain/faiss_storage.py,sha256=98PTZcP2gn3EB2xD4jD4KBCn11Ox1NNqCLvJNH5fWN8,13413
10
10
  ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSzu5L2PXzg,280
11
11
  ddi_fw/langchain/storage.py,sha256=OizKyWm74Js7T6Q9kez-ulUoBGzIMFo4R46h4kjUyIM,11200
12
12
  ddi_fw/ml/__init__.py,sha256=FteYEawCkVQOaK-cTv2VrHZ2ZnfeFr31BD6VucO7_DQ,268
@@ -38,7 +38,7 @@ ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,55
38
38
  ddi_fw/vectorization/__init__.py,sha256=LcJOpLVoLvHPDw9phGFlUQGeNcST_zKV-Oi1Pm5h_nE,110
39
39
  ddi_fw/vectorization/feature_vector_generation.py,sha256=QQQGhCti653BdU343Ag1bH_g1fzi2hlic7dgNy7otjE,7694
40
40
  ddi_fw/vectorization/idf_helper.py,sha256=_Gd1dtDSLaw8o-o0JugzSKMt9FpeXewTh4wGEaUd4VQ,2571
41
- ddi_fw-0.0.246.dist-info/METADATA,sha256=dGtlIUaC6JfpC27ATKpa7mYUs_9n-R_-tmfn_CVPAcw,2624
42
- ddi_fw-0.0.246.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
43
- ddi_fw-0.0.246.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
44
- ddi_fw-0.0.246.dist-info/RECORD,,
41
+ ddi_fw-0.0.248.dist-info/METADATA,sha256=jZL0rUzzuwnsCaLxgQvY6eMMqj_wUVHE5k0cVdS5wQQ,2623
42
+ ddi_fw-0.0.248.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
43
+ ddi_fw-0.0.248.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
44
+ ddi_fw-0.0.248.dist-info/RECORD,,