ddi-fw 0.0.252__tar.gz → 0.0.253__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/PKG-INFO +1 -1
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/pyproject.toml +1 -1
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/langchain/faiss_storage.py +104 -18
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw.egg-info/PKG-INFO +1 -1
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/README.md +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/setup.cfg +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/datasets/__init__.py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/datasets/core.py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/datasets/dataset_splitter.py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/datasets/db_utils.py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/datasets/setup_._py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/langchain/__init__.py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/langchain/chroma_storage.py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/langchain/embeddings.py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/langchain/sentence_splitter.py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/langchain/storage.py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/ml/__init__.py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/ml/evaluation_helper.py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/ml/ml_helper.py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/ml/model_wrapper.py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/ml/pytorch_wrapper.py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/ml/tensorflow_wrapper.py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/ml/tracking_service.py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/ner/__init__.py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/ner/mmlrestclient.py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/ner/ner.py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/pipeline/__init__.py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/pipeline/multi_modal_combination_strategy.py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/pipeline/multi_pipeline.py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/pipeline/multi_pipeline_org.py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/pipeline/ner_pipeline.py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/pipeline/pipeline.py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/utils/__init__.py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/utils/categorical_data_encoding_checker.py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/utils/enums.py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/utils/json_helper.py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/utils/kaggle.py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/utils/numpy_utils.py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/utils/package_helper.py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/utils/py7zr_helper.py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/utils/utils.py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/utils/zip_helper.py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/vectorization/__init__.py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/vectorization/feature_vector_generation.py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/vectorization/idf_helper.py +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw.egg-info/SOURCES.txt +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw.egg-info/dependency_links.txt +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw.egg-info/requires.txt +0 -0
- {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw.egg-info/top_level.txt +0 -0
@@ -3,7 +3,7 @@ import pandas as pd
|
|
3
3
|
from uuid import uuid4
|
4
4
|
from langchain_community.vectorstores.faiss import FAISS
|
5
5
|
from langchain_community.docstore.in_memory import InMemoryDocstore
|
6
|
-
from typing import Callable, Optional, Dict, Any, Type
|
6
|
+
from typing import Callable, List, Optional, Dict, Any, Type
|
7
7
|
from langchain_core.documents import Document
|
8
8
|
import numpy as np # optional, if you're using NumPy vectors
|
9
9
|
from langchain_core.embeddings import Embeddings
|
@@ -260,12 +260,106 @@ def load_configuration(config_file):
|
|
260
260
|
return config
|
261
261
|
|
262
262
|
|
263
|
+
# def generate_embeddings(
|
264
|
+
# df,
|
265
|
+
# vector_store_manager_type:Type[BaseVectorStoreManager],
|
266
|
+
# config_file,
|
267
|
+
# new_model_names,
|
268
|
+
# collections,
|
269
|
+
# persist_directory="embeddings",
|
270
|
+
# ):
|
271
|
+
# """
|
272
|
+
# Generate embeddings for collections based on a configuration file.
|
273
|
+
|
274
|
+
# collections: List of collections that contain metadata for embedding generation.
|
275
|
+
# config_file: Path to the configuration file containing model settings.
|
276
|
+
# new_model_names: List of model names to generate embeddings for.
|
277
|
+
# vector_store_manager_type: Class type of the vector store manager (e.g., FaissVectorStoreManager or ChromaVectorStoreManager)
|
278
|
+
# """
|
279
|
+
# if not collections and not config_file:
|
280
|
+
# raise ValueError("Either 'collections' or 'config_file' must be provided.")
|
281
|
+
# if collections and config_file:
|
282
|
+
# raise ValueError("Only one of 'collections' or 'config_file' should be provided.")
|
283
|
+
|
284
|
+
# if not collections:
|
285
|
+
# collections = load_configuration(config_file)
|
286
|
+
|
287
|
+
# for collection_config in collections:
|
288
|
+
# id = collection_config['id']
|
289
|
+
# name = collection_config['name']
|
290
|
+
|
291
|
+
# if name not in new_model_names:
|
292
|
+
# continue
|
293
|
+
|
294
|
+
# embedding_model_type = collection_config.get('embedding_model_type')
|
295
|
+
# text_splitters_types = collection_config.get('text_splitters_types')
|
296
|
+
# batch_size = collection_config.get('batch_size')
|
297
|
+
# partial_df_size = collection_config.get('partial_dataframe_size')
|
298
|
+
# columns = collection_config.get('columns')
|
299
|
+
# page_content_columns = collection_config.get('page_content_columns')
|
300
|
+
# persist_dir = f'{persist_directory}/{id}'
|
301
|
+
|
302
|
+
# # Load embedding model
|
303
|
+
# try:
|
304
|
+
# model_kwargs = collection_config.get('model_kwargs')
|
305
|
+
# model = get_import(embedding_model_type)(
|
306
|
+
# model_name=name, **model_kwargs)
|
307
|
+
# except Exception as e:
|
308
|
+
# raise Exception(f"Unknown embedding model: {embedding_model_type}") from e
|
309
|
+
|
310
|
+
# # Load text splitters
|
311
|
+
# text_splitters = []
|
312
|
+
# text_splitters_suffixes = []
|
313
|
+
# for text_splitter_type in text_splitters_types:
|
314
|
+
# try:
|
315
|
+
# type_of_text_splitter = get_import(
|
316
|
+
# text_splitter_type.get("type"))
|
317
|
+
# kwargs = text_splitter_type.get("params")
|
318
|
+
# suffix = text_splitter_type.get("suffix")
|
319
|
+
# if kwargs:
|
320
|
+
# text_splitter = type_of_text_splitter(**kwargs)
|
321
|
+
# else:
|
322
|
+
# text_splitter = type_of_text_splitter()
|
323
|
+
# text_splitters.append(text_splitter)
|
324
|
+
# text_splitters_suffixes.append(suffix)
|
325
|
+
# except Exception as e:
|
326
|
+
# raise Exception(f"Unknown text splitter: {text_splitter_type}") from e
|
327
|
+
|
328
|
+
# for text_splitter, suffix in zip(text_splitters, text_splitters_suffixes):
|
329
|
+
# print(f"{id}_{suffix}")
|
330
|
+
|
331
|
+
# # Prepare manager parameters
|
332
|
+
# manager_params = {
|
333
|
+
# "collection_name": f"{id}_{suffix}",
|
334
|
+
# "persist_directory": persist_dir,
|
335
|
+
# "embeddings": model,
|
336
|
+
# "text_splitter": text_splitter,
|
337
|
+
# "batch_size": batch_size
|
338
|
+
# }
|
339
|
+
|
340
|
+
# # Instantiate the manager class
|
341
|
+
# vector_store_manager = vector_store_manager_type(**manager_params)
|
342
|
+
|
343
|
+
# # Prepare documents
|
344
|
+
# # You may need to use a DataFrameLoader or similar to convert df to LangChain Documents
|
345
|
+
# loader = DataFrameLoader(
|
346
|
+
# data_frame=df, page_content_column=page_content_columns[0]
|
347
|
+
# )
|
348
|
+
# docs = loader.load()
|
349
|
+
|
350
|
+
# # Generate vector store
|
351
|
+
# vector_store_manager.generate_vector_store(docs)
|
352
|
+
|
353
|
+
# # Optionally persist/save
|
354
|
+
# vector_store_manager.save(persist_dir)
|
355
|
+
|
356
|
+
|
263
357
|
def generate_embeddings(
|
264
|
-
|
265
|
-
vector_store_manager_type:Type[BaseVectorStoreManager],
|
266
|
-
config_file,
|
267
|
-
new_model_names,
|
268
|
-
collections,
|
358
|
+
docs,
|
359
|
+
vector_store_manager_type:Type[BaseVectorStoreManager],
|
360
|
+
config_file:Optional[str],
|
361
|
+
new_model_names:Optional[List],
|
362
|
+
collections:Optional[Dict],
|
269
363
|
persist_directory="embeddings",
|
270
364
|
):
|
271
365
|
"""
|
@@ -280,17 +374,16 @@ def generate_embeddings(
|
|
280
374
|
raise ValueError("Either 'collections' or 'config_file' must be provided.")
|
281
375
|
if collections and config_file:
|
282
376
|
raise ValueError("Only one of 'collections' or 'config_file' should be provided.")
|
283
|
-
|
377
|
+
|
284
378
|
if not collections:
|
285
379
|
collections = load_configuration(config_file)
|
286
|
-
|
380
|
+
if collections is None:
|
381
|
+
raise ValueError("No collections found in the configuration file.")
|
287
382
|
for collection_config in collections:
|
288
383
|
id = collection_config['id']
|
289
384
|
name = collection_config['name']
|
290
|
-
|
291
385
|
if name not in new_model_names:
|
292
386
|
continue
|
293
|
-
|
294
387
|
embedding_model_type = collection_config.get('embedding_model_type')
|
295
388
|
text_splitters_types = collection_config.get('text_splitters_types')
|
296
389
|
batch_size = collection_config.get('batch_size')
|
@@ -340,15 +433,8 @@ def generate_embeddings(
|
|
340
433
|
# Instantiate the manager class
|
341
434
|
vector_store_manager = vector_store_manager_type(**manager_params)
|
342
435
|
|
343
|
-
# Prepare documents
|
344
|
-
# You may need to use a DataFrameLoader or similar to convert df to LangChain Documents
|
345
|
-
loader = DataFrameLoader(
|
346
|
-
data_frame=df, page_content_column=page_content_columns[0]
|
347
|
-
)
|
348
|
-
docs = loader.load()
|
349
|
-
|
350
436
|
# Generate vector store
|
351
437
|
vector_store_manager.generate_vector_store(docs)
|
352
438
|
|
353
439
|
# Optionally persist/save
|
354
|
-
vector_store_manager.save(persist_dir)
|
440
|
+
vector_store_manager.save(persist_dir)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|