ddi-fw 0.0.252__tar.gz → 0.0.253__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/PKG-INFO +1 -1
  2. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/pyproject.toml +1 -1
  3. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/langchain/faiss_storage.py +104 -18
  4. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw.egg-info/PKG-INFO +1 -1
  5. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/README.md +0 -0
  6. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/setup.cfg +0 -0
  7. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/datasets/__init__.py +0 -0
  8. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/datasets/core.py +0 -0
  9. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/datasets/dataset_splitter.py +0 -0
  10. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/datasets/db_utils.py +0 -0
  11. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/datasets/setup_._py +0 -0
  12. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/langchain/__init__.py +0 -0
  13. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/langchain/chroma_storage.py +0 -0
  14. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/langchain/embeddings.py +0 -0
  15. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/langchain/sentence_splitter.py +0 -0
  16. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/langchain/storage.py +0 -0
  17. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/ml/__init__.py +0 -0
  18. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/ml/evaluation_helper.py +0 -0
  19. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/ml/ml_helper.py +0 -0
  20. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/ml/model_wrapper.py +0 -0
  21. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/ml/pytorch_wrapper.py +0 -0
  22. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/ml/tensorflow_wrapper.py +0 -0
  23. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/ml/tracking_service.py +0 -0
  24. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/ner/__init__.py +0 -0
  25. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/ner/mmlrestclient.py +0 -0
  26. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/ner/ner.py +0 -0
  27. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/pipeline/__init__.py +0 -0
  28. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/pipeline/multi_modal_combination_strategy.py +0 -0
  29. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/pipeline/multi_pipeline.py +0 -0
  30. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/pipeline/multi_pipeline_org.py +0 -0
  31. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/pipeline/ner_pipeline.py +0 -0
  32. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/pipeline/pipeline.py +0 -0
  33. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/utils/__init__.py +0 -0
  34. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/utils/categorical_data_encoding_checker.py +0 -0
  35. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/utils/enums.py +0 -0
  36. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/utils/json_helper.py +0 -0
  37. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/utils/kaggle.py +0 -0
  38. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/utils/numpy_utils.py +0 -0
  39. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/utils/package_helper.py +0 -0
  40. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/utils/py7zr_helper.py +0 -0
  41. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/utils/utils.py +0 -0
  42. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/utils/zip_helper.py +0 -0
  43. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/vectorization/__init__.py +0 -0
  44. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/vectorization/feature_vector_generation.py +0 -0
  45. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw/vectorization/idf_helper.py +0 -0
  46. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw.egg-info/SOURCES.txt +0 -0
  47. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw.egg-info/dependency_links.txt +0 -0
  48. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw.egg-info/requires.txt +0 -0
  49. {ddi_fw-0.0.252 → ddi_fw-0.0.253}/src/ddi_fw.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ddi_fw
3
- Version: 0.0.252
3
+ Version: 0.0.253
4
4
  Summary: Do not use :)
5
5
  Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
6
6
  Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
@@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
6
6
 
7
7
  [project]
8
8
  name = "ddi_fw"
9
- version = "0.0.252"
9
+ version = "0.0.253"
10
10
  description = "Do not use :)"
11
11
  readme = "README.md"
12
12
  authors = [
@@ -3,7 +3,7 @@ import pandas as pd
3
3
  from uuid import uuid4
4
4
  from langchain_community.vectorstores.faiss import FAISS
5
5
  from langchain_community.docstore.in_memory import InMemoryDocstore
6
- from typing import Callable, Optional, Dict, Any, Type
6
+ from typing import Callable, List, Optional, Dict, Any, Type
7
7
  from langchain_core.documents import Document
8
8
  import numpy as np # optional, if you're using NumPy vectors
9
9
  from langchain_core.embeddings import Embeddings
@@ -260,12 +260,106 @@ def load_configuration(config_file):
260
260
  return config
261
261
 
262
262
 
263
+ # def generate_embeddings(
264
+ # df,
265
+ # vector_store_manager_type:Type[BaseVectorStoreManager],
266
+ # config_file,
267
+ # new_model_names,
268
+ # collections,
269
+ # persist_directory="embeddings",
270
+ # ):
271
+ # """
272
+ # Generate embeddings for collections based on a configuration file.
273
+
274
+ # collections: List of collections that contain metadata for embedding generation.
275
+ # config_file: Path to the configuration file containing model settings.
276
+ # new_model_names: List of model names to generate embeddings for.
277
+ # vector_store_manager_type: Class type of the vector store manager (e.g., FaissVectorStoreManager or ChromaVectorStoreManager)
278
+ # """
279
+ # if not collections and not config_file:
280
+ # raise ValueError("Either 'collections' or 'config_file' must be provided.")
281
+ # if collections and config_file:
282
+ # raise ValueError("Only one of 'collections' or 'config_file' should be provided.")
283
+
284
+ # if not collections:
285
+ # collections = load_configuration(config_file)
286
+
287
+ # for collection_config in collections:
288
+ # id = collection_config['id']
289
+ # name = collection_config['name']
290
+
291
+ # if name not in new_model_names:
292
+ # continue
293
+
294
+ # embedding_model_type = collection_config.get('embedding_model_type')
295
+ # text_splitters_types = collection_config.get('text_splitters_types')
296
+ # batch_size = collection_config.get('batch_size')
297
+ # partial_df_size = collection_config.get('partial_dataframe_size')
298
+ # columns = collection_config.get('columns')
299
+ # page_content_columns = collection_config.get('page_content_columns')
300
+ # persist_dir = f'{persist_directory}/{id}'
301
+
302
+ # # Load embedding model
303
+ # try:
304
+ # model_kwargs = collection_config.get('model_kwargs')
305
+ # model = get_import(embedding_model_type)(
306
+ # model_name=name, **model_kwargs)
307
+ # except Exception as e:
308
+ # raise Exception(f"Unknown embedding model: {embedding_model_type}") from e
309
+
310
+ # # Load text splitters
311
+ # text_splitters = []
312
+ # text_splitters_suffixes = []
313
+ # for text_splitter_type in text_splitters_types:
314
+ # try:
315
+ # type_of_text_splitter = get_import(
316
+ # text_splitter_type.get("type"))
317
+ # kwargs = text_splitter_type.get("params")
318
+ # suffix = text_splitter_type.get("suffix")
319
+ # if kwargs:
320
+ # text_splitter = type_of_text_splitter(**kwargs)
321
+ # else:
322
+ # text_splitter = type_of_text_splitter()
323
+ # text_splitters.append(text_splitter)
324
+ # text_splitters_suffixes.append(suffix)
325
+ # except Exception as e:
326
+ # raise Exception(f"Unknown text splitter: {text_splitter_type}") from e
327
+
328
+ # for text_splitter, suffix in zip(text_splitters, text_splitters_suffixes):
329
+ # print(f"{id}_{suffix}")
330
+
331
+ # # Prepare manager parameters
332
+ # manager_params = {
333
+ # "collection_name": f"{id}_{suffix}",
334
+ # "persist_directory": persist_dir,
335
+ # "embeddings": model,
336
+ # "text_splitter": text_splitter,
337
+ # "batch_size": batch_size
338
+ # }
339
+
340
+ # # Instantiate the manager class
341
+ # vector_store_manager = vector_store_manager_type(**manager_params)
342
+
343
+ # # Prepare documents
344
+ # # You may need to use a DataFrameLoader or similar to convert df to LangChain Documents
345
+ # loader = DataFrameLoader(
346
+ # data_frame=df, page_content_column=page_content_columns[0]
347
+ # )
348
+ # docs = loader.load()
349
+
350
+ # # Generate vector store
351
+ # vector_store_manager.generate_vector_store(docs)
352
+
353
+ # # Optionally persist/save
354
+ # vector_store_manager.save(persist_dir)
355
+
356
+
263
357
  def generate_embeddings(
264
- df,
265
- vector_store_manager_type:Type[BaseVectorStoreManager],
266
- config_file,
267
- new_model_names,
268
- collections,
358
+ docs,
359
+ vector_store_manager_type:Type[BaseVectorStoreManager],
360
+ config_file:Optional[str],
361
+ new_model_names:Optional[List],
362
+ collections:Optional[Dict],
269
363
  persist_directory="embeddings",
270
364
  ):
271
365
  """
@@ -280,17 +374,16 @@ def generate_embeddings(
280
374
  raise ValueError("Either 'collections' or 'config_file' must be provided.")
281
375
  if collections and config_file:
282
376
  raise ValueError("Only one of 'collections' or 'config_file' should be provided.")
283
-
377
+
284
378
  if not collections:
285
379
  collections = load_configuration(config_file)
286
-
380
+ if collections is None:
381
+ raise ValueError("No collections found in the configuration file.")
287
382
  for collection_config in collections:
288
383
  id = collection_config['id']
289
384
  name = collection_config['name']
290
-
291
385
  if name not in new_model_names:
292
386
  continue
293
-
294
387
  embedding_model_type = collection_config.get('embedding_model_type')
295
388
  text_splitters_types = collection_config.get('text_splitters_types')
296
389
  batch_size = collection_config.get('batch_size')
@@ -340,15 +433,8 @@ def generate_embeddings(
340
433
  # Instantiate the manager class
341
434
  vector_store_manager = vector_store_manager_type(**manager_params)
342
435
 
343
- # Prepare documents
344
- # You may need to use a DataFrameLoader or similar to convert df to LangChain Documents
345
- loader = DataFrameLoader(
346
- data_frame=df, page_content_column=page_content_columns[0]
347
- )
348
- docs = loader.load()
349
-
350
436
  # Generate vector store
351
437
  vector_store_manager.generate_vector_store(docs)
352
438
 
353
439
  # Optionally persist/save
354
- vector_store_manager.save(persist_dir)
440
+ vector_store_manager.save(persist_dir)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ddi_fw
3
- Version: 0.0.252
3
+ Version: 0.0.253
4
4
  Summary: Do not use :)
5
5
  Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
6
6
  Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
File without changes
File without changes
File without changes