ddi-fw 0.0.251__tar.gz → 0.0.253__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/PKG-INFO +1 -1
  2. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/pyproject.toml +1 -1
  3. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/langchain/faiss_storage.py +115 -25
  4. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw.egg-info/PKG-INFO +1 -1
  5. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/README.md +0 -0
  6. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/setup.cfg +0 -0
  7. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/datasets/__init__.py +0 -0
  8. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/datasets/core.py +0 -0
  9. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/datasets/dataset_splitter.py +0 -0
  10. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/datasets/db_utils.py +0 -0
  11. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/datasets/setup_._py +0 -0
  12. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/langchain/__init__.py +0 -0
  13. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/langchain/chroma_storage.py +0 -0
  14. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/langchain/embeddings.py +0 -0
  15. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/langchain/sentence_splitter.py +0 -0
  16. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/langchain/storage.py +0 -0
  17. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/ml/__init__.py +0 -0
  18. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/ml/evaluation_helper.py +0 -0
  19. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/ml/ml_helper.py +0 -0
  20. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/ml/model_wrapper.py +0 -0
  21. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/ml/pytorch_wrapper.py +0 -0
  22. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/ml/tensorflow_wrapper.py +0 -0
  23. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/ml/tracking_service.py +0 -0
  24. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/ner/__init__.py +0 -0
  25. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/ner/mmlrestclient.py +0 -0
  26. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/ner/ner.py +0 -0
  27. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/pipeline/__init__.py +0 -0
  28. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/pipeline/multi_modal_combination_strategy.py +0 -0
  29. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/pipeline/multi_pipeline.py +0 -0
  30. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/pipeline/multi_pipeline_org.py +0 -0
  31. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/pipeline/ner_pipeline.py +0 -0
  32. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/pipeline/pipeline.py +0 -0
  33. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/utils/__init__.py +0 -0
  34. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/utils/categorical_data_encoding_checker.py +0 -0
  35. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/utils/enums.py +0 -0
  36. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/utils/json_helper.py +0 -0
  37. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/utils/kaggle.py +0 -0
  38. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/utils/numpy_utils.py +0 -0
  39. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/utils/package_helper.py +0 -0
  40. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/utils/py7zr_helper.py +0 -0
  41. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/utils/utils.py +0 -0
  42. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/utils/zip_helper.py +0 -0
  43. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/vectorization/__init__.py +0 -0
  44. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/vectorization/feature_vector_generation.py +0 -0
  45. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw/vectorization/idf_helper.py +0 -0
  46. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw.egg-info/SOURCES.txt +0 -0
  47. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw.egg-info/dependency_links.txt +0 -0
  48. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw.egg-info/requires.txt +0 -0
  49. {ddi_fw-0.0.251 → ddi_fw-0.0.253}/src/ddi_fw.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ddi_fw
3
- Version: 0.0.251
3
+ Version: 0.0.253
4
4
  Summary: Do not use :)
5
5
  Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
6
6
  Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
@@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
6
6
 
7
7
  [project]
8
8
  name = "ddi_fw"
9
- version = "0.0.251"
9
+ version = "0.0.253"
10
10
  description = "Do not use :)"
11
11
  readme = "README.md"
12
12
  authors = [
@@ -3,7 +3,7 @@ import pandas as pd
3
3
  from uuid import uuid4
4
4
  from langchain_community.vectorstores.faiss import FAISS
5
5
  from langchain_community.docstore.in_memory import InMemoryDocstore
6
- from typing import Callable, Optional, Dict, Any, Type
6
+ from typing import Callable, List, Optional, Dict, Any, Type
7
7
  from langchain_core.documents import Document
8
8
  import numpy as np # optional, if you're using NumPy vectors
9
9
  from langchain_core.embeddings import Embeddings
@@ -56,16 +56,20 @@ class FaissVectorStoreManager(BaseVectorStoreManager):
56
56
 
57
57
  # uuids = [str(uuid4()) for _ in range(len(docs))]
58
58
  # self.vector_store.add_documents(documents=docs, ids=uuids)
59
-
59
+
60
60
  def initialize_embedding_dict(self, **kwargs):
61
- # vector_db_persist_directory = kwargs.get("vector_db_persist_directory")
61
+ """
62
+ Initializes a dictionary where keys are types (e.g., 'description', 'indication'),
63
+ and values are dictionaries mapping drugbank_ids to a list of their embeddings.
64
+
65
+ Returns:
66
+ dict: A dictionary with the structure {type: {drugbank_id: [embedding]}}.
67
+ """
62
68
  self.load(self.persist_directory)
63
69
  df = self.as_dataframe(formatter_fn=custom_formatter)
64
- type_dict = (
65
- df.groupby('type')
66
- .apply(lambda group: dict(zip(group['id'], group['embedding'])))
67
- .to_dict()
68
- )
70
+ type_dict = {}
71
+ for drug_type, group in df.groupby('type'):
72
+ type_dict[drug_type] = dict(zip(group['id'], group['embedding'].apply(lambda x: [x])))
69
73
  return type_dict
70
74
 
71
75
  def generate_vector_store(self, docs, handle_empty='zero'):
@@ -256,12 +260,106 @@ def load_configuration(config_file):
256
260
  return config
257
261
 
258
262
 
263
+ # def generate_embeddings(
264
+ # df,
265
+ # vector_store_manager_type:Type[BaseVectorStoreManager],
266
+ # config_file,
267
+ # new_model_names,
268
+ # collections,
269
+ # persist_directory="embeddings",
270
+ # ):
271
+ # """
272
+ # Generate embeddings for collections based on a configuration file.
273
+
274
+ # collections: List of collections that contain metadata for embedding generation.
275
+ # config_file: Path to the configuration file containing model settings.
276
+ # new_model_names: List of model names to generate embeddings for.
277
+ # vector_store_manager_type: Class type of the vector store manager (e.g., FaissVectorStoreManager or ChromaVectorStoreManager)
278
+ # """
279
+ # if not collections and not config_file:
280
+ # raise ValueError("Either 'collections' or 'config_file' must be provided.")
281
+ # if collections and config_file:
282
+ # raise ValueError("Only one of 'collections' or 'config_file' should be provided.")
283
+
284
+ # if not collections:
285
+ # collections = load_configuration(config_file)
286
+
287
+ # for collection_config in collections:
288
+ # id = collection_config['id']
289
+ # name = collection_config['name']
290
+
291
+ # if name not in new_model_names:
292
+ # continue
293
+
294
+ # embedding_model_type = collection_config.get('embedding_model_type')
295
+ # text_splitters_types = collection_config.get('text_splitters_types')
296
+ # batch_size = collection_config.get('batch_size')
297
+ # partial_df_size = collection_config.get('partial_dataframe_size')
298
+ # columns = collection_config.get('columns')
299
+ # page_content_columns = collection_config.get('page_content_columns')
300
+ # persist_dir = f'{persist_directory}/{id}'
301
+
302
+ # # Load embedding model
303
+ # try:
304
+ # model_kwargs = collection_config.get('model_kwargs')
305
+ # model = get_import(embedding_model_type)(
306
+ # model_name=name, **model_kwargs)
307
+ # except Exception as e:
308
+ # raise Exception(f"Unknown embedding model: {embedding_model_type}") from e
309
+
310
+ # # Load text splitters
311
+ # text_splitters = []
312
+ # text_splitters_suffixes = []
313
+ # for text_splitter_type in text_splitters_types:
314
+ # try:
315
+ # type_of_text_splitter = get_import(
316
+ # text_splitter_type.get("type"))
317
+ # kwargs = text_splitter_type.get("params")
318
+ # suffix = text_splitter_type.get("suffix")
319
+ # if kwargs:
320
+ # text_splitter = type_of_text_splitter(**kwargs)
321
+ # else:
322
+ # text_splitter = type_of_text_splitter()
323
+ # text_splitters.append(text_splitter)
324
+ # text_splitters_suffixes.append(suffix)
325
+ # except Exception as e:
326
+ # raise Exception(f"Unknown text splitter: {text_splitter_type}") from e
327
+
328
+ # for text_splitter, suffix in zip(text_splitters, text_splitters_suffixes):
329
+ # print(f"{id}_{suffix}")
330
+
331
+ # # Prepare manager parameters
332
+ # manager_params = {
333
+ # "collection_name": f"{id}_{suffix}",
334
+ # "persist_directory": persist_dir,
335
+ # "embeddings": model,
336
+ # "text_splitter": text_splitter,
337
+ # "batch_size": batch_size
338
+ # }
339
+
340
+ # # Instantiate the manager class
341
+ # vector_store_manager = vector_store_manager_type(**manager_params)
342
+
343
+ # # Prepare documents
344
+ # # You may need to use a DataFrameLoader or similar to convert df to LangChain Documents
345
+ # loader = DataFrameLoader(
346
+ # data_frame=df, page_content_column=page_content_columns[0]
347
+ # )
348
+ # docs = loader.load()
349
+
350
+ # # Generate vector store
351
+ # vector_store_manager.generate_vector_store(docs)
352
+
353
+ # # Optionally persist/save
354
+ # vector_store_manager.save(persist_dir)
355
+
356
+
259
357
  def generate_embeddings(
260
- df,
261
- vector_store_manager_type:Type[BaseVectorStoreManager],
262
- config_file,
263
- new_model_names,
264
- collections,
358
+ docs,
359
+ vector_store_manager_type:Type[BaseVectorStoreManager],
360
+ config_file:Optional[str],
361
+ new_model_names:Optional[List],
362
+ collections:Optional[Dict],
265
363
  persist_directory="embeddings",
266
364
  ):
267
365
  """
@@ -276,17 +374,16 @@ def generate_embeddings(
276
374
  raise ValueError("Either 'collections' or 'config_file' must be provided.")
277
375
  if collections and config_file:
278
376
  raise ValueError("Only one of 'collections' or 'config_file' should be provided.")
279
-
377
+
280
378
  if not collections:
281
379
  collections = load_configuration(config_file)
282
-
380
+ if collections is None:
381
+ raise ValueError("No collections found in the configuration file.")
283
382
  for collection_config in collections:
284
383
  id = collection_config['id']
285
384
  name = collection_config['name']
286
-
287
385
  if name not in new_model_names:
288
386
  continue
289
-
290
387
  embedding_model_type = collection_config.get('embedding_model_type')
291
388
  text_splitters_types = collection_config.get('text_splitters_types')
292
389
  batch_size = collection_config.get('batch_size')
@@ -336,15 +433,8 @@ def generate_embeddings(
336
433
  # Instantiate the manager class
337
434
  vector_store_manager = vector_store_manager_type(**manager_params)
338
435
 
339
- # Prepare documents
340
- # You may need to use a DataFrameLoader or similar to convert df to LangChain Documents
341
- loader = DataFrameLoader(
342
- data_frame=df, page_content_column=page_content_columns[0]
343
- )
344
- docs = loader.load()
345
-
346
436
  # Generate vector store
347
437
  vector_store_manager.generate_vector_store(docs)
348
438
 
349
439
  # Optionally persist/save
350
- vector_store_manager.save(persist_dir)
440
+ vector_store_manager.save(persist_dir)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ddi_fw
3
- Version: 0.0.251
3
+ Version: 0.0.253
4
4
  Summary: Do not use :)
5
5
  Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
6
6
  Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
File without changes
File without changes
File without changes