ddi-fw 0.0.261__py3-none-any.whl → 0.0.263__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
  from ..langchain.embeddings import PoolingStrategy,SumPoolingStrategy,MeanPoolingStrategy,SentenceTransformerDecorator,PretrainedEmbeddings,SBertEmbeddings
2
- from .sentence_splitter import SentenceSplitter
2
+ from .sentence_splitter import SentenceSplitter,PassthroughTextSplitter
3
3
  # from .storage import DataFrameToVectorDB, generate_embeddings
4
4
  from .faiss_storage import BaseVectorStoreManager, FaissVectorStoreManager,generate_embeddings
5
5
  from .chroma_storage import ChromaVectorStoreManager
@@ -153,8 +153,8 @@ class ChromaVectorStoreManager(BaseVectorStoreManager):
153
153
  # print(f"{page_content_column}, size:{len(split_docs_chunk)}")
154
154
 
155
155
  @staticmethod
156
- def get_persist_dir(base_dir, suffix, config=None):
157
- return f"{base_dir}"
156
+ def get_persist_dir(base_dir,id, suffix, config=None):
157
+ return f"{base_dir}/chroma_db/{id}"
158
158
 
159
159
  def generate_vector_store(self, docs: List[Document]):
160
160
  self.vector_store = Chroma(
@@ -39,7 +39,7 @@ class BaseVectorStoreManager(BaseModel):
39
39
  raise NotImplementedError("This method should be implemented by subclasses.")
40
40
 
41
41
  @staticmethod
42
- def get_persist_dir(base_dir, suffix, config=None):
42
+ def get_persist_dir(base_dir, id ,suffix, config=None):
43
43
  raise NotImplementedError("Subclasses must implement get_persist_dir.")
44
44
 
45
45
 
@@ -66,8 +66,8 @@ class FaissVectorStoreManager(BaseVectorStoreManager):
66
66
  # self.vector_store.add_documents(documents=docs, ids=uuids)
67
67
 
68
68
  @staticmethod
69
- def get_persist_dir(base_dir, suffix, config=None):
70
- return f"{base_dir}/{suffix}"
69
+ def get_persist_dir(base_dir,id, suffix, config=None):
70
+ return f"{base_dir}/faiss/{id}/{suffix}"
71
71
 
72
72
  def initialize_embedding_dict(self, **kwargs):
73
73
  """
@@ -414,8 +414,9 @@ def generate_embeddings(
414
414
  # Load embedding model
415
415
  try:
416
416
  model_kwargs = collection_config.get('model_kwargs')
417
+ kwargs = {"model_kwargs":model_kwargs}
417
418
  model = get_import(embedding_model_type)(
418
- model_name=name, **model_kwargs)
419
+ model_name=name, **kwargs)
419
420
  except Exception as e:
420
421
  raise Exception(f"Unknown embedding model: {embedding_model_type}") from e
421
422
 
@@ -441,7 +442,7 @@ def generate_embeddings(
441
442
  print(f"{id}_{suffix}")
442
443
  # persist_dir = f'{persist_directory}/{id}/{suffix}'
443
444
  # persist_dir = f'{persist_directory}/{suffix}'
444
- persist_dir = vector_store_manager_type.get_persist_dir(persist_directory, suffix, collection_config)
445
+ persist_dir = vector_store_manager_type.get_persist_dir(persist_directory , id, suffix, collection_config)
445
446
 
446
447
  # Prepare manager parameters
447
448
  manager_params = {
@@ -5,6 +5,13 @@ from langchain_text_splitters.base import TextSplitter
5
5
 
6
6
  nltk.download('punkt')
7
7
 
8
+ ''' A text splitter that splits text into sentences using NLTK's sentence tokenizer.'''
8
9
  class SentenceSplitter(TextSplitter):
9
10
  def split_text(self, text: str) -> List[str]:
10
11
  return sent_tokenize(text)
12
+
13
+
14
+ ''' A text splitter that does not split the text at all, returning the entire text as a single chunk.'''
15
+ class PassthroughTextSplitter(TextSplitter):
16
+ def split_text(self, text: str) -> List[str]:
17
+ return [text]
ddi_fw/utils/kaggle.py CHANGED
@@ -37,6 +37,7 @@ def create_kaggle_dataset(base_path: str, collections: list):
37
37
 
38
38
  # Ensure title is between 6 and 50 characters
39
39
  if not (6 <= len(title) <= 50):
40
+ raise ValueError(f"Title length for {title} must be between 6 and 50 characters.")
40
41
  continue # Skip if title length is out of the expected range
41
42
 
42
43
  # Step 3: Define the metadata content
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ddi_fw
3
- Version: 0.0.261
3
+ Version: 0.0.263
4
4
  Summary: Do not use :)
5
5
  Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
6
6
  Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
@@ -3,11 +3,11 @@ ddi_fw/datasets/core.py,sha256=UnbCDoWXdxeiAb0e0anhDqXiVFGUi02VA9sKl6NVBZU,17409
3
3
  ddi_fw/datasets/dataset_splitter.py,sha256=8H8uZTAf8N9LUZeSeHOMawtJFJhnDgUUqFcnl7dquBQ,1672
4
4
  ddi_fw/datasets/db_utils.py,sha256=xRj28U_uXTRPHcz3yIICczFUHXUPiAOZtAj5BM6kH44,6465
5
5
  ddi_fw/datasets/setup_._py,sha256=khYVJuW5PlOY_i_A16F3UbSZ6s6o_ljw33Byw3C-A8E,1047
6
- ddi_fw/langchain/__init__.py,sha256=Kk2Yr7vemjy9MNB_ImAWET808zt1JkLsWqsgEXpVPJk,421
7
- ddi_fw/langchain/chroma_storage.py,sha256=63_UojxGLbytgm4g2BZWdo2hvnWiVjrs4mZjxNxdkV8,15837
6
+ ddi_fw/langchain/__init__.py,sha256=97Y4lYuxShWqx5hfDbzf8VyV0HrM76fDlNp5xXusKQU,445
7
+ ddi_fw/langchain/chroma_storage.py,sha256=fOxoJoaqqyOKqtfUtlq2zJd-XY03rARTDvrPE_9nY2I,15855
8
8
  ddi_fw/langchain/embeddings.py,sha256=eEWy4okcjdhUJHi4N48Wd8XauPXyeaQVLUdNWEvtEcY,6754
9
- ddi_fw/langchain/faiss_storage.py,sha256=1G_lJ4_pKGEp5SlKBYUZWxEnGHuq3JGmvKeDaEztX8w,18646
10
- ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSzu5L2PXzg,280
9
+ ddi_fw/langchain/faiss_storage.py,sha256=Lu68vuXv3_BhGW7Jf4QDf-eyyN2ykrXTahJcueRSO7c,18715
10
+ ddi_fw/langchain/sentence_splitter.py,sha256=NCcDdDWDnwZTZDqarg-5gSbcDFoAM_sxcgH9ZCu97IA,597
11
11
  ddi_fw/langchain/storage.py,sha256=OizKyWm74Js7T6Q9kez-ulUoBGzIMFo4R46h4kjUyIM,11200
12
12
  ddi_fw/ml/__init__.py,sha256=FteYEawCkVQOaK-cTv2VrHZ2ZnfeFr31BD6VucO7_DQ,268
13
13
  ddi_fw/ml/evaluation_helper.py,sha256=2-7CLSgGTqLEk4HkgCVIOt-GxfLAn6SBozJghAtHb5M,11581
@@ -29,7 +29,7 @@ ddi_fw/utils/__init__.py,sha256=WNxkQXk-694roG50D355TGLXstfdWVb_tUyr-PM-8rg,537
29
29
  ddi_fw/utils/categorical_data_encoding_checker.py,sha256=T1X70Rh4atucAuqyUZmz-iFULllY9dY0NRyV9-jTjJ0,3438
30
30
  ddi_fw/utils/enums.py,sha256=19eJ3fX5eRK_xPvkYcukmug144jXPH4X9zQqtsFBj5A,671
31
31
  ddi_fw/utils/json_helper.py,sha256=BVU6wmJgdXPxyqLPu3Ck_9Es5RrP1PDanKvE-OSj1D4,571
32
- ddi_fw/utils/kaggle.py,sha256=ZlKS4kZVsNV_JNYO4IvqY4GkhQCvl-_5saaz_bZzgrQ,2508
32
+ ddi_fw/utils/kaggle.py,sha256=itisQ5nffYMZz6gFYMdmbrpo2qaQvFVmLiRCC73MB1U,2604
33
33
  ddi_fw/utils/numpy_utils.py,sha256=gd1WNq5NpWD2MBEMTtFuS5I0h8B6FAUNcq6BVOlxdhY,797
34
34
  ddi_fw/utils/package_helper.py,sha256=erl8_onmhK-41zQoaED2qyDUV9GQxmT9sdoyRp9_q5I,1056
35
35
  ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,4747
@@ -38,7 +38,7 @@ ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,55
38
38
  ddi_fw/vectorization/__init__.py,sha256=LcJOpLVoLvHPDw9phGFlUQGeNcST_zKV-Oi1Pm5h_nE,110
39
39
  ddi_fw/vectorization/feature_vector_generation.py,sha256=92bhZw4Qxh0hqPK-bPHm9bUO7pg2p4cStQYtVrOtetE,7919
40
40
  ddi_fw/vectorization/idf_helper.py,sha256=_Gd1dtDSLaw8o-o0JugzSKMt9FpeXewTh4wGEaUd4VQ,2571
41
- ddi_fw-0.0.261.dist-info/METADATA,sha256=P0xexzVAJfggUMTjYeClVcqkap4mhjRzzLJCfDR86II,2623
42
- ddi_fw-0.0.261.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
43
- ddi_fw-0.0.261.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
44
- ddi_fw-0.0.261.dist-info/RECORD,,
41
+ ddi_fw-0.0.263.dist-info/METADATA,sha256=CIRb9UVKCTd-6aXXLaeK4I_LR1Wn2TghfQjqJ3CiWAU,2623
42
+ ddi_fw-0.0.263.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
43
+ ddi_fw-0.0.263.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
44
+ ddi_fw-0.0.263.dist-info/RECORD,,