ddi-fw 0.0.254__py3-none-any.whl → 0.0.256__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ddi_fw/datasets/core.py +1 -1
- ddi_fw/langchain/faiss_storage.py +6 -5
- ddi_fw/vectorization/feature_vector_generation.py +6 -1
- {ddi_fw-0.0.254.dist-info → ddi_fw-0.0.256.dist-info}/METADATA +1 -1
- {ddi_fw-0.0.254.dist-info → ddi_fw-0.0.256.dist-info}/RECORD +7 -7
- {ddi_fw-0.0.254.dist-info → ddi_fw-0.0.256.dist-info}/WHEEL +0 -0
- {ddi_fw-0.0.254.dist-info → ddi_fw-0.0.256.dist-info}/top_level.txt +0 -0
ddi_fw/datasets/core.py
CHANGED
@@ -379,7 +379,7 @@ class TextDatasetMixin(BaseModel):
|
|
379
379
|
if self.vector_store_manager is not None:
|
380
380
|
self.embedding_dict = self.vector_store_manager.initialize_embedding_dict()
|
381
381
|
# self.embedding_dict = self.__initialize_embedding_dict()
|
382
|
-
|
382
|
+
self.__calculate_embedding_size()
|
383
383
|
|
384
384
|
|
385
385
|
|
@@ -37,7 +37,7 @@ class BaseVectorStoreManager(BaseModel):
|
|
37
37
|
raise NotImplementedError("This method should be implemented by subclasses.")
|
38
38
|
|
39
39
|
class FaissVectorStoreManager(BaseVectorStoreManager):
|
40
|
-
persist_directory: str = Field(default="./embeddings")
|
40
|
+
persist_directory: str = Field(default="./embeddings/faiss")
|
41
41
|
index: Any = None
|
42
42
|
vector_store: Optional[FAISS] | None = None
|
43
43
|
class Config:
|
@@ -360,7 +360,7 @@ def generate_embeddings(
|
|
360
360
|
config_file:Optional[str],
|
361
361
|
new_model_names:Optional[List],
|
362
362
|
collections:Optional[Dict],
|
363
|
-
persist_directory="embeddings",
|
363
|
+
persist_directory="./embeddings",
|
364
364
|
):
|
365
365
|
"""
|
366
366
|
Generate embeddings for collections based on a configuration file.
|
@@ -390,7 +390,7 @@ def generate_embeddings(
|
|
390
390
|
partial_df_size = collection_config.get('partial_dataframe_size')
|
391
391
|
columns = collection_config.get('columns')
|
392
392
|
page_content_columns = collection_config.get('page_content_columns')
|
393
|
-
|
393
|
+
|
394
394
|
|
395
395
|
# Load embedding model
|
396
396
|
try:
|
@@ -417,10 +417,11 @@ def generate_embeddings(
|
|
417
417
|
text_splitters_suffixes.append(suffix)
|
418
418
|
except Exception as e:
|
419
419
|
raise Exception(f"Unknown text splitter: {text_splitter_type}") from e
|
420
|
-
|
420
|
+
|
421
421
|
for text_splitter, suffix in zip(text_splitters, text_splitters_suffixes):
|
422
422
|
print(f"{id}_{suffix}")
|
423
|
-
|
423
|
+
# persist_dir = f'{persist_directory}/{id}/{suffix}'
|
424
|
+
persist_dir = f'{persist_directory}/{suffix}'
|
424
425
|
# Prepare manager parameters
|
425
426
|
manager_params = {
|
426
427
|
"collection_name": f"{id}_{suffix}",
|
@@ -3,7 +3,6 @@ import numpy as np
|
|
3
3
|
import pandas as pd
|
4
4
|
from scipy.spatial.distance import pdist, squareform
|
5
5
|
from sklearn.preprocessing import MultiLabelBinarizer
|
6
|
-
import cupy as cp
|
7
6
|
|
8
7
|
# todo pd.unique kullan
|
9
8
|
def find_distinct_elements(frame):
|
@@ -97,6 +96,12 @@ class SimilarityMatrixGenerator:
|
|
97
96
|
Returns:
|
98
97
|
cp.ndarray: A 2D CuPy array containing the pairwise Jaccard similarity.
|
99
98
|
"""
|
99
|
+
try:
|
100
|
+
import cupy as cp
|
101
|
+
except ImportError:
|
102
|
+
raise ImportError("cupy is required for GPU Jaccard similarity computation. Please install it with 'pip install cupy'.")
|
103
|
+
|
104
|
+
|
100
105
|
if not ((matrix == 0) | (matrix == 1)).all():
|
101
106
|
raise ValueError("Input matrix must be binary (contain only 0s and 1s).")
|
102
107
|
matrix = cp.asarray(matrix)
|
@@ -1,12 +1,12 @@
|
|
1
1
|
ddi_fw/datasets/__init__.py,sha256=NozQvXPYIS01U0srZmcKhiqJgRDkD-C-VXHL6sKrFSw,166
|
2
|
-
ddi_fw/datasets/core.py,sha256=
|
2
|
+
ddi_fw/datasets/core.py,sha256=UnbCDoWXdxeiAb0e0anhDqXiVFGUi02VA9sKl6NVBZU,17409
|
3
3
|
ddi_fw/datasets/dataset_splitter.py,sha256=8H8uZTAf8N9LUZeSeHOMawtJFJhnDgUUqFcnl7dquBQ,1672
|
4
4
|
ddi_fw/datasets/db_utils.py,sha256=xRj28U_uXTRPHcz3yIICczFUHXUPiAOZtAj5BM6kH44,6465
|
5
5
|
ddi_fw/datasets/setup_._py,sha256=khYVJuW5PlOY_i_A16F3UbSZ6s6o_ljw33Byw3C-A8E,1047
|
6
6
|
ddi_fw/langchain/__init__.py,sha256=Kk2Yr7vemjy9MNB_ImAWET808zt1JkLsWqsgEXpVPJk,421
|
7
7
|
ddi_fw/langchain/chroma_storage.py,sha256=7LSUhdiPdQHZvKC_NapOeVbHtS71iE5ABZVTrI0YQ-A,15520
|
8
8
|
ddi_fw/langchain/embeddings.py,sha256=eEWy4okcjdhUJHi4N48Wd8XauPXyeaQVLUdNWEvtEcY,6754
|
9
|
-
ddi_fw/langchain/faiss_storage.py,sha256=
|
9
|
+
ddi_fw/langchain/faiss_storage.py,sha256=LG2cf6upCEHBBF4Jixnq_diA_t3A0OQZBqPc9DwmILc,17825
|
10
10
|
ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSzu5L2PXzg,280
|
11
11
|
ddi_fw/langchain/storage.py,sha256=OizKyWm74Js7T6Q9kez-ulUoBGzIMFo4R46h4kjUyIM,11200
|
12
12
|
ddi_fw/ml/__init__.py,sha256=FteYEawCkVQOaK-cTv2VrHZ2ZnfeFr31BD6VucO7_DQ,268
|
@@ -36,9 +36,9 @@ ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,
|
|
36
36
|
ddi_fw/utils/utils.py,sha256=PY-zDawREKoXQfzX7lVkxBLVFQPkfvr9385kHCjaNXo,4391
|
37
37
|
ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
|
38
38
|
ddi_fw/vectorization/__init__.py,sha256=LcJOpLVoLvHPDw9phGFlUQGeNcST_zKV-Oi1Pm5h_nE,110
|
39
|
-
ddi_fw/vectorization/feature_vector_generation.py,sha256=
|
39
|
+
ddi_fw/vectorization/feature_vector_generation.py,sha256=92bhZw4Qxh0hqPK-bPHm9bUO7pg2p4cStQYtVrOtetE,7919
|
40
40
|
ddi_fw/vectorization/idf_helper.py,sha256=_Gd1dtDSLaw8o-o0JugzSKMt9FpeXewTh4wGEaUd4VQ,2571
|
41
|
-
ddi_fw-0.0.
|
42
|
-
ddi_fw-0.0.
|
43
|
-
ddi_fw-0.0.
|
44
|
-
ddi_fw-0.0.
|
41
|
+
ddi_fw-0.0.256.dist-info/METADATA,sha256=4Gwtn1oiXo2cJ8osjOXRRu7JIQuP8i3PWjOsE7B56t0,2623
|
42
|
+
ddi_fw-0.0.256.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
43
|
+
ddi_fw-0.0.256.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
|
44
|
+
ddi_fw-0.0.256.dist-info/RECORD,,
|
File without changes
|
File without changes
|