ddi-fw 0.0.262__py3-none-any.whl → 0.0.264__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ddi_fw/datasets/core.py +1 -1
- ddi_fw/langchain/faiss_storage.py +47 -2
- ddi_fw/pipeline/multi_pipeline.py +3 -2
- ddi_fw/utils/kaggle.py +1 -0
- {ddi_fw-0.0.262.dist-info → ddi_fw-0.0.264.dist-info}/METADATA +1 -1
- {ddi_fw-0.0.262.dist-info → ddi_fw-0.0.264.dist-info}/RECORD +8 -8
- {ddi_fw-0.0.262.dist-info → ddi_fw-0.0.264.dist-info}/WHEEL +0 -0
- {ddi_fw-0.0.262.dist-info → ddi_fw-0.0.264.dist-info}/top_level.txt +0 -0
ddi_fw/datasets/core.py
CHANGED
@@ -94,7 +94,7 @@ class BaseDataset(BaseModel, abc.ABC):
|
|
94
94
|
|
95
95
|
train_data,test_data = np.stack(train_data.flatten().tolist()), np.stack(test_data.flatten().tolist())
|
96
96
|
column = self.columns[0] if self.columns else 'default'
|
97
|
-
items.append([f'
|
97
|
+
items.append([f'{column}', np.nan_to_num(train_data),
|
98
98
|
y_train_label, np.nan_to_num(test_data), y_test_label])
|
99
99
|
else:
|
100
100
|
for index, column in enumerate(self.columns):
|
@@ -414,8 +414,9 @@ def generate_embeddings(
|
|
414
414
|
# Load embedding model
|
415
415
|
try:
|
416
416
|
model_kwargs = collection_config.get('model_kwargs')
|
417
|
+
kwargs = {"model_kwargs":model_kwargs}
|
417
418
|
model = get_import(embedding_model_type)(
|
418
|
-
model_name=name, **
|
419
|
+
model_name=name, **kwargs)
|
419
420
|
except Exception as e:
|
420
421
|
raise Exception(f"Unknown embedding model: {embedding_model_type}") from e
|
421
422
|
|
@@ -459,4 +460,48 @@ def generate_embeddings(
|
|
459
460
|
vector_store_manager.generate_vector_store(docs)
|
460
461
|
|
461
462
|
# Optionally persist/save
|
462
|
-
vector_store_manager.save(persist_dir)
|
463
|
+
vector_store_manager.save(persist_dir)
|
464
|
+
|
465
|
+
|
466
|
+
import os
|
467
|
+
import json
|
468
|
+
|
469
|
+
def generate_embeddings_for_json_object(
|
470
|
+
obj_json: dict,
|
471
|
+
vector_store_manager_type: Type[BaseVectorStoreManager],
|
472
|
+
persist_root: str = "./embeddings",
|
473
|
+
new_model_names: Optional[List] = None,
|
474
|
+
docs=None
|
475
|
+
):
|
476
|
+
"""
|
477
|
+
Generate embeddings for all collections in the given JSON object, storing them in a container folder.
|
478
|
+
|
479
|
+
Args:
|
480
|
+
obj_json: JSON object with 'id', 'name', and 'collections' keys.
|
481
|
+
vector_store_manager_type: The vector store manager class to use.
|
482
|
+
persist_root: Root directory for all embeddings.
|
483
|
+
new_model_names: Optional list of model names to filter collections.
|
484
|
+
docs: Documents to embed (if needed).
|
485
|
+
"""
|
486
|
+
obj_id = obj_json.get("id")
|
487
|
+
obj_name = obj_json.get("name")
|
488
|
+
collections = obj_json.get("collections", [])
|
489
|
+
|
490
|
+
if not obj_id:
|
491
|
+
raise ValueError("JSON object must have an 'id' field.")
|
492
|
+
if not collections:
|
493
|
+
raise ValueError("No collections found in the given JSON object.")
|
494
|
+
|
495
|
+
# Create container directory for this object
|
496
|
+
container_dir = os.path.join(persist_root, str(obj_id))
|
497
|
+
os.makedirs(container_dir, exist_ok=True)
|
498
|
+
|
499
|
+
# Call your existing function
|
500
|
+
generate_embeddings(
|
501
|
+
docs=docs,
|
502
|
+
vector_store_manager_type=vector_store_manager_type,
|
503
|
+
config_file=None,
|
504
|
+
new_model_names=new_model_names,
|
505
|
+
collections=collections,
|
506
|
+
persist_directory=container_dir
|
507
|
+
)
|
@@ -149,8 +149,9 @@ class MultiPipeline():
|
|
149
149
|
|
150
150
|
# Default model configuration
|
151
151
|
default_model = config.get("default_model", {})
|
152
|
-
|
153
|
-
|
152
|
+
if default_model:
|
153
|
+
default_model_type = get_import(default_model.get("model_type"))
|
154
|
+
default_model_params = default_model.get("params", {})
|
154
155
|
|
155
156
|
multi_modal = config.get("multi_modal")
|
156
157
|
|
ddi_fw/utils/kaggle.py
CHANGED
@@ -37,6 +37,7 @@ def create_kaggle_dataset(base_path: str, collections: list):
|
|
37
37
|
|
38
38
|
# Ensure title is between 6 and 50 characters
|
39
39
|
if not (6 <= len(title) <= 50):
|
40
|
+
raise ValueError(f"Title length for {title} must be between 6 and 50 characters.")
|
40
41
|
continue # Skip if title length is out of the expected range
|
41
42
|
|
42
43
|
# Step 3: Define the metadata content
|
@@ -1,12 +1,12 @@
|
|
1
1
|
ddi_fw/datasets/__init__.py,sha256=NozQvXPYIS01U0srZmcKhiqJgRDkD-C-VXHL6sKrFSw,166
|
2
|
-
ddi_fw/datasets/core.py,sha256=
|
2
|
+
ddi_fw/datasets/core.py,sha256=Nc0OnaYi0hIYuGCdxOCyT2X4mdWK0wyVSxUw6836fKk,17410
|
3
3
|
ddi_fw/datasets/dataset_splitter.py,sha256=8H8uZTAf8N9LUZeSeHOMawtJFJhnDgUUqFcnl7dquBQ,1672
|
4
4
|
ddi_fw/datasets/db_utils.py,sha256=xRj28U_uXTRPHcz3yIICczFUHXUPiAOZtAj5BM6kH44,6465
|
5
5
|
ddi_fw/datasets/setup_._py,sha256=khYVJuW5PlOY_i_A16F3UbSZ6s6o_ljw33Byw3C-A8E,1047
|
6
6
|
ddi_fw/langchain/__init__.py,sha256=97Y4lYuxShWqx5hfDbzf8VyV0HrM76fDlNp5xXusKQU,445
|
7
7
|
ddi_fw/langchain/chroma_storage.py,sha256=fOxoJoaqqyOKqtfUtlq2zJd-XY03rARTDvrPE_9nY2I,15855
|
8
8
|
ddi_fw/langchain/embeddings.py,sha256=eEWy4okcjdhUJHi4N48Wd8XauPXyeaQVLUdNWEvtEcY,6754
|
9
|
-
ddi_fw/langchain/faiss_storage.py,sha256=
|
9
|
+
ddi_fw/langchain/faiss_storage.py,sha256=M-pogVtmESi_sXsBCEcTItz1-NDILllCAB41Pg54kNo,20235
|
10
10
|
ddi_fw/langchain/sentence_splitter.py,sha256=NCcDdDWDnwZTZDqarg-5gSbcDFoAM_sxcgH9ZCu97IA,597
|
11
11
|
ddi_fw/langchain/storage.py,sha256=OizKyWm74Js7T6Q9kez-ulUoBGzIMFo4R46h4kjUyIM,11200
|
12
12
|
ddi_fw/ml/__init__.py,sha256=FteYEawCkVQOaK-cTv2VrHZ2ZnfeFr31BD6VucO7_DQ,268
|
@@ -21,7 +21,7 @@ ddi_fw/ner/mmlrestclient.py,sha256=NZta7m2Qm6I_qtVguMZhqtAUjVBmmXn0-TMnsNp0jpg,6
|
|
21
21
|
ddi_fw/ner/ner.py,sha256=FHyyX53Xwpdw8Hec261dyN88yD7Z9LmJua2mIrQLguI,17967
|
22
22
|
ddi_fw/pipeline/__init__.py,sha256=tKDM_rW4vPjlYTeOkNgi9PujDzb4e9O3LK1w5wqnebw,212
|
23
23
|
ddi_fw/pipeline/multi_modal_combination_strategy.py,sha256=JSyuP71b1I1yuk0s2ecCJZTtCED85jBtkpwTUxibJvI,1706
|
24
|
-
ddi_fw/pipeline/multi_pipeline.py,sha256=
|
24
|
+
ddi_fw/pipeline/multi_pipeline.py,sha256=ck6VhWF4dDTqeJu7Z0VYBYSxIcRUJLAYj01P6AplQgg,10241
|
25
25
|
ddi_fw/pipeline/multi_pipeline_org.py,sha256=AbErwu05-3YIPnCcXRsj-jxPJG8HG2H7cMZlGjzaYa8,9037
|
26
26
|
ddi_fw/pipeline/ner_pipeline.py,sha256=1gBk81LeZlU1rhjJ1qBgHbFt_HqOeJ5WLnJ4AkYku4s,8188
|
27
27
|
ddi_fw/pipeline/pipeline.py,sha256=m6pZrhoBK2lUr7PwpmJl6-WEpYcPGGc9N9C1LNJ78NQ,6974
|
@@ -29,7 +29,7 @@ ddi_fw/utils/__init__.py,sha256=WNxkQXk-694roG50D355TGLXstfdWVb_tUyr-PM-8rg,537
|
|
29
29
|
ddi_fw/utils/categorical_data_encoding_checker.py,sha256=T1X70Rh4atucAuqyUZmz-iFULllY9dY0NRyV9-jTjJ0,3438
|
30
30
|
ddi_fw/utils/enums.py,sha256=19eJ3fX5eRK_xPvkYcukmug144jXPH4X9zQqtsFBj5A,671
|
31
31
|
ddi_fw/utils/json_helper.py,sha256=BVU6wmJgdXPxyqLPu3Ck_9Es5RrP1PDanKvE-OSj1D4,571
|
32
|
-
ddi_fw/utils/kaggle.py,sha256=
|
32
|
+
ddi_fw/utils/kaggle.py,sha256=itisQ5nffYMZz6gFYMdmbrpo2qaQvFVmLiRCC73MB1U,2604
|
33
33
|
ddi_fw/utils/numpy_utils.py,sha256=gd1WNq5NpWD2MBEMTtFuS5I0h8B6FAUNcq6BVOlxdhY,797
|
34
34
|
ddi_fw/utils/package_helper.py,sha256=erl8_onmhK-41zQoaED2qyDUV9GQxmT9sdoyRp9_q5I,1056
|
35
35
|
ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,4747
|
@@ -38,7 +38,7 @@ ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,55
|
|
38
38
|
ddi_fw/vectorization/__init__.py,sha256=LcJOpLVoLvHPDw9phGFlUQGeNcST_zKV-Oi1Pm5h_nE,110
|
39
39
|
ddi_fw/vectorization/feature_vector_generation.py,sha256=92bhZw4Qxh0hqPK-bPHm9bUO7pg2p4cStQYtVrOtetE,7919
|
40
40
|
ddi_fw/vectorization/idf_helper.py,sha256=_Gd1dtDSLaw8o-o0JugzSKMt9FpeXewTh4wGEaUd4VQ,2571
|
41
|
-
ddi_fw-0.0.
|
42
|
-
ddi_fw-0.0.
|
43
|
-
ddi_fw-0.0.
|
44
|
-
ddi_fw-0.0.
|
41
|
+
ddi_fw-0.0.264.dist-info/METADATA,sha256=PKLhkkd6zsEA_YtV_4vVLh0K-pgLTzpf6IDH_ETlsek,2623
|
42
|
+
ddi_fw-0.0.264.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
43
|
+
ddi_fw-0.0.264.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
|
44
|
+
ddi_fw-0.0.264.dist-info/RECORD,,
|
File without changes
|
File without changes
|