ddi-fw 0.0.108__py3-none-any.whl → 0.0.109__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ddi_fw/langchain/storage.py +39 -2
- {ddi_fw-0.0.108.dist-info → ddi_fw-0.0.109.dist-info}/METADATA +1 -1
- {ddi_fw-0.0.108.dist-info → ddi_fw-0.0.109.dist-info}/RECORD +5 -5
- {ddi_fw-0.0.108.dist-info → ddi_fw-0.0.109.dist-info}/WHEEL +0 -0
- {ddi_fw-0.0.108.dist-info → ddi_fw-0.0.109.dist-info}/top_level.txt +0 -0
ddi_fw/langchain/storage.py
CHANGED
@@ -57,6 +57,42 @@ def split_dataframe(df, min_size=512):
|
|
57
57
|
|
58
58
|
return partial_dfs
|
59
59
|
|
60
|
+
def split_dataframe_indices(df, min_size=512):
|
61
|
+
total_size = len(df)
|
62
|
+
|
63
|
+
# If the dataframe is smaller than min_size, return the entire range
|
64
|
+
if total_size <= min_size:
|
65
|
+
return [(0, total_size - 1)]
|
66
|
+
|
67
|
+
# List to store the start and end indices of each chunk
|
68
|
+
chunk_indices = []
|
69
|
+
start_idx = 0
|
70
|
+
|
71
|
+
# Calculate the minimum number of chunks needed to ensure each chunk has at least min_size
|
72
|
+
num_chunks = total_size // min_size
|
73
|
+
remaining_rows = total_size
|
74
|
+
|
75
|
+
# Split into chunks
|
76
|
+
for i in range(num_chunks):
|
77
|
+
chunk_size = min_size
|
78
|
+
if (remaining_rows - chunk_size) < min_size:
|
79
|
+
chunk_size = remaining_rows # Last chunk takes all remaining rows
|
80
|
+
|
81
|
+
# Calculate the ending index of the chunk (exclusive, hence chunk_size - 1)
|
82
|
+
end_idx = start_idx + chunk_size - 1
|
83
|
+
chunk_indices.append((start_idx, end_idx))
|
84
|
+
|
85
|
+
# Update the start index and remaining rows
|
86
|
+
start_idx += chunk_size
|
87
|
+
remaining_rows -= chunk_size
|
88
|
+
|
89
|
+
# If there are any remaining rows after the loop, they should form the last chunk
|
90
|
+
if remaining_rows > 0:
|
91
|
+
end_idx = start_idx + remaining_rows - 1
|
92
|
+
chunk_indices.append((start_idx, end_idx))
|
93
|
+
|
94
|
+
return chunk_indices
|
95
|
+
|
60
96
|
class DataFrameToVectorDB:
|
61
97
|
def __init__(self,
|
62
98
|
collection_name,
|
@@ -78,8 +114,9 @@ class DataFrameToVectorDB:
|
|
78
114
|
return docs
|
79
115
|
|
80
116
|
def __split_list(self, input_list, batch_size):
|
81
|
-
for i in range(0, len(input_list), batch_size):
|
82
|
-
|
117
|
+
# for i in range(0, len(input_list), batch_size):
|
118
|
+
for s,e in split_dataframe_indices(input_list):
|
119
|
+
yield input_list[s:e+1]
|
83
120
|
|
84
121
|
def store_documents(self, df, columns, page_content_columns):
|
85
122
|
"""
|
@@ -58,7 +58,7 @@ ddi_fw/drugbank/event_extractor.py,sha256=6odoZohhK7OdLF-LF0l-5BFq0_NMG_5jrFJbHr
|
|
58
58
|
ddi_fw/langchain/__init__.py,sha256=zS0CQrakWEP19biSRewFJGcBT8WBZq4899HrEKiMqUY,269
|
59
59
|
ddi_fw/langchain/embeddings.py,sha256=lU64a5AZ62jP8U3hTSwK0kXt7gThbwPACLfJMZ1baPA,7538
|
60
60
|
ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSzu5L2PXzg,280
|
61
|
-
ddi_fw/langchain/storage.py,sha256=
|
61
|
+
ddi_fw/langchain/storage.py,sha256=02cIFOrPxo2Nav5ufkKLOh8cfVY8QupxiO6rzRjNB7I,10399
|
62
62
|
ddi_fw/ml/__init__.py,sha256=tIxiW0g6q1VsmDYVXR_ovvHQR3SCir8g2bKxx_CrS7s,221
|
63
63
|
ddi_fw/ml/evaluation_helper.py,sha256=o4-w5Xa3t4olLW4ymx_8L-Buhe5wfQEmT2bh4Zz544c,13066
|
64
64
|
ddi_fw/ml/ml_helper.py,sha256=fySjIAFzkeEOvaLJhDwtCOgRhgYQ7H106eqaP16GhDY,4489
|
@@ -91,7 +91,7 @@ ddi_fw/utils/package_helper.py,sha256=erl8_onmhK-41zQoaED2qyDUV9GQxmT9sdoyRp9_q5
|
|
91
91
|
ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,4747
|
92
92
|
ddi_fw/utils/utils.py,sha256=szwnxMTDRrZoeNRyDuf3aCbtzriwtaRk4mHSH3asLdA,4301
|
93
93
|
ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
|
94
|
-
ddi_fw-0.0.
|
95
|
-
ddi_fw-0.0.
|
96
|
-
ddi_fw-0.0.
|
97
|
-
ddi_fw-0.0.
|
94
|
+
ddi_fw-0.0.109.dist-info/METADATA,sha256=0kpJYBb0pe7lXlETKf2vSuK0_dOizijcPX0N4h4aBN8,1967
|
95
|
+
ddi_fw-0.0.109.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
|
96
|
+
ddi_fw-0.0.109.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
|
97
|
+
ddi_fw-0.0.109.dist-info/RECORD,,
|
File without changes
|
File without changes
|