ddi-fw 0.0.233__py3-none-any.whl → 0.0.235__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ddi_fw/vectorization/feature_vector_generation.py +41 -2
- {ddi_fw-0.0.233.dist-info → ddi_fw-0.0.235.dist-info}/METADATA +1 -1
- {ddi_fw-0.0.233.dist-info → ddi_fw-0.0.235.dist-info}/RECORD +5 -5
- {ddi_fw-0.0.233.dist-info → ddi_fw-0.0.235.dist-info}/WHEEL +0 -0
- {ddi_fw-0.0.233.dist-info → ddi_fw-0.0.235.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,9 @@
|
|
1
|
+
import os
|
1
2
|
import numpy as np
|
2
3
|
import pandas as pd
|
3
4
|
from scipy.spatial.distance import pdist, squareform
|
4
5
|
from sklearn.preprocessing import MultiLabelBinarizer
|
6
|
+
import cupy as cp
|
5
7
|
|
6
8
|
# todo pd.unique kullan
|
7
9
|
def find_distinct_elements(frame):
|
@@ -28,7 +30,8 @@ def find_distinct_elements_count(frame):
|
|
28
30
|
|
29
31
|
class SimilarityMatrixGenerator:
|
30
32
|
def __init__(self):
|
31
|
-
|
33
|
+
# Check if GPU usage is enabled via an environment variable
|
34
|
+
self.use_gpu = os.getenv("SIMILARITY_MATRIX_USE_GPU", "false").lower() == "true"
|
32
35
|
|
33
36
|
def create_jaccard_similarity_matrices_ex_1(self, array):
|
34
37
|
jaccard_sim = 1 - pdist(array, metric='jaccard')
|
@@ -45,8 +48,16 @@ class SimilarityMatrixGenerator:
|
|
45
48
|
return np.nan_to_num(matrix, nan=0.0)
|
46
49
|
# return matrix
|
47
50
|
|
51
|
+
|
52
|
+
def create_jaccard_similarity_matrices(self, matrix: np.ndarray)->np.ndarray:
|
53
|
+
if self.use_gpu:
|
54
|
+
print("Using GPU for Jaccard similarity matrix computation.")
|
55
|
+
return self.__create_jaccard_similarity_matrices_gpu(matrix)
|
56
|
+
else:
|
57
|
+
return self.__create_jaccard_similarity_matrices(matrix)
|
58
|
+
|
48
59
|
"""produced from ChatGPT"""
|
49
|
-
def
|
60
|
+
def __create_jaccard_similarity_matrices(self, matrix: np.ndarray)->np.ndarray:
|
50
61
|
"""
|
51
62
|
Efficiently compute the Jaccard similarity between rows of a binary matrix using vectorized operations.
|
52
63
|
|
@@ -76,6 +87,34 @@ class SimilarityMatrixGenerator:
|
|
76
87
|
return similarity
|
77
88
|
|
78
89
|
|
90
|
+
def __create_jaccard_similarity_matrices_gpu(self,matrix: np.ndarray) -> np.ndarray:
|
91
|
+
"""
|
92
|
+
Efficiently compute the Jaccard similarity between rows of a binary matrix using GPU-accelerated CuPy.
|
93
|
+
|
94
|
+
Parameters:
|
95
|
+
matrix (cp.ndarray): A 2D binary CuPy array (only 0s and 1s).
|
96
|
+
|
97
|
+
Returns:
|
98
|
+
cp.ndarray: A 2D CuPy array containing the pairwise Jaccard similarity.
|
99
|
+
"""
|
100
|
+
if not ((matrix == 0) | (matrix == 1)).all():
|
101
|
+
raise ValueError("Input matrix must be binary (contain only 0s and 1s).")
|
102
|
+
matrix = cp.asarray(matrix)
|
103
|
+
# Intersection: dot product (each pair of rows)
|
104
|
+
intersection = matrix @ matrix.T
|
105
|
+
|
106
|
+
# Row-wise sum (number of 1s per row)
|
107
|
+
row_sums = matrix.sum(axis=1, keepdims=True)
|
108
|
+
|
109
|
+
# Union: |A ∪ B| = |A| + |B| - |A ∩ B|
|
110
|
+
union = row_sums + row_sums.T - intersection
|
111
|
+
|
112
|
+
# Avoid division by zero
|
113
|
+
similarity = cp.divide(intersection, union, out=cp.ones_like(intersection, dtype=cp.float64)) #, where=union != 0
|
114
|
+
|
115
|
+
return cp.asnumpy(similarity)
|
116
|
+
|
117
|
+
|
79
118
|
|
80
119
|
class VectorGenerator:
|
81
120
|
def __init__(self, df):
|
@@ -99,9 +99,9 @@ ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,
|
|
99
99
|
ddi_fw/utils/utils.py,sha256=PY-zDawREKoXQfzX7lVkxBLVFQPkfvr9385kHCjaNXo,4391
|
100
100
|
ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
|
101
101
|
ddi_fw/vectorization/__init__.py,sha256=LcJOpLVoLvHPDw9phGFlUQGeNcST_zKV-Oi1Pm5h_nE,110
|
102
|
-
ddi_fw/vectorization/feature_vector_generation.py,sha256=
|
102
|
+
ddi_fw/vectorization/feature_vector_generation.py,sha256=QQQGhCti653BdU343Ag1bH_g1fzi2hlic7dgNy7otjE,7694
|
103
103
|
ddi_fw/vectorization/idf_helper.py,sha256=_Gd1dtDSLaw8o-o0JugzSKMt9FpeXewTh4wGEaUd4VQ,2571
|
104
|
-
ddi_fw-0.0.
|
105
|
-
ddi_fw-0.0.
|
106
|
-
ddi_fw-0.0.
|
107
|
-
ddi_fw-0.0.
|
104
|
+
ddi_fw-0.0.235.dist-info/METADATA,sha256=mww6smf4YRNE7iI-f_y9YcXJam05OJsfGX3XVMnmr5I,2632
|
105
|
+
ddi_fw-0.0.235.dist-info/WHEEL,sha256=pxyMxgL8-pra_rKaQ4drOZAegBVuX-G_4nRHjjgWbmo,91
|
106
|
+
ddi_fw-0.0.235.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
|
107
|
+
ddi_fw-0.0.235.dist-info/RECORD,,
|
File without changes
|
File without changes
|