ddi-fw 0.0.232__py3-none-any.whl → 0.0.234__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -130,7 +130,7 @@ class NerParameterSearch(BaseModel):
130
130
 
131
131
  for item in group_items:
132
132
  item[0] = f"threshold_{item[0]}_{threshold}"
133
- self.datasets[item[0]] = dataset
133
+ # self.datasets[item[0]] = dataset
134
134
 
135
135
  self.items.extend(group_items)
136
136
 
@@ -1,3 +1,4 @@
1
+ import os
1
2
  import numpy as np
2
3
  import pandas as pd
3
4
  from scipy.spatial.distance import pdist, squareform
@@ -28,7 +29,8 @@ def find_distinct_elements_count(frame):
28
29
 
29
30
  class SimilarityMatrixGenerator:
30
31
  def __init__(self):
31
- pass
32
+ # Check if GPU usage is enabled via an environment variable
33
+ self.use_gpu = os.getenv("SIMILARITY_MATRIX_USE_GPU", "false").lower() == "true"
32
34
 
33
35
  def create_jaccard_similarity_matrices_ex_1(self, array):
34
36
  jaccard_sim = 1 - pdist(array, metric='jaccard')
@@ -45,8 +47,15 @@ class SimilarityMatrixGenerator:
45
47
  return np.nan_to_num(matrix, nan=0.0)
46
48
  # return matrix
47
49
 
50
+
51
+ def create_jaccard_similarity_matrices(self, matrix: np.ndarray)->np.ndarray:
52
+ if self.use_gpu:
53
+ return self.__create_jaccard_similarity_matrices_gpu(matrix)
54
+ else:
55
+ return self.__create_jaccard_similarity_matrices(matrix)
56
+
48
57
  """produced from ChatGPT"""
49
- def create_jaccard_similarity_matrices(self, matrix)->np.ndarray:
58
+ def __create_jaccard_similarity_matrices(self, matrix: np.ndarray)->np.ndarray:
50
59
  """
51
60
  Efficiently compute the Jaccard similarity between rows of a binary matrix using vectorized operations.
52
61
 
@@ -75,6 +84,35 @@ class SimilarityMatrixGenerator:
75
84
 
76
85
  return similarity
77
86
 
87
+ import cupy as cp
88
+
89
+ def __create_jaccard_similarity_matrices_gpu(self,matrix: np.ndarray) -> np.ndarray:
90
+ """
91
+ Efficiently compute the Jaccard similarity between rows of a binary matrix using GPU-accelerated CuPy.
92
+
93
+ Parameters:
94
+ matrix (cp.ndarray): A 2D binary CuPy array (only 0s and 1s).
95
+
96
+ Returns:
97
+ cp.ndarray: A 2D CuPy array containing the pairwise Jaccard similarity.
98
+ """
99
+ if not ((matrix == 0) | (matrix == 1)).all():
100
+ raise ValueError("Input matrix must be binary (contain only 0s and 1s).")
101
+ matrix = cp.asarray(matrix)
102
+ # Intersection: dot product (each pair of rows)
103
+ intersection = matrix @ matrix.T
104
+
105
+ # Row-wise sum (number of 1s per row)
106
+ row_sums = matrix.sum(axis=1, keepdims=True)
107
+
108
+ # Union: |A ∪ B| = |A| + |B| - |A ∩ B|
109
+ union = row_sums + row_sums.T - intersection
110
+
111
+ # Avoid division by zero
112
+ similarity = cp.divide(intersection, union, out=cp.ones_like(intersection, dtype=cp.float64)) #, where=union != 0
113
+
114
+ return cp.asnumpy(similarity)
115
+
78
116
 
79
117
 
80
118
  class VectorGenerator:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ddi_fw
3
- Version: 0.0.232
3
+ Version: 0.0.234
4
4
  Summary: Do not use :)
5
5
  Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
6
6
  Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
@@ -86,7 +86,7 @@ ddi_fw/pipeline/__init__.py,sha256=tKDM_rW4vPjlYTeOkNgi9PujDzb4e9O3LK1w5wqnebw,2
86
86
  ddi_fw/pipeline/multi_modal_combination_strategy.py,sha256=JSyuP71b1I1yuk0s2ecCJZTtCED85jBtkpwTUxibJvI,1706
87
87
  ddi_fw/pipeline/multi_pipeline.py,sha256=EjJnA3Vzd-WeEvUBaA2LDOy_iQ5-2eW2VhtxvvxDPfQ,9857
88
88
  ddi_fw/pipeline/multi_pipeline_org.py,sha256=AbErwu05-3YIPnCcXRsj-jxPJG8HG2H7cMZlGjzaYa8,9037
89
- ddi_fw/pipeline/ner_pipeline.py,sha256=BycxZvI7JRJ3s3HhYAgOxG2_lqrVnhv7ECOWSgVQhz4,8186
89
+ ddi_fw/pipeline/ner_pipeline.py,sha256=1gBk81LeZlU1rhjJ1qBgHbFt_HqOeJ5WLnJ4AkYku4s,8188
90
90
  ddi_fw/pipeline/pipeline.py,sha256=q1kMkW9-fOlrA4BOGUku40U_PuEYfcbtH2EvlRM4uTM,6243
91
91
  ddi_fw/utils/__init__.py,sha256=WNxkQXk-694roG50D355TGLXstfdWVb_tUyr-PM-8rg,537
92
92
  ddi_fw/utils/categorical_data_encoding_checker.py,sha256=T1X70Rh4atucAuqyUZmz-iFULllY9dY0NRyV9-jTjJ0,3438
@@ -99,9 +99,9 @@ ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,
99
99
  ddi_fw/utils/utils.py,sha256=PY-zDawREKoXQfzX7lVkxBLVFQPkfvr9385kHCjaNXo,4391
100
100
  ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
101
101
  ddi_fw/vectorization/__init__.py,sha256=LcJOpLVoLvHPDw9phGFlUQGeNcST_zKV-Oi1Pm5h_nE,110
102
- ddi_fw/vectorization/feature_vector_generation.py,sha256=93G3QM28uoNlvlVz_BhV6ARxldpogiNJStxHdsgqTbU,6026
102
+ ddi_fw/vectorization/feature_vector_generation.py,sha256=-M8Y82mKZB6kwhJHvPgmQZSwUCheTLbb5zYuEtHZC1Y,7623
103
103
  ddi_fw/vectorization/idf_helper.py,sha256=_Gd1dtDSLaw8o-o0JugzSKMt9FpeXewTh4wGEaUd4VQ,2571
104
- ddi_fw-0.0.232.dist-info/METADATA,sha256=CBSE9xsWEc0vxlCxw9NCLxTJWXmXoFTegN5wgrHVGvA,2632
105
- ddi_fw-0.0.232.dist-info/WHEEL,sha256=pxyMxgL8-pra_rKaQ4drOZAegBVuX-G_4nRHjjgWbmo,91
106
- ddi_fw-0.0.232.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
107
- ddi_fw-0.0.232.dist-info/RECORD,,
104
+ ddi_fw-0.0.234.dist-info/METADATA,sha256=DxrU3PdEv-7FKmZ3bJ4OAI465EsN-sKaxDnTQSTO19A,2632
105
+ ddi_fw-0.0.234.dist-info/WHEEL,sha256=pxyMxgL8-pra_rKaQ4drOZAegBVuX-G_4nRHjjgWbmo,91
106
+ ddi_fw-0.0.234.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
107
+ ddi_fw-0.0.234.dist-info/RECORD,,