ddi-fw 0.0.227__py3-none-any.whl → 0.0.230__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- from .core import BaseDataset
1
+ from .core import BaseDataset,TextDatasetMixin
2
2
  from .ddi_mdl.base import DDIMDLDataset
3
3
  from .ddi_mdl_text.base import DDIMDLDatasetV2
4
4
  from .mdf_sa_ddi.base import MDFSADDIDataset
@@ -81,28 +81,34 @@ class NerParameterSearch(BaseModel):
81
81
  df = idf.to_dataframe()
82
82
  self.min_threshold_dict = {key: np.floor(df.describe()[key]["min"]) for key in df.describe().keys()}
83
83
  self.max_threshold_dict = {key: np.ceil(df.describe()[key]["max"]) for key in df.describe().keys()}
84
+
85
+ print("Minimum thresholds:", self.min_threshold_dict)
86
+ print("Maximum thresholds:", self.max_threshold_dict)
84
87
 
85
88
  # Generate datasets and items
86
89
  for column in self.columns:
87
90
  min_threshold = self.min_threshold_dict[column]
88
91
  max_threshold = self.max_threshold_dict[column]
89
- kwargs = {
92
+ thresholds = {
90
93
  "threshold_method": "idf",
91
- "tui_threshold": 0,
92
- "cui_threshold": 0,
93
- "entities_threshold": 0,
94
+ "tui": 0,
95
+ "cui": 0,
96
+ "entities": 0,
94
97
  }
95
98
  if self.dataset_additional_config:
96
- kwargs["additional_config"]= self.dataset_additional_config
99
+ kwargs= self.dataset_additional_config
97
100
 
98
101
  for threshold in np.arange(min_threshold, max_threshold, self.increase_step):
99
102
  if column.startswith("tui"):
100
- kwargs["tui_threshold"] = threshold
103
+ thresholds["tui"] = threshold
101
104
  if column.startswith("cui"):
102
- kwargs["cui_threshold"] = threshold
105
+ thresholds["cui"] = threshold
103
106
  if column.startswith("entities"):
104
- kwargs["entities_threshold"] = threshold
107
+ thresholds["entities"] = threshold
108
+ kwargs['ner']['thresholds'] = thresholds
105
109
 
110
+ print(f"Loading dataset for column: {column} with threshold: {threshold}")
111
+ # Create a new dataset instance for each threshold
106
112
  dataset = self.dataset_type(
107
113
  columns=[column],
108
114
  dataset_splitter_type=self.dataset_splitter_type,
@@ -1,6 +1,7 @@
1
1
  import numpy as np
2
2
  import pandas as pd
3
3
  from scipy.spatial.distance import pdist, squareform
4
+ from sklearn.preprocessing import MultiLabelBinarizer
4
5
 
5
6
  # todo pd.unique kullan
6
7
  def find_distinct_elements(frame):
@@ -29,13 +30,13 @@ class SimilarityMatrixGenerator:
29
30
  def __init__(self):
30
31
  pass
31
32
 
32
- def create_jaccard_similarity_matrices_ex(self, array):
33
+ def create_jaccard_similarity_matrices_ex_1(self, array):
33
34
  jaccard_sim = 1 - pdist(array, metric='jaccard')
34
35
  jaccard_sim_matrix = squareform(jaccard_sim)
35
36
  return jaccard_sim_matrix
36
37
 
37
38
  # https://github.com/YifanDengWHU/DDIMDL/blob/master/DDIMDL.py , def Jaccard(matrix):
38
- def create_jaccard_similarity_matrices(self, matrix)->np.ndarray:
39
+ def create_jaccard_similarity_matrices_ex_2(self, matrix)->np.ndarray:
39
40
  matrix = np.asmatrix(matrix)
40
41
  numerator = matrix * matrix.T
41
42
  denominator = np.ones(np.shape(matrix)) * matrix.T + \
@@ -43,6 +44,37 @@ class SimilarityMatrixGenerator:
43
44
  matrix = numerator / denominator
44
45
  return np.nan_to_num(matrix, nan=0.0)
45
46
  # return matrix
47
+
48
+ """produced from ChatGPT"""
49
+ def create_jaccard_similarity_matrices(self, matrix)->np.ndarray:
50
+ """
51
+ Efficiently compute the Jaccard similarity between rows of a binary matrix using vectorized operations.
52
+
53
+ Parameters:
54
+ matrix (np.ndarray): A 2D binary NumPy array (only 0s and 1s).
55
+
56
+ Returns:
57
+ np.ndarray: A 2D NumPy array containing the pairwise Jaccard similarity.
58
+ """
59
+ if not ((matrix == 0) | (matrix == 1)).all():
60
+ raise ValueError("Input matrix must be binary (contain only 0s and 1s).")
61
+
62
+ # Intersection: dot product (each pair of rows)
63
+ intersection = matrix @ matrix.T
64
+
65
+ # Row-wise sum (number of 1s per row)
66
+ row_sums = matrix.sum(axis=1, keepdims=True)
67
+
68
+ # Union: |A ∪ B| = |A| + |B| - |A ∩ B|
69
+ union = row_sums + row_sums.T - intersection
70
+
71
+ # Avoid division by zero
72
+ with np.errstate(divide='ignore', invalid='ignore'):
73
+ similarity = intersection / union
74
+ similarity[np.isnan(similarity)] = 1.0 # If both rows are all zeros, define similarity as 1
75
+
76
+ return similarity
77
+
46
78
 
47
79
 
48
80
  class VectorGenerator:
@@ -99,6 +131,7 @@ class VectorGenerator:
99
131
  bit_vectors.append(vector)
100
132
  print("array oluşturuldu")
101
133
  return np.array(bit_vectors)
134
+
102
135
 
103
136
  # def generate_feature_vector(self, column):
104
137
  # bit_vectors = []
@@ -120,6 +153,7 @@ class VectorGenerator:
120
153
  bit_vectors = self.generate_feature_vector(column)
121
154
  vectors[column] = bit_vectors
122
155
  return vectors
156
+
123
157
 
124
158
 
125
159
  # generate feature vector
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ddi_fw
3
- Version: 0.0.227
3
+ Version: 0.0.230
4
4
  Summary: Do not use :)
5
5
  Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
6
6
  Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
@@ -1,4 +1,4 @@
1
- ddi_fw/datasets/__init__.py,sha256=_I3iDHARwzmg7_EL5XKtB_TgG1yAkLSOVTujLL9Wz9Q,280
1
+ ddi_fw/datasets/__init__.py,sha256=qsiNp0JJz2djukwtBpcdryUPIXkBOxDA6whryu_jt04,297
2
2
  ddi_fw/datasets/core.py,sha256=p-e3wP5C_SCh0fMXioUHUXKvLVtyCrsQCFvKRnH4fjs,17008
3
3
  ddi_fw/datasets/dataset_splitter.py,sha256=8H8uZTAf8N9LUZeSeHOMawtJFJhnDgUUqFcnl7dquBQ,1672
4
4
  ddi_fw/datasets/db_utils.py,sha256=xRj28U_uXTRPHcz3yIICczFUHXUPiAOZtAj5BM6kH44,6465
@@ -86,7 +86,7 @@ ddi_fw/pipeline/__init__.py,sha256=tKDM_rW4vPjlYTeOkNgi9PujDzb4e9O3LK1w5wqnebw,2
86
86
  ddi_fw/pipeline/multi_modal_combination_strategy.py,sha256=JSyuP71b1I1yuk0s2ecCJZTtCED85jBtkpwTUxibJvI,1706
87
87
  ddi_fw/pipeline/multi_pipeline.py,sha256=EjJnA3Vzd-WeEvUBaA2LDOy_iQ5-2eW2VhtxvvxDPfQ,9857
88
88
  ddi_fw/pipeline/multi_pipeline_org.py,sha256=AbErwu05-3YIPnCcXRsj-jxPJG8HG2H7cMZlGjzaYa8,9037
89
- ddi_fw/pipeline/ner_pipeline.py,sha256=IVtmlBhQ73FeR0b26U33yWlNVwqiEqdvBAseTz6CVsk,6954
89
+ ddi_fw/pipeline/ner_pipeline.py,sha256=8PKTnpD2jw9AlpLScwmlvafCm73hOmcbzAkrxNNR7tE,7240
90
90
  ddi_fw/pipeline/pipeline.py,sha256=q1kMkW9-fOlrA4BOGUku40U_PuEYfcbtH2EvlRM4uTM,6243
91
91
  ddi_fw/utils/__init__.py,sha256=WNxkQXk-694roG50D355TGLXstfdWVb_tUyr-PM-8rg,537
92
92
  ddi_fw/utils/categorical_data_encoding_checker.py,sha256=T1X70Rh4atucAuqyUZmz-iFULllY9dY0NRyV9-jTjJ0,3438
@@ -99,9 +99,9 @@ ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,
99
99
  ddi_fw/utils/utils.py,sha256=PY-zDawREKoXQfzX7lVkxBLVFQPkfvr9385kHCjaNXo,4391
100
100
  ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
101
101
  ddi_fw/vectorization/__init__.py,sha256=LcJOpLVoLvHPDw9phGFlUQGeNcST_zKV-Oi1Pm5h_nE,110
102
- ddi_fw/vectorization/feature_vector_generation.py,sha256=EBf-XAiwQwr68az91erEYNegfeqssBR29kVgrliIyac,4765
102
+ ddi_fw/vectorization/feature_vector_generation.py,sha256=93G3QM28uoNlvlVz_BhV6ARxldpogiNJStxHdsgqTbU,6026
103
103
  ddi_fw/vectorization/idf_helper.py,sha256=_Gd1dtDSLaw8o-o0JugzSKMt9FpeXewTh4wGEaUd4VQ,2571
104
- ddi_fw-0.0.227.dist-info/METADATA,sha256=yVVPcTBE4VRLFs4K7jWuOQWoLe-B_i8c8BV1YJCjI7U,2632
105
- ddi_fw-0.0.227.dist-info/WHEEL,sha256=lTU6B6eIfYoiQJTZNc-fyaR6BpL6ehTzU3xGYxn2n8k,91
106
- ddi_fw-0.0.227.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
107
- ddi_fw-0.0.227.dist-info/RECORD,,
104
+ ddi_fw-0.0.230.dist-info/METADATA,sha256=X5O1lK9xbfCA5wvuWOWGPWJ_Jkpad8b99cd2Wos1A6E,2632
105
+ ddi_fw-0.0.230.dist-info/WHEEL,sha256=pxyMxgL8-pra_rKaQ4drOZAegBVuX-G_4nRHjjgWbmo,91
106
+ ddi_fw-0.0.230.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
107
+ ddi_fw-0.0.230.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (78.1.1)
2
+ Generator: setuptools (79.0.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5