PyPI - metacountregressor - Versions diffs - 0.1.130__tar.gz → 0.1.133__tar.gz - Mend

metacountregressor 0.1.130tar.gz → 0.1.133tar.gz

Files changed (27) hide show

{metacountregressor-0.1.130 → metacountregressor-0.1.133}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: metacountregressor
-Version: 0.1.130
+Version: 0.1.133
 Summary: Extensions for a Python package for estimation of count models.
 Home-page: https://github.com/zahern/CountDataEstimation
 Author: Zeke Ahern

{metacountregressor-0.1.130 → metacountregressor-0.1.133}/metacountregressor/helperprocess.py RENAMED Viewed

@@ -183,15 +183,28 @@ config = {
 # Function to guess Low, Medium, High ranges
 def guess_low_medium_high(column_name, series):
     # Compute the tertiles (33rd and 66th percentiles)
-    print('why')
+    print('did it make it...')
     mode_value = np.mode(series)  # Get the most frequent value
+    print('good')
     series = pd.to_numeric(series, errors='coerce').fillna(mode_value)
     low_threshold = np.quantile(series, 0.33)
     high_threshold = np.quantile(series,0.66)
     # Define the bins and labels
     bins = [np.min(series) - 1, low_threshold, high_threshold, np.max(series)]
-    labels = ['Low', 'Medium', 'High']
+    # Handle duplicate bins by adjusting labels
+    if len(set(bins)) < len(bins):  # Check for duplicate bin edges
+        if low_threshold == high_threshold:
+            # Collapse to two bins (Low and High)
+            bins = [np.min(series) - 1, low_threshold, np.max(series)]
+            labels = ['Low', 'High']
+        else:
+            # Collapse to three unique bins
+            bins = sorted(set(bins))  # Remove duplicate edges
+            labels = [f'Bin {i + 1}' for i in range(len(bins) - 1)]
+    else:
+        # Standard case: Low, Medium, High
+        labels = ['Low', 'Medium', 'High']
     return {
         'type': 'bin',
@@ -210,7 +223,8 @@ def transform_dataframe(df, config):
                 df[column],
                 bins=settings['bins'],
                 labels=settings['labels'],
-                right=False
+                right=False,
             )
             # One-hot encode the binned column
             binned_dummies = pd.get_dummies(binned, prefix=settings['prefix'])
@@ -250,12 +264,14 @@ def guess_column_type(column_name, series):
         return {'type': 'one-hot', 'prefix': column_name}
     elif pd.api.types.is_numeric_dtype(series):
         unique_values = series.nunique()
         if unique_values < 5:
             return {'type': 'one-hot', 'prefix': column_name}
         elif np.max(series) - np.min(series) > 20:
+            print('made it through here')
             # If there are few unique values, assume binning with default bins
-            guess_low_medium_high(column_name,series)
+            return guess_low_medium_high(column_name,series)
         else:
            # # Otherwise, assume continuous data with normalization
             # Otherwise, fallback to continuous standardization

{metacountregressor-0.1.130 → metacountregressor-0.1.133}/metacountregressor.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: metacountregressor
-Version: 0.1.130
+Version: 0.1.133
 Summary: Extensions for a Python package for estimation of count models.
 Home-page: https://github.com/zahern/CountDataEstimation
 Author: Zeke Ahern