upgini 1.2.13a4__py3-none-any.whl → 1.2.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.13a4"
1
+ __version__ = "1.2.14"
upgini/autofe/binary.py CHANGED
@@ -140,13 +140,9 @@ class Distance(PandasOperand):
140
140
  has_symmetry_importance: bool = True
141
141
 
142
142
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
143
- # Handle None values by replacing them with 0 in the dot product and norm calculations
144
- left = left.apply(lambda x: np.array(x) if x is not None else np.zeros_like(right[0]))
145
- right = right.apply(lambda x: np.array(x) if x is not None else np.zeros_like(left[0]))
146
-
147
143
  return pd.Series(
148
144
  1 - self.__dot(left, right) / (self.__norm(left) * self.__norm(right)), index=left.index
149
- )
145
+ ).astype(np.float64)
150
146
 
151
147
  # row-wise dot product, handling None values
152
148
  def __dot(self, left: pd.Series, right: pd.Series) -> pd.Series:
@@ -1577,8 +1577,8 @@ class FeaturesEnricher(TransformerMixin):
1577
1577
  df = generator.generate(df)
1578
1578
  generated_features.extend(generator.generated_features)
1579
1579
 
1580
- normalizer = Normalizer(search_keys, generated_features, self.bundle, self.logger, self.warning_counter)
1581
- df = normalizer.normalize(df)
1580
+ normalizer = Normalizer(self.bundle, self.logger, self.warning_counter)
1581
+ df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
1582
1582
  columns_renaming = normalizer.columns_renaming
1583
1583
 
1584
1584
  df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
@@ -2017,10 +2017,8 @@ class FeaturesEnricher(TransformerMixin):
2017
2017
  df = generator.generate(df)
2018
2018
  generated_features.extend(generator.generated_features)
2019
2019
 
2020
- normalizer = Normalizer(
2021
- search_keys, generated_features, self.bundle, self.logger, self.warning_counter, silent_mode
2022
- )
2023
- df = normalizer.normalize(df)
2020
+ normalizer = Normalizer(self.bundle, self.logger, self.warning_counter, silent_mode)
2021
+ df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
2024
2022
  columns_renaming = normalizer.columns_renaming
2025
2023
 
2026
2024
  # Don't pass all features in backend on transform
@@ -2449,14 +2447,13 @@ class FeaturesEnricher(TransformerMixin):
2449
2447
  if is_numeric_dtype(df[self.TARGET_NAME]) and has_date:
2450
2448
  self._validate_PSI(df.sort_values(by=maybe_date_column))
2451
2449
 
2452
- self.__adjust_cv(df, maybe_date_column, self.model_task_type)
2453
-
2454
- normalizer = Normalizer(
2455
- self.fit_search_keys, self.fit_generated_features, self.bundle, self.logger, self.warning_counter
2450
+ normalizer = Normalizer(self.bundle, self.logger, self.warning_counter)
2451
+ df, self.fit_search_keys, self.fit_generated_features = normalizer.normalize(
2452
+ df, self.fit_search_keys, self.fit_generated_features
2456
2453
  )
2457
- df = normalizer.normalize(df)
2458
- columns_renaming = normalizer.columns_renaming
2459
- self.fit_columns_renaming = columns_renaming
2454
+ self.fit_columns_renaming = normalizer.columns_renaming
2455
+
2456
+ self.__adjust_cv(df)
2460
2457
 
2461
2458
  df = remove_fintech_duplicates(
2462
2459
  df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
@@ -2470,7 +2467,7 @@ class FeaturesEnricher(TransformerMixin):
2470
2467
  self.df_with_original_index = df.copy()
2471
2468
  # TODO check maybe need to drop _time column from df_with_original_index
2472
2469
 
2473
- df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys, columns_renaming)
2470
+ df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys, self.fit_columns_renaming)
2474
2471
 
2475
2472
  # Convert EMAIL to HEM after unnesting to do it only with one column
2476
2473
  email_column = self._get_email_column(self.fit_search_keys)
@@ -2480,7 +2477,7 @@ class FeaturesEnricher(TransformerMixin):
2480
2477
  email_column,
2481
2478
  hem_column,
2482
2479
  self.fit_search_keys,
2483
- columns_renaming,
2480
+ self.fit_columns_renaming,
2484
2481
  list(unnest_search_keys.keys()),
2485
2482
  self.logger,
2486
2483
  )
@@ -2491,7 +2488,7 @@ class FeaturesEnricher(TransformerMixin):
2491
2488
  converter = IpSearchKeyConverter(
2492
2489
  ip_column,
2493
2490
  self.fit_search_keys,
2494
- columns_renaming,
2491
+ self.fit_columns_renaming,
2495
2492
  list(unnest_search_keys.keys()),
2496
2493
  self.bundle,
2497
2494
  self.logger,
@@ -2522,7 +2519,7 @@ class FeaturesEnricher(TransformerMixin):
2522
2519
  features_columns = [c for c in df.columns if c not in non_feature_columns]
2523
2520
 
2524
2521
  features_to_drop = FeaturesValidator(self.logger).validate(
2525
- df, features_columns, self.generate_features, self.warning_counter, columns_renaming
2522
+ df, features_columns, self.generate_features, self.warning_counter, self.fit_columns_renaming
2526
2523
  )
2527
2524
  self.fit_dropped_features.update(features_to_drop)
2528
2525
  df = df.drop(columns=features_to_drop)
@@ -2563,7 +2560,7 @@ class FeaturesEnricher(TransformerMixin):
2563
2560
  rest_client=self.rest_client,
2564
2561
  logger=self.logger,
2565
2562
  )
2566
- dataset.columns_renaming = columns_renaming
2563
+ dataset.columns_renaming = self.fit_columns_renaming
2567
2564
 
2568
2565
  self.passed_features = [
2569
2566
  column for column, meaning_type in meaning_types.items() if meaning_type == FileColumnMeaningType.FEATURE
@@ -2710,24 +2707,24 @@ class FeaturesEnricher(TransformerMixin):
2710
2707
  if not self.warning_counter.has_warnings():
2711
2708
  self.__display_support_link(self.bundle.get("all_ok_community_invite"))
2712
2709
 
2713
- def __adjust_cv(self, df: pd.DataFrame, date_column: pd.Series, model_task_type: ModelTaskType):
2710
+ def __adjust_cv(self, df: pd.DataFrame):
2711
+ date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
2714
2712
  # Check Multivariate time series
2715
2713
  if (
2716
2714
  self.cv is None
2717
2715
  and date_column
2718
- and model_task_type == ModelTaskType.REGRESSION
2716
+ and self.model_task_type == ModelTaskType.REGRESSION
2719
2717
  and len({SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM}.intersection(self.fit_search_keys.keys())) == 0
2720
2718
  and is_blocked_time_series(df, date_column, list(self.fit_search_keys.keys()) + [TARGET])
2721
2719
  ):
2722
2720
  msg = self.bundle.get("multivariate_timeseries_detected")
2723
2721
  self.__override_cv(CVType.blocked_time_series, msg, print_warning=False)
2724
- elif (
2725
- self.cv is None
2726
- and model_task_type != ModelTaskType.REGRESSION
2727
- and self._get_group_columns(df, self.fit_search_keys)
2728
- ):
2722
+ elif self.cv is None and self.model_task_type != ModelTaskType.REGRESSION:
2729
2723
  msg = self.bundle.get("group_k_fold_in_classification")
2730
2724
  self.__override_cv(CVType.group_k_fold, msg, print_warning=self.cv is not None)
2725
+ group_columns = self._get_group_columns(df, self.fit_search_keys)
2726
+ self.runtime_parameters.properties["cv_params.group_columns"] = ",".join(group_columns)
2727
+ self.runtime_parameters.properties["cv_params.shuffle_kfold"] = "True"
2731
2728
 
2732
2729
  def __override_cv(self, cv: CVType, msg: str, print_warning: bool = True):
2733
2730
  if print_warning:
@@ -1,6 +1,6 @@
1
1
  import hashlib
2
2
  from logging import Logger, getLogger
3
- from typing import Dict, List
3
+ from typing import Dict, List, Tuple
4
4
 
5
5
  import numpy as np
6
6
  import pandas as pd
@@ -35,22 +35,25 @@ class Normalizer:
35
35
 
36
36
  def __init__(
37
37
  self,
38
- search_keys: Dict[str, SearchKey],
39
- generated_features: List[str],
40
38
  bundle: ResourceBundle = None,
41
39
  logger: Logger = None,
42
40
  warnings_counter: WarningCounter = None,
43
41
  silent_mode=False,
44
42
  ):
45
- self.search_keys = search_keys
46
- self.generated_features = generated_features
47
43
  self.bundle = bundle or get_custom_bundle()
48
44
  self.logger = logger or getLogger()
49
45
  self.warnings_counter = warnings_counter or WarningCounter()
50
46
  self.silent_mode = silent_mode
51
47
  self.columns_renaming = {}
48
+ self.search_keys = {}
49
+ self.generated_features = []
50
+
51
+ def normalize(
52
+ self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], generated_features: List[str]
53
+ ) -> Tuple[pd.DataFrame, Dict[str, SearchKey], List[str]]:
54
+ self.search_keys = search_keys.copy()
55
+ self.generated_features = generated_features.copy()
52
56
 
53
- def normalize(self, df: pd.DataFrame) -> pd.DataFrame:
54
57
  df = df.copy()
55
58
  df = self._rename_columns(df)
56
59
 
@@ -68,21 +71,25 @@ class Normalizer:
68
71
 
69
72
  df = self.__convert_features_types(df)
70
73
 
71
- return df
74
+ return df, self.search_keys, self.generated_features
72
75
 
73
76
  def _rename_columns(self, df: pd.DataFrame):
74
77
  # logger.info("Replace restricted symbols in column names")
75
78
  new_columns = []
76
79
  dup_counter = 0
77
80
  for column in df.columns:
78
- if column in [
79
- TARGET,
80
- EVAL_SET_INDEX,
81
- SYSTEM_RECORD_ID,
82
- ENTITY_SYSTEM_RECORD_ID,
83
- SEARCH_KEY_UNNEST,
84
- DateTimeSearchKeyConverter.DATETIME_COL,
85
- ] + self.generated_features:
81
+ if (
82
+ column
83
+ in [
84
+ TARGET,
85
+ EVAL_SET_INDEX,
86
+ SYSTEM_RECORD_ID,
87
+ ENTITY_SYSTEM_RECORD_ID,
88
+ SEARCH_KEY_UNNEST,
89
+ DateTimeSearchKeyConverter.DATETIME_COL,
90
+ ]
91
+ + self.generated_features
92
+ ):
86
93
  self.columns_renaming[column] = column
87
94
  new_columns.append(column)
88
95
  continue
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.13a4
3
+ Version: 1.2.14
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,9 +1,9 @@
1
- upgini/__about__.py,sha256=FCz2XUQlXzrSJIZwpE7MVdBagpn7lwlAw754ujNNr2Q,25
1
+ upgini/__about__.py,sha256=mxzjGyB-ihJR05pHYcBiUe_XT5X9wj6cBHLxOXBhAeM,23
2
2
  upgini/__init__.py,sha256=M64LwQTBa-5Jz24Zm2h8rWwlKQQ1J8nP7gGgIciS0WU,589
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=olZ-OHSfBNoBSCo7R5t7uCLukI2nO7afpx_A-HCiJLk,31067
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=HJJZbZScVrl6ugDBQE71m7om5-ahvMyEnAqZNw-OEJ0,188058
6
+ upgini/features_enricher.py,sha256=vRC7g6n6XQxSrvzXk6NJjP0ZytDQhWR4sTAo4Hp7gmA,188319
7
7
  upgini/http.py,sha256=21asexflvavydzCOONJDGQBtQanCElrbnqLXakJ9Cu8,42880
8
8
  upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
9
9
  upgini/metadata.py,sha256=osmzdNESeh7yP3BZday6N9Q3eaIHfzhhRM1d6NSgcf0,11223
@@ -15,7 +15,7 @@ upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9Jvf
15
15
  upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
16
16
  upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  upgini/autofe/all_operands.py,sha256=cCCB44qvkmuWyiRM5Xykx8tkHPIjQthrWyj67STWN80,2578
18
- upgini/autofe/binary.py,sha256=i2Y0uAOXVORt-RgnkO0gM7jZz2l5j3jqYz_yBOT2gxk,7927
18
+ upgini/autofe/binary.py,sha256=zMhtHVuGUAFLUqem-XiXqJj-GRXxS88tdz8tFuDfSNM,7659
19
19
  upgini/autofe/date.py,sha256=OpFc3Al0xO3qlESn2Uokfxw51ArVqmh3xngWwdrsaqE,9762
20
20
  upgini/autofe/feature.py,sha256=eL7wABUhDKZzv3E-RPJNcyGwSfB0UptcfU2RbvsOks4,15082
21
21
  upgini/autofe/groupby.py,sha256=r-xl_keZZgm_tpiEoDhjYSkT6NHv7a4cRQR4wJ4uCp8,3263
@@ -27,7 +27,7 @@ upgini/data_source/data_source_publisher.py,sha256=X-8aGtVgzGmxyXkMVBoBLIGDMb4lY
27
27
  upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
28
28
  upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
29
29
  upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
- upgini/normalizer/normalize_utils.py,sha256=bHRPWCNrUvt2R9qMX6dZFCJ0i8ENVCQ2Rw3dHH9IJEg,7447
30
+ upgini/normalizer/normalize_utils.py,sha256=Lv75lq7M46z9cAIutwkdKZtPZkWblgoRzToAJ1BwY8A,7709
31
31
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
32
32
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
33
33
  upgini/resource_bundle/strings.properties,sha256=9kvmcUrsSFUCrzOiN0Ozf-lQ2H8Igz5gATUPoHMOaU4,26456
@@ -57,7 +57,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
57
57
  upgini/utils/target_utils.py,sha256=BVtDmrmFMKerSUWaNOIEdzsYHIFiODdpnWbE50QDPDc,7864
58
58
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
59
59
  upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
60
- upgini-1.2.13a4.dist-info/METADATA,sha256=WqeEXF0ava0eEP1wD566T8AIgMX4V74H404kcc7k0DY,48579
61
- upgini-1.2.13a4.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
62
- upgini-1.2.13a4.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
- upgini-1.2.13a4.dist-info/RECORD,,
60
+ upgini-1.2.14.dist-info/METADATA,sha256=xIbSnwYAnie1HKKRe_6MEUGG0BaZRtvWAsRlc2vzlBw,48577
61
+ upgini-1.2.14.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
62
+ upgini-1.2.14.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
+ upgini-1.2.14.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.25.0
2
+ Generator: hatchling 1.24.2
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any