upgini 1.1.269__py3-none-any.whl → 1.1.273__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

@@ -1,5 +1,5 @@
1
1
  from typing import Dict
2
- from upgini.autofe.date import DateDiff, DateDiffType2
2
+ from upgini.autofe.date import DateDiff, DateDiffType2, DateListDiff, DateListDiffBounded
3
3
  from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
4
4
  from upgini.autofe.operand import Operand
5
5
  from upgini.autofe.unary import Abs, Log, Residual, Sqrt, Square, Sigmoid, Floor, Freq
@@ -38,6 +38,16 @@ ALL_OPERANDS: Dict[str, Operand] = {
38
38
  Sim(),
39
39
  DateDiff(),
40
40
  DateDiffType2(),
41
+ DateListDiff(aggregation="min"),
42
+ DateListDiff(aggregation="max"),
43
+ DateListDiff(aggregation="mean"),
44
+ DateListDiff(aggregation="nunique"),
45
+ DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=0, upper_bound=18),
46
+ DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=18, upper_bound=23),
47
+ DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=23, upper_bound=30),
48
+ DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=30, upper_bound=45),
49
+ DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=45, upper_bound=60),
50
+ DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=60),
41
51
  ]
42
52
  }
43
53
 
upgini/autofe/date.py CHANGED
@@ -1,11 +1,12 @@
1
- from typing import Optional, Union
1
+ from typing import Any, Optional, Union
2
2
  import numpy as np
3
3
  import pandas as pd
4
+ from pydantic import BaseModel
4
5
 
5
6
  from upgini.autofe.operand import PandasOperand
6
7
 
7
8
 
8
- class DateDiffMixin:
9
+ class DateDiffMixin(BaseModel):
9
10
  diff_unit: str = "D"
10
11
  left_unit: Optional[str] = None
11
12
  right_unit: Optional[str] = None
@@ -38,7 +39,6 @@ class DateDiffType2(PandasOperand, DateDiffMixin):
38
39
  name = "date_diff_type2"
39
40
  is_binary = True
40
41
  has_symmetry_importance = True
41
- is_vectorizable = False
42
42
 
43
43
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
44
44
  left = self._convert_to_date(left, self.left_unit)
@@ -51,3 +51,60 @@ class DateDiffType2(PandasOperand, DateDiffMixin):
51
51
  diff = (future - left) / np.timedelta64(1, self.diff_unit)
52
52
 
53
53
  return diff
54
+
55
+
56
+ _ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len, 0)}
57
+
58
+
59
+ class DateListDiff(PandasOperand, DateDiffMixin):
60
+ is_binary = True
61
+ has_symmetry_importance = True
62
+ aggregation: str
63
+
64
+ def __init__(self, **data: Any) -> None:
65
+ if "name" not in data:
66
+ data["name"] = f"date_diff_{data.get('aggregation')}"
67
+ super().__init__(**data)
68
+
69
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
70
+ left = self._convert_to_date(left, self.left_unit)
71
+ right = right.apply(lambda x: pd.arrays.DatetimeArray(self._convert_to_date(x, self.right_unit)))
72
+
73
+ return pd.Series(left - right.values).apply(lambda x: self._agg(self._diff(x)))
74
+
75
+ def _diff(self, x):
76
+ x = x / np.timedelta64(1, self.diff_unit)
77
+ return x[x > 0]
78
+
79
+ def _agg(self, x):
80
+ method = getattr(np, self.aggregation, None)
81
+ default = np.nan
82
+ if method is None and self.aggregation in _ext_aggregations:
83
+ method, default = _ext_aggregations[self.aggregation]
84
+ elif not callable(method):
85
+ raise ValueError(f"Unsupported aggregation: {self.aggregation}")
86
+
87
+ return method(x) if len(x) > 0 else default
88
+
89
+
90
+ class DateListDiffBounded(DateListDiff):
91
+ lower_bound: Optional[int]
92
+ upper_bound: Optional[int]
93
+
94
+ def __init__(self, **data: Any) -> None:
95
+ if "name" not in data:
96
+ lower_bound = data.get("lower_bound")
97
+ upper_bound = data.get("upper_bound")
98
+ components = [
99
+ "date_diff",
100
+ data.get("diff_unit"),
101
+ str(lower_bound if lower_bound is not None else "minusinf"),
102
+ str(upper_bound if upper_bound is not None else "plusinf"),
103
+ ]
104
+ components.append(data.get("aggregation"))
105
+ data["name"] = "_".join(components)
106
+ super().__init__(**data)
107
+
108
+ def _agg(self, x):
109
+ x = x[(x >= (self.lower_bound or -np.inf)) & (x < (self.upper_bound or np.inf))]
110
+ return super()._agg(x)
@@ -72,8 +72,8 @@ class DataSourcePublisher:
72
72
  )
73
73
  if search_keys is None or len(search_keys) == 0:
74
74
  raise ValidationError("Empty search keys")
75
- if SearchKey.DATE in search_keys.values() and date_format is None:
76
- raise ValidationError("date_format is required for DATE search key")
75
+ # if SearchKey.DATE in search_keys.values() and date_format is None:
76
+ # raise ValidationError("date_format is required for DATE search key")
77
77
  if update_frequency not in self.ACCEPTABLE_UPDATE_FREQUENCIES:
78
78
  raise ValidationError(
79
79
  f"Invalid update frequency: {update_frequency}. "
@@ -85,11 +85,19 @@ class DataSourcePublisher:
85
85
  or set(search_keys.values()) == {SearchKey.MSISDN_RANGE_FROM, SearchKey.MSISDN_RANGE_TO}
86
86
  ) and sort_column is None:
87
87
  raise ValidationError("Sort column is required for passed search keys")
88
+ if (
89
+ set(search_keys.values()) == {SearchKey.PHONE, SearchKey.DATE}
90
+ and snapshot_frequency_days is None
91
+ and join_date_abs_limit_days is None
92
+ ):
93
+ raise ValidationError(
94
+ "With MSISDN and DATE keys one of the snapshot_frequency_days or"
95
+ " join_date_abs_limit_days parameters is required"
96
+ )
88
97
 
89
98
  request = {
90
99
  "dataTableUri": data_table_uri,
91
100
  "searchKeys": {k: v.value.value for k, v in search_keys.items()},
92
- "dateFormat": date_format,
93
101
  "excludeColumns": exclude_columns,
94
102
  "hashFeatureNames": str(hash_feature_names).lower(),
95
103
  "snapshotFrequencyDays": snapshot_frequency_days,
@@ -98,6 +106,8 @@ class DataSourcePublisher:
98
106
  "featuresForEmbeddings": features_for_embeddings,
99
107
  "forceGeneration": str(_force_generation).lower(),
100
108
  }
109
+ if date_format is not None:
110
+ request["dateFormat"] = date_format
101
111
  if secondary_search_keys is not None:
102
112
  request["secondarySearchKeys"] = {k: v.value.value for k, v in secondary_search_keys.items()}
103
113
  if sort_column is not None:
@@ -424,7 +424,7 @@ class FeaturesEnricher(TransformerMixin):
424
424
  self.X = X
425
425
  self.y = y
426
426
  self.eval_set = self._check_eval_set(eval_set, X, self.bundle)
427
- self.dump_input(trace_id, X, y, eval_set)
427
+ self.dump_input(trace_id, X, y, self.eval_set)
428
428
  self.__inner_fit(
429
429
  trace_id,
430
430
  X,
@@ -563,7 +563,7 @@ class FeaturesEnricher(TransformerMixin):
563
563
  self.X = X
564
564
  self.y = y
565
565
  self.eval_set = self._check_eval_set(eval_set, X, self.bundle)
566
- self.dump_input(trace_id, X, y, eval_set)
566
+ self.dump_input(trace_id, X, y, self.eval_set)
567
567
 
568
568
  if _num_samples(drop_duplicates(X)) > Dataset.MAX_ROWS:
569
569
  raise ValidationError(self.bundle.get("dataset_too_many_rows_registered").format(Dataset.MAX_ROWS))
@@ -823,12 +823,16 @@ class FeaturesEnricher(TransformerMixin):
823
823
  print(msg)
824
824
 
825
825
  self.__validate_search_keys(self.search_keys, self.search_id)
826
+ effective_X = X if X is not None else self.X
827
+ effective_y = y if y is not None else self.y
828
+ effective_eval_set = eval_set if eval_set is not None else self.eval_set
829
+ effective_eval_set = self._check_eval_set(effective_eval_set, effective_X, self.bundle)
826
830
 
827
831
  try:
828
832
  self.__log_debug_information(
829
- X if X is not None else self.X,
830
- y if y is not None else self.y,
831
- eval_set if eval_set is not None else self.eval_set,
833
+ effective_X,
834
+ effective_y,
835
+ effective_eval_set,
832
836
  exclude_features_sources=exclude_features_sources,
833
837
  cv=cv if cv is not None else self.cv,
834
838
  importance_threshold=importance_threshold,
@@ -842,17 +846,14 @@ class FeaturesEnricher(TransformerMixin):
842
846
  self._search_task is None
843
847
  or self._search_task.provider_metadata_v2 is None
844
848
  or len(self._search_task.provider_metadata_v2) == 0
845
- or (self.X is None and X is None)
846
- or (self.y is None and y is None)
849
+ or effective_X is None
850
+ or effective_y is None
847
851
  ):
848
852
  raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
849
853
 
850
854
  if X is not None and y is None:
851
855
  raise ValidationError("X passed without y")
852
856
 
853
- effective_X = X if X is not None else self.X
854
- effective_eval_set = eval_set if eval_set is not None else self.eval_set
855
-
856
857
  validate_scoring_argument(scoring)
857
858
 
858
859
  self._validate_baseline_score(effective_X, effective_eval_set)
@@ -872,8 +873,7 @@ class FeaturesEnricher(TransformerMixin):
872
873
  ):
873
874
  cat_features = estimator.get_param("cat_features")
874
875
  if len(cat_features) > 0 and isinstance(cat_features[0], int):
875
- effectiveX = X or self.X
876
- cat_features = [effectiveX.columns[i] for i in cat_features]
876
+ cat_features = [effective_X.columns[i] for i in cat_features]
877
877
  for cat_feature in cat_features:
878
878
  if cat_feature in self.search_keys:
879
879
  if self.search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
@@ -883,9 +883,9 @@ class FeaturesEnricher(TransformerMixin):
883
883
 
884
884
  prepared_data = self._prepare_data_for_metrics(
885
885
  trace_id=trace_id,
886
- X=X,
887
- y=y,
888
- eval_set=eval_set,
886
+ X=effective_X,
887
+ y=effective_y,
888
+ eval_set=effective_eval_set,
889
889
  exclude_features_sources=exclude_features_sources,
890
890
  importance_threshold=importance_threshold,
891
891
  max_features=max_features,
@@ -995,8 +995,6 @@ class FeaturesEnricher(TransformerMixin):
995
995
  enriched_metric = None
996
996
  uplift = None
997
997
 
998
- effective_X = X if X is not None else self.X
999
- effective_y = y if y is not None else self.y
1000
998
  train_metrics = {
1001
999
  self.bundle.get("quality_metrics_segment_header"): self.bundle.get(
1002
1000
  "quality_metrics_train_segment"
@@ -2823,6 +2821,7 @@ class FeaturesEnricher(TransformerMixin):
2823
2821
 
2824
2822
  maybe_date_col = self._get_date_column(self.search_keys)
2825
2823
  if X is not None and maybe_date_col is not None and maybe_date_col in X.columns:
2824
+ # TODO cast date column to single dtype
2826
2825
  min_date = X[maybe_date_col].min()
2827
2826
  max_date = X[maybe_date_col].max()
2828
2827
  self.logger.info(f"Dates interval is ({min_date}, {max_date})")
@@ -203,7 +203,7 @@ phone_detected_not_registered=\nWARNING: Phone numbers detected in column `{}`.
203
203
  target_type_detected=\nDetected task type: {}\n
204
204
  # all_ok_community_invite=Chat with us in Slack community:
205
205
  all_ok_community_invite=❓ Support request
206
- too_small_for_metrics=Your train dataset contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
206
+ too_small_for_metrics=Your train dataset or one of eval datasets contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
207
207
  imbalance_multiclass=Class {0} is on 25% quantile of classes distribution ({1} records in train dataset). \nDownsample classes with records more than {1}.
208
208
  loss_selection_info=Using loss `{}` for feature selection
209
209
  loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.269
3
+ Version: 1.1.273
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -2,7 +2,7 @@ upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
2
2
  upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
3
3
  upgini/dataset.py,sha256=xb4gIANyGbdcuM8Awyq2pJPiH_3k_LEbETApJgAoRBA,45529
4
4
  upgini/errors.py,sha256=pdzQl3MKuK52yvncxMWMRWeSIOGhUFzpQoszoRFBOk0,958
5
- upgini/features_enricher.py,sha256=hdI3dRDyg9rKMGK3IyRTMTDxESEbF1xmtH6dp8k3srw,174132
5
+ upgini/features_enricher.py,sha256=LPYSCGq89WLaL5iQNikTyhICUs_APtqEvhn5XRENn1U,174105
6
6
  upgini/fingerprint.js,sha256=VygVIQlN1v4NGZfjHqtRogOw8zjTnnMNJg_f7M5iGQU,33442
7
7
  upgini/http.py,sha256=zaO86LBBLmkieGbgYifk29eVoPCxXimZQ8YkQtKcM0I,42244
8
8
  upgini/metadata.py,sha256=fwVxtkR6Mn4iRoOqV6BfMJvJrx65I3YwZUMbZjhPyOI,9673
@@ -13,23 +13,23 @@ upgini/version_validator.py,sha256=rDIncP6BEko4J2F2hUcMOtKm_vZbI4ICWcNcw8hrwM4,1
13
13
  upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
14
14
  upgini/ads_management/ads_manager.py,sha256=fP4Yqx3h2Snw5X335TbXEwFoupq1RYsE7y0PAduvetU,2646
15
15
  upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
- upgini/autofe/all_operands.py,sha256=Nb7Fu4owDNy9gKbJN88c1DxODNtEEGAhiLT1-Eoc9yI,1587
16
+ upgini/autofe/all_operands.py,sha256=H66wqVLD-H9k8A4-q2wslhV9QaNxlb49f8YiT0Xfkps,2356
17
17
  upgini/autofe/binary.py,sha256=f8LQqZi9zyaMUAv-jASMmWNA_vT05ncYCjZq0qx3USs,3972
18
- upgini/autofe/date.py,sha256=lrZ5xpQO2L0c2bPta3EMdd1v5czDH_WY08Ww1s50t4w,1824
18
+ upgini/autofe/date.py,sha256=cc0GMAJR0QZOI_Qp2V5UDklaXLNS_79O1GhU6GlOYzg,3895
19
19
  upgini/autofe/feature.py,sha256=2FQRGtIumNz60hFAjfLReaY18SI7HxzYZOoC5avzSjQ,11847
20
20
  upgini/autofe/groupby.py,sha256=iXRfOmOc84ooSzRhsh9GmmG7rTafX0-ekXko8s9Qs68,3089
21
21
  upgini/autofe/operand.py,sha256=dhtToPDGWtP_0u_RjayUpezJJZAgq_TzNbPH0bI9OXI,2805
22
22
  upgini/autofe/unary.py,sha256=YRTzQLttbDdOnkogWBPnBexpu7uHWSLSFAxSCu3iFdY,3145
23
23
  upgini/autofe/vector.py,sha256=5qhI_bdwaWM1l7fgCkx1tMt9R9gxWzoYCl-7WO4KiOs,604
24
24
  upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
- upgini/data_source/data_source_publisher.py,sha256=QASEDhJ9SxJKcWxoN2vUPxrM_HTlwKQOPa92L7EQneA,15962
25
+ upgini/data_source/data_source_publisher.py,sha256=J2lrpPuysUHPeqTSfoybBtPRTBCFu7R5KzaakhjaRDc,16485
26
26
  upgini/mdc/__init__.py,sha256=ETDh3JKbrDdPMOECiYLAa8lvKYe68mv4IY6fZa9FimA,1126
27
27
  upgini/mdc/context.py,sha256=Sl1S_InKlzzRxYqwJ2k24lawJdCKWgGJ-RIRfvzWJrk,1468
28
28
  upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
29
29
  upgini/normalizer/phone_normalizer.py,sha256=lhwsPEnfyjeIsndW2EcQGZksXYsfxaQ1ghAzVYoDRKM,9927
30
30
  upgini/resource_bundle/__init__.py,sha256=hdvbqL0b0xMWbY6-kiYGsW1ro2GMiWpxxsO9uCv-h9Q,8379
31
31
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
32
- upgini/resource_bundle/strings.properties,sha256=00KNv1A3rxXioktqB9o_V_zX0etC2LZO7NBIEsCoNNQ,26087
32
+ upgini/resource_bundle/strings.properties,sha256=TM9OykiEXNpcgFN3DpqBGbQs4N9m4mzHBn-k6aazc30,26111
33
33
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
34
34
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
35
  upgini/sampler/base.py,sha256=CC-DvPbrN7zp5--SVFuUqkVmdWM_5F7R0Do98ETV82U,6421
@@ -56,8 +56,8 @@ upgini/utils/sklearn_ext.py,sha256=e1aMNXk1zUt7uFnl0FcUF0zOnaXSE7z5xBHmJPknUVs,4
56
56
  upgini/utils/target_utils.py,sha256=9K67tkY7LWhQMO-vbbPqBaO-KriAmg_6fVz5RQRaLQc,7802
57
57
  upgini/utils/track_info.py,sha256=EPcJ13Jqa17_T0JjM37Ac9kWDz5Zk0GVsIZKutOb8aU,5207
58
58
  upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
59
- upgini-1.1.269.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
60
- upgini-1.1.269.dist-info/METADATA,sha256=eabO8mMQA4qAV37lMnBhxe2gpllcmOWFI65Hhb7b5Ec,48156
61
- upgini-1.1.269.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
62
- upgini-1.1.269.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
63
- upgini-1.1.269.dist-info/RECORD,,
59
+ upgini-1.1.273.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
60
+ upgini-1.1.273.dist-info/METADATA,sha256=Omoz12LfHouVHSu4OlfpbPbHZJ4ZXW5K1bTUo3jFswg,48156
61
+ upgini-1.1.273.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
62
+ upgini-1.1.273.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
63
+ upgini-1.1.273.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.43.0)
2
+ Generator: bdist_wheel (0.42.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5